cvp-2129 PR analysis

LikhithaEda · LikhithaEda · commit f6d1efef39f0 · 2021-08-04T11:02:27.000-04:00
diff --git a/pr-analysis/AvgMergeTimes.png b/pr-analysis/AvgMergeTimes.png
diff --git a/pr-analysis/PR_analysis.py b/pr-analysis/PR_analysis.py
@@ -0,0 +1,156 @@
+from pandas.core.frame import DataFrame
+import pandas as pd
+from pandas import json_normalize
+import numpy as np
+import matplotlib.pyplot as plt
+import time
+from datetime import datetime, timedelta
+import matplotlib.dates as mdates
+from matplotlib.dates import DayLocator, HourLocator, DateFormatter, drange
+from graphql_query import get_PR_data
+
+
+def createDateColumn(dataframe):
+  """This function will create a date column in the data frame which will have datetime type rather
+  than a string type"""
+
+  newDatecol = [] #conatin new date format and will be appended to the dataframe
+  format_str = r"%Y-%m-%dT%H:%M:%SZ"
+  for i in dataframe['node.mergedAt']:
+    if ( i !=None ):
+      newdate = datetime.strptime(i, format_str) #making the string to a datetime format
+      newDatecol.append(newdate.date())  #appending to the list as a date
+    if (i == None):
+      newDatecol.append("None")
+  dataframe['Date Merged'] = newDatecol 
+
+  return dataframe
+
+
+def numPRMerged_graph(df):
+  """This function will create a graph for Num of Pr merged"""
+
+  #get oldest and youngest dates from the list
+  datelist = df['dates']
+  oldest = min(datelist)
+  youngest = max(datelist)
+  timegap  = 12
+  dates = mdates.drange(oldest, youngest, timedelta(weeks= timegap))
+  # data
+  counts = df['counts']
+  
+  # Set up the axes and figure
+  fig, ax = plt.subplots()
+
+  # (To use the exact code below, you'll need to convert your sequence
+  # of datetimes into matplotlib's float-based date format.  
+  # Use "dates = mdates.date2num(dates)" to convert them.)
+  dates = mdates.date2num(dates)
+  width = np.diff(dates).min()
+
+  # Make a bar plot. Note that I'm using "dates" directly instead of plotting
+  # "counts" against x-values of [0,1,2...]
+  ax.bar(datelist, counts.tolist(), align='center', width=width, ec = 'blue')
+
+  # Tell matplotlib to interpret the x-axis values as dates
+  ax.xaxis_date()
+
+  # Make space for and rotate the x-axis tick labels
+  fig.autofmt_xdate()
+  
+  plt.ylabel('Counts')
+  plt.xlabel('Dates')
+  plt.title('Number of PRs merged over time')
+  plt.savefig('PRmergeRates.png',dpi=400)
+  plt.show()
+
+def computeMergetime(created_at, merged_at):
+  """This function will calculate the merge time"""
+
+  format_str = r"%Y-%m-%dT%H:%M:%SZ"
+  date_created = datetime.strptime(created_at, format_str)
+  date_merged = datetime.strptime(merged_at, format_str)
+  time_diff = (date_merged - date_created).total_seconds() / 86400 #return diff in days [86400 secs in a day]
+  return int(time_diff)
+
+def addlabels(x,y):
+    for i in range(len(x)):
+        plt.text(i,y[i],y[i], ha = 'center')
+
+def avgMergetime_graph(df):
+  """This function will create a graph for avg merge time"""
+
+  x = df['Merged_YM']
+  y = df['mergetime']
+  fig, ax = plt.subplots()
+  x_pos = np.arange(len(x))  # <--
+  plt.bar(x_pos, y)
+  plt.xticks(x_pos, x)  # <--
+  # Make space for and rotate the x-axis tick labels
+  fig.autofmt_xdate()
+  ax.xaxis_date()
+  addlabels(x, y) 
+  plt.xlabel("Dates")
+  plt.ylabel("Merge Time in Days")
+  plt.title("Avg Merge Times")
+  plt.savefig('AvgMergeTimes.png',dpi=400)
+  plt.show()
+
+def avgMergetime(df):
+  """ This function will be called to calculate the avg mergetime and produce a graph"""
+
+  #1. calculate the mergetime for each PR and add to the dataframe
+  
+  mergetime_ = []
+
+  for index, row in df.iterrows():
+    if (row.loc['node.mergedAt'] != None) :
+      mergetime = computeMergetime(row.loc['node.createdAt'] , row.loc['node.mergedAt'])
+      mergetime_.append(mergetime)
+    else:
+      mergetime_.append("None")
+
+  df['mergetime'] = mergetime_
+  print(df)
+  #2. calculate the average merge time for each month
+
+  df['Merged_YM'] = pd.to_datetime(df['node.mergedAt']).dt.to_period('M')
+  new_df = df.filter(['Merged_YM','mergetime'], axis=1)
+
+  group_mean = new_df.groupby('Merged_YM')['mergetime'].mean()
+  mean_df = group_mean.reset_index()
+  
+  mean_df['mergetime'] = mean_df.mergetime.astype(int) #change from float to int
+
+  #3. create a bar graph
+  avgMergetime_graph(mean_df)
+
+def process_data(dataframe):
+  """This function will be called in the main() to process the data gathered from the query
+  and create a dataframe"""
+
+  #add a new column for just the date in date format
+  dataframe = createDateColumn(dataframe)
+  frequency = dataframe['Date Merged'].value_counts() #get the frequency of each date
+  # converting to df and assigning new names to the columns
+  df_value_counts = pd.DataFrame(frequency)
+  df_value_counts = df_value_counts.reset_index()
+  df_value_counts.columns = ['dates', 'counts'] # change column names
+  #delete the the row with None
+  dateFreq = df_value_counts.loc[df_value_counts["dates"] != "None"]
+
+  #1. Create a graph for number of PRs merged over time
+  numPRMerged_graph(dateFreq)
+  #2. Create a graph for avg PR merge time
+  avgMergetime(dataframe)
+  #average time it takes to merge PRs
+  #pie chart of author is merger vs author is not the merger
+
+
+def main():
+  #get data from the graphql query
+  pr_cursor = None
+  res_data = get_PR_data(pr_cursor)
+  process_data(res_data)
+
+main()
diff --git a/pr-analysis/PRmergeRates.png b/pr-analysis/PRmergeRates.png
diff --git a/pr-analysis/README.md b/pr-analysis/README.md
@@ -0,0 +1,8 @@
+# PR Analysis on operator-test-playbooks
+
+To Run the code, you can use this command: **python PR_analysis.py**
+
+**NOTE** Before you run the code, you must add a github token in **graphql_query.py** file by replacing **Add_Your_Token_Here** in the headers. 
+The token can be generated by following this [guide](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token)
+
+Once the code is finished running, you will have two png files saved in your folder. These are the graphs from the analysis of the queried data.
diff --git a/pr-analysis/graphql_query.py b/pr-analysis/graphql_query.py
@@ -0,0 +1,67 @@
+from string import Template
+from python_graphql_client import GraphqlClient
+import pandas as pd
+import requests
+import json
+
+headers = {"Authorization": "token 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"}
+
+
+def run_query(query): # A simple function to use requests.post to make the API call. Note the json= section.
+    request = requests.post('https://api.github.com/graphql', json={'query': query}, headers=headers)
+    if request.status_code == 200: #200 means request fulfilled
+        return request.json()
+        #return request.text
+    else:
+        raise Exception("Query failed to run by returning code of {}. {}".format(request.status_code, query))
+
+def build_query(pr_cursor):
+  return Template("""{
+    repository(owner: "redhat-openshift-ecosystem", name: "operator-test-playbooks") {
+      pullRequests(first: 15, after: $cursor) {
+        pageInfo{
+          hasNextPage
+          endCursor
+        }
+        edges {
+          node {
+            author {
+              login
+            }
+            mergedBy {
+              login
+            }
+            createdAt
+            mergedAt
+            
+          }
+        }
+      }
+    }
+  }
+  """).substitute({'cursor': pr_cursor})
+  
+def format_cursor(cursor):
+    """Format cursor inside double quotations as required by API"""
+    return '"{}"'.format(cursor)
+
+#This function will create and return a data frame with the data returned from the query
+def get_PR_data(cursor):
+  all_data = []
+  hasNextPage = True
+  while (hasNextPage == True):
+    cursor =  "null" if cursor is None else format_cursor(cursor)
+    getPRinfo = build_query(cursor)
+    result = run_query(getPRinfo)  
+    #print(result)
+    data_frame = pd.json_normalize(result['data']['repository']['pullRequests']['edges'])
+    page_info = pd.json_normalize(result['data']['repository']['pullRequests']['pageInfo']) 
+    #print(data_frame)
+    #print(page_info)
+    all_data.append(data_frame)
+    cursor = page_info.loc[0,'endCursor'] #update cursor
+    hasNextPage = page_info.loc[0,'hasNextPage'] #update hasNextPage
+  res_data = pd.concat(all_data) #creating a df with all PRs
+  res_data.pop('node.mergedBy')
+  #print(res_data)
+  return res_data
diff --git a/pr-analysis/stats.md b/pr-analysis/stats.md
@@ -0,0 +1,12 @@
+# Operator Test Playbooks
+
+## History
+
+### Number of PR over time
+![Number of PR over time](PRmergeRates.png)
+Download : [png](PRmergeRates.png) [pdf](PRmergeRates.pdf)
+
+## Average PR merging time
+![Average PR merging time](AvgMergeTimes.png)
+Download : [png](AvgMergeTimes.png) [pdf](AvgMergeTimes.pdf)
+