redhat-openshift-ecosystem · wheelerlaw · Aug 12, 2021 · Aug 3, 2021
diff --git a/docs/index.md b/docs/index.md
@@ -0,0 +1,12 @@
+# PR History Statistics
+
+## Number of PR merged over time
+
+![Number of PR over time](pr-analysis/PRmergeRates.png)
+Download : [png](pr-analysis/PRmergeRates.png)
+
+## Average PR merging time
+
+![Average PR merging time](pr-analysis/AvgMergeTimes.png)
+Download : [png](pr-analysis/AvgMergeTimes.png)
+
diff --git a/docs/pr-analysis/AvgMergeTimes.png b/docs/pr-analysis/AvgMergeTimes.png
diff --git a/docs/pr-analysis/PR_Info_Monthly.csv b/docs/pr-analysis/PR_Info_Monthly.csv
diff --git a/docs/pr-analysis/PR_analysis.py b/docs/pr-analysis/PR_analysis.py
@@ -0,0 +1,172 @@
+import pandas as pd
+import numpy as np
+import matplotlib.pyplot as plt
+from datetime import datetime, timedelta
+import matplotlib.dates as mdates
+from matplotlib.dates import DayLocator, HourLocator, DateFormatter, drange
+from graphql_query import get_PR_data
+
+
+def createDateColumn(dataframe):
+    """This function will create a date column in
+    the data frame which will have datetime type rather
+    than a string type"""
+
+    newDatecol = []
+    format_str = r"%Y-%m-%dT%H:%M:%SZ"
+    for i in dataframe['node.mergedAt']:
+        if (i is not None):
+            # making the string to a datetime format
+            newdate = datetime.strptime(i, format_str)
+            # appending to the list as a date
+            newDatecol.append(newdate.date())
+        if (i is None):
+            newDatecol.append("None")
+    dataframe['Date Merged'] = newDatecol
+
+    return dataframe
+
+
+def numPRMerged_graph(df):
+    """This function will create a graph for Num of Pr merged"""
+
+    # get oldest and youngest dates from the list
+    datelist = df['dates']
+    oldest = min(datelist)
+    youngest = max(datelist)
+    timegap = 12
+    dates = mdates.drange(oldest, youngest, timedelta(weeks=timegap))
+    # data
+    counts = df['counts']
+    # Set up the axes and figure
+    fig, ax = plt.subplots()
+    # (To use the exact code below, you'll need to convert your sequence
+    # of datetimes into matplotlib's float-based date format.
+    # Use "dates = mdates.date2num(dates)" to convert them.)
+    dates = mdates.date2num(dates)
+    width = np.diff(dates).min()
+
+    # Make a bar plot. Note that I'm using "dates" directly instead of plotting
+    # "counts" against x-values of [0,1,2...]
+    ax.bar(datelist, counts.tolist(), align='center', width=width, ec='blue')
+
+    # Tell matplotlib to interpret the x-axis values as dates
+    ax.xaxis_date()
+
+    # Make space for and rotate the x-axis tick labels
+    fig.autofmt_xdate()
+    plt.ylabel('Counts')
+    plt.xlabel('Dates')
+    plt.title('Number of PRs merged over time')
+    plt.savefig('PRmergeRates.png', dpi=400)
+    plt.show()
+
+
+def computeMergetime(created_at, merged_at):
+    """This function will calculate the merge time"""
+
+    format_str = r"%Y-%m-%dT%H:%M:%SZ"
+    date_created = datetime.strptime(created_at, format_str)
+    date_merged = datetime.strptime(merged_at, format_str)
+    # return diff in days [86400 secs in a day]
+    time_diff = (date_merged - date_created).total_seconds() / 86400
+    return int(time_diff)
+
+
+def addlabels(x, y):
+    """create labels for bars in bar chart"""
+
+    for i in range(len(x)):
+        plt.text(i, y[i], y[i], ha='center')
+
+
+def avgMergetime_graph(df):
+    """This function will create a graph for avg merge time"""
+
+    x = df['Merged_YM']
+    y = df['mergetime']
+    fig, ax = plt.subplots()
+    x_pos = np.arange(len(x))  # <--
+    plt.bar(x_pos, y)
+    plt.xticks(x_pos, x)  # <--
+    # Make space for and rotate the x-axis tick labels
+    fig.autofmt_xdate()
+    ax.xaxis_date()
+    addlabels(x, y)
+    plt.xlabel("Dates")
+    plt.ylabel("Merge Time in Days")
+    plt.title("Avg Merge Times")
+    plt.savefig('AvgMergeTimes.png', dpi=400)
+    plt.show()
+
+
+def avgMergetime(df):
+    """ This function will be called to calculate
+    the avg mergetime and produce a graph"""
+
+    # 1. calculate the mergetime for each PR and add to the dataframe
+    mergetime_ = []
+
+    for index, row in df.iterrows():
+        if (row.loc['node.mergedAt'] is not None):
+            mergetime = computeMergetime(row.loc['node.createdAt'],
+                                         row.loc['node.mergedAt'])
+            mergetime_.append(mergetime)
+        else:
+            mergetime_.append("None")
+    df['mergetime'] = mergetime_
+
+    # 2. calculate the average merge time for each month
+    df['Merged_YM'] = pd.to_datetime(df['node.mergedAt']).dt.to_period('M')
+    new_df = df.filter(['Merged_YM', 'mergetime'], axis=1)
+    group_mean = new_df.groupby('Merged_YM')['mergetime'].mean()
+    mean_df = group_mean.reset_index()
+    # change from float to int
+    mean_df['mergetime'] = mean_df.mergetime.astype(int)
+
+    # 3. create a bar graph
+    avgMergetime_graph(mean_df)
+
+
+def getMonthlyPRinfo(df):
+    """Retrieve the info of PRs merged in
+    each month in history and create csv file"""
+
+    new_df = df.filter(['Merged_YM', 'node.title', 'node.url'], axis=1)
+    new_df.groupby('Merged_YM')
+    new_df.to_csv('PR_Info_Monthly.csv', index=False)
+
+
+def process_data(dataframe):
+    """This function will be called in the main()
+    to process the data gathered from the query
+    and create a dataframe"""
+
+    # add a new column for just the date in date format
+    dataframe = createDateColumn(dataframe)
+    # get the frequency of each date
+    frequency = dataframe['Date Merged'].value_counts()
+    # converting to df and assigning new names to the columns
+    df_value_counts = pd.DataFrame(frequency)
+    df_value_counts = df_value_counts.reset_index()
+    # change column names
+    df_value_counts.columns = ['dates', 'counts']
+    # delete the the row with None
+    dateFreq = df_value_counts.loc[df_value_counts["dates"] != "None"]
+
+    # 1. Create a graph for number of PRs merged over time
+    numPRMerged_graph(dateFreq)
+    # 2. Create a graph for avg PR merge time
+    avgMergetime(dataframe)
+    # 3. A table with PR info for each month
+    getMonthlyPRinfo(dataframe)
+
+
+def main():
+    # get data from the graphql query
+    pr_cursor = None
+    res_data = get_PR_data(pr_cursor)
+    process_data(res_data)
+
+
+main()
diff --git a/docs/pr-analysis/PRmergeRates.png b/docs/pr-analysis/PRmergeRates.png
diff --git a/docs/pr-analysis/README.md b/docs/pr-analysis/README.md
@@ -0,0 +1,22 @@
+# PR Analysis on operator-test-playbooks
+
+## How to run the code:
+
+### Install Necessary Modules
+`pip install -r requirements.txt`
+
+**Note:** you may need to install the python3-tkinter package for graphs to display.
+
+### Add a Token
+Before you run the code, you must add a personal github token. The token can be generated by following this [guide](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token)
+
+You can add the token in the command line by running `export GH_TOKEN=<yourtoken>`
+
+To run the code, you can use this command: `python PR_analysis.py`
+
+### Results
+Once the code is finished running, you will have two png files and one csv file saved in this folder. 
+
+These are the graphs from the analysis of the queried data and the information about which PRs that have been merged in each month.
+
+In PR_Info_Monthly.csv you can search all the Pull Requests based on Year-Month.
diff --git a/docs/pr-analysis/graphql_query.py b/docs/pr-analysis/graphql_query.py
@@ -0,0 +1,76 @@
+from string import Template
+from pandas import json_normalize
+import pandas as pd
+import requests
+import os
+
+GH_TOKEN = os.environ['GH_TOKEN']
+headers = {"Authorization": f"Bearer {GH_TOKEN}"}
+
+
+def run_query(query):
+    """A simple function to use requests.post
+    to make the API call. Note the json= section."""
+
+    request = requests.post('https://api.github.com/graphql',
+                            json={'query': query}, headers=headers)
+    if request.status_code == 200:  # 200 means request fulfilled
+        return request.json()
+    else:
+        raise Exception(
+                        "Query failed to run by returning code of {}. {}".format(
+                                                                                 request.status_code, query))
+
+
+def build_query(pr_cursor):
+    return Template("""{
+      repository(owner: "redhat-openshift-ecosystem", name: "operator-test-playbooks") {
+        pullRequests(first: 15, after: $cursor) {
+          pageInfo{
+            hasNextPage
+            endCursor
+          }
+          edges {
+            node {
+              author {
+                login
+              }
+              mergedBy {
+                login
+              }
+              createdAt
+              mergedAt
+              title
+              url
+            }
+          }
+        }
+      }
+    }
+    """).substitute({'cursor': pr_cursor})
+
+
+def format_cursor(cursor):
+    """Format cursor inside double quotations as required by API"""
+    return '"{}"'.format(cursor)
+
+
+def get_PR_data(cursor):
+    """This function will create and return a data
+    frame with the data returned from the query"""
+
+    all_data = []
+    hasNextPage = True
+    while hasNextPage:
+        cursor = "null" if cursor is None else format_cursor(cursor)
+        getPRinfo = build_query(cursor)
+        result = run_query(getPRinfo)
+        data_frame = pd.json_normalize(result['data']['repository']['pullRequests']['edges'])
+        page_info = pd.json_normalize(result['data']['repository']['pullRequests']['pageInfo'])
+        all_data.append(data_frame)
+        cursor = page_info.loc[0, 'endCursor']  # update cursor
+        hasNextPage = page_info.loc[0, 'hasNextPage']  # update hasNextPage
+    res_data = pd.concat(all_data)  # creating a df with all PRs
+    res_data.pop('node.mergedBy')
+
+    return res_data
diff --git a/docs/pr-analysis/requirements.txt b/docs/pr-analysis/requirements.txt
@@ -0,0 +1,14 @@
+jsonpatch==1.32
+jsonpointer==2.1
+jsonschema==3.2.0
+matplotlib==3.4.2
+numpy==1.21.1
+pandas==1.3.0
+python-dateutil==2.8.0
+requests==2.25.1
+requests-file==1.4.3
+requests-ftp==0.3.1
+requests-ntlm==1.1.0
+requestsexceptions==1.4.0
+simplejson==3.17.2
+ujson==4.0.2
diff --git a/mkdocs.yml b/mkdocs.yml
@@ -0,0 +1 @@
+site_name: Operator Test Playbooks - PR History
Original file line number	Diff line number	Diff line change
		@@ -0,0 +1 @@
		site_name: Operator Test Playbooks - PR History