redhat-openshift-ecosystem
diff --git a/‎pr-analysis/AvgMergeTimes.png‎
125 KB b/‎pr-analysis/AvgMergeTimes.png‎
125 KB
diff --git a/‎pr-analysis/PR_Info_Monthly.csv‎
Lines changed: 205 additions & 0 deletions b/‎pr-analysis/PR_Info_Monthly.csv‎
Lines changed: 205 additions & 0 deletions
diff --git a/‎pr-analysis/PR_analysis.py‎
Lines changed: 173 additions & 0 deletions b/‎pr-analysis/PR_analysis.py‎
Lines changed: 173 additions & 0 deletions
diff --git a/‎pr-analysis/PRmergeRates.png‎
92.8 KB b/‎pr-analysis/PRmergeRates.png‎
92.8 KB
diff --git a/‎pr-analysis/README.md‎
Lines changed: 26 additions & 0 deletions b/‎pr-analysis/README.md‎
Lines changed: 26 additions & 0 deletions
diff --git a/‎pr-analysis/graphql_query.py‎
Lines changed: 76 additions & 0 deletions b/‎pr-analysis/graphql_query.py‎
Lines changed: 76 additions & 0 deletions
@@ -0,0 +1,173 @@
+from pandas.core.frame import DataFrame
+import pandas as pd
+from pandas import json_normalize
+import numpy as np
+import matplotlib.pyplot as plt
+import time
+from datetime import datetime, timedelta
+import matplotlib.dates as mdates
+from matplotlib.dates import DayLocator, HourLocator, DateFormatter, drange
+from graphql_query import get_PR_data
+
+
+def createDateColumn(dataframe):
+    """This function will create a date column in
+    the data frame which will have datetime type rather
+    than a string type"""
+
+    newDatecol = []
+    format_str = r"%Y-%m-%dT%H:%M:%SZ"
+    for i in dataframe['node.mergedAt']:
+        if (i is not None):
+            # making the string to a datetime format
+            newdate = datetime.strptime(i, format_str)
+            # appending to the list as a date
+            newDatecol.append(newdate.date())
+        if (i is None):
+            newDatecol.append("None")
+    dataframe['Date Merged'] = newDatecol
+
+    return dataframe
+
+
+def numPRMerged_graph(df):
+    """This function will create a graph for Num of Pr merged"""
+
+    # get oldest and youngest dates from the list
+    datelist = df['dates']
+    oldest = min(datelist)
+    youngest = max(datelist)
+    timegap = 12
+    dates = mdates.drange(oldest, youngest, timedelta(weeks=timegap))
+    # data
+    counts = df['counts']
+    # Set up the axes and figure
+    fig, ax = plt.subplots()
+    # (To use the exact code below, you'll need to convert your sequence
+    # of datetimes into matplotlib's float-based date format.
+    # Use "dates = mdates.date2num(dates)" to convert them.)
+    dates = mdates.date2num(dates)
+    width = np.diff(dates).min()
+
+    # Make a bar plot. Note that I'm using "dates" directly instead of plotting
+    # "counts" against x-values of [0,1,2...]
+    ax.bar(datelist, counts.tolist(), align='center', width=width, ec='blue')
+
+    # Tell matplotlib to interpret the x-axis values as dates
+    ax.xaxis_date()
+
+    # Make space for and rotate the x-axis tick labels
+    fig.autofmt_xdate()
+    plt.ylabel('Counts')
+    plt.xlabel('Dates')
+    plt.title('Number of PRs merged over time')
+    plt.savefig('PRmergeRates.png', dpi=400)
+    plt.show()
+
+
+def computeMergetime(created_at, merged_at):
+    """This function will calculate the merge time"""
+    format_str = r"%Y-%m-%dT%H:%M:%SZ"
+    date_created = datetime.strptime(created_at, format_str)
+    date_merged = datetime.strptime(merged_at, format_str)
+    # return diff in days [86400 secs in a day]
+    time_diff = (date_merged - date_created).total_seconds() / 86400
+    return int(time_diff)
+
+
+def addlabels(x, y):
+    for i in range(len(x)):
+        plt.text(i, y[i], y[i], ha='center')
+
+
+def avgMergetime_graph(df):
+    """This function will create a graph for avg merge time"""
+
+    x = df['Merged_YM']
+    y = df['mergetime']
+    fig, ax = plt.subplots()
+    x_pos = np.arange(len(x))  # <--
+    plt.bar(x_pos, y)
+    plt.xticks(x_pos, x)  # <--
+    # Make space for and rotate the x-axis tick labels
+    fig.autofmt_xdate()
+    ax.xaxis_date()
+    addlabels(x, y)
+    plt.xlabel("Dates")
+    plt.ylabel("Merge Time in Days")
+    plt.title("Avg Merge Times")
+    plt.savefig('AvgMergeTimes.png', dpi=400)
+    plt.show()
+
+
+def avgMergetime(df):
+    """ This function will be called to calculate
+    the avg mergetime and produce a graph"""
+
+    # 1. calculate the mergetime for each PR and add to the dataframe
+    mergetime_ = []
+
+    for index, row in df.iterrows():
+        if (row.loc['node.mergedAt'] is not None):
+            mergetime = computeMergetime(row.loc['node.createdAt'],
+                                         row.loc['node.mergedAt'])
+            mergetime_.append(mergetime)
+        else:
+            mergetime_.append("None")
+    df['mergetime'] = mergetime_
+
+    # 2. calculate the average merge time for each month
+    df['Merged_YM'] = pd.to_datetime(df['node.mergedAt']).dt.to_period('M')
+    new_df = df.filter(['Merged_YM', 'mergetime'], axis=1)
+    group_mean = new_df.groupby('Merged_YM')['mergetime'].mean()
+    mean_df = group_mean.reset_index()
+    # change from float to int
+    mean_df['mergetime'] = mean_df.mergetime.astype(int)
+
+    # 3. create a bar graph
+    avgMergetime_graph(mean_df)
+
+
+def getMonthlyPRinfo(df):
+    """Retrieve the info of PRs merged in
+    each month in history and create csv file"""
+
+    new_df = df.filter(['Merged_YM', 'node.title', 'node.url'], axis=1)
+    new_df.groupby('Merged_YM')
+    new_df.to_csv('PR_Info_Monthly.csv', index=False)
+
+
+def process_data(dataframe):
+    """This function will be called in the main()
+    to process the data gathered from the query
+    and create a dataframe"""
+
+    # add a new column for just the date in date format
+    dataframe = createDateColumn(dataframe)
+    # get the frequency of each date
+    frequency = dataframe['Date Merged'].value_counts()
+    # converting to df and assigning new names to the columns
+    df_value_counts = pd.DataFrame(frequency)
+    df_value_counts = df_value_counts.reset_index()
+    # change column names
+    df_value_counts.columns = ['dates', 'counts']
+    # delete the the row with None
+    dateFreq = df_value_counts.loc[df_value_counts["dates"] != "None"]
+
+    # 1. Create a graph for number of PRs merged over time
+    numPRMerged_graph(dateFreq)
+    # 2. Create a graph for avg PR merge time
+    avgMergetime(dataframe)
+    # 3. A table with PR info for each month
+    getMonthlyPRinfo(dataframe)
+    # pie chart of author is merger vs author is not the merger
+
+
+def main():
+    # get data from the graphql query
+    pr_cursor = None
+    res_data = get_PR_data(pr_cursor)
+    process_data(res_data)
+
+
+main()
@@ -0,0 +1,26 @@
+# PR Analysis on operator-test-playbooks
+
+To Run the code, you can use this command: **python PR_analysis.py**
+
+**NOTE** Before you run the code, you must add a personal github token. The token can be generated by following this [guide](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token)
+
+### Two ways to add token:
+
+1. You can do this by adding the token in **graphql_query.py** file by replacing **Add_Your_Token_Here** in the headers (uncomment this line). 
+Make sure to remove this token before pushing any changes.
+
+2. You can add the token in the command line by following this [guide](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token#using-a-token-on-the-command-line)
+
+Once the code is finished running, you will have two png files and one csv file saved in your folder. These are the graphs from the analysis of the queried data and the information about which PRs have been merged in each month.
+
+# Operator Test Playbooks 
+
+## History
+
+## Number of PR over time
+![Number of PR over time](PRmergeRates.png)
+Download : [png](PRmergeRates.png)
+
+## Average PR merging time
+![Average PR merging time](AvgMergeTimes.png)
+Download : [png](AvgMergeTimes.png)
@@ -0,0 +1,76 @@
+from string import Template
+from python_graphql_client import GraphqlClient
+import pandas as pd
+import requests
+import os
+
+headers = {"Authorization": "token Add_Your_Token_Here"}
+# remove your token when pushing changes
+
+
+def run_query(query):
+    """A simple function to use requests.post
+    to make the API call. Note the json= section."""
+
+    request = requests.post('https://api.github.com/graphql',
+                            json={'query': query}, headers=headers)
+    if request.status_code == 200:  # 200 means request fulfilled
+        return request.json()
+    else:
+        raise Exception(
+                        "Query failed to run by returning code of {}. {}".format(
+                                                                                 request.status_code, query))
+
+
+def build_query(pr_cursor):
+    return Template("""{
+      repository(owner: "redhat-openshift-ecosystem", name: "operator-test-playbooks") {
+        pullRequests(first: 15, after: $cursor) {
+          pageInfo{
+            hasNextPage
+            endCursor
+          }
+          edges {
+            node {
+              author {
+                login
+              }
+              mergedBy {
+                login
+              }
+              createdAt
+              mergedAt
+              title
+              url
+            }
+          }
+        }
+      }
+    }
+    """).substitute({'cursor': pr_cursor})
+
+
+def format_cursor(cursor):
+    """Format cursor inside double quotations as required by API"""
+    return '"{}"'.format(cursor)
+
+
+def get_PR_data(cursor):
+    """This function will create and return a data
+    frame with the data returned from the query"""
+
+    all_data = []
+    hasNextPage = True
+    while hasNextPage:
+        cursor = "null" if cursor is None else format_cursor(cursor)
+        getPRinfo = build_query(cursor)
+        result = run_query(getPRinfo)
+        data_frame = pd.json_normalize(result['data']['repository']['pullRequests']['edges'])
+        page_info = pd.json_normalize(result['data']['repository']['pullRequests']['pageInfo'])
+        all_data.append(data_frame)
+        cursor = page_info.loc[0, 'endCursor']  # update cursor
+        hasNextPage = page_info.loc[0, 'hasNextPage']  # update hasNextPage
+    res_data = pd.concat(all_data)  # creating a df with all PRs
+    res_data.pop('node.mergedBy')
+
+    return res_data