Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
12 changes: 12 additions & 0 deletions docs/index.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,12 @@
# PR History Statistics

## Number of PR merged over time

![Number of PR over time](pr-analysis/PRmergeRates.png)
Download : [png](pr-analysis/PRmergeRates.png)

## Average PR merging time

![Average PR merging time](pr-analysis/AvgMergeTimes.png)
Download : [png](pr-analysis/AvgMergeTimes.png)

Binary file added docs/pr-analysis/AvgMergeTimes.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
205 changes: 205 additions & 0 deletions docs/pr-analysis/PR_Info_Monthly.csv

Large diffs are not rendered by default.

172 changes: 172 additions & 0 deletions docs/pr-analysis/PR_analysis.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,172 @@
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from datetime import datetime, timedelta
import matplotlib.dates as mdates
from matplotlib.dates import DayLocator, HourLocator, DateFormatter, drange
from graphql_query import get_PR_data


def createDateColumn(dataframe):
"""This function will create a date column in
the data frame which will have datetime type rather
than a string type"""

newDatecol = []
format_str = r"%Y-%m-%dT%H:%M:%SZ"
for i in dataframe['node.mergedAt']:
if (i is not None):
# making the string to a datetime format
newdate = datetime.strptime(i, format_str)
# appending to the list as a date
newDatecol.append(newdate.date())
if (i is None):
newDatecol.append("None")
dataframe['Date Merged'] = newDatecol

return dataframe


def numPRMerged_graph(df):
"""This function will create a graph for Num of Pr merged"""

# get oldest and youngest dates from the list
datelist = df['dates']
oldest = min(datelist)
youngest = max(datelist)
timegap = 12
dates = mdates.drange(oldest, youngest, timedelta(weeks=timegap))
# data
counts = df['counts']
# Set up the axes and figure
fig, ax = plt.subplots()
# (To use the exact code below, you'll need to convert your sequence
# of datetimes into matplotlib's float-based date format.
# Use "dates = mdates.date2num(dates)" to convert them.)
dates = mdates.date2num(dates)
width = np.diff(dates).min()

# Make a bar plot. Note that I'm using "dates" directly instead of plotting
# "counts" against x-values of [0,1,2...]
ax.bar(datelist, counts.tolist(), align='center', width=width, ec='blue')

# Tell matplotlib to interpret the x-axis values as dates
ax.xaxis_date()

# Make space for and rotate the x-axis tick labels
fig.autofmt_xdate()
plt.ylabel('Counts')
plt.xlabel('Dates')
plt.title('Number of PRs merged over time')
plt.savefig('PRmergeRates.png', dpi=400)
plt.show()


def computeMergetime(created_at, merged_at):
"""This function will calculate the merge time"""

format_str = r"%Y-%m-%dT%H:%M:%SZ"
date_created = datetime.strptime(created_at, format_str)
date_merged = datetime.strptime(merged_at, format_str)
# return diff in days [86400 secs in a day]
time_diff = (date_merged - date_created).total_seconds() / 86400
return int(time_diff)


def addlabels(x, y):
"""create labels for bars in bar chart"""

for i in range(len(x)):
plt.text(i, y[i], y[i], ha='center')


def avgMergetime_graph(df):
"""This function will create a graph for avg merge time"""

x = df['Merged_YM']
y = df['mergetime']
fig, ax = plt.subplots()
x_pos = np.arange(len(x)) # <--
plt.bar(x_pos, y)
plt.xticks(x_pos, x) # <--
# Make space for and rotate the x-axis tick labels
fig.autofmt_xdate()
ax.xaxis_date()
addlabels(x, y)
plt.xlabel("Dates")
plt.ylabel("Merge Time in Days")
plt.title("Avg Merge Times")
plt.savefig('AvgMergeTimes.png', dpi=400)
plt.show()


def avgMergetime(df):
""" This function will be called to calculate
the avg mergetime and produce a graph"""

# 1. calculate the mergetime for each PR and add to the dataframe
mergetime_ = []

for index, row in df.iterrows():
if (row.loc['node.mergedAt'] is not None):
mergetime = computeMergetime(row.loc['node.createdAt'],
row.loc['node.mergedAt'])
mergetime_.append(mergetime)
else:
mergetime_.append("None")
df['mergetime'] = mergetime_

# 2. calculate the average merge time for each month
df['Merged_YM'] = pd.to_datetime(df['node.mergedAt']).dt.to_period('M')
new_df = df.filter(['Merged_YM', 'mergetime'], axis=1)
group_mean = new_df.groupby('Merged_YM')['mergetime'].mean()
mean_df = group_mean.reset_index()
# change from float to int
mean_df['mergetime'] = mean_df.mergetime.astype(int)

# 3. create a bar graph
avgMergetime_graph(mean_df)


def getMonthlyPRinfo(df):
"""Retrieve the info of PRs merged in
each month in history and create csv file"""

new_df = df.filter(['Merged_YM', 'node.title', 'node.url'], axis=1)
new_df.groupby('Merged_YM')
new_df.to_csv('PR_Info_Monthly.csv', index=False)


def process_data(dataframe):
"""This function will be called in the main()
to process the data gathered from the query
and create a dataframe"""

# add a new column for just the date in date format
dataframe = createDateColumn(dataframe)
# get the frequency of each date
frequency = dataframe['Date Merged'].value_counts()
# converting to df and assigning new names to the columns
df_value_counts = pd.DataFrame(frequency)
df_value_counts = df_value_counts.reset_index()
# change column names
df_value_counts.columns = ['dates', 'counts']
# delete the the row with None
dateFreq = df_value_counts.loc[df_value_counts["dates"] != "None"]

# 1. Create a graph for number of PRs merged over time
numPRMerged_graph(dateFreq)
# 2. Create a graph for avg PR merge time
avgMergetime(dataframe)
# 3. A table with PR info for each month
getMonthlyPRinfo(dataframe)


def main():
# get data from the graphql query
pr_cursor = None
res_data = get_PR_data(pr_cursor)
process_data(res_data)


main()
Binary file added docs/pr-analysis/PRmergeRates.png
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
22 changes: 22 additions & 0 deletions docs/pr-analysis/README.md
Original file line number Diff line number Diff line change
@@ -0,0 +1,22 @@
# PR Analysis on operator-test-playbooks

## How to run the code:

### Install Necessary Modules
`pip install -r requirements.txt`

**Note:** you may need to install the python3-tkinter package for graphs to display.

### Add a Token
Before you run the code, you must add a personal github token. The token can be generated by following this [guide](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token)

You can add the token in the command line by running `export GH_TOKEN=<yourtoken>`

To run the code, you can use this command: `python PR_analysis.py`

### Results
Once the code is finished running, you will have two png files and one csv file saved in this folder.

These are the graphs from the analysis of the queried data and the information about which PRs that have been merged in each month.

In PR_Info_Monthly.csv you can search all the Pull Requests based on Year-Month.
76 changes: 76 additions & 0 deletions docs/pr-analysis/graphql_query.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,76 @@
from string import Template
from pandas import json_normalize
import pandas as pd
import requests
import os

GH_TOKEN = os.environ['GH_TOKEN']
headers = {"Authorization": f"Bearer {GH_TOKEN}"}


def run_query(query):
"""A simple function to use requests.post
to make the API call. Note the json= section."""

request = requests.post('https://api.github.com/graphql',
json={'query': query}, headers=headers)
if request.status_code == 200: # 200 means request fulfilled
return request.json()
else:
raise Exception(
"Query failed to run by returning code of {}. {}".format(
request.status_code, query))


def build_query(pr_cursor):
return Template("""{
repository(owner: "redhat-openshift-ecosystem", name: "operator-test-playbooks") {
pullRequests(first: 15, after: $cursor) {
pageInfo{
hasNextPage
endCursor
}
edges {
node {
author {
login
}
mergedBy {
login
}
createdAt
mergedAt
title
url
}
}
}
}
}
""").substitute({'cursor': pr_cursor})


def format_cursor(cursor):
"""Format cursor inside double quotations as required by API"""
return '"{}"'.format(cursor)


def get_PR_data(cursor):
"""This function will create and return a data
frame with the data returned from the query"""

all_data = []
hasNextPage = True
while hasNextPage:
cursor = "null" if cursor is None else format_cursor(cursor)
getPRinfo = build_query(cursor)
result = run_query(getPRinfo)
data_frame = pd.json_normalize(result['data']['repository']['pullRequests']['edges'])
page_info = pd.json_normalize(result['data']['repository']['pullRequests']['pageInfo'])
all_data.append(data_frame)
cursor = page_info.loc[0, 'endCursor'] # update cursor
hasNextPage = page_info.loc[0, 'hasNextPage'] # update hasNextPage
res_data = pd.concat(all_data) # creating a df with all PRs
res_data.pop('node.mergedBy')

return res_data
14 changes: 14 additions & 0 deletions docs/pr-analysis/requirements.txt
Original file line number Diff line number Diff line change
@@ -0,0 +1,14 @@
jsonpatch==1.32
jsonpointer==2.1
jsonschema==3.2.0
matplotlib==3.4.2
numpy==1.21.1
pandas==1.3.0
python-dateutil==2.8.0
requests==2.25.1
requests-file==1.4.3
requests-ftp==0.3.1
requests-ntlm==1.1.0
requestsexceptions==1.4.0
simplejson==3.17.2
ujson==4.0.2
1 change: 1 addition & 0 deletions mkdocs.yml
Original file line number Diff line number Diff line change
@@ -0,0 +1 @@
site_name: Operator Test Playbooks - PR History