Skip to content

Commit 4512e5a

Browse files
committed
cvp-2129 PR analysis
1 parent f588eba commit 4512e5a

File tree

7 files changed

+501
-0
lines changed

7 files changed

+501
-0
lines changed

pr-analysis/AvgMergeTimes.png

125 KB
Loading

pr-analysis/PR_Info_Monthly.csv

Lines changed: 205 additions & 0 deletions
Large diffs are not rendered by default.

pr-analysis/PR_analysis.py

Lines changed: 172 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,172 @@
1+
import pandas as pd
2+
import numpy as np
3+
import matplotlib.pyplot as plt
4+
from datetime import datetime, timedelta
5+
import matplotlib.dates as mdates
6+
from matplotlib.dates import DayLocator, HourLocator, DateFormatter, drange
7+
from graphql_query import get_PR_data
8+
9+
10+
def createDateColumn(dataframe):
11+
"""This function will create a date column in
12+
the data frame which will have datetime type rather
13+
than a string type"""
14+
15+
newDatecol = []
16+
format_str = r"%Y-%m-%dT%H:%M:%SZ"
17+
for i in dataframe['node.mergedAt']:
18+
if (i is not None):
19+
# making the string to a datetime format
20+
newdate = datetime.strptime(i, format_str)
21+
# appending to the list as a date
22+
newDatecol.append(newdate.date())
23+
if (i is None):
24+
newDatecol.append("None")
25+
dataframe['Date Merged'] = newDatecol
26+
27+
return dataframe
28+
29+
30+
def numPRMerged_graph(df):
31+
"""This function will create a graph for Num of Pr merged"""
32+
33+
# get oldest and youngest dates from the list
34+
datelist = df['dates']
35+
oldest = min(datelist)
36+
youngest = max(datelist)
37+
timegap = 12
38+
dates = mdates.drange(oldest, youngest, timedelta(weeks=timegap))
39+
# data
40+
counts = df['counts']
41+
# Set up the axes and figure
42+
fig, ax = plt.subplots()
43+
# (To use the exact code below, you'll need to convert your sequence
44+
# of datetimes into matplotlib's float-based date format.
45+
# Use "dates = mdates.date2num(dates)" to convert them.)
46+
dates = mdates.date2num(dates)
47+
width = np.diff(dates).min()
48+
49+
# Make a bar plot. Note that I'm using "dates" directly instead of plotting
50+
# "counts" against x-values of [0,1,2...]
51+
ax.bar(datelist, counts.tolist(), align='center', width=width, ec='blue')
52+
53+
# Tell matplotlib to interpret the x-axis values as dates
54+
ax.xaxis_date()
55+
56+
# Make space for and rotate the x-axis tick labels
57+
fig.autofmt_xdate()
58+
plt.ylabel('Counts')
59+
plt.xlabel('Dates')
60+
plt.title('Number of PRs merged over time')
61+
plt.savefig('PRmergeRates.png', dpi=400)
62+
plt.show()
63+
64+
65+
def computeMergetime(created_at, merged_at):
66+
"""This function will calculate the merge time"""
67+
68+
format_str = r"%Y-%m-%dT%H:%M:%SZ"
69+
date_created = datetime.strptime(created_at, format_str)
70+
date_merged = datetime.strptime(merged_at, format_str)
71+
# return diff in days [86400 secs in a day]
72+
time_diff = (date_merged - date_created).total_seconds() / 86400
73+
return int(time_diff)
74+
75+
76+
def addlabels(x, y):
77+
"""create labels for bars in bar chart"""
78+
79+
for i in range(len(x)):
80+
plt.text(i, y[i], y[i], ha='center')
81+
82+
83+
def avgMergetime_graph(df):
84+
"""This function will create a graph for avg merge time"""
85+
86+
x = df['Merged_YM']
87+
y = df['mergetime']
88+
fig, ax = plt.subplots()
89+
x_pos = np.arange(len(x)) # <--
90+
plt.bar(x_pos, y)
91+
plt.xticks(x_pos, x) # <--
92+
# Make space for and rotate the x-axis tick labels
93+
fig.autofmt_xdate()
94+
ax.xaxis_date()
95+
addlabels(x, y)
96+
plt.xlabel("Dates")
97+
plt.ylabel("Merge Time in Days")
98+
plt.title("Avg Merge Times")
99+
plt.savefig('AvgMergeTimes.png', dpi=400)
100+
plt.show()
101+
102+
103+
def avgMergetime(df):
104+
""" This function will be called to calculate
105+
the avg mergetime and produce a graph"""
106+
107+
# 1. calculate the mergetime for each PR and add to the dataframe
108+
mergetime_ = []
109+
110+
for index, row in df.iterrows():
111+
if (row.loc['node.mergedAt'] is not None):
112+
mergetime = computeMergetime(row.loc['node.createdAt'],
113+
row.loc['node.mergedAt'])
114+
mergetime_.append(mergetime)
115+
else:
116+
mergetime_.append("None")
117+
df['mergetime'] = mergetime_
118+
119+
# 2. calculate the average merge time for each month
120+
df['Merged_YM'] = pd.to_datetime(df['node.mergedAt']).dt.to_period('M')
121+
new_df = df.filter(['Merged_YM', 'mergetime'], axis=1)
122+
group_mean = new_df.groupby('Merged_YM')['mergetime'].mean()
123+
mean_df = group_mean.reset_index()
124+
# change from float to int
125+
mean_df['mergetime'] = mean_df.mergetime.astype(int)
126+
127+
# 3. create a bar graph
128+
avgMergetime_graph(mean_df)
129+
130+
131+
def getMonthlyPRinfo(df):
132+
"""Retrieve the info of PRs merged in
133+
each month in history and create csv file"""
134+
135+
new_df = df.filter(['Merged_YM', 'node.title', 'node.url'], axis=1)
136+
new_df.groupby('Merged_YM')
137+
new_df.to_csv('PR_Info_Monthly.csv', index=False)
138+
139+
140+
def process_data(dataframe):
141+
"""This function will be called in the main()
142+
to process the data gathered from the query
143+
and create a dataframe"""
144+
145+
# add a new column for just the date in date format
146+
dataframe = createDateColumn(dataframe)
147+
# get the frequency of each date
148+
frequency = dataframe['Date Merged'].value_counts()
149+
# converting to df and assigning new names to the columns
150+
df_value_counts = pd.DataFrame(frequency)
151+
df_value_counts = df_value_counts.reset_index()
152+
# change column names
153+
df_value_counts.columns = ['dates', 'counts']
154+
# delete the the row with None
155+
dateFreq = df_value_counts.loc[df_value_counts["dates"] != "None"]
156+
157+
# 1. Create a graph for number of PRs merged over time
158+
numPRMerged_graph(dateFreq)
159+
# 2. Create a graph for avg PR merge time
160+
avgMergetime(dataframe)
161+
# 3. A table with PR info for each month
162+
getMonthlyPRinfo(dataframe)
163+
164+
165+
def main():
166+
# get data from the graphql query
167+
pr_cursor = None
168+
res_data = get_PR_data(pr_cursor)
169+
process_data(res_data)
170+
171+
172+
main()

pr-analysis/PRmergeRates.png

93 KB
Loading

pr-analysis/README.md

Lines changed: 34 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,34 @@
1+
# PR Analysis on operator-test-playbooks
2+
3+
## How to run the code:
4+
5+
### Install Necessary Modules
6+
`pip install -r requirements.txt`
7+
8+
**Note:** you may need to install the python3-tkinter package for graphs to display.
9+
10+
### Add a Token
11+
Before you run the code, you must add a personal github token. The token can be generated by following this [guide](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token)
12+
13+
You can add the token in the command line by running `export GH_TOKEN=<yourtoken>`
14+
15+
To Run the code, you can use this command: `python PR_analysis.py`
16+
17+
### Results
18+
Once the code is finished running, you will have two png files and one csv file saved in your folder.
19+
20+
These are the graphs from the analysis of the queried data and the information about which PRs that have been merged in each month.
21+
22+
In PR_Info_Monthly.csv you can search all the Pull Requests based on Year-Month.
23+
24+
# Operator Test Playbooks
25+
26+
## History
27+
28+
## Number of PR merged over time
29+
![Number of PR over time](PRmergeRates.png)
30+
Download : [png](PRmergeRates.png)
31+
32+
## Average PR merging time
33+
![Average PR merging time](AvgMergeTimes.png)
34+
Download : [png](AvgMergeTimes.png)

pr-analysis/graphql_query.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from string import Template
2+
from pandas import json_normalize
3+
import pandas as pd
4+
import requests
5+
import os
6+
7+
GH_TOKEN = os.environ['GH_TOKEN']
8+
headers = {"Authorization": f"Bearer {GH_TOKEN}"}
9+
10+
11+
def run_query(query):
12+
"""A simple function to use requests.post
13+
to make the API call. Note the json= section."""
14+
15+
request = requests.post('https://api.github.com/graphql',
16+
json={'query': query}, headers=headers)
17+
if request.status_code == 200: # 200 means request fulfilled
18+
return request.json()
19+
else:
20+
raise Exception(
21+
"Query failed to run by returning code of {}. {}".format(
22+
request.status_code, query))
23+
24+
25+
def build_query(pr_cursor):
26+
return Template("""{
27+
repository(owner: "redhat-openshift-ecosystem", name: "operator-test-playbooks") {
28+
pullRequests(first: 15, after: $cursor) {
29+
pageInfo{
30+
hasNextPage
31+
endCursor
32+
}
33+
edges {
34+
node {
35+
author {
36+
login
37+
}
38+
mergedBy {
39+
login
40+
}
41+
createdAt
42+
mergedAt
43+
title
44+
url
45+
}
46+
}
47+
}
48+
}
49+
}
50+
""").substitute({'cursor': pr_cursor})
51+
52+
53+
def format_cursor(cursor):
54+
"""Format cursor inside double quotations as required by API"""
55+
return '"{}"'.format(cursor)
56+
57+
58+
def get_PR_data(cursor):
59+
"""This function will create and return a data
60+
frame with the data returned from the query"""
61+
62+
all_data = []
63+
hasNextPage = True
64+
while hasNextPage:
65+
cursor = "null" if cursor is None else format_cursor(cursor)
66+
getPRinfo = build_query(cursor)
67+
result = run_query(getPRinfo)
68+
data_frame = pd.json_normalize(result['data']['repository']['pullRequests']['edges'])
69+
page_info = pd.json_normalize(result['data']['repository']['pullRequests']['pageInfo'])
70+
all_data.append(data_frame)
71+
cursor = page_info.loc[0, 'endCursor'] # update cursor
72+
hasNextPage = page_info.loc[0, 'hasNextPage'] # update hasNextPage
73+
res_data = pd.concat(all_data) # creating a df with all PRs
74+
res_data.pop('node.mergedBy')
75+
76+
return res_data

pr-analysis/requirements.txt

Lines changed: 14 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,14 @@
1+
jsonpatch==1.32
2+
jsonpointer==2.1
3+
jsonschema==3.2.0
4+
matplotlib==3.4.2
5+
numpy==1.21.1
6+
pandas==1.3.0
7+
python-dateutil==2.8.0
8+
requests==2.25.1
9+
requests-file==1.4.3
10+
requests-ftp==0.3.1
11+
requests-ntlm==1.1.0
12+
requestsexceptions==1.4.0
13+
simplejson==3.17.2
14+
ujson==4.0.2

0 commit comments

Comments
 (0)