Skip to content

Commit 3a9299b

Browse files
committed
cvp-2129 PR analysis
1 parent f588eba commit 3a9299b

File tree

6 files changed

+480
-0
lines changed

6 files changed

+480
-0
lines changed

pr-analysis/AvgMergeTimes.png

125 KB
Loading

pr-analysis/PR_Info_Monthly.csv

Lines changed: 205 additions & 0 deletions
Large diffs are not rendered by default.

pr-analysis/PR_analysis.py

Lines changed: 173 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,173 @@
1+
from pandas.core.frame import DataFrame
2+
import pandas as pd
3+
from pandas import json_normalize
4+
import numpy as np
5+
import matplotlib.pyplot as plt
6+
import time
7+
from datetime import datetime, timedelta
8+
import matplotlib.dates as mdates
9+
from matplotlib.dates import DayLocator, HourLocator, DateFormatter, drange
10+
from graphql_query import get_PR_data
11+
12+
13+
def createDateColumn(dataframe):
14+
"""This function will create a date column in
15+
the data frame which will have datetime type rather
16+
than a string type"""
17+
18+
newDatecol = []
19+
format_str = r"%Y-%m-%dT%H:%M:%SZ"
20+
for i in dataframe['node.mergedAt']:
21+
if (i is not None):
22+
# making the string to a datetime format
23+
newdate = datetime.strptime(i, format_str)
24+
# appending to the list as a date
25+
newDatecol.append(newdate.date())
26+
if (i is None):
27+
newDatecol.append("None")
28+
dataframe['Date Merged'] = newDatecol
29+
30+
return dataframe
31+
32+
33+
def numPRMerged_graph(df):
34+
"""This function will create a graph for Num of Pr merged"""
35+
36+
# get oldest and youngest dates from the list
37+
datelist = df['dates']
38+
oldest = min(datelist)
39+
youngest = max(datelist)
40+
timegap = 12
41+
dates = mdates.drange(oldest, youngest, timedelta(weeks=timegap))
42+
# data
43+
counts = df['counts']
44+
# Set up the axes and figure
45+
fig, ax = plt.subplots()
46+
# (To use the exact code below, you'll need to convert your sequence
47+
# of datetimes into matplotlib's float-based date format.
48+
# Use "dates = mdates.date2num(dates)" to convert them.)
49+
dates = mdates.date2num(dates)
50+
width = np.diff(dates).min()
51+
52+
# Make a bar plot. Note that I'm using "dates" directly instead of plotting
53+
# "counts" against x-values of [0,1,2...]
54+
ax.bar(datelist, counts.tolist(), align='center', width=width, ec='blue')
55+
56+
# Tell matplotlib to interpret the x-axis values as dates
57+
ax.xaxis_date()
58+
59+
# Make space for and rotate the x-axis tick labels
60+
fig.autofmt_xdate()
61+
plt.ylabel('Counts')
62+
plt.xlabel('Dates')
63+
plt.title('Number of PRs merged over time')
64+
plt.savefig('PRmergeRates.png', dpi=400)
65+
plt.show()
66+
67+
68+
def computeMergetime(created_at, merged_at):
69+
"""This function will calculate the merge time"""
70+
format_str = r"%Y-%m-%dT%H:%M:%SZ"
71+
date_created = datetime.strptime(created_at, format_str)
72+
date_merged = datetime.strptime(merged_at, format_str)
73+
# return diff in days [86400 secs in a day]
74+
time_diff = (date_merged - date_created).total_seconds() / 86400
75+
return int(time_diff)
76+
77+
78+
def addlabels(x, y):
79+
for i in range(len(x)):
80+
plt.text(i, y[i], y[i], ha='center')
81+
82+
83+
def avgMergetime_graph(df):
84+
"""This function will create a graph for avg merge time"""
85+
86+
x = df['Merged_YM']
87+
y = df['mergetime']
88+
fig, ax = plt.subplots()
89+
x_pos = np.arange(len(x)) # <--
90+
plt.bar(x_pos, y)
91+
plt.xticks(x_pos, x) # <--
92+
# Make space for and rotate the x-axis tick labels
93+
fig.autofmt_xdate()
94+
ax.xaxis_date()
95+
addlabels(x, y)
96+
plt.xlabel("Dates")
97+
plt.ylabel("Merge Time in Days")
98+
plt.title("Avg Merge Times")
99+
plt.savefig('AvgMergeTimes.png', dpi=400)
100+
plt.show()
101+
102+
103+
def avgMergetime(df):
104+
""" This function will be called to calculate
105+
the avg mergetime and produce a graph"""
106+
107+
# 1. calculate the mergetime for each PR and add to the dataframe
108+
mergetime_ = []
109+
110+
for index, row in df.iterrows():
111+
if (row.loc['node.mergedAt'] is not None):
112+
mergetime = computeMergetime(row.loc['node.createdAt'],
113+
row.loc['node.mergedAt'])
114+
mergetime_.append(mergetime)
115+
else:
116+
mergetime_.append("None")
117+
df['mergetime'] = mergetime_
118+
119+
# 2. calculate the average merge time for each month
120+
df['Merged_YM'] = pd.to_datetime(df['node.mergedAt']).dt.to_period('M')
121+
new_df = df.filter(['Merged_YM', 'mergetime'], axis=1)
122+
group_mean = new_df.groupby('Merged_YM')['mergetime'].mean()
123+
mean_df = group_mean.reset_index()
124+
# change from float to int
125+
mean_df['mergetime'] = mean_df.mergetime.astype(int)
126+
127+
# 3. create a bar graph
128+
avgMergetime_graph(mean_df)
129+
130+
131+
def getMonthlyPRinfo(df):
132+
"""Retrieve the info of PRs merged in
133+
each month in history and create csv file"""
134+
135+
new_df = df.filter(['Merged_YM', 'node.title', 'node.url'], axis=1)
136+
new_df.groupby('Merged_YM')
137+
new_df.to_csv('PR_Info_Monthly.csv', index=False)
138+
139+
140+
def process_data(dataframe):
141+
"""This function will be called in the main()
142+
to process the data gathered from the query
143+
and create a dataframe"""
144+
145+
# add a new column for just the date in date format
146+
dataframe = createDateColumn(dataframe)
147+
# get the frequency of each date
148+
frequency = dataframe['Date Merged'].value_counts()
149+
# converting to df and assigning new names to the columns
150+
df_value_counts = pd.DataFrame(frequency)
151+
df_value_counts = df_value_counts.reset_index()
152+
# change column names
153+
df_value_counts.columns = ['dates', 'counts']
154+
# delete the the row with None
155+
dateFreq = df_value_counts.loc[df_value_counts["dates"] != "None"]
156+
157+
# 1. Create a graph for number of PRs merged over time
158+
numPRMerged_graph(dateFreq)
159+
# 2. Create a graph for avg PR merge time
160+
avgMergetime(dataframe)
161+
# 3. A table with PR info for each month
162+
getMonthlyPRinfo(dataframe)
163+
# pie chart of author is merger vs author is not the merger
164+
165+
166+
def main():
167+
# get data from the graphql query
168+
pr_cursor = None
169+
res_data = get_PR_data(pr_cursor)
170+
process_data(res_data)
171+
172+
173+
main()

pr-analysis/PRmergeRates.png

92.8 KB
Loading

pr-analysis/README.md

Lines changed: 26 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,26 @@
1+
# PR Analysis on operator-test-playbooks
2+
3+
To Run the code, you can use this command: **python PR_analysis.py**
4+
5+
**NOTE** Before you run the code, you must add a personal github token. The token can be generated by following this [guide](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token)
6+
7+
### Two ways to add token:
8+
9+
1. You can do this by adding the token in **graphql_query.py** file by replacing **Add_Your_Token_Here** in the headers (uncomment this line).
10+
Make sure to remove this token before pushing any changes.
11+
12+
2. You can add the token in the command line by following this [guide](https://docs.github.com/en/github/authenticating-to-github/keeping-your-account-and-data-secure/creating-a-personal-access-token#using-a-token-on-the-command-line)
13+
14+
Once the code is finished running, you will have two png files and one csv file saved in your folder. These are the graphs from the analysis of the queried data and the information about which PRs have been merged in each month.
15+
16+
# Operator Test Playbooks
17+
18+
## History
19+
20+
## Number of PR over time
21+
![Number of PR over time](PRmergeRates.png)
22+
Download : [png](PRmergeRates.png)
23+
24+
## Average PR merging time
25+
![Average PR merging time](AvgMergeTimes.png)
26+
Download : [png](AvgMergeTimes.png)

pr-analysis/graphql_query.py

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
from string import Template
2+
from python_graphql_client import GraphqlClient
3+
import pandas as pd
4+
import requests
5+
import os
6+
7+
headers = {"Authorization": "token Add_Your_Token_Here"}
8+
# remove your token when pushing changes
9+
10+
11+
def run_query(query):
12+
"""A simple function to use requests.post
13+
to make the API call. Note the json= section."""
14+
15+
request = requests.post('https://api.github.com/graphql',
16+
json={'query': query}, headers=headers)
17+
if request.status_code == 200: # 200 means request fulfilled
18+
return request.json()
19+
else:
20+
raise Exception(
21+
"Query failed to run by returning code of {}. {}".format(
22+
request.status_code, query))
23+
24+
25+
def build_query(pr_cursor):
26+
return Template("""{
27+
repository(owner: "redhat-openshift-ecosystem", name: "operator-test-playbooks") {
28+
pullRequests(first: 15, after: $cursor) {
29+
pageInfo{
30+
hasNextPage
31+
endCursor
32+
}
33+
edges {
34+
node {
35+
author {
36+
login
37+
}
38+
mergedBy {
39+
login
40+
}
41+
createdAt
42+
mergedAt
43+
title
44+
url
45+
}
46+
}
47+
}
48+
}
49+
}
50+
""").substitute({'cursor': pr_cursor})
51+
52+
53+
def format_cursor(cursor):
54+
"""Format cursor inside double quotations as required by API"""
55+
return '"{}"'.format(cursor)
56+
57+
58+
def get_PR_data(cursor):
59+
"""This function will create and return a data
60+
frame with the data returned from the query"""
61+
62+
all_data = []
63+
hasNextPage = True
64+
while hasNextPage:
65+
cursor = "null" if cursor is None else format_cursor(cursor)
66+
getPRinfo = build_query(cursor)
67+
result = run_query(getPRinfo)
68+
data_frame = pd.json_normalize(result['data']['repository']['pullRequests']['edges'])
69+
page_info = pd.json_normalize(result['data']['repository']['pullRequests']['pageInfo'])
70+
all_data.append(data_frame)
71+
cursor = page_info.loc[0, 'endCursor'] # update cursor
72+
hasNextPage = page_info.loc[0, 'hasNextPage'] # update hasNextPage
73+
res_data = pd.concat(all_data) # creating a df with all PRs
74+
res_data.pop('node.mergedBy')
75+
76+
return res_data

0 commit comments

Comments
 (0)