|
#!/usr/bin/env python |
|
import argparse |
|
import json |
|
import logging |
|
import os |
|
import re |
|
import sys |
|
|
|
from collections import OrderedDict |
|
|
|
import requests |
|
|
|
# TODO: doesn't paginate, so capped at first page |
|
class DumbGitHubCache(object): |
|
def __init__(self, workdir): |
|
self.workdir = workdir |
|
if not os.path.exists(workdir): |
|
os.makedirs(workdir) |
|
|
|
self.token = os.environ['GITHUB_TOKEN'] |
|
self.host = 'https://api.github.com' |
|
self.headers = { |
|
'authorization': "token %s" % self.token |
|
} |
|
|
|
def write_json(self, data, filename): |
|
dirname = os.path.dirname(filename) |
|
if not os.path.exists(dirname): |
|
os.makedirs(dirname) |
|
with open(filename, 'w') as fp: |
|
json.dump(data, fp) |
|
|
|
def get_json(self, path, localpath): |
|
localpath = localpath[1:] if localpath.startswith("/") else localpath |
|
filename = os.path.join(self.workdir, localpath) |
|
if os.path.exists(filename): |
|
# logging.info("get %s HIT cache at %s", path, filename) |
|
with open(filename) as fp: |
|
return json.load(fp) |
|
# logging.info("get %s missed cache at %s", path, filename) |
|
r = requests.get("%s%s" % (self.host, path), headers=self.headers) |
|
r.raise_for_status() |
|
data = r.json() |
|
self.write_json(data, filename) |
|
return data |
|
|
|
|
|
def loginfo(msg): |
|
logging.info(msg) |
|
|
|
|
|
def main(workdir, prfile, repo, file_regex, min_count, users): |
|
""" |
|
Given a list of pr numbers and a repo, pull down the pr, its reviews. |
|
and the files that it touches. Then walk through all prs, and |
|
accumulate some per-file stats for all files touched by all prs. |
|
|
|
eg: |
|
foo/bar/baz: |
|
assignees: |
|
spiffxp: 23 |
|
labels: |
|
size/XL: 2 |
|
size/S: 21 |
|
|
|
means: |
|
of all the prs surveyed, for those that touched file foo/bar/baz: |
|
- spiffxp was an assignee on 23 prs |
|
- the size/XL label was on 2 prs |
|
- he size/S label was on 21 prs |
|
|
|
Other keys: |
|
authors: was a pr author |
|
requested_reviewers: was requested for review on a pr |
|
lgtmers: issued /lgtm on a pr (OR clicked "approve changes" in github's ui) |
|
approvers: issued /approve on a pr |
|
triagers: issued area/kind/sig/priority/lifecycle/milestone commands on a pr |
|
holders: issued a /hold on a pr (OR clicked "request changes" in github ui) |
|
|
|
|
|
Other ideas: |
|
- sift these against OWNERS files somehow |
|
- take file of org/repo#123 instead of --repo org/repo + file of 123 |
|
- filter by date ranges |
|
- filter by labels |
|
- (maybe these should be github queries) |
|
""" |
|
|
|
setup_logging() |
|
|
|
ghcache = DumbGitHubCache(workdir) |
|
|
|
def cache_get(endpoint): |
|
ep = endpoint.replace('https://api.github.com','') |
|
js = "%s.json" % ep |
|
return ghcache.get_json("%s" % ep, js) |
|
|
|
with open(prfile) as fp: |
|
prnums = [l.strip() for l in fp] |
|
|
|
|
|
pattern = re.compile(file_regex) |
|
# IMO while some of these can be done by author, they are more meaningful |
|
# signal when done by someone other than author |
|
# TODO(spiffxp): /lgtm is meaningless if not done by a member |
|
# TODO(spiffxp): /approve is meaningless if not in OWNERS |
|
commands = { |
|
'close': re.compile(r'^/close', re.MULTILINE), |
|
'lgtm': re.compile(r'^/lgtm', re.MULTILINE), |
|
'approve': re.compile(r'^/approve', re.MULTILINE), |
|
'triage': re.compile(r'^/(remove-)?(area|kind|sig|priority|milestone|lifecycle)', re.MULTILINE), |
|
'hold': re.compile(r'^/hold$', re.MULTILINE), |
|
# other ideas: |
|
# test-shepherd: retest|test|ok-to-test |
|
# review-shepherd: (un)(assign|cc) (if not self) |
|
} |
|
|
|
def process_comments(comments, processed): |
|
"""comments: [{user: ..., body: ...}], processed:{k:[] for k in commands}]""" |
|
leftovers=[] |
|
for c in comments: |
|
# ignore reviews or comments by users that no longer exist; they have user: null |
|
if c['user'] is None: |
|
continue |
|
matched = False |
|
for name, regex in commands.iteritems(): |
|
if regex.findall(c['body']): |
|
matched = True |
|
processed[name].append(c) |
|
# TODO(spiffxp): make these distinct commands? |
|
if 'state' in c: |
|
if c['state'] == "APPROVED": |
|
matched = True |
|
processed['lgtm'].append(c) |
|
elif c['state'] == "CHANGES_REQUESTED": |
|
matched = True |
|
processed['hold'].append(c) |
|
if not matched: |
|
leftovers.append(c) |
|
return leftovers |
|
|
|
# get the prs we'll be dealing with |
|
prs = {} |
|
for num in prnums: |
|
pr_id = '%s#%s' % (repo, num) |
|
url = 'https://api.github.com/repos/%s/pulls/%s' % (repo, num) |
|
loginfo("pr: %s..." % url) |
|
pr = cache_get(url) |
|
loginfo(" files...") |
|
files = cache_get("%s/files?per_page=100" % url) |
|
|
|
loginfo(" reviews...") |
|
reviews = cache_get("%s/reviews?per_page=100" % url) |
|
loginfo(" comments...") |
|
comments = cache_get("%s?per_page=100" % pr['comments_url']) |
|
loginfo(" review_comments...") |
|
review_comments = cache_get("%s?per_page=100" % pr['review_comments_url']) |
|
|
|
# things I want to filter out of reviews |
|
# - reviews of {body:"", state:"COMMENTED"}, means they dropped |
|
# review comments, so the review itself isn't meaningful |
|
github_reviews = filter(lambda x: x['body'] != "" or x['state'] != "COMMENTED", reviews) |
|
|
|
processed={name:[] for name in commands} |
|
github_reviews = process_comments(github_reviews, processed) |
|
comments = process_comments(comments, processed) |
|
review_comments = process_comments(review_comments, processed) |
|
|
|
# TODO: try parsing out events like so |
|
# did they use the native review ui to approve? cool |
|
# did they issue an approve? cool |
|
# did they issue an lgtm? cool |
|
# did they triage? cool |
|
# if their comment was none of these things, they commented |
|
# if their review_comment was none of these things, they review commented |
|
filenames = [f['filename'] for f in files] |
|
labels = [l['name'] for l in pr['labels']] |
|
prs[pr_id] = { |
|
'id': pr_id, |
|
'url': pr['html_url'], |
|
'author': pr['user']['login'], |
|
'assignees': [x['login'] for x in pr['assignees']], |
|
'requested_reviewers': [x['login'] for x in pr['requested_reviewers']], |
|
'lgtms': [{ |
|
'login': c['user']['login'], |
|
'html_url': c['html_url'], |
|
} for c in processed['lgtm']], |
|
'approves': [{ |
|
'login': c['user']['login'], |
|
'html_url': c['html_url'], |
|
} for c in processed['approve']], |
|
'triages': [{ |
|
'login': c['user']['login'], |
|
'html_url': c['html_url'], |
|
} for c in processed['triage']], |
|
'holds': [{ |
|
'login': c['user']['login'], |
|
'html_url': c['html_url'], |
|
} for c in processed['hold']], |
|
'files': filenames, |
|
'labels': labels, |
|
'merged': pr['merged'], |
|
'github_reviews': [{ |
|
'login': r['user']['login'], |
|
'body': r['body'], |
|
'state': r['state'], |
|
'html_url': r['html_url'] |
|
} for r in github_reviews], |
|
} |
|
|
|
# filter to prs touching files that match a regex |
|
file_pattern = re.compile(file_regex) |
|
prs_matching_file_regex = [] |
|
for pr_id, info in prs.iteritems(): |
|
f_files = [f for f in info['files'] if file_pattern.match(f)] |
|
relevant_files = len(f_files) != 0 |
|
if relevant_files: |
|
prs_matching_file_regex.append(info) |
|
|
|
# swap around: for every file, what prs touched it |
|
file_to_prs = {} |
|
for info in prs_matching_file_regex: |
|
for f in info['files']: |
|
file_prs = file_to_prs.get(f, []) |
|
file_prs.append(info) |
|
file_to_prs[f]=file_prs |
|
|
|
ignore_users = users is None or len(users) == 0 |
|
def user_count_tuple_matches(x): |
|
return x[1] >= min_count and (ignore_users or x[0] in users) |
|
|
|
file_info = OrderedDict(sorted( |
|
[( |
|
f, OrderedDict({ |
|
'author': OrderedDict(sorted( |
|
filter(user_count_tuple_matches, |
|
map(lambda x: (x[0], len(x[1])), |
|
group_by(lambda x: x['author'], file_prs).iteritems())), |
|
key=lambda x: x[1] |
|
)), |
|
'assignees': OrderedDict(sorted( |
|
filter(user_count_tuple_matches, |
|
map(lambda x: (x[0], len(x[1])), |
|
categorize_by(lambda x: x['assignees'], file_prs).iteritems())), |
|
key=lambda x: x[1] |
|
)), |
|
'requested_reviewers': OrderedDict(sorted( |
|
filter(user_count_tuple_matches, |
|
map(lambda x: (x[0], len(x[1])), |
|
categorize_by(lambda x: x['requested_reviewers'], file_prs).iteritems())), |
|
key=lambda x: x[1] |
|
)), |
|
# 'labels': OrderedDict(sorted( |
|
# filter(lambda x: x[1] >= min_count, |
|
# map(lambda x: (x[0], len(x[1])), |
|
# categorize_by(lambda x: x['labels'], file_prs).iteritems())), |
|
# key=lambda x: x[1] |
|
# )), |
|
#'github_reviewers': OrderedDict(sorted( |
|
# filter(lambda x: len(x[1]) >= min_count, |
|
# map(lambda x: (x[0], [{ |
|
# 'html_url': r['html_url'], |
|
# 'state': r['state'] |
|
# } for pr in x[1] for r in pr['github_reviews'] if r['login'] == x[0]]), |
|
# categorize_by(lambda x: set([r['login'] for r in x['github_reviews']]), file_prs).iteritems())), |
|
# key=lambda x: x[1] |
|
#)), |
|
'lgtmers': OrderedDict(sorted( |
|
filter(user_count_tuple_matches, |
|
map(lambda x: (x[0], len([[r['html_url'] for r in pr['lgtms'] if r['login'] == x[0]] for pr in x[1]])), |
|
categorize_by(lambda x: set([r['login'] for r in x['lgtms']]), file_prs).iteritems())), |
|
key=lambda x: x[1] |
|
)), |
|
'approvers': OrderedDict(sorted( |
|
filter(user_count_tuple_matches, |
|
map(lambda x: (x[0], len([[r['html_url'] for r in pr['approves'] if r['login'] == x[0]] for pr in x[1]])), |
|
categorize_by(lambda x: set([r['login'] for r in x['approves']]), file_prs).iteritems())), |
|
key=lambda x: x[1] |
|
)), |
|
'triagers': OrderedDict(sorted( |
|
filter(user_count_tuple_matches, |
|
map(lambda x: (x[0], len([[r['html_url'] for r in pr['triages'] if r['login'] == x[0]] for pr in x[1]])), |
|
categorize_by(lambda x: set([r['login'] for r in x['triages']]), file_prs).iteritems())), |
|
key=lambda x: x[1] |
|
)), |
|
'holders': OrderedDict(sorted( |
|
filter(user_count_tuple_matches, |
|
map(lambda x: (x[0], len([[r['html_url'] for r in pr['holds'] if r['login'] == x[0]] for pr in x[1]])), |
|
categorize_by(lambda x: set([r['login'] for r in x['holds']]), file_prs).iteritems())), |
|
key=lambda x: x[1] |
|
)), |
|
# TODO(spiffxp): include commenter / review_commenter |
|
}) |
|
) for f, file_prs in file_to_prs.iteritems()], |
|
# sort by files that have had the most PRs against them (by adding up author counts) |
|
key=lambda x: reduce(lambda r,x2: r+x2[1], x[1]['author'].items(), 0) |
|
)) |
|
|
|
print json.dumps(file_info, indent=2) |
|
|
|
# naming things is hard: |
|
# - categorize_by: a thing can be in multiple categories |
|
# - group_by: a thing can only be in one group |
|
|
|
# given fn(value)->[keys] fn and [values], return {key:[values]} |
|
def categorize_by(fn, xs): |
|
r = {} |
|
for x in xs: |
|
ks = fn(x) |
|
for k in ks: |
|
vs = r.get(k, []) |
|
vs.append(x) |
|
r[k] = vs |
|
return r |
|
|
|
# given fn(value)->key fn and [values], return {key:[values]} |
|
def group_by(fn, xs): |
|
r = {} |
|
for x in xs: |
|
k = fn(x) |
|
vs = r.get(k, []) |
|
vs.append(x) |
|
r[k] = vs |
|
return r |
|
|
|
def setup_logging(): |
|
"""Initialize logging to screen""" |
|
# See https://docs.python.org/2/library/logging.html#logrecord-attributes |
|
# [IWEF]mmdd HH:MM:SS.mmm] msg |
|
fmt = '%(levelname).1s%(asctime)s.%(msecs)03d] %(message)s' # pylint: disable=line-too-long |
|
datefmt = '%m%d %H:%M:%S' |
|
logging.basicConfig( |
|
level=logging.INFO, |
|
format=fmt, |
|
datefmt=datefmt, |
|
) |
|
|
|
if __name__ == '__main__': |
|
parser = argparse.ArgumentParser(description='Summarize github events for the specified user') |
|
parser.add_argument('prfile', default='prfile', help='file containing list of pr numbers') |
|
parser.add_argument('--repo', default='kubernetes/kubernetes', help='repo that will be queried') |
|
parser.add_argument('--file-regex', default='.*', help='only consider prs whose filenames match this regex') |
|
parser.add_argument('--min-count', default=1, help='minimum count of occurrences to be included in summary') |
|
parser.add_argument('--workdir', default='data/pr-file-info', help='Work directory to cache things') |
|
parser.add_argument('--users', nargs='+', help='Filter to just these users') |
|
args = parser.parse_args() |
|
|
|
main(args.workdir, args.prfile, args.repo, args.file_regex, int(args.min_count), args.users) |