To run:
python fetch_metrics.py --users=richkuz,rkuzsma
It outputs a chart for each user with activity like PRs, Commits, Lines Changed, and Cycle Time. And an overall chart for all the users over time.
All this code is generated by Cursor CLI with Gemini Flash 3.
import os
import argparse
from github import Github, Auth
from datetime import datetime, timedelta, timezone
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed
# --- CONFIGURATION ---
ACCESS_TOKEN = os.getenv('GITHUB_TOKEN', 'YOUR_GITHUB_TOKEN')
USER_NAMES = os.getenv('GITHUB_USERS', 'user1,user2') # Comma-separated list
ORG_FILTER = os.getenv('GITHUB_ORG', '') # Optional: limit to a specific org
MONTHS_BACK = int(os.getenv('MONTHS_BACK', 12))
# ---------------------
auth = Auth.Token(ACCESS_TOKEN)
g = Github(auth=auth, per_page=100)
start_date = datetime.now(timezone.utc) - timedelta(days=MONTHS_BACK * 30)
def process_pr(issue):
try:
# Parse repo name from URL to avoid extra API calls
url_parts = issue.url.split('/')
repo_name = f"{url_parts[4]}/{url_parts[5]}"
pr = issue.as_pull_request()
# 1. Cycle Time (Creation to Merge)
pr_merged_at = pr.merged_at
pr_created_at = pr.created_at
if pr_merged_at.tzinfo is None: pr_merged_at = pr_merged_at.replace(tzinfo=timezone.utc)
if pr_created_at.tzinfo is None: pr_created_at = pr_created_at.replace(tzinfo=timezone.utc)
cycle_time_hrs = (pr_merged_at - pr_created_at).total_seconds() / 3600
# These attributes trigger an API call to fetch PR details
additions = pr.additions
deletions = pr.deletions
size = additions + deletions
commits = pr.commits
# This triggers another API call
reviews = pr.get_reviews().totalCount
return {
'repo': repo_name,
'number': pr.number,
'title': pr.title,
'user': pr.user.login,
'date': pr.merged_at.date(),
'cycle_time_hrs': cycle_time_hrs,
'size': size,
'commits': commits,
'reviews_received': reviews,
'additions': additions,
'deletions': deletions
}
except Exception as e:
print(f" Error processing PR #{issue.number}: {e}")
return None
def fetch_metrics_for_user(username, since_date=None, existing_keys=None):
if existing_keys is None:
existing_keys = set()
# Ensure since_date is a date object for the search query
if isinstance(since_date, datetime):
fetch_from_str = since_date.strftime('%Y-%m-%d')
elif hasattr(since_date, 'strftime'):
fetch_from_str = since_date.strftime('%Y-%m-%d')
else:
fetch_from_str = start_date.strftime('%Y-%m-%d')
# GitHub search query for merged PRs by this author
query = f"author:{username} is:pr is:merged merged:>={fetch_from_str}"
if ORG_FILTER:
query += f" org:{ORG_FILTER}"
print(f"Applying ORG_FILTER: {ORG_FILTER}")
print(f"Searching for merged PRs by {username}: {query}...")
try:
pulls = g.search_issues(query=query)
total_count = pulls.totalCount
print(f"DEBUG: GitHub Search found {total_count} PRs for {username} since {fetch_from_str}")
if total_count == 0:
print(f"No merged PRs found for {username} matching query.")
return []
# Filter out PRs already in cache
new_pulls = []
for p in pulls:
url_parts = p.url.split('/')
repo_name = f"{url_parts[4]}/{url_parts[5]}"
key = f"{repo_name}#{p.number}"
if key not in existing_keys:
new_pulls.append(p)
if not new_pulls:
print(f"All {total_count} PRs for {username} are already in cache. Skipping fetch.")
return []
print(f"Processing {len(new_pulls)} new PRs (out of {total_count} total) in parallel...")
user_data = []
# Use ThreadPoolExecutor for parallel processing
with ThreadPoolExecutor(max_workers=10) as executor:
futures = {executor.submit(process_pr, issue): issue for issue in new_pulls}
count = 0
for future in as_completed(futures):
result = future.result()
if result:
user_data.append(result)
count += 1
if count % 20 == 0:
print(f" Processed {count}/{len(new_pulls)} PRs for {username}...")
return user_data
except Exception as e:
print(f"Error searching PRs for {username}: {e}")
return []
def fetch_all_metrics():
cache_file = 'github_metrics.csv'
existing_df = pd.DataFrame()
existing_keys = set()
if os.path.exists(cache_file):
print(f"Loading existing data from {cache_file}...")
existing_df = pd.read_csv(cache_file)
# Ensure date column is datetime objects for comparison
existing_df['date'] = pd.to_datetime(existing_df['date']).dt.date
if not existing_df.empty:
# Create a set of keys (repo#number) for fast lookup
existing_df['key'] = existing_df['repo'] + "#" + existing_df['number'].astype(str)
existing_keys = set(existing_df['key'].tolist())
users = [u.strip() for u in USER_NAMES.split(',')]
all_new_data = []
for username in users:
fetch_from = start_date.date()
if not existing_df.empty and 'user' in existing_df.columns:
user_cache = existing_df[existing_df['user'].str.lower() == username.lower()]
if not user_cache.empty:
cache_min = user_cache['date'].min()
cache_max = user_cache['date'].max()
if cache_min <= fetch_from:
fetch_from = cache_max
print(f"User {username} has cache from {cache_min} to {cache_max}. Checking for updates since {fetch_from}...")
else:
print(f"User {username} cache starts at {cache_min}, but we need data from {fetch_from}. Re-fetching older data...")
all_new_data.extend(fetch_metrics_for_user(username, fetch_from, existing_keys))
if not all_new_data:
print("No new data to add.")
if not existing_df.empty and 'key' in existing_df.columns:
existing_df = existing_df.drop(columns=['key'])
return existing_df
new_df = pd.DataFrame(all_new_data)
# Combine and deduplicate
combined_df = pd.concat([existing_df, new_df], ignore_index=True)
combined_df = combined_df.drop_duplicates(subset=['repo', 'number'], keep='last')
combined_df = combined_df.sort_values('date')
if 'key' in combined_df.columns:
combined_df = combined_df.drop(columns=['key'])
return combined_df
def create_activity_plot(ax, stats, title, all_months):
import numpy as np
x_labels = [str(m) for m in all_months]
x_pos = np.arange(len(all_months))
# Widths for visual hierarchy
width_pr = 0.4
width_commits = 0.3
width_lines = 0.2
# Create twin axes
ax_twin_commits = ax.twinx()
ax_twin_lines = ax.twinx()
# Position the axes
ax_twin_lines.spines['right'].set_position(('outward', 60))
# 1. Total Lines (Tertiary) - FURTHEST RIGHT - Lowest Z-order
ax_twin_lines.bar(x_pos + 0.25, stats['total_size'].fillna(0), width_lines, color='#E2E8F0', label='Lines Changed')
ax_twin_lines.set_ylabel('Total Lines Changed', color='#64748B')
ax_twin_lines.set_zorder(1)
# 2. Total Commits (Secondary) - MIDDLE RIGHT - Middle Z-order
ax_twin_commits.bar(x_pos, stats['total_commits'].fillna(0), width_commits, color='#94A3B8', label='Commits')
ax_twin_commits.set_ylabel('Total Commits', color='#475569')
ax_twin_commits.set_zorder(2)
# 3. PR Count (Primary) - LEFT - Top Z-order
ax.bar(x_pos - 0.2, stats['pr_count'].fillna(0), width_pr, color='#2E5BFF', edgecolor='black', linewidth=0.5, label='PRs')
ax.set_ylabel('Number of PRs', color='#1E40AF', fontweight='bold')
ax.set_xticks(x_pos)
ax.set_xticklabels(x_labels, rotation=45, ha='right')
ax.set_zorder(3)
ax.patch.set_visible(False)
ax.set_title(title, fontweight='bold', pad=20)
# Combined legend
h1, l1 = ax.get_legend_handles_labels()
h2, l2 = ax_twin_lines.get_legend_handles_labels()
h3, l3 = ax_twin_commits.get_legend_handles_labels()
ax.legend(h1 + h3 + h2, l1 + l3 + l2, loc='upper left', prop={'size': 8}, frameon=True)
def generate_user_report(df, username):
user_df = df[df['user'].str.lower() == username.lower()].copy()
if user_df.empty:
print(f"Warning: No data found for user {username}")
return ""
user_df['month'] = pd.to_datetime(user_df['date']).dt.to_period('M')
all_months = pd.period_range(start=user_df['month'].min(), end=user_df['month'].max(), freq='M')
def get_monthly_stats(data_df):
if 'commits' not in data_df.columns:
data_df['commits'] = 0
return data_df.groupby('month').agg(
pr_count=('repo', 'count'),
total_size=('size', 'sum'),
total_commits=('commits', 'sum'),
median_cycle=('cycle_time_hrs', 'median')
).reindex(all_months)
size_90 = user_df['size'].quantile(0.9) if len(user_df) > 5 else float('inf')
cycle_90 = user_df['cycle_time_hrs'].quantile(0.9) if len(user_df) > 5 else float('inf')
user_df_clean = user_df[(user_df['size'] <= size_90) & (user_df['cycle_time_hrs'] <= cycle_90)].copy()
stats_full = get_monthly_stats(user_df)
stats_clean = get_monthly_stats(user_df_clean)
fig, axes = plt.subplots(1, 2, figsize=(20, 8))
plt.subplots_adjust(wspace=0.6, bottom=0.2, right=0.85)
create_activity_plot(axes[0], stats_full, f'Activity (All PRs): {username}', all_months)
create_activity_plot(axes[1], stats_clean, f'Activity (Outliers Removed): {username}', all_months)
chart_filename = f'user_activity_{username}.png'
plt.savefig(chart_filename)
plt.close()
fig, ax = plt.subplots(figsize=(10, 6))
x_pos = np.arange(len(all_months))
ax.plot(x_pos, stats_full['median_cycle'], marker='o', label='All PRs', color='orange', linewidth=2)
if len(user_df) > 5:
ax.plot(x_pos, stats_clean['median_cycle'], marker='s', label='All PRs (Outliers Removed)', color='green', linestyle='--', linewidth=2)
ax.set_title(f'Cycle Time Trends: {username}')
ax.set_ylabel('Median Hours to Merge')
ax.set_xticks(x_pos)
ax.set_xticklabels([str(m) for m in all_months], rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3)
cycle_chart_filename = f'user_cycle_{username}.png'
plt.savefig(cycle_chart_filename)
plt.close()
outlier_count = len(user_df) - len(user_df_clean)
outliers_df = user_df[(user_df['size'] > size_90) | (user_df['cycle_time_hrs'] > cycle_90)].copy()
outliers_df = outliers_df.sort_values('date', ascending=False)
outlier_rows = ""
for _, row in outliers_df.iterrows():
outlier_rows += f"<tr><td>#{row['number']}</td><td>{row['date']}</td><td>{row['title']}</td><td>{row['commits']}</td><td>{row['size']}</td><td>{row['cycle_time_hrs']:.1f}h</td></tr>"
outlier_table = f"""
<details style="margin-top: 15px; border: 1px solid #eab308; border-radius: 4px; background: white;">
<summary style="cursor: pointer; padding: 10px; font-weight: bold; color: #854d0e; background-color: #fefce8;">
▶ View {outlier_count} Outlier PR Details
</summary>
<div style="padding: 10px; overflow-x: auto;">
<table style="font-size: 0.9em; width: 100%; margin-bottom: 0;">
<thead>
<tr style="background-color: #fef9c3;">
<th>PR #</th>
<th>Date</th>
<th>Title</th>
<th>Commits</th>
<th>Lines Changed</th>
<th>Cycle Time</th>
</tr>
</thead>
<tbody>
{outlier_rows}
</tbody>
</table>
</div>
</details>
""" if outlier_count > 0 else ""
outlier_desc = f"""
<div style="background-color: #fffbdd; padding: 15px; border-left: 5px solid #d4a017; margin: 20px 0; border-radius: 4px;">
<strong>Outlier Breakdown:</strong> We identified <strong>{outlier_count}</strong> PRs that were significantly larger or took much longer than usual.
These are removed from the "Activity (Outliers Removed)" and cycle time trend lines.
<br><em>Thresholds: Size > {size_90:.0f} lines OR Cycle Time > {cycle_90:.1f}h.</em>
{outlier_table}
</div>
""" if outlier_count > 0 else "<p>No significant outliers detected.</p>"
return f"""<div class="user-section"><h2>User: {username}</h2><div class="chart-container"><img src="{chart_filename}" alt="Activity for {username}"></div><div class="chart-container" style="max-width: 800px; margin: 0 auto 30px auto;"><img src="{cycle_chart_filename}" alt="Cycle Time for {username}"></div>{outlier_desc}<table><thead><tr><th>Metric (Last {MONTHS_BACK} Mo)</th><th>Full History</th><th>Typical Work</th><th>Impact</th></tr></thead><tbody><tr><td>Total Merged PRs</td><td>{len(user_df)}</td><td>{len(user_df_clean)}</td><td>-{outlier_count} PRs</td></tr><tr><td>Median Cycle Time</td><td>{user_df['cycle_time_hrs'].median():.1f}h</td><td>{user_df_clean['cycle_time_hrs'].median():.1f}h</td><td>{((user_df['cycle_time_hrs'].median() - user_df_clean['cycle_time_hrs'].median())/user_df['cycle_time_hrs'].median()*100) if user_df['cycle_time_hrs'].median() > 0 else 0:.1f}% faster</td></tr><tr><td>Total Lines Changed</td><td>{user_df['size'].sum():,.0f}</td><td>{user_df_clean['size'].sum():,.0f}</td><td>-{(user_df['size'].sum() - user_df_clean['size'].sum()):,.0f} lines</td></tr><tr><td>Total Commits</td><td>{user_df['commits'].sum():,.0f}</td><td>{user_df_clean['commits'].sum():,.0f}</td><td>-{(user_df['commits'].sum() - user_df_clean['commits'].sum()):,.0f} commits</td></tr></tbody></table></div>"""
def generate_team_report(df):
if df.empty: return ""
df['month'] = pd.to_datetime(df['date']).dt.to_period('M')
all_months = pd.period_range(start=df['month'].min(), end=df['month'].max(), freq='M')
def get_monthly_stats(data_df):
if 'commits' not in data_df.columns: data_df['commits'] = 0
return data_df.groupby('month').agg(pr_count=('repo', 'count'), total_size=('size', 'sum'), total_commits=('commits', 'sum'), median_cycle=('cycle_time_hrs', 'median')).reindex(all_months)
size_90 = df['size'].quantile(0.9) if len(df) > 5 else float('inf')
cycle_90 = df['cycle_time_hrs'].quantile(0.9) if len(df) > 5 else float('inf')
df_clean = df[(df['size'] <= size_90) & (df['cycle_time_hrs'] <= cycle_90)].copy()
stats_full = get_monthly_stats(df)
stats_clean = get_monthly_stats(df_clean)
fig, axes = plt.subplots(1, 2, figsize=(20, 8))
plt.subplots_adjust(wspace=0.6, bottom=0.2, right=0.85)
create_activity_plot(axes[0], stats_full, 'Team Activity (All PRs)', all_months)
create_activity_plot(axes[1], stats_clean, 'Team Activity (Outliers Removed)', all_months)
plt.savefig('team_activity.png')
plt.close()
fig, ax = plt.subplots(figsize=(10, 6))
x_pos = np.arange(len(all_months))
ax.plot(x_pos, stats_full['median_cycle'], marker='o', label='All PRs', color='orange', linewidth=2)
if len(df) > 5:
ax.plot(x_pos, stats_clean['median_cycle'], marker='s', label='All PRs (Outliers Removed)', color='green', linestyle='--', linewidth=2)
ax.set_title('Overall Team Cycle Time Trends')
ax.set_ylabel('Median Hours to Merge')
ax.set_xticks(x_pos)
ax.set_xticklabels([str(m) for m in all_months], rotation=45, ha='right')
ax.legend()
ax.grid(True, alpha=0.3)
plt.savefig('team_cycle.png')
plt.close()
return f"""
<div class="user-section">
<h1>Overall Team Trends</h1>
<div class="chart-container">
<img src="team_activity.png" alt="Team Activity">
</div>
<div class="chart-container" style="max-width: 800px; margin: 0 auto 30px auto;">
<img src="team_cycle.png" alt="Team Cycle Time">
</div>
<table>
<thead>
<tr>
<th>Metric (Team Total - Last {MONTHS_BACK} Mo)</th>
<th>Full History</th>
<th>Typical Work (Clean)</th>
</tr>
</thead>
<tbody>
<tr><td>Total Merged PRs</td><td>{len(df)}</td><td>{len(df_clean)}</td></tr>
<tr><td>Median Cycle Time</td><td>{df['cycle_time_hrs'].median():.1f}h</td><td>{df_clean['cycle_time_hrs'].median():.1f}h</td></tr>
<tr><td>Total Lines Changed</td><td>{df['size'].sum():,.0f}</td><td>{df_clean['size'].sum():,.0f}</td></tr>
</tbody>
</table>
</div>
"""
def generate_html_report(df, user_reports_html, team_report_html):
total_prs = len(df)
unique_users = sorted(df['user'].unique().tolist(), key=str.lower)
total_authors = len(unique_users)
avg_cycle_time = df['cycle_time_hrs'].mean()
median_cycle_time = df['cycle_time_hrs'].median()
user_names_list = ", ".join(unique_users)
html_content = f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>GitHub Code Trends</title>
<style>
body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; line-height: 1.6; color: #333; max-width: 1200px; margin: 0 auto; padding: 20px; background-color: #f6f8fa; }}
h1, h2, h3 {{ color: #24292e; border-bottom: 1px solid #eaecef; padding-bottom: 10px; }}
.stats-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; margin-bottom: 30px; }}
.stat-card {{ background: white; padding: 20px; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12); text-align: center; }}
.stat-value {{ font-size: 24px; font-weight: bold; color: #0366d6; }}
.stat-label {{ font-size: 14px; color: #586069; }}
.user-list {{ background: #fff; padding: 15px; border-radius: 6px; border: 1px solid #eaecef; margin-bottom: 30px; font-size: 0.95em; }}
.chart-container {{ background: white; padding: 20px; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12); margin-bottom: 30px; text-align: center; }}
.chart-container img {{ max-width: 100%; height: auto; }}
.user-section {{ background: white; padding: 20px; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12); margin-bottom: 50px; }}
table {{ width: 100%; border-collapse: collapse; margin-bottom: 30px; background: white; }}
th, td {{ text-align: left; padding: 12px; border-bottom: 1px solid #eaecef; }}
th {{ background-color: #f6f8fa; }}
tr:hover {{ background-color: #f1f8ff; }}
</style>
</head>
<body>
<h1>GitHub Code Trends</h1>
<p>Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
<div class="user-list">
<strong>Tracked Users ({total_authors}):</strong> {user_names_list}
</div>
<div class="stats-grid">
<div class="stat-card"><div class="stat-value">{total_prs}</div><div class="stat-label">Total Merged PRs</div></div>
<div class="stat-card"><div class="stat-value">{total_authors}</div><div class="stat-label">Tracked Users</div></div>
<div class="stat-card"><div class="stat-value">{avg_cycle_time:.1f}h</div><div class="stat-label">Avg Cycle Time</div></div>
<div class="stat-card"><div class="stat-value">{median_cycle_time:.1f}h</div><div class="stat-label">Median Cycle Time</div></div>
</div>
{team_report_html}
<h2>Individual User Trends</h2>
{user_reports_html}
</body>
</html>
"""
with open('engineering_report.html', 'w') as f:
f.write(html_content)
print("Full HTML report generated: engineering_report.html")
def generate_dashboard(df):
users = df['user'].unique()
user_reports_html = ""
for username in users:
print(f"Generating charts for {username}...")
user_reports_html += generate_user_report(df, username)
print("Generating Overall Team charts...")
team_report_html = generate_team_report(df)
generate_html_report(df, user_reports_html, team_report_html)
if __name__ == "__main__":
parser = argparse.ArgumentParser(description='Fetch GitHub metrics for users.')
parser.add_argument('--users', type=str, help='Comma-separated list of GitHub usernames')
parser.add_argument('--clear-cache', action='store_true', help='Clear the local CSV cache before running')
args = parser.parse_args()
if args.users:
USER_NAMES = args.users
if ACCESS_TOKEN == 'YOUR_GITHUB_TOKEN' or USER_NAMES == 'user1,user2':
print("Error: Please set GITHUB_TOKEN and either GITHUB_USERS environment variable or --users argument.")
print("Example: export GITHUB_TOKEN='your_token_here'")
print(" python fetch_metrics.py --users='username1,username2'")
exit(1)
if args.clear_cache and os.path.exists('github_metrics.csv'):
print("Clearing cache file.")
os.remove('github_metrics.csv')
df = fetch_all_metrics()
if df.empty:
print("No data found."); exit(0)
df.to_csv('github_metrics.csv', index=False)
dashboard_start_date = start_date.date()
df['temp_date'] = pd.to_datetime(df['date']).dt.date
df_filtered = df[df['temp_date'] >= dashboard_start_date].copy()
if df_filtered.empty:
print(f"No data found since {dashboard_start_date}."); exit(0)
generate_dashboard(df_filtered)