richkuz/generate_github_stats.md

## generate_github_stats.md

      
    Raw
  

              generate_github_stats.md
            
          
    Generate GitHub Stats

To run:
python fetch_metrics.py --users=richkuz,rkuzsma

It outputs a chart for each user with activity like PRs, Commits, Lines Changed, and Cycle Time. And an overall chart for all the users over time.
All this code is generated by Cursor CLI with Gemini Flash 3.
import os
import argparse
from github import Github, Auth
from datetime import datetime, timedelta, timezone
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
from concurrent.futures import ThreadPoolExecutor, as_completed

# --- CONFIGURATION ---
ACCESS_TOKEN = os.getenv('GITHUB_TOKEN', 'YOUR_GITHUB_TOKEN')
USER_NAMES = os.getenv('GITHUB_USERS', 'user1,user2') # Comma-separated list
ORG_FILTER = os.getenv('GITHUB_ORG', '') # Optional: limit to a specific org
MONTHS_BACK = int(os.getenv('MONTHS_BACK', 12))
# ---------------------

auth = Auth.Token(ACCESS_TOKEN)
g = Github(auth=auth, per_page=100)
start_date = datetime.now(timezone.utc) - timedelta(days=MONTHS_BACK * 30)

def process_pr(issue):
    try:
        # Parse repo name from URL to avoid extra API calls
        url_parts = issue.url.split('/')
        repo_name = f"{url_parts[4]}/{url_parts[5]}"
        pr = issue.as_pull_request()
        
        # 1. Cycle Time (Creation to Merge)
        pr_merged_at = pr.merged_at
        pr_created_at = pr.created_at
        if pr_merged_at.tzinfo is None: pr_merged_at = pr_merged_at.replace(tzinfo=timezone.utc)
        if pr_created_at.tzinfo is None: pr_created_at = pr_created_at.replace(tzinfo=timezone.utc)
        
        cycle_time_hrs = (pr_merged_at - pr_created_at).total_seconds() / 3600
        
        # These attributes trigger an API call to fetch PR details
        additions = pr.additions
        deletions = pr.deletions
        size = additions + deletions
        commits = pr.commits
        
        # This triggers another API call
        reviews = pr.get_reviews().totalCount
        
        return {
            'repo': repo_name,
            'number': pr.number,
            'title': pr.title,
            'user': pr.user.login,
            'date': pr.merged_at.date(),
            'cycle_time_hrs': cycle_time_hrs,
            'size': size,
            'commits': commits,
            'reviews_received': reviews,
            'additions': additions,
            'deletions': deletions
        }
    except Exception as e:
        print(f"  Error processing PR #{issue.number}: {e}")
        return None

def fetch_metrics_for_user(username, since_date=None, existing_keys=None):
    if existing_keys is None:
        existing_keys = set()
    # Ensure since_date is a date object for the search query
    if isinstance(since_date, datetime):
        fetch_from_str = since_date.strftime('%Y-%m-%d')
    elif hasattr(since_date, 'strftime'):
        fetch_from_str = since_date.strftime('%Y-%m-%d')
    else:
        fetch_from_str = start_date.strftime('%Y-%m-%d')
    
    # GitHub search query for merged PRs by this author
    query = f"author:{username} is:pr is:merged merged:>={fetch_from_str}"
    if ORG_FILTER:
        query += f" org:{ORG_FILTER}"
        print(f"Applying ORG_FILTER: {ORG_FILTER}")
        
    print(f"Searching for merged PRs by {username}: {query}...")
    
    try:
        pulls = g.search_issues(query=query)
        total_count = pulls.totalCount
        print(f"DEBUG: GitHub Search found {total_count} PRs for {username} since {fetch_from_str}")
        
        if total_count == 0:
            print(f"No merged PRs found for {username} matching query.")
            return []
            
        # Filter out PRs already in cache
        new_pulls = []
        for p in pulls:
            url_parts = p.url.split('/')
            repo_name = f"{url_parts[4]}/{url_parts[5]}"
            key = f"{repo_name}#{p.number}"
            if key not in existing_keys:
                new_pulls.append(p)
        
        if not new_pulls:
            print(f"All {total_count} PRs for {username} are already in cache. Skipping fetch.")
            return []

        print(f"Processing {len(new_pulls)} new PRs (out of {total_count} total) in parallel...")
        
        user_data = []
        # Use ThreadPoolExecutor for parallel processing
        with ThreadPoolExecutor(max_workers=10) as executor:
            futures = {executor.submit(process_pr, issue): issue for issue in new_pulls}
            
            count = 0
            for future in as_completed(futures):
                result = future.result()
                if result:
                    user_data.append(result)
                
                count += 1
                if count % 20 == 0:
                    print(f"  Processed {count}/{len(new_pulls)} PRs for {username}...")
        
        return user_data
    except Exception as e:
        print(f"Error searching PRs for {username}: {e}")
        return []

def fetch_all_metrics():
    cache_file = 'github_metrics.csv'
    existing_df = pd.DataFrame()
    existing_keys = set()
    
    if os.path.exists(cache_file):
        print(f"Loading existing data from {cache_file}...")
        existing_df = pd.read_csv(cache_file)
        # Ensure date column is datetime objects for comparison
        existing_df['date'] = pd.to_datetime(existing_df['date']).dt.date
        if not existing_df.empty:
            # Create a set of keys (repo#number) for fast lookup
            existing_df['key'] = existing_df['repo'] + "#" + existing_df['number'].astype(str)
            existing_keys = set(existing_df['key'].tolist())
    
    users = [u.strip() for u in USER_NAMES.split(',')]
    all_new_data = []
    
    for username in users:
        fetch_from = start_date.date()
        if not existing_df.empty and 'user' in existing_df.columns:
            user_cache = existing_df[existing_df['user'].str.lower() == username.lower()]
            if not user_cache.empty:
                cache_min = user_cache['date'].min()
                cache_max = user_cache['date'].max()
                
                if cache_min <= fetch_from:
                    fetch_from = cache_max
                    print(f"User {username} has cache from {cache_min} to {cache_max}. Checking for updates since {fetch_from}...")
                else:
                    print(f"User {username} cache starts at {cache_min}, but we need data from {fetch_from}. Re-fetching older data...")
        
        all_new_data.extend(fetch_metrics_for_user(username, fetch_from, existing_keys))
    
    if not all_new_data:
        print("No new data to add.")
        if not existing_df.empty and 'key' in existing_df.columns:
            existing_df = existing_df.drop(columns=['key'])
        return existing_df
    
    new_df = pd.DataFrame(all_new_data)
    
    # Combine and deduplicate
    combined_df = pd.concat([existing_df, new_df], ignore_index=True)
    combined_df = combined_df.drop_duplicates(subset=['repo', 'number'], keep='last')
    combined_df = combined_df.sort_values('date')
    
    if 'key' in combined_df.columns:
        combined_df = combined_df.drop(columns=['key'])
    
    return combined_df

def create_activity_plot(ax, stats, title, all_months):
    import numpy as np
    x_labels = [str(m) for m in all_months]
    x_pos = np.arange(len(all_months))
    
    # Widths for visual hierarchy
    width_pr = 0.4
    width_commits = 0.3
    width_lines = 0.2
    
    # Create twin axes
    ax_twin_commits = ax.twinx()
    ax_twin_lines = ax.twinx()
    
    # Position the axes
    ax_twin_lines.spines['right'].set_position(('outward', 60))
    
    # 1. Total Lines (Tertiary) - FURTHEST RIGHT - Lowest Z-order
    ax_twin_lines.bar(x_pos + 0.25, stats['total_size'].fillna(0), width_lines, color='#E2E8F0', label='Lines Changed')
    ax_twin_lines.set_ylabel('Total Lines Changed', color='#64748B')
    ax_twin_lines.set_zorder(1)
    
    # 2. Total Commits (Secondary) - MIDDLE RIGHT - Middle Z-order
    ax_twin_commits.bar(x_pos, stats['total_commits'].fillna(0), width_commits, color='#94A3B8', label='Commits')
    ax_twin_commits.set_ylabel('Total Commits', color='#475569')
    ax_twin_commits.set_zorder(2)
    
    # 3. PR Count (Primary) - LEFT - Top Z-order
    ax.bar(x_pos - 0.2, stats['pr_count'].fillna(0), width_pr, color='#2E5BFF', edgecolor='black', linewidth=0.5, label='PRs')
    ax.set_ylabel('Number of PRs', color='#1E40AF', fontweight='bold')
    ax.set_xticks(x_pos)
    ax.set_xticklabels(x_labels, rotation=45, ha='right')
    ax.set_zorder(3)
    ax.patch.set_visible(False)
    
    ax.set_title(title, fontweight='bold', pad=20)
    
    # Combined legend
    h1, l1 = ax.get_legend_handles_labels()
    h2, l2 = ax_twin_lines.get_legend_handles_labels()
    h3, l3 = ax_twin_commits.get_legend_handles_labels()
    ax.legend(h1 + h3 + h2, l1 + l3 + l2, loc='upper left', prop={'size': 8}, frameon=True)

def generate_user_report(df, username):
    user_df = df[df['user'].str.lower() == username.lower()].copy()
    if user_df.empty:
        print(f"Warning: No data found for user {username}")
        return ""
    
    user_df['month'] = pd.to_datetime(user_df['date']).dt.to_period('M')
    all_months = pd.period_range(start=user_df['month'].min(), end=user_df['month'].max(), freq='M')
    
    def get_monthly_stats(data_df):
        if 'commits' not in data_df.columns:
            data_df['commits'] = 0
        return data_df.groupby('month').agg(
            pr_count=('repo', 'count'),
            total_size=('size', 'sum'),
            total_commits=('commits', 'sum'),
            median_cycle=('cycle_time_hrs', 'median')
        ).reindex(all_months)
    
    size_90 = user_df['size'].quantile(0.9) if len(user_df) > 5 else float('inf')
    cycle_90 = user_df['cycle_time_hrs'].quantile(0.9) if len(user_df) > 5 else float('inf')
    user_df_clean = user_df[(user_df['size'] <= size_90) & (user_df['cycle_time_hrs'] <= cycle_90)].copy()
    
    stats_full = get_monthly_stats(user_df)
    stats_clean = get_monthly_stats(user_df_clean)

    fig, axes = plt.subplots(1, 2, figsize=(20, 8))
    plt.subplots_adjust(wspace=0.6, bottom=0.2, right=0.85)
    create_activity_plot(axes[0], stats_full, f'Activity (All PRs): {username}', all_months)
    create_activity_plot(axes[1], stats_clean, f'Activity (Outliers Removed): {username}', all_months)
    chart_filename = f'user_activity_{username}.png'
    plt.savefig(chart_filename)
    plt.close()

    fig, ax = plt.subplots(figsize=(10, 6))
    x_pos = np.arange(len(all_months))
    ax.plot(x_pos, stats_full['median_cycle'], marker='o', label='All PRs', color='orange', linewidth=2)
    if len(user_df) > 5:
        ax.plot(x_pos, stats_clean['median_cycle'], marker='s', label='All PRs (Outliers Removed)', color='green', linestyle='--', linewidth=2)
    ax.set_title(f'Cycle Time Trends: {username}')
    ax.set_ylabel('Median Hours to Merge')
    ax.set_xticks(x_pos)
    ax.set_xticklabels([str(m) for m in all_months], rotation=45, ha='right')
    ax.legend()
    ax.grid(True, alpha=0.3)
    cycle_chart_filename = f'user_cycle_{username}.png'
    plt.savefig(cycle_chart_filename)
    plt.close()

    outlier_count = len(user_df) - len(user_df_clean)
    outliers_df = user_df[(user_df['size'] > size_90) | (user_df['cycle_time_hrs'] > cycle_90)].copy()
    outliers_df = outliers_df.sort_values('date', ascending=False)
    
    outlier_rows = ""
    for _, row in outliers_df.iterrows():
        outlier_rows += f"<tr><td>#{row['number']}</td><td>{row['date']}</td><td>{row['title']}</td><td>{row['commits']}</td><td>{row['size']}</td><td>{row['cycle_time_hrs']:.1f}h</td></tr>"
    
    outlier_table = f"""
        <details style="margin-top: 15px; border: 1px solid #eab308; border-radius: 4px; background: white;">
            <summary style="cursor: pointer; padding: 10px; font-weight: bold; color: #854d0e; background-color: #fefce8;">
                ▶ View {outlier_count} Outlier PR Details
            </summary>
            <div style="padding: 10px; overflow-x: auto;">
                <table style="font-size: 0.9em; width: 100%; margin-bottom: 0;">
                    <thead>
                        <tr style="background-color: #fef9c3;">
                            <th>PR #</th>
                            <th>Date</th>
                            <th>Title</th>
                            <th>Commits</th>
                            <th>Lines Changed</th>
                            <th>Cycle Time</th>
                        </tr>
                    </thead>
                    <tbody>
                        {outlier_rows}
                    </tbody>
                </table>
            </div>
        </details>
    """ if outlier_count > 0 else ""

    outlier_desc = f"""
    <div style="background-color: #fffbdd; padding: 15px; border-left: 5px solid #d4a017; margin: 20px 0; border-radius: 4px;">
        <strong>Outlier Breakdown:</strong> We identified <strong>{outlier_count}</strong> PRs that were significantly larger or took much longer than usual. 
        These are removed from the "Activity (Outliers Removed)" and cycle time trend lines.
        <br><em>Thresholds: Size > {size_90:.0f} lines OR Cycle Time > {cycle_90:.1f}h.</em>
        {outlier_table}
    </div>
    """ if outlier_count > 0 else "<p>No significant outliers detected.</p>"
    
    return f"""<div class="user-section"><h2>User: {username}</h2><div class="chart-container"><img src="{chart_filename}" alt="Activity for {username}"></div><div class="chart-container" style="max-width: 800px; margin: 0 auto 30px auto;"><img src="{cycle_chart_filename}" alt="Cycle Time for {username}"></div>{outlier_desc}<table><thead><tr><th>Metric (Last {MONTHS_BACK} Mo)</th><th>Full History</th><th>Typical Work</th><th>Impact</th></tr></thead><tbody><tr><td>Total Merged PRs</td><td>{len(user_df)}</td><td>{len(user_df_clean)}</td><td>-{outlier_count} PRs</td></tr><tr><td>Median Cycle Time</td><td>{user_df['cycle_time_hrs'].median():.1f}h</td><td>{user_df_clean['cycle_time_hrs'].median():.1f}h</td><td>{((user_df['cycle_time_hrs'].median() - user_df_clean['cycle_time_hrs'].median())/user_df['cycle_time_hrs'].median()*100) if user_df['cycle_time_hrs'].median() > 0 else 0:.1f}% faster</td></tr><tr><td>Total Lines Changed</td><td>{user_df['size'].sum():,.0f}</td><td>{user_df_clean['size'].sum():,.0f}</td><td>-{(user_df['size'].sum() - user_df_clean['size'].sum()):,.0f} lines</td></tr><tr><td>Total Commits</td><td>{user_df['commits'].sum():,.0f}</td><td>{user_df_clean['commits'].sum():,.0f}</td><td>-{(user_df['commits'].sum() - user_df_clean['commits'].sum()):,.0f} commits</td></tr></tbody></table></div>"""

def generate_team_report(df):
    if df.empty: return ""
    df['month'] = pd.to_datetime(df['date']).dt.to_period('M')
    all_months = pd.period_range(start=df['month'].min(), end=df['month'].max(), freq='M')
    
    def get_monthly_stats(data_df):
        if 'commits' not in data_df.columns: data_df['commits'] = 0
        return data_df.groupby('month').agg(pr_count=('repo', 'count'), total_size=('size', 'sum'), total_commits=('commits', 'sum'), median_cycle=('cycle_time_hrs', 'median')).reindex(all_months)
    
    size_90 = df['size'].quantile(0.9) if len(df) > 5 else float('inf')
    cycle_90 = df['cycle_time_hrs'].quantile(0.9) if len(df) > 5 else float('inf')
    df_clean = df[(df['size'] <= size_90) & (df['cycle_time_hrs'] <= cycle_90)].copy()
    stats_full = get_monthly_stats(df)
    stats_clean = get_monthly_stats(df_clean)

    fig, axes = plt.subplots(1, 2, figsize=(20, 8))
    plt.subplots_adjust(wspace=0.6, bottom=0.2, right=0.85)
    create_activity_plot(axes[0], stats_full, 'Team Activity (All PRs)', all_months)
    create_activity_plot(axes[1], stats_clean, 'Team Activity (Outliers Removed)', all_months)
    plt.savefig('team_activity.png')
    plt.close()

    fig, ax = plt.subplots(figsize=(10, 6))
    x_pos = np.arange(len(all_months))
    ax.plot(x_pos, stats_full['median_cycle'], marker='o', label='All PRs', color='orange', linewidth=2)
    if len(df) > 5:
        ax.plot(x_pos, stats_clean['median_cycle'], marker='s', label='All PRs (Outliers Removed)', color='green', linestyle='--', linewidth=2)
    ax.set_title('Overall Team Cycle Time Trends')
    ax.set_ylabel('Median Hours to Merge')
    ax.set_xticks(x_pos)
    ax.set_xticklabels([str(m) for m in all_months], rotation=45, ha='right')
    ax.legend()
    ax.grid(True, alpha=0.3)
    plt.savefig('team_cycle.png')
    plt.close()

    return f"""
    <div class="user-section">
        <h1>Overall Team Trends</h1>
        <div class="chart-container">
            <img src="team_activity.png" alt="Team Activity">
        </div>
        <div class="chart-container" style="max-width: 800px; margin: 0 auto 30px auto;">
            <img src="team_cycle.png" alt="Team Cycle Time">
        </div>
        <table>
            <thead>
                <tr>
                    <th>Metric (Team Total - Last {MONTHS_BACK} Mo)</th>
                    <th>Full History</th>
                    <th>Typical Work (Clean)</th>
                </tr>
            </thead>
            <tbody>
                <tr><td>Total Merged PRs</td><td>{len(df)}</td><td>{len(df_clean)}</td></tr>
                <tr><td>Median Cycle Time</td><td>{df['cycle_time_hrs'].median():.1f}h</td><td>{df_clean['cycle_time_hrs'].median():.1f}h</td></tr>
                <tr><td>Total Lines Changed</td><td>{df['size'].sum():,.0f}</td><td>{df_clean['size'].sum():,.0f}</td></tr>
            </tbody>
        </table>
    </div>
    """

def generate_html_report(df, user_reports_html, team_report_html):
    total_prs = len(df)
    unique_users = sorted(df['user'].unique().tolist(), key=str.lower)
    total_authors = len(unique_users)
    avg_cycle_time = df['cycle_time_hrs'].mean()
    median_cycle_time = df['cycle_time_hrs'].median()
    
    user_names_list = ", ".join(unique_users)
    
    html_content = f"""
    <!DOCTYPE html>
    <html lang="en">
    <head>
        <meta charset="UTF-8">
        <meta name="viewport" content="width=device-width, initial-scale=1.0">
        <title>GitHub Code Trends</title>
        <style>
            body {{ font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, Helvetica, Arial, sans-serif; line-height: 1.6; color: #333; max-width: 1200px; margin: 0 auto; padding: 20px; background-color: #f6f8fa; }}
            h1, h2, h3 {{ color: #24292e; border-bottom: 1px solid #eaecef; padding-bottom: 10px; }}
            .stats-grid {{ display: grid; grid-template-columns: repeat(auto-fit, minmax(200px, 1fr)); gap: 20px; margin-bottom: 30px; }}
            .stat-card {{ background: white; padding: 20px; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12); text-align: center; }}
            .stat-value {{ font-size: 24px; font-weight: bold; color: #0366d6; }}
            .stat-label {{ font-size: 14px; color: #586069; }}
            .user-list {{ background: #fff; padding: 15px; border-radius: 6px; border: 1px solid #eaecef; margin-bottom: 30px; font-size: 0.95em; }}
            .chart-container {{ background: white; padding: 20px; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12); margin-bottom: 30px; text-align: center; }}
            .chart-container img {{ max-width: 100%; height: auto; }}
            .user-section {{ background: white; padding: 20px; border-radius: 6px; box-shadow: 0 1px 3px rgba(0,0,0,0.12); margin-bottom: 50px; }}
            table {{ width: 100%; border-collapse: collapse; margin-bottom: 30px; background: white; }}
            th, td {{ text-align: left; padding: 12px; border-bottom: 1px solid #eaecef; }}
            th {{ background-color: #f6f8fa; }}
            tr:hover {{ background-color: #f1f8ff; }}
        </style>
    </head>
    <body>
        <h1>GitHub Code Trends</h1>
        <p>Generated on {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}</p>
        
        <div class="user-list">
            <strong>Tracked Users ({total_authors}):</strong> {user_names_list}
        </div>

        <div class="stats-grid">
            <div class="stat-card"><div class="stat-value">{total_prs}</div><div class="stat-label">Total Merged PRs</div></div>
            <div class="stat-card"><div class="stat-value">{total_authors}</div><div class="stat-label">Tracked Users</div></div>
            <div class="stat-card"><div class="stat-value">{avg_cycle_time:.1f}h</div><div class="stat-label">Avg Cycle Time</div></div>
            <div class="stat-card"><div class="stat-value">{median_cycle_time:.1f}h</div><div class="stat-label">Median Cycle Time</div></div>
        </div>

        {team_report_html}

        <h2>Individual User Trends</h2>
        {user_reports_html}
    </body>
    </html>
    """
    with open('engineering_report.html', 'w') as f:
        f.write(html_content)
    print("Full HTML report generated: engineering_report.html")

def generate_dashboard(df):
    users = df['user'].unique()
    user_reports_html = ""
    for username in users:
        print(f"Generating charts for {username}...")
        user_reports_html += generate_user_report(df, username)
    print("Generating Overall Team charts...")
    team_report_html = generate_team_report(df)
    generate_html_report(df, user_reports_html, team_report_html)

if __name__ == "__main__":
    parser = argparse.ArgumentParser(description='Fetch GitHub metrics for users.')
    parser.add_argument('--users', type=str, help='Comma-separated list of GitHub usernames')
    parser.add_argument('--clear-cache', action='store_true', help='Clear the local CSV cache before running')
    args = parser.parse_args()

    if args.users:
        USER_NAMES = args.users

    if ACCESS_TOKEN == 'YOUR_GITHUB_TOKEN' or USER_NAMES == 'user1,user2':
        print("Error: Please set GITHUB_TOKEN and either GITHUB_USERS environment variable or --users argument.")
        print("Example: export GITHUB_TOKEN='your_token_here'")
        print("         python fetch_metrics.py --users='username1,username2'")
        exit(1)

    if args.clear_cache and os.path.exists('github_metrics.csv'):
        print("Clearing cache file.")
        os.remove('github_metrics.csv')
    df = fetch_all_metrics()
    if df.empty:
        print("No data found."); exit(0)
    df.to_csv('github_metrics.csv', index=False)
    dashboard_start_date = start_date.date()
    df['temp_date'] = pd.to_datetime(df['date']).dt.date
    df_filtered = df[df['temp_date'] >= dashboard_start_date].copy()
    if df_filtered.empty:
        print(f"No data found since {dashboard_start_date}."); exit(0)
    generate_dashboard(df_filtered)
No results found