Created
March 9, 2026 09:57
-
-
Save ParagEkbote/9cca606410c62597d89fa84310c762f3 to your computer and use it in GitHub Desktop.
Open-Source Contributions Insight
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import json | |
| import subprocess | |
| import requests | |
| from datetime import datetime | |
| from statistics import mean, median, stdev | |
| from collections import defaultdict | |
| from pathlib import Path | |
| GITHUB_TOKEN = os.environ.get("GITHUB_TOKEN", "") | |
| HEADERS = { | |
| "Authorization": f"Bearer {GITHUB_TOKEN}", | |
| "Accept": "application/vnd.github.v3+json", | |
| } | |
| SEARCH_AUTHOR = "ParagEkbote" | |
| MERGED_PRS_JSON = "merged_prs.json" | |
| def parse_time(t): | |
| return datetime.strptime(t, "%Y-%m-%dT%H:%M:%SZ") | |
| def get_repo_root() -> Path: | |
| """ | |
| Return the git repository root as a Path if available. | |
| Falls back to current working directory if not in a git repo or git isn't available. | |
| """ | |
| try: | |
| root = ( | |
| subprocess.check_output( | |
| ["git", "rev-parse", "--show-toplevel"], | |
| stderr=subprocess.DEVNULL, | |
| ) | |
| .decode() | |
| .strip() | |
| ) | |
| return Path(root) | |
| except Exception: | |
| # Not a git repo or git not installed; fallback to cwd | |
| cwd = Path.cwd() | |
| print( | |
| f"β οΈ Could not determine git root. Falling back to current directory: {cwd}" | |
| ) | |
| return cwd | |
| def fetch_all_merged_prs(): | |
| print("π₯ Fetching merged PRs via GraphQL...") | |
| url = "https://api.github.com/graphql" | |
| cursor = None | |
| all_prs = [] | |
| while True: | |
| query = f""" | |
| {{ | |
| search( | |
| query: "author:{SEARCH_AUTHOR} is:pr is:merged sort:created-desc", | |
| type: ISSUE, | |
| first: 100 | |
| {f', after: "{cursor}"' if cursor else ""} | |
| ) {{ | |
| pageInfo {{ | |
| hasNextPage | |
| endCursor | |
| }} | |
| nodes {{ | |
| ... on PullRequest {{ | |
| url | |
| title | |
| createdAt | |
| mergedAt | |
| additions | |
| deletions | |
| changedFiles | |
| repository {{ | |
| nameWithOwner | |
| description | |
| stargazerCount | |
| primaryLanguage {{ | |
| name | |
| }} | |
| }} | |
| author {{ | |
| login | |
| }} | |
| }} | |
| }} | |
| }} | |
| }} | |
| """ | |
| resp = requests.post(url, headers=HEADERS, json={"query": query}) | |
| data = resp.json() | |
| if "errors" in data: | |
| raise Exception(data["errors"]) | |
| search = data["data"]["search"] | |
| all_prs.extend(search["nodes"]) | |
| if not search["pageInfo"]["hasNextPage"]: | |
| break | |
| cursor = search["pageInfo"]["endCursor"] | |
| external = [ | |
| pr for pr in all_prs | |
| if not pr["repository"]["nameWithOwner"].startswith(f"{SEARCH_AUTHOR}/") | |
| ] | |
| print(f"βοΈ Found {len(external)} merged external PRs.") | |
| return external | |
| def fetch_pr_reviews_comments_files(pr_url): | |
| parts = pr_url.split("/") | |
| owner, repo, number = parts[3], parts[4], parts[-1] | |
| base = f"https://api.github.com/repos/{owner}/{repo}/pulls/{number}" | |
| pr_reviews = requests.get(base + "/reviews", headers=HEADERS).json() | |
| pr_comments = requests.get(base + "/comments", headers=HEADERS).json() | |
| pr_files = requests.get(base + "/files", headers=HEADERS).json() | |
| return pr_reviews, pr_comments, pr_files | |
| FOLDER_MAP = { | |
| "docs": "docs", | |
| "documentation": "docs", | |
| "examples": "examples", | |
| "example": "examples", | |
| "tests": "tests", | |
| "test": "tests", | |
| "models": "models", | |
| "model": "models", | |
| "utils": "utils", | |
| "ops": "ops", | |
| "src": "src", | |
| "api": "api", | |
| "core": "core", | |
| "config": "config", | |
| "lib": "lib", | |
| } | |
| def categorize_file(path): | |
| parts = path.split("/") | |
| if len(parts) == 0: | |
| return "other" | |
| first = parts[0].lower() | |
| for key, label in FOLDER_MAP.items(): | |
| if first.startswith(key): | |
| return label | |
| return "other" | |
| def extract_notable_prs(prs): | |
| """Extract notable PRs for case study examples""" | |
| notable = [] | |
| for pr in prs: | |
| size = pr["additions"] + pr["deletions"] | |
| repo = pr["repository"]["nameWithOwner"] | |
| stars = pr["repository"]["stargazerCount"] | |
| # High-impact criteria | |
| if size > 500 or stars > 1000 or pr["changedFiles"] > 10: | |
| notable.append({ | |
| "title": pr["title"], | |
| "url": pr["url"], | |
| "repo": repo, | |
| "size": size, | |
| "stars": stars, | |
| "files": pr["changedFiles"], | |
| "created": pr["createdAt"] | |
| }) | |
| # Sort by stars and size | |
| notable.sort(key=lambda x: (x["stars"], x["size"]), reverse=True) | |
| return notable[:10] # Top 10 | |
| def compute_pr_insights(prs): | |
| merge_durations = [] | |
| review_iterations = [] | |
| review_comments_count = [] | |
| pr_sizes = [] | |
| touched_files = [] | |
| first_try_merge_flags = [] | |
| repo_contribution_count = defaultdict(int) | |
| folder_counts = defaultdict(int) | |
| language_counts = defaultdict(int) | |
| # For case study analysis | |
| impact_scores = [] | |
| monthly_merge = defaultdict(list) | |
| monthly_first_try = defaultdict(list) | |
| monthly_pr_count = defaultdict(int) | |
| # Notable contributions | |
| large_prs = [] | |
| quick_merges = [] | |
| for pr in prs: | |
| url = pr["url"] | |
| repo = pr["repository"]["nameWithOwner"] | |
| created = parse_time(pr["createdAt"]) | |
| merged = parse_time(pr["mergedAt"]) | |
| month = created.strftime("%Y-%m") | |
| repo_contribution_count[repo] += 1 | |
| monthly_pr_count[month] += 1 | |
| # Language tracking | |
| lang = pr["repository"].get("primaryLanguage") | |
| if lang: | |
| language_counts[lang["name"]] += 1 | |
| # Merge time | |
| merge_days = (merged - created).total_seconds() / 86400 | |
| merge_durations.append(merge_days) | |
| monthly_merge[month].append(merge_days) | |
| if merge_days < 1: | |
| quick_merges.append((pr["title"], pr["url"], merge_days)) | |
| # PR size | |
| size = pr["additions"] + pr["deletions"] | |
| pr_sizes.append(size) | |
| touched_files.append(pr["changedFiles"]) | |
| if size > 500: | |
| large_prs.append((pr["title"], pr["url"], size)) | |
| # Impact score (stars * size weight) | |
| stars = pr["repository"]["stargazerCount"] | |
| impact = stars * (1 + size / 1000) | |
| impact_scores.append(impact) | |
| # Reviews & comments | |
| reviews, comments, files = fetch_pr_reviews_comments_files(url) | |
| review_iterations.append(len(reviews)) | |
| review_comments_count.append(len(comments)) | |
| # First-try merge | |
| states = [r.get("state", "").lower() for r in reviews] | |
| ok = "changes_requested" not in states | |
| first_try_merge_flags.append(ok) | |
| monthly_first_try[month].append(int(ok)) | |
| # Folder categories | |
| for f in files: | |
| folder = categorize_file(f["filename"]) | |
| folder_counts[folder] += 1 | |
| # Summaries | |
| repeat_repos = sum(1 for repo, cnt in repo_contribution_count.items() if cnt > 1) | |
| top_repo = max(repo_contribution_count, key=lambda r: repo_contribution_count[r]) | |
| # Calculate consistency metrics | |
| merge_stdev = stdev(merge_durations) if len(merge_durations) > 1 else 0 | |
| size_stdev = stdev(pr_sizes) if len(pr_sizes) > 1 else 0 | |
| return { | |
| "total_prs": len(prs), | |
| "avg_merge_days": mean(merge_durations), | |
| "median_merge_days": median(merge_durations), | |
| "merge_stdev": merge_stdev, | |
| "avg_review_iterations": mean(review_iterations), | |
| "avg_review_comments": mean(review_comments_count), | |
| "avg_pr_size": mean(pr_sizes), | |
| "median_pr_size": median(pr_sizes), | |
| "max_pr_size": max(pr_sizes), | |
| "min_pr_size": min(pr_sizes), | |
| "size_stdev": size_stdev, | |
| "avg_files_touched": mean(touched_files), | |
| "first_try_merge_rate": sum(first_try_merge_flags) / len(first_try_merge_flags), | |
| "repeat_repos": repeat_repos, | |
| "total_repos": len(repo_contribution_count), | |
| "top_repo": top_repo, | |
| "top_repo_count": repo_contribution_count[top_repo], | |
| "contribution_summary": folder_counts, | |
| "language_distribution": language_counts, | |
| "avg_impact_score": mean(impact_scores), | |
| "monthly_activity": monthly_pr_count, | |
| "large_prs": sorted(large_prs, key=lambda x: x[2], reverse=True)[:5], | |
| "quick_merges": sorted(quick_merges, key=lambda x: x[2])[:5], | |
| "repo_contributions": dict(sorted(repo_contribution_count.items(), | |
| key=lambda x: x[1], reverse=True)[:10]) | |
| } | |
| def export_case_study_markdown(insights, prs, filename="oss_case_study.md"): | |
| """Generate comprehensive case study markdown""" | |
| repo_root = get_repo_root() | |
| out_path = repo_root / filename | |
| notable_prs = extract_notable_prs(prs) | |
| lines = [] | |
| # Header | |
| lines.append("# π Open Source Contribution Case Study") | |
| lines.append("\n**Developer:** Parag Ekbote") | |
| lines.append(f"**Analysis Date:** {datetime.now().strftime('%B %d, %Y')}") | |
| lines.append(f"**Total Contributions Analyzed:** {insights['total_prs']} merged PRs\n") | |
| lines.append("---\n") | |
| # Executive Summary | |
| lines.append("## π― Executive Summary\n") | |
| lines.append(f"This case study analyzes **{insights['total_prs']} merged pull requests** across " | |
| f"**{insights['total_repos']} open-source repositories**. The analysis reveals consistent " | |
| f"contribution patterns with a **{insights['first_try_merge_rate']*100:.1f}%** first-try merge rate " | |
| f"and an average merge time of **{insights['avg_merge_days']:.2f} days**.\n") | |
| # Key Highlights | |
| lines.append("### Key Highlights\n") | |
| lines.append(f"- β **{insights['first_try_merge_rate']*100:.1f}%** of PRs merged without change requests") | |
| lines.append(f"- π Contributed to **{insights['repeat_repos']}** repositories multiple times") | |
| lines.append(f"- π¦ Average PR size: **{insights['avg_pr_size']:.0f} lines** (showing focused, reviewable changes)") | |
| lines.append(f"- β‘ **{len(insights['quick_merges'])}** PRs merged within 24 hours") | |
| lines.append(f"- ποΈ Largest contribution: **{insights['max_pr_size']:,} lines of code**\n") | |
| # Contribution Metrics | |
| lines.append("---\n") | |
| lines.append("## π Contribution Metrics\n") | |
| lines.append("### Merge Performance\n") | |
| lines.append("| Metric | Value | Insight |") | |
| lines.append("|--------|-------|---------|") | |
| lines.append(f"| Average Merge Time | {insights['avg_merge_days']:.2f} days | " | |
| f"{'Fast turnaround' if insights['avg_merge_days'] < 3 else 'Standard process'} |") | |
| lines.append(f"| Median Merge Time | {insights['median_merge_days']:.2f} days | " | |
| f"Consistent with average |") | |
| lines.append(f"| Standard Deviation | {insights['merge_stdev']:.2f} days | " | |
| f"{'Low variance' if insights['merge_stdev'] < 5 else 'Varied timelines'} |") | |
| lines.append(f"| First-Try Merge Rate | {insights['first_try_merge_rate']*100:.1f}% | " | |
| f"{'Excellent' if insights['first_try_merge_rate'] > 0.7 else 'Good'} code quality |") | |
| lines.append("\n### Code Review Engagement\n") | |
| lines.append("| Metric | Value |") | |
| lines.append("|--------|-------|") | |
| lines.append(f"| Avg Review Iterations | {insights['avg_review_iterations']:.2f} |") | |
| lines.append(f"| Avg Review Comments | {insights['avg_review_comments']:.2f} |") | |
| lines.append("\n### PR Size & Complexity\n") | |
| lines.append("| Metric | Value | Context |") | |
| lines.append("|--------|-------|---------|") | |
| lines.append(f"| Average PR Size | {insights['avg_pr_size']:.0f} LoC | " | |
| f"{'Small, focused' if insights['avg_pr_size'] < 200 else 'Moderate scope'} |") | |
| lines.append(f"| Median PR Size | {insights['median_pr_size']:.0f} LoC | Typical contribution size |") | |
| lines.append(f"| Largest PR | {insights['max_pr_size']:,} LoC | Capability for large-scale work |") | |
| lines.append(f"| Smallest PR | {insights['min_pr_size']} LoC | Attention to detail |") | |
| lines.append(f"| Files per PR | {insights['avg_files_touched']:.2f} | " | |
| f"{'Focused' if insights['avg_files_touched'] < 5 else 'Cross-cutting'} changes |") | |
| # Repository Landscape | |
| lines.append("\n---\n") | |
| lines.append("## π Repository Landscape\n") | |
| lines.append(f"### Top Repositories ({len(insights['repo_contributions'])} shown)\n") | |
| for i, (repo, count) in enumerate(insights['repo_contributions'].items(), 1): | |
| lines.append(f"{i}. **[{repo}](https://github.com/{repo})** β {count} PR{'s' if count > 1 else ''}") | |
| lines.append(f"\n**Most Active Repository:** {insights['top_repo']} " | |
| f"({insights['top_repo_count']} contributions)\n") | |
| # Technology Stack | |
| lines.append("### π» Technology Stack\n") | |
| top_langs = sorted(insights['language_distribution'].items(), | |
| key=lambda x: x[1], reverse=True)[:8] | |
| for lang, count in top_langs: | |
| percentage = (count / insights['total_prs']) * 100 | |
| lines.append(f"- **{lang}**: {count} PRs ({percentage:.1f}%)") | |
| # Contribution Areas | |
| lines.append("\n### ποΈ Contribution Focus Areas\n") | |
| top_folders = sorted(insights['contribution_summary'].items(), | |
| key=lambda x: x[1], reverse=True)[:8] | |
| for folder, count in top_folders: | |
| lines.append(f"- **{folder.title()}**: {count} file modifications") | |
| # Notable Contributions | |
| lines.append("\n---\n") | |
| lines.append("## π Notable Contributions\n") | |
| if insights['large_prs']: | |
| lines.append("### Significant PRs (by size)\n") | |
| for i, (title, url, size) in enumerate(insights['large_prs'], 1): | |
| lines.append(f"{i}. [{title}]({url}) β {size:,} LoC") | |
| if insights['quick_merges']: | |
| lines.append("\n### Fast-Tracked Merges (< 24 hours)\n") | |
| for i, (title, url, hours) in enumerate(insights['quick_merges'], 1): | |
| lines.append(f"{i}. [{title}]({url}) β {hours*24:.1f} hours") | |
| if notable_prs: | |
| lines.append("\n### High-Impact Projects\n") | |
| lines.append("*Based on repository popularity and contribution size*\n") | |
| for i, pr in enumerate(notable_prs[:5], 1): | |
| lines.append(f"{i}. **[{pr['repo']}]({pr['url']})** " | |
| f"(β {pr['stars']:,}) β {pr['size']:,} LoC, {pr['files']} files") | |
| # Activity Patterns | |
| lines.append("\n---\n") | |
| lines.append("## π Activity Patterns\n") | |
| if insights['monthly_activity']: | |
| sorted_months = sorted(insights['monthly_activity'].items()) | |
| total_months = len(sorted_months) | |
| avg_per_month = insights['total_prs'] / total_months if total_months > 0 else 0 | |
| lines.append(f"**Active Period:** {sorted_months[0][0]} to {sorted_months[-1][0]}") | |
| lines.append(f"**Average PRs per Month:** {avg_per_month:.1f}") | |
| lines.append(f"**Most Active Month:** {max(sorted_months, key=lambda x: x[1])[0]} " | |
| f"({max(insights['monthly_activity'].values())} PRs)\n") | |
| # Insights & Patterns | |
| lines.append("---\n") | |
| lines.append("## π‘ Key Insights\n") | |
| lines.append("### Code Quality Indicators") | |
| lines.append(f"- High first-try merge rate ({insights['first_try_merge_rate']*100:.1f}%) suggests strong " | |
| "understanding of project standards") | |
| lines.append(f"- Average of {insights['avg_review_comments']:.1f} review comments indicates constructive " | |
| "collaboration") | |
| lines.append(f"- Consistent PR sizing (Ο = {insights['size_stdev']:.0f}) shows disciplined contribution approach\n") | |
| lines.append("### Community Engagement") | |
| lines.append(f"- Repeat contributions to {insights['repeat_repos']} repositories demonstrates sustained engagement") | |
| lines.append(f"- Work spans {insights['total_repos']} different projects showing adaptability") | |
| lines.append(f"- Diverse technology stack ({len(insights['language_distribution'])} languages) " | |
| "indicates technical versatility\n") | |
| # Methodology | |
| lines.append("---\n") | |
| lines.append("## π Methodology\n") | |
| lines.append("This case study analyzes merged pull requests using GitHub's GraphQL API. " | |
| "Metrics include:\n") | |
| lines.append("- **Merge Performance**: Time from PR creation to merge, review iterations") | |
| lines.append("- **Code Metrics**: Lines changed, files modified, PR complexity") | |
| lines.append("- **Impact Analysis**: Repository popularity, contribution scope") | |
| lines.append("- **Quality Indicators**: First-try merge rate, review engagement\n") | |
| lines.append("---\n") | |
| lines.append(f"*Generated automatically from GitHub data on {datetime.now().strftime('%Y-%m-%d')}*") | |
| # Write to project root | |
| with open(out_path, "w") as f: | |
| f.write("\n".join(lines)) | |
| print(f"π Case study exported to: {out_path}") | |
| return out_path | |
| # ------------------------------------------------------- | |
| # MAIN | |
| # ------------------------------------------------------- | |
| if __name__ == "__main__": | |
| print("π Starting OSS Case Study Generation...\n") | |
| # Fetch data | |
| prs = fetch_all_merged_prs() | |
| # Save raw data | |
| with open(MERGED_PRS_JSON, "w") as f: | |
| json.dump(prs, f, indent=2) | |
| print(f"πΎ Saved PR list to {MERGED_PRS_JSON}\n") | |
| # Compute insights | |
| print("π Analyzing contribution patterns...") | |
| insights = compute_pr_insights(prs) | |
| # Display summary | |
| print("\n" + "="*50) | |
| print("ANALYSIS SUMMARY") | |
| print("="*50) | |
| print(f"Total PRs: {insights['total_prs']}") | |
| print(f"Total Repositories: {insights['total_repos']}") | |
| print(f"Avg Merge Time: {insights['avg_merge_days']:.2f} days") | |
| print(f"First-Try Merge Rate: {insights['first_try_merge_rate']*100:.1f}%") | |
| print(f"Most Active Repo: {insights['top_repo']}") | |
| print("="*50 + "\n") | |
| # Generate case study | |
| output_file = export_case_study_markdown(insights, prs) | |
| print("β Case study generation complete!") | |
| print(f"π File location: {output_file}") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment