Skip to content

Instantly share code, notes, and snippets.

@innomatics
Created December 19, 2025 05:16
Show Gist options
  • Select an option

  • Save innomatics/c6951ab275cfd2900cb666579b59b459 to your computer and use it in GitHub Desktop.

Select an option

Save innomatics/c6951ab275cfd2900cb666579b59b459 to your computer and use it in GitHub Desktop.
#!/bin/bash
# GitHub Repository Statistics Script (GraphQL Version)
# Usage: ./github_stats.sh owner/repo1 owner/repo2 ...
# Requires: GITHUB_TOKEN environment variable for authentication
set -euo pipefail
# Colors for output
RED='\033[0;31m'
GREEN='\033[0;32m'
YELLOW='\033[1;33m'
BLUE='\033[0;34m'
NC='\033[0m' # No Color
# Check if GITHUB_TOKEN is set
if [ -z "${GITHUB_TOKEN:-}" ]; then
echo -e "${RED}Error: GITHUB_TOKEN environment variable is not set${NC}"
echo "Please set it with: export GITHUB_TOKEN='your_github_token'"
exit 1
fi
# Check if repositories are provided
if [ $# -eq 0 ]; then
echo -e "${RED}Error: No repositories provided${NC}"
echo "Usage: $0 owner/repo1 owner/repo2 ..."
exit 1
fi
# Get current year
CURRENT_YEAR=$(date +%Y)
START_DATE="${CURRENT_YEAR}-01-01T00:00:00Z"
# Temporary directory for data
TEMP_DIR=$(mktemp -d)
trap "rm -rf $TEMP_DIR" EXIT
echo -e "${BLUE}=== GitHub Repository Statistics for ${CURRENT_YEAR} ===${NC}"
echo -e "${BLUE}=== (PRs created in ${CURRENT_YEAR} and merged) ===${NC}\n"
# Initialize data files
PR_FILE="$TEMP_DIR/prs.json"
CONTRIBUTORS_FILE="$TEMP_DIR/contributors.json"
echo "[]" > "$PR_FILE"
echo "{}" > "$CONTRIBUTORS_FILE"
# Function to call GitHub GraphQL API
graphql_query() {
local query="$1"
local temp_response="$TEMP_DIR/graphql_response.json"
local temp_headers="$TEMP_DIR/graphql_headers.txt"
# Use -w to get HTTP status code, save response to file
http_code=$(curl -s -w "%{http_code}" -o "$temp_response" \
-X POST \
-H "Authorization: bearer $GITHUB_TOKEN" \
-H "Content-Type: application/json" \
-d "{\"query\": $(echo "$query" | jq -Rs .)}" \
https://api.github.com/graphql)
# Write HTTP code to a separate file
echo "$http_code" > "$TEMP_DIR/http_code.txt"
}
# Process each repository
for REPO in "$@"; do
echo -e "${GREEN}Processing repository: ${REPO}${NC}"
# Validate repo format
if [[ ! "$REPO" =~ ^([^/]+)/([^/]+)$ ]]; then
echo -e "${RED}Invalid repository format: $REPO (expected: owner/repo)${NC}"
continue
fi
OWNER="${BASH_REMATCH[1]}"
REPO_NAME="${BASH_REMATCH[2]}"
echo " Fetching pull requests created since ${START_DATE}..."
has_next_page=true
after_cursor="null"
pr_count=0
while [ "$has_next_page" = true ]; do
# GraphQL query to fetch PRs created in current year
# Note: We filter by createdAt at API level, then filter merged PRs locally
query="query {
repository(owner: \"$OWNER\", name: \"$REPO_NAME\") {
pullRequests(
first: 100
states: MERGED
orderBy: {field: CREATED_AT, direction: DESC}
after: $after_cursor
) {
pageInfo {
hasNextPage
endCursor
}
nodes {
number
title
createdAt
mergedAt
additions
deletions
bodyText
author {
login
}
reviews(first: 100) {
totalCount
}
comments(first: 1) {
totalCount
}
}
}
}
}"
# Retry logic with exponential backoff
max_retries=3
retry_count=0
success=false
while [ $retry_count -lt $max_retries ] && [ "$success" = false ]; do
# Call API (writes to files)
graphql_query "$query"
# Read HTTP code and response from files
http_code=$(cat "$TEMP_DIR/http_code.txt")
response=$(cat "$TEMP_DIR/graphql_response.json")
# Check HTTP status code
if [ "$http_code" != "200" ]; then
retry_count=$((retry_count + 1))
if [ $retry_count -lt $max_retries ]; then
wait_time=$((2 ** retry_count)) # Exponential backoff: 2, 4, 8 seconds
echo -e "${YELLOW} HTTP Error: Status $http_code${NC}"
echo " Retrying in ${wait_time} seconds (attempt $((retry_count + 1))/$max_retries)..."
sleep $wait_time
else
echo -e "${RED}HTTP Error after $max_retries attempts: Status $http_code${NC}"
if [ -n "$response" ]; then
echo "$response"
fi
has_next_page=false
break 2 # Break out of both retry loop and main while loop
fi
continue
fi
# Check for GraphQL errors in response
if echo "$response" | jq -e '.errors' > /dev/null 2>&1; then
error_msg=$(echo "$response" | jq -r '.errors[0].message // "Unknown error"')
retry_count=$((retry_count + 1))
if [ $retry_count -lt $max_retries ]; then
wait_time=$((2 ** retry_count)) # Exponential backoff: 2, 4, 8 seconds
echo -e "${YELLOW} API Error: $error_msg${NC}"
echo " Retrying in ${wait_time} seconds (attempt $((retry_count + 1))/$max_retries)..."
sleep $wait_time
else
echo -e "${RED}GraphQL Error after $max_retries attempts:${NC}"
echo "$response" | jq '.errors'
has_next_page=false
break 2 # Break out of both retry loop and main while loop
fi
else
success=true
fi
done
if [ "$success" = false ]; then
break
fi
# Check if repository data exists
if ! echo "$response" | jq -e '.data.repository' > /dev/null 2>&1; then
echo -e "${RED}Error: Repository not found or no access${NC}"
break
fi
# Extract page info with safe defaults
has_next_page=$(echo "$response" | jq -r '.data.repository.pullRequests.pageInfo.hasNextPage // false')
end_cursor=$(echo "$response" | jq -r '.data.repository.pullRequests.pageInfo.endCursor // ""')
# Process PRs from this page - handle null nodes and filter by createdAt
prs_page=$(echo "$response" | jq --arg start "$START_DATE" '
.data.repository.pullRequests.nodes // [] |
map(select(. != null)) |
map(select(.createdAt >= $start))
')
page_count=$(echo "$prs_page" | jq 'length')
pr_count=$((pr_count + page_count))
# Check if we've gone past our date range (based on creation date)
oldest_created=$(echo "$response" | jq -r '
[.data.repository.pullRequests.nodes[].createdAt] |
map(select(. != null)) |
min
')
# Process each PR in this page
echo "$prs_page" | jq -c '.[]' | while read -r pr; do
PR_NUMBER=$(echo "$pr" | jq -r '.number // 0')
PR_USER=$(echo "$pr" | jq -r '.author.login // "unknown"')
PR_TITLE=$(echo "$pr" | jq -r '.title // "No title"')
PR_BODY=$(echo "$pr" | jq -r '.bodyText // ""')
PR_BODY_SIZE=${#PR_BODY}
# Safely extract numeric fields with defaults
ADDITIONS=$(echo "$pr" | jq -r '.additions // 0')
DELETIONS=$(echo "$pr" | jq -r '.deletions // 0')
# Validate numeric values (handle null/invalid)
if [[ ! "$ADDITIONS" =~ ^[0-9]+$ ]]; then
ADDITIONS=0
fi
if [[ ! "$DELETIONS" =~ ^[0-9]+$ ]]; then
DELETIONS=0
fi
DIFF_SIZE=$((ADDITIONS + DELETIONS))
# Get review comment count (reviews.totalCount + comments.totalCount)
REVIEW_COUNT=$(echo "$pr" | jq -r '.reviews.totalCount // 0')
COMMENT_COUNT=$(echo "$pr" | jq -r '.comments.totalCount // 0')
# Validate review counts
if [[ ! "$REVIEW_COUNT" =~ ^[0-9]+$ ]]; then
REVIEW_COUNT=0
fi
if [[ ! "$COMMENT_COUNT" =~ ^[0-9]+$ ]]; then
COMMENT_COUNT=0
fi
REVIEW_COMMENT_COUNT=$((REVIEW_COUNT + COMMENT_COUNT))
# Store PR data
echo "{}" | jq \
--arg repo "$REPO" \
--arg number "$PR_NUMBER" \
--arg user "$PR_USER" \
--arg title "$PR_TITLE" \
--argjson additions "$ADDITIONS" \
--argjson deletions "$DELETIONS" \
--argjson diff_size "$DIFF_SIZE" \
--argjson body_size "$PR_BODY_SIZE" \
--argjson review_comments "$REVIEW_COMMENT_COUNT" \
'{
repo: $repo,
number: $number,
user: $user,
title: $title,
additions: $additions,
deletions: $deletions,
diff_size: $diff_size,
body_size: $body_size,
review_comments: $review_comments
}' > "$TEMP_DIR/pr_data.json"
# Append to PR file
jq -s '.[0] + [.[1]]' "$PR_FILE" "$TEMP_DIR/pr_data.json" > "$TEMP_DIR/prs_tmp.json"
mv "$TEMP_DIR/prs_tmp.json" "$PR_FILE"
# Update contributor stats
echo "{}" | jq \
--arg user "$PR_USER" \
--argjson deletions "$DELETIONS" \
--argjson diff_size "$DIFF_SIZE" \
--argjson review_comments "$REVIEW_COMMENT_COUNT" \
'{
user: $user,
deletions: $deletions,
diff_size: $diff_size,
review_comments: $review_comments
}' > "$TEMP_DIR/contrib_data.json"
# Update contributors file
jq -s '
.[0] as $contributors |
.[1] as $contrib |
$contributors |
.[$contrib.user] += {
deletions: ((.[$contrib.user].deletions // 0) + $contrib.deletions),
total_diff: ((.[$contrib.user].total_diff // 0) + $contrib.diff_size),
pr_count: ((.[$contrib.user].pr_count // 0) + 1),
review_comments: ((.[$contrib.user].review_comments // 0) + $contrib.review_comments)
}
' "$CONTRIBUTORS_FILE" "$TEMP_DIR/contrib_data.json" > "$TEMP_DIR/contributors_tmp.json"
mv "$TEMP_DIR/contributors_tmp.json" "$CONTRIBUTORS_FILE"
done
echo " Processed page (found $page_count PRs created in ${CURRENT_YEAR})"
# Stop if we've gone past our start date (based on creation date)
if [ "$oldest_created" != "null" ] && [ "$oldest_created" \< "$START_DATE" ]; then
echo " Reached PRs created before ${CURRENT_YEAR}, stopping..."
break
fi
# Update cursor for next page
if [ "$has_next_page" = "true" ]; then
after_cursor="\"$end_cursor\""
sleep 0.5 # Rate limiting
fi
done
echo " Found $pr_count merged PRs created in ${CURRENT_YEAR}"
echo ""
done
# Generate statistics
echo -e "${BLUE}=== Compiling Statistics ===${NC}\n"
# Total merged PRs
TOTAL_MERGED=$(jq 'length' "$PR_FILE")
echo -e "${YELLOW}Total merged pull requests created in ${CURRENT_YEAR}:${NC} $TOTAL_MERGED"
echo ""
# Top 10 contributors by deleted lines (exclude github-actions)
echo -e "${YELLOW}Top 10 contributors by deleted lines of code:${NC}"
jq -r 'to_entries |
map(select(.key != "github-actions[bot]" and .key != "github-actions")) |
map({user: .key, deletions: .value.deletions}) |
sort_by(-.deletions) |
limit(10; .[]) |
"\(.deletions) lines - \(.user)"' "$CONTRIBUTORS_FILE" | nl
echo ""
# Top 10 smallest PRs by diff size (exclude zero-diff PRs)
echo -e "${YELLOW}Top 10 smallest PRs by diff size (excluding zero diffs):${NC}"
jq -r 'map(select(.diff_size > 0)) |
sort_by(.diff_size) |
limit(10; .[]) |
"\(.diff_size) lines - \(.repo)#\(.number): \(.title)"' "$PR_FILE" | nl
echo ""
# Top 10 PRs by description size
echo -e "${YELLOW}Top 10 PRs by description size (bytes):${NC}"
jq -r 'sort_by(-.body_size) |
limit(10; .[]) |
"\(.body_size) bytes - \(.repo)#\(.number): \(.title)"' "$PR_FILE" | nl
echo ""
# Top 10 contributors by smallest average PR size (exclude github-actions)
echo -e "${YELLOW}Top 10 contributors by smallest average PR size:${NC}"
jq -r 'to_entries |
map(select(.key != "github-actions[bot]" and .key != "github-actions")) |
map({
user: .key,
avg_size: ((.value.total_diff / .value.pr_count) | floor),
pr_count: .value.pr_count
}) |
sort_by(.avg_size) |
limit(10; .[]) |
"\(.avg_size) avg lines (\(.pr_count) PRs) - \(.user)"' "$CONTRIBUTORS_FILE" | nl
echo ""
# Top 10 contributors by PR review comment count (exclude github-actions)
echo -e "${YELLOW}Top 10 contributors by PR review comment count:${NC}"
jq -r 'to_entries |
map(select(.key != "github-actions[bot]" and .key != "github-actions")) |
map({user: .key, review_comments: .value.review_comments}) |
sort_by(-.review_comments) |
limit(10; .[]) |
"\(.review_comments) review comments - \(.user)"' "$CONTRIBUTORS_FILE" | nl
echo ""
echo -e "${GREEN}Statistics compilation complete!${NC}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment