Skip to content

Instantly share code, notes, and snippets.

@JaySon-Huang
Last active October 21, 2025 14:17
Show Gist options
  • Select an option

  • Save JaySon-Huang/d48bd473d5c1ed2ca5b7e608050542c6 to your computer and use it in GitHub Desktop.

Select an option

Save JaySon-Huang/d48bd473d5c1ed2ca5b7e608050542c6 to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import os
import sys
import time
import subprocess
import argparse
from datetime import datetime
from pathlib import Path
def run_git_command(args, cwd=None):
"""Run a git command and return the output."""
try:
result = subprocess.run(
['git'] + args,
cwd=cwd,
stdout=subprocess.PIPE,
stderr=subprocess.PIPE,
text=True,
encoding='utf-8'
)
if result.returncode != 0:
print(f"Git error: {result.stderr.strip()}")
return None
return result.stdout.strip()
except Exception as e:
print(f"Failed to execute git command: {e}")
return None
def is_git_repo():
"""Check if current directory is a Git repository."""
return run_git_command(['rev-parse', '--is-inside-work-tree']) == 'true'
def get_tracked_files(path_filter=None):
"""Get list of all tracked files in the repo, optionally filtered by path."""
cmd = ['ls-files']
if path_filter:
# Git ls-files can take pathspecs directly
cmd.append(path_filter)
output = run_git_command(cmd)
if output is None:
return []
return output.splitlines()
def get_file_authors(file_path):
"""Get unique authors who modified a file (follows renames)."""
output = run_git_command(['log', '--follow', '--format=%aN', '--', file_path])
if output is None:
return []
authors = set(name.strip() for name in output.splitlines() if name.strip())
return sorted(authors)
def get_git_blame(file_path):
"""Get the output from git blame for a file."""
output = run_git_command(['blame', file_path])
if output is None:
return ""
return output
def get_git_log(file_path, with_follow):
"""Get the git log for a file."""
if with_follow:
cmd = ['log', '--follow', '--', file_path]
else:
cmd = ['log', '--', file_path]
output = run_git_command(cmd)
if output is None:
return ""
return output
def write_header(file_handle, title, filters=None):
"""Write a standard header to a file."""
file_handle.write(f"{title}\n")
file_handle.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n")
if filters:
for filter_desc in filters:
file_handle.write(f"{filter_desc}\n")
file_handle.write("=" * 60 + "\n")
def main():
parser = argparse.ArgumentParser(
description="Analyze Git repo files and categorize them based on their authors.",
formatter_class=argparse.RawTextHelpFormatter # For better help text formatting
)
# parser.add_argument(
# '--authors',
# type=str,
# action='append', # Allow multiple --authors flags
# required=True,
# help="""Specify the list of authors to check against (case-insensitive).
# Can be used multiple times, e.g., --authors alice --authors bob
# Files will be categorized based on these authors."""
# )
parser.add_argument(
'--path',
type=str,
help="Filter files within a specific path (e.g., 'src/', 'docs/README.md')."
)
parser.add_argument(
'--output-all',
type=str,
default='report_all_files.txt',
help="Output file path for the list of all files with their authors (default: report_all_files.txt)"
)
parser.add_argument(
'--output-only-specified',
type=str,
default='report_files_only_specified_authors.txt',
help="Output file path for files modified ONLY by the specified authors (default: report_files_only_specified_authors.txt)"
)
parser.add_argument(
'--output-with-others',
type=str,
default='report_files_with_other_authors.txt',
help="Output file path for files modified by other authors (including or excluding specified ones) (default: report_files_with_other_authors.txt)"
)
parser.add_argument(
'--output-with-others-manual-exclude',
type=str,
default='report_files_with_other_authors_manual_exclude.txt',
help="Output file path for files modified by other authors, excluding certain paths (default: report_files_with_other_authors_manual_exclude.txt)"
)
args = parser.parse_args()
args.authors = [
"Calvin Neo", "Fu Zhe", "JaySon", "Lloyd-Pottiger", "hongyunyan", "jinhelin", "lidezhu", "yanweiqi", "yibin",
"Wenxuan", "hzh0425", "iosmanthus", "SeaRise", "Zhigao Tong", "xufei", "xzhangxian1008", "guo-shaoge", "ruoxi",
"jiaqizho", "Arenatlx", "Flowyi", "hehechen", "Jiarui Li", "Liqi Geng", "Meng Xin", "EasonBall", "ywqzzy",
"Zequan", "虎", "flow", "Han Fei", "Soup", "Elsa", "Xue Zhenliang", "Yilong Li", "birdstorm", "YangKeao",
"xiongjiwei", "Schrodinger ZHU Yifan", "Tong Zhigao", "Shenghui Wu", "虚弱球", "Hangjie Mo", "yongman",
"lei yu", "Mattias Jonsson", "ystaticy", "Liu Cong", "Lynn", "Zhi Qi", "Shenghui Wu", "Zhexuan Yang",
"Zhuhe Fang", "shuke", "Yu Lei", "wuhuizuo", "bestwoody", "Ning Xu", "Kira Yoshikage", "S1mple", "TONG, Zhigao",
"tison", "Liangliang Gu", "Annie of the Stars", "HuaiyuXu", "lizhenhuan", "AntiTopQuark", "Iggie Wang", "Xuanyi Li",
"baishen", "chAnge", "dongjunduo", "likzn", "ds", "EricZequan", "Qiaolin Yu", "Jiang Hua", "yuzhao.jyz", "Jiading Guo",
"Jack Yu", "Jigao Luo", "nauta", "Rossi Sun", "ChangRui-Ryan"
]
manual_exclude_file_prefixes = [
"dbms/pch",
"dbms/src/Flash/",
"dbms/src/Storages/DeltaMerge/",
"dbms/src/Storages/Page/",
"dbms/src/Storages/KVStore/",
"dbms/src/Storages/System/StoragesSystemDT",
"dbms/src/Core/TiFlashDisaggregatedMode.h",
"dbms/src/Storages/RegionQueryInfo",
"libs/libclara",
"libs/libprocess_metrics",
"libs/libsymbolization",
"libs/libglibc-compatibility/musl/COPYRIGHT",
"libs/libmemcpy/impl/LICENSE",
]
# Normalize specified author names to lowercase for case-insensitive comparison
specified_authors_lower = {a.lower().strip() for a in args.authors if a.strip()}
if not specified_authors_lower:
print("Error: At least one author must be specified using --authors.")
sys.exit(1)
# Check if we're in a Git repo
if not is_git_repo():
print("Error: Current directory is not a Git repository.")
sys.exit(1)
print("Fetching list of tracked files...")
# Pass path filter to git ls-files if provided
files = get_tracked_files(args.path)
if not files:
print("No tracked files found matching the criteria.")
sys.exit(0)
total_files = len(files)
print(f"Analyzing authorship for {total_files} files...")
# Prepare filter descriptions for headers
filter_descs = []
filter_descs.append(f"Specified Author(s): {', '.join(args.authors)}")
if args.path:
filter_descs.append(f"Path Filter: {args.path}")
# Prepare output files
output_path_all = Path(args.output_all)
output_path_only_specified = Path(args.output_only_specified)
output_path_with_others = Path(args.output_with_others)
output_path_with_others_manual_exclude = Path(args.output_with_others_manual_exclude)
with output_path_all.open('w', encoding='utf-8') as f_all:
write_header(f_all, "Report: All Files and Their Authors", filter_descs)
with output_path_only_specified.open('w', encoding='utf-8') as f_only:
write_header(f_only, "Report: Files Modified ONLY by Specified Authors", filter_descs)
with output_path_with_others.open('w', encoding='utf-8') as f_others:
write_header(f_others, "Report: Files Modified by Other Authors", filter_descs)
with output_path_with_others_manual_exclude.open('w', encoding='utf-8') as f_others_manual:
write_header(f_others_manual, "Report: Files Modified by Other Authors (Manual Exclude)", filter_descs)
count_only_specified = 0
count_with_others = 0
count_with_others_manual_exclude = 0
start_time = time.time()
for idx, file in enumerate(files, 1):
authors = get_file_authors(file)
author_names = ", ".join(authors)
# Normalize file authors to lowercase for comparison
file_authors_lower = {a.lower().strip() for a in authors if a.strip()}
# Write to the 'all files' report
with output_path_all.open('a', encoding='utf-8') as f_all:
f_all.write(f"File: {file}\n")
f_all.write(f"Authors: {author_names}\n")
f_all.write("-" * 60 + "\n")
# Categorize the file
# Case 1: File was modified ONLY by the specified authors
# This means the set of file authors is a subset of specified authors and is not empty
if file_authors_lower and file_authors_lower.issubset(specified_authors_lower):
with output_path_only_specified.open('a', encoding='utf-8') as f_only:
f_only.write(f"File: {file}\n")
f_only.write(f"Authors: {author_names}\n")
f_only.write("-" * 60 + "\n")
count_only_specified += 1
else:
# Case 2: File was modified by other authors
# This includes:
# - Files with only other authors
# - Files with a mix of specified and other authors
# Separate specified and other authors for this report
specified_in_file = sorted({a for a in authors if a.lower().strip() in specified_authors_lower})
others_in_file = sorted({a for a in authors if a.lower().strip() not in specified_authors_lower})
# If the file name is starting with any of the manual exclude prefixes, skip it
if any(file.startswith(prefix) for prefix in manual_exclude_file_prefixes):
with output_path_with_others_manual_exclude.open('a', encoding='utf-8') as f_others_manual:
f_others_manual.write(f"File: {file}\n")
if specified_in_file:
f_others_manual.write(f"Specified Authors: {', '.join(specified_in_file)}\n")
if others_in_file:
f_others_manual.write(f"Other Authors: {', '.join(others_in_file)}\n")
f_others_manual.write("Git Blame Output:\n")
blame_output = get_git_blame(file)
f_others_manual.write(f"{blame_output}\n")
f_others_manual.write("-" * 20 + "\n")
f_others_manual.write("Git Log Output (no follow):\n")
log_output = get_git_log(file, with_follow=False)
f_others_manual.write(f"{log_output}\n")
f_others_manual.write("-" * 20 + "\n")
f_others_manual.write("Git Log Output (follow rename):\n")
log_output = get_git_log(file, with_follow=True)
f_others_manual.write(f"{log_output}\n")
f_others_manual.write("-" * 60 + "\n")
count_with_others_manual_exclude += 1
else:
with output_path_with_others.open('a', encoding='utf-8') as f_others:
f_others.write(f"File: {file}\n")
if specified_in_file:
f_others.write(f"Specified Authors: {', '.join(specified_in_file)}\n")
if others_in_file:
f_others.write(f"Other Authors: {', '.join(others_in_file)}\n")
f_others.write("-" * 60 + "\n")
count_with_others += 1
# Progress
print(f"\rProcessing: {idx}/{total_files} files", end="", flush=True)
end_time = time.time()
print(f"\n\nAnalysis complete!")
print(f"Time taken: {end_time - start_time:.3f} seconds")
print(f"Report for all files saved to: {output_path_all}")
print(f"Report for files by only specified authors saved to: {output_path_only_specified}")
print(f"Report for files with other authors saved to: {output_path_with_others}")
print(f"Files modified ONLY by specified authors: {count_only_specified}")
print(f"Files modified by other authors (or a mix): {count_with_others}")
print(f"Files modified by other authors (manual exclude applied): {count_with_others_manual_exclude}")
if __name__ == "__main__":
main()
@JaySon-Huang
Copy link
Author

Collect the authors since year 2019

git log --since="2019-01-01" --format='%aN' | sort | uniq

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment