|
#!/usr/bin/env python3 |
|
|
|
import os |
|
import sys |
|
import time |
|
import subprocess |
|
import argparse |
|
from datetime import datetime |
|
from pathlib import Path |
|
|
|
|
|
def run_git_command(args, cwd=None): |
|
"""Run a git command and return the output.""" |
|
try: |
|
result = subprocess.run( |
|
['git'] + args, |
|
cwd=cwd, |
|
stdout=subprocess.PIPE, |
|
stderr=subprocess.PIPE, |
|
text=True, |
|
encoding='utf-8' |
|
) |
|
if result.returncode != 0: |
|
print(f"Git error: {result.stderr.strip()}") |
|
return None |
|
return result.stdout.strip() |
|
except Exception as e: |
|
print(f"Failed to execute git command: {e}") |
|
return None |
|
|
|
|
|
def is_git_repo(): |
|
"""Check if current directory is a Git repository.""" |
|
return run_git_command(['rev-parse', '--is-inside-work-tree']) == 'true' |
|
|
|
|
|
def get_tracked_files(path_filter=None): |
|
"""Get list of all tracked files in the repo, optionally filtered by path.""" |
|
cmd = ['ls-files'] |
|
if path_filter: |
|
# Git ls-files can take pathspecs directly |
|
cmd.append(path_filter) |
|
|
|
output = run_git_command(cmd) |
|
if output is None: |
|
return [] |
|
return output.splitlines() |
|
|
|
|
|
def get_file_authors(file_path): |
|
"""Get unique authors who modified a file (follows renames).""" |
|
output = run_git_command(['log', '--follow', '--format=%aN', '--', file_path]) |
|
if output is None: |
|
return [] |
|
authors = set(name.strip() for name in output.splitlines() if name.strip()) |
|
return sorted(authors) |
|
|
|
|
|
def get_git_blame(file_path): |
|
"""Get the output from git blame for a file.""" |
|
output = run_git_command(['blame', file_path]) |
|
if output is None: |
|
return "" |
|
return output |
|
|
|
|
|
def get_git_log(file_path, with_follow): |
|
"""Get the git log for a file.""" |
|
if with_follow: |
|
cmd = ['log', '--follow', '--', file_path] |
|
else: |
|
cmd = ['log', '--', file_path] |
|
output = run_git_command(cmd) |
|
if output is None: |
|
return "" |
|
return output |
|
|
|
|
|
def write_header(file_handle, title, filters=None): |
|
"""Write a standard header to a file.""" |
|
file_handle.write(f"{title}\n") |
|
file_handle.write(f"Generated on: {datetime.now().strftime('%Y-%m-%d %H:%M:%S')}\n") |
|
if filters: |
|
for filter_desc in filters: |
|
file_handle.write(f"{filter_desc}\n") |
|
file_handle.write("=" * 60 + "\n") |
|
|
|
|
|
def main(): |
|
parser = argparse.ArgumentParser( |
|
description="Analyze Git repo files and categorize them based on their authors.", |
|
formatter_class=argparse.RawTextHelpFormatter # For better help text formatting |
|
) |
|
# parser.add_argument( |
|
# '--authors', |
|
# type=str, |
|
# action='append', # Allow multiple --authors flags |
|
# required=True, |
|
# help="""Specify the list of authors to check against (case-insensitive). |
|
# Can be used multiple times, e.g., --authors alice --authors bob |
|
# Files will be categorized based on these authors.""" |
|
# ) |
|
parser.add_argument( |
|
'--path', |
|
type=str, |
|
help="Filter files within a specific path (e.g., 'src/', 'docs/README.md')." |
|
) |
|
parser.add_argument( |
|
'--output-all', |
|
type=str, |
|
default='report_all_files.txt', |
|
help="Output file path for the list of all files with their authors (default: report_all_files.txt)" |
|
) |
|
parser.add_argument( |
|
'--output-only-specified', |
|
type=str, |
|
default='report_files_only_specified_authors.txt', |
|
help="Output file path for files modified ONLY by the specified authors (default: report_files_only_specified_authors.txt)" |
|
) |
|
parser.add_argument( |
|
'--output-with-others', |
|
type=str, |
|
default='report_files_with_other_authors.txt', |
|
help="Output file path for files modified by other authors (including or excluding specified ones) (default: report_files_with_other_authors.txt)" |
|
) |
|
parser.add_argument( |
|
'--output-with-others-manual-exclude', |
|
type=str, |
|
default='report_files_with_other_authors_manual_exclude.txt', |
|
help="Output file path for files modified by other authors, excluding certain paths (default: report_files_with_other_authors_manual_exclude.txt)" |
|
) |
|
|
|
args = parser.parse_args() |
|
|
|
args.authors = [ |
|
"Calvin Neo", "Fu Zhe", "JaySon", "Lloyd-Pottiger", "hongyunyan", "jinhelin", "lidezhu", "yanweiqi", "yibin", |
|
"Wenxuan", "hzh0425", "iosmanthus", "SeaRise", "Zhigao Tong", "xufei", "xzhangxian1008", "guo-shaoge", "ruoxi", |
|
"jiaqizho", "Arenatlx", "Flowyi", "hehechen", "Jiarui Li", "Liqi Geng", "Meng Xin", "EasonBall", "ywqzzy", |
|
"Zequan", "虎", "flow", "Han Fei", "Soup", "Elsa", "Xue Zhenliang", "Yilong Li", "birdstorm", "YangKeao", |
|
"xiongjiwei", "Schrodinger ZHU Yifan", "Tong Zhigao", "Shenghui Wu", "虚弱球", "Hangjie Mo", "yongman", |
|
"lei yu", "Mattias Jonsson", "ystaticy", "Liu Cong", "Lynn", "Zhi Qi", "Shenghui Wu", "Zhexuan Yang", |
|
"Zhuhe Fang", "shuke", "Yu Lei", "wuhuizuo", "bestwoody", "Ning Xu", "Kira Yoshikage", "S1mple", "TONG, Zhigao", |
|
"tison", "Liangliang Gu", "Annie of the Stars", "HuaiyuXu", "lizhenhuan", "AntiTopQuark", "Iggie Wang", "Xuanyi Li", |
|
"baishen", "chAnge", "dongjunduo", "likzn", "ds", "EricZequan", "Qiaolin Yu", "Jiang Hua", "yuzhao.jyz", "Jiading Guo", |
|
"Jack Yu", "Jigao Luo", "nauta", "Rossi Sun", "ChangRui-Ryan" |
|
] |
|
|
|
manual_exclude_file_prefixes = [ |
|
"dbms/pch", |
|
"dbms/src/Flash/", |
|
"dbms/src/Storages/DeltaMerge/", |
|
"dbms/src/Storages/Page/", |
|
"dbms/src/Storages/KVStore/", |
|
"dbms/src/Storages/System/StoragesSystemDT", |
|
"dbms/src/Core/TiFlashDisaggregatedMode.h", |
|
"dbms/src/Storages/RegionQueryInfo", |
|
"libs/libclara", |
|
"libs/libprocess_metrics", |
|
"libs/libsymbolization", |
|
"libs/libglibc-compatibility/musl/COPYRIGHT", |
|
"libs/libmemcpy/impl/LICENSE", |
|
] |
|
|
|
# Normalize specified author names to lowercase for case-insensitive comparison |
|
specified_authors_lower = {a.lower().strip() for a in args.authors if a.strip()} |
|
|
|
if not specified_authors_lower: |
|
print("Error: At least one author must be specified using --authors.") |
|
sys.exit(1) |
|
|
|
# Check if we're in a Git repo |
|
if not is_git_repo(): |
|
print("Error: Current directory is not a Git repository.") |
|
sys.exit(1) |
|
|
|
print("Fetching list of tracked files...") |
|
# Pass path filter to git ls-files if provided |
|
files = get_tracked_files(args.path) |
|
|
|
if not files: |
|
print("No tracked files found matching the criteria.") |
|
sys.exit(0) |
|
|
|
total_files = len(files) |
|
print(f"Analyzing authorship for {total_files} files...") |
|
|
|
# Prepare filter descriptions for headers |
|
filter_descs = [] |
|
filter_descs.append(f"Specified Author(s): {', '.join(args.authors)}") |
|
if args.path: |
|
filter_descs.append(f"Path Filter: {args.path}") |
|
|
|
# Prepare output files |
|
output_path_all = Path(args.output_all) |
|
output_path_only_specified = Path(args.output_only_specified) |
|
output_path_with_others = Path(args.output_with_others) |
|
output_path_with_others_manual_exclude = Path(args.output_with_others_manual_exclude) |
|
|
|
with output_path_all.open('w', encoding='utf-8') as f_all: |
|
write_header(f_all, "Report: All Files and Their Authors", filter_descs) |
|
|
|
with output_path_only_specified.open('w', encoding='utf-8') as f_only: |
|
write_header(f_only, "Report: Files Modified ONLY by Specified Authors", filter_descs) |
|
|
|
with output_path_with_others.open('w', encoding='utf-8') as f_others: |
|
write_header(f_others, "Report: Files Modified by Other Authors", filter_descs) |
|
|
|
with output_path_with_others_manual_exclude.open('w', encoding='utf-8') as f_others_manual: |
|
write_header(f_others_manual, "Report: Files Modified by Other Authors (Manual Exclude)", filter_descs) |
|
|
|
count_only_specified = 0 |
|
count_with_others = 0 |
|
count_with_others_manual_exclude = 0 |
|
|
|
start_time = time.time() |
|
for idx, file in enumerate(files, 1): |
|
authors = get_file_authors(file) |
|
author_names = ", ".join(authors) |
|
# Normalize file authors to lowercase for comparison |
|
file_authors_lower = {a.lower().strip() for a in authors if a.strip()} |
|
|
|
# Write to the 'all files' report |
|
with output_path_all.open('a', encoding='utf-8') as f_all: |
|
f_all.write(f"File: {file}\n") |
|
f_all.write(f"Authors: {author_names}\n") |
|
f_all.write("-" * 60 + "\n") |
|
|
|
# Categorize the file |
|
# Case 1: File was modified ONLY by the specified authors |
|
# This means the set of file authors is a subset of specified authors and is not empty |
|
if file_authors_lower and file_authors_lower.issubset(specified_authors_lower): |
|
with output_path_only_specified.open('a', encoding='utf-8') as f_only: |
|
f_only.write(f"File: {file}\n") |
|
f_only.write(f"Authors: {author_names}\n") |
|
f_only.write("-" * 60 + "\n") |
|
count_only_specified += 1 |
|
else: |
|
# Case 2: File was modified by other authors |
|
# This includes: |
|
# - Files with only other authors |
|
# - Files with a mix of specified and other authors |
|
# Separate specified and other authors for this report |
|
specified_in_file = sorted({a for a in authors if a.lower().strip() in specified_authors_lower}) |
|
others_in_file = sorted({a for a in authors if a.lower().strip() not in specified_authors_lower}) |
|
|
|
# If the file name is starting with any of the manual exclude prefixes, skip it |
|
if any(file.startswith(prefix) for prefix in manual_exclude_file_prefixes): |
|
with output_path_with_others_manual_exclude.open('a', encoding='utf-8') as f_others_manual: |
|
f_others_manual.write(f"File: {file}\n") |
|
if specified_in_file: |
|
f_others_manual.write(f"Specified Authors: {', '.join(specified_in_file)}\n") |
|
if others_in_file: |
|
f_others_manual.write(f"Other Authors: {', '.join(others_in_file)}\n") |
|
|
|
f_others_manual.write("Git Blame Output:\n") |
|
blame_output = get_git_blame(file) |
|
f_others_manual.write(f"{blame_output}\n") |
|
|
|
f_others_manual.write("-" * 20 + "\n") |
|
f_others_manual.write("Git Log Output (no follow):\n") |
|
log_output = get_git_log(file, with_follow=False) |
|
f_others_manual.write(f"{log_output}\n") |
|
f_others_manual.write("-" * 20 + "\n") |
|
f_others_manual.write("Git Log Output (follow rename):\n") |
|
log_output = get_git_log(file, with_follow=True) |
|
f_others_manual.write(f"{log_output}\n") |
|
|
|
f_others_manual.write("-" * 60 + "\n") |
|
count_with_others_manual_exclude += 1 |
|
else: |
|
with output_path_with_others.open('a', encoding='utf-8') as f_others: |
|
f_others.write(f"File: {file}\n") |
|
if specified_in_file: |
|
f_others.write(f"Specified Authors: {', '.join(specified_in_file)}\n") |
|
if others_in_file: |
|
f_others.write(f"Other Authors: {', '.join(others_in_file)}\n") |
|
f_others.write("-" * 60 + "\n") |
|
count_with_others += 1 |
|
|
|
# Progress |
|
print(f"\rProcessing: {idx}/{total_files} files", end="", flush=True) |
|
|
|
end_time = time.time() |
|
print(f"\n\nAnalysis complete!") |
|
print(f"Time taken: {end_time - start_time:.3f} seconds") |
|
print(f"Report for all files saved to: {output_path_all}") |
|
print(f"Report for files by only specified authors saved to: {output_path_only_specified}") |
|
print(f"Report for files with other authors saved to: {output_path_with_others}") |
|
print(f"Files modified ONLY by specified authors: {count_only_specified}") |
|
print(f"Files modified by other authors (or a mix): {count_with_others}") |
|
print(f"Files modified by other authors (manual exclude applied): {count_with_others_manual_exclude}") |
|
|
|
|
|
if __name__ == "__main__": |
|
main() |
Collect the authors since year 2019