Created
November 2, 2025 10:37
-
-
Save sunsided/fa31a169639e3152c84041b497b46872 to your computer and use it in GitHub Desktop.
Create backdated git commits based on file change dates
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| """ | |
| git_historic_commit.py — import a tree and create backdated commits. | |
| Behavior | |
| - Scans --source (outside or inside repo). | |
| - Copies files into repo under --dest (default: repo root '.'). | |
| - Commit order/dates follow source mtimes. | |
| - Filters match *source-relative* POSIX paths (Unicode NFC). | |
| """ | |
| from __future__ import annotations | |
| import argparse, fnmatch, os, shutil, subprocess, sys, unicodedata | |
| from dataclasses import dataclass | |
| from datetime import datetime, timezone | |
| from pathlib import Path | |
| from typing import Iterable, List | |
| @dataclass | |
| class FileInfo: | |
| src_rel: Path # relative to source root | |
| src_abs: Path # absolute source | |
| mtime_ns: int | |
| dst_rel: Path # destination path inside repo (relative to repo root) | |
| def run(cmd: List[str], check=True, capture=False, env=None, cwd: Path | None = None): | |
| return subprocess.run( | |
| cmd, check=check, text=True, capture_output=capture, env=env, cwd=str(cwd) if cwd else None | |
| ) | |
| def git_root() -> Path: | |
| out = run(["git", "rev-parse", "--show-toplevel"], capture=True) | |
| return Path(out.stdout.strip()) | |
| def is_git_repo() -> bool: | |
| try: | |
| git_root(); return True | |
| except subprocess.CalledProcessError: | |
| return False | |
| def nfc(s: str) -> str: | |
| return unicodedata.normalize("NFC", s) | |
| def expand_globs(patterns: Iterable[str]) -> List[str]: | |
| """For '**/name.ext' also add 'name.ext'. Strip leading './'.""" | |
| out: List[str] = [] | |
| for p in patterns: | |
| p = p.lstrip("./") | |
| out.append(p) | |
| if p.startswith("**/"): | |
| out.append(p[3:]) # also match root files | |
| return out | |
| def match_any_glob(rel_posix: str, patterns: Iterable[str], case_insensitive: bool) -> bool: | |
| if not patterns: | |
| return False | |
| s = rel_posix if not case_insensitive else rel_posix.lower() | |
| for pat in patterns: | |
| p = pat if not case_insensitive else pat.lower() | |
| if fnmatch.fnmatchcase(s, p): | |
| return True | |
| return False | |
| def iter_src_files(base: Path): | |
| for p in base.rglob("*"): | |
| if p.is_file(): | |
| yield p | |
| def to_iso8601_z(ts_ns: int) -> str: | |
| seconds = ts_ns // 1_000_000_000 | |
| dt = datetime.fromtimestamp(seconds, tz=timezone.utc).astimezone() | |
| return dt.isoformat(timespec="seconds") | |
| def group_by_mtime(files: List[FileInfo], window_seconds: int) -> List[List[FileInfo]]: | |
| if not files: return [] | |
| files.sort(key=lambda f: f.mtime_ns) | |
| groups: List[List[FileInfo]] = [] | |
| cur = [files[0]] | |
| anchor = files[0].mtime_ns | |
| window_ns = window_seconds * 1_000_000_000 | |
| for f in files[1:]: | |
| if f.mtime_ns - anchor <= window_ns: | |
| cur.append(f) | |
| else: | |
| groups.append(cur); cur = [f]; anchor = f.mtime_ns | |
| groups.append(cur) | |
| return groups | |
| def plan(src_root: Path, repo_root: Path, dest_root_rel: Path, | |
| excludes: List[str], respect_gitignore: bool, | |
| exclude_dirs: List[str], include_only: List[str], | |
| includes: List[str], # force-include overrides exclude | |
| ignore_case: bool, debug_filters: bool) -> List[FileInfo]: | |
| src_root = src_root.resolve() | |
| dest_root_rel = Path(dest_root_rel.as_posix().rstrip("/")) if dest_root_rel else Path(".") | |
| items: List[FileInfo] = [] | |
| # Normalize + expand patterns (so '**/*.ext' also matches '*.ext') | |
| ex_files = [nfc(p) for p in expand_globs(excludes)] | |
| ex_dirs = [nfc(p) for p in expand_globs(exclude_dirs)] | |
| inc_only = [nfc(p) for p in expand_globs(include_only)] | |
| inc_override = [nfc(p) for p in expand_globs(includes)] | |
| for src_abs in iter_src_files(src_root): | |
| try: | |
| src_rel = src_abs.relative_to(src_root) | |
| except Exception: | |
| src_rel = Path(os.path.relpath(src_abs, src_root)) | |
| rel_posix = nfc(src_rel.as_posix()) | |
| key = rel_posix if not ignore_case else rel_posix.lower() | |
| # 1) directory-level exclude (on any parent) | |
| skip_dir = any( | |
| fnmatch.fnmatchcase( | |
| (p.as_posix() if not ignore_case else p.as_posix().lower()), pat | |
| ) | |
| for pat in ex_dirs | |
| for p in [src_rel] + list(src_rel.parents) | |
| ) | |
| if skip_dir: | |
| if debug_filters: print(f"[skip:dir] {rel_posix}") | |
| continue | |
| # 2) include-only gate (whitelist) | |
| if inc_only and not match_any_glob(key, inc_only, False): | |
| if debug_filters: print(f"[skip:inc] {rel_posix}") | |
| continue | |
| # 3) file-level excludes (blacklist) … | |
| excluded = match_any_glob(key, ex_files, False) | |
| # 4) … with include override to re-include specific matches | |
| if excluded and match_any_glob(key, inc_override, False): | |
| excluded = False | |
| if debug_filters: print(f"[reinc] {rel_posix}") | |
| if excluded: | |
| if debug_filters: print(f"[skip:exc] {rel_posix}") | |
| continue | |
| try: | |
| st = src_abs.stat() | |
| except FileNotFoundError: | |
| continue | |
| dst_rel = (dest_root_rel / src_rel) | |
| items.append(FileInfo(src_rel=src_rel, src_abs=src_abs, mtime_ns=st.st_mtime_ns, dst_rel=dst_rel)) | |
| if not items: | |
| return [] | |
| if respect_gitignore: | |
| proc = subprocess.Popen( | |
| ["git", "check-ignore", "--stdin"], | |
| cwd=repo_root, stdin=subprocess.PIPE, stdout=subprocess.PIPE, | |
| stderr=subprocess.DEVNULL, text=True | |
| ) | |
| stdin = "\n".join(i.dst_rel.as_posix() for i in items) | |
| stdout, _ = proc.communicate(stdin) | |
| ignored = set(stdout.splitlines()) | |
| kept = [] | |
| for i in items: | |
| if i.dst_rel.as_posix() in ignored: | |
| if debug_filters: print(f"[skip:.gi] {i.dst_rel.as_posix()}") | |
| else: | |
| kept.append(i) | |
| items = kept | |
| return items | |
| def has_pending_changes(repo: Path) -> bool: | |
| proc = run(["git", "status", "--porcelain"], capture=True, cwd=repo) | |
| return bool(proc.stdout.strip()) | |
| def ensure_parent(dst_abs: Path): | |
| dst_abs.parent.mkdir(parents=True, exist_ok=True) | |
| def copy_preserve_mtime(src: Path, dst: Path): | |
| ensure_parent(dst) | |
| shutil.copy2(src, dst) | |
| def git_check_ignored(repo: Path, rel_paths: List[str]) -> List[str]: | |
| if not rel_paths: | |
| return [] | |
| proc = subprocess.Popen( | |
| ["git", "check-ignore", "--stdin"], | |
| cwd=repo, stdin=subprocess.PIPE, stdout=subprocess.PIPE, stderr=subprocess.DEVNULL, text=True | |
| ) | |
| stdout, _ = proc.communicate("\n".join(rel_paths)) | |
| return [line.strip() for line in stdout.splitlines()] | |
| def main() -> int: | |
| ap = argparse.ArgumentParser(description="Create backdated commits by importing a directory tree.") | |
| ap.add_argument("source", type=Path, help="Path to the *source* directory to import (may be outside the repo).") | |
| ap.add_argument("--dest", type=Path, default=None, | |
| help="Destination subdir *inside the repo* (default: repo root '.').") | |
| ap.add_argument("--exclude", action="append", default=[], help="Glob (source-relative) to exclude; repeatable.") | |
| ap.add_argument("--exclude-dir", action="append", default=[], help="Directory globs to exclude whole subtrees.") | |
| ap.add_argument("--include", action="append", default=[], | |
| help="Glob to *force-include* files even if they match an exclude; repeatable.") | |
| ap.add_argument("--include-only", action="append", default=[], help="If set, only files matching any of these globs pass.") | |
| ap.add_argument("--ignore-case", action="store_true", help="Make include/exclude matching case-insensitive.") | |
| ap.add_argument("--debug-filters", action="store_true", help="Print why files are included/excluded.") | |
| ap.add_argument("--group-seconds", type=int, default=0, | |
| help="Group files with mtimes within N seconds into a single commit.") | |
| ap.add_argument("--respect-gitignore", action="store_true", | |
| help="Skip files whose *destination* would be ignored by .gitignore.") | |
| ap.add_argument("--force-add-ignored", action="store_true", | |
| help="Force-add files Git would normally ignore (uses 'git add -f').") | |
| ap.add_argument("--message", default="historical import", | |
| help="Commit message prefix (timestamp appended).") | |
| ap.add_argument("--dry-run", action="store_true", help="Plan only; do not copy or commit.") | |
| ap.add_argument("--verbose", action="store_true", help="Print copy/add/commit operations as they happen.") | |
| args = ap.parse_args() | |
| if not is_git_repo(): | |
| print("Error: not inside a git repository.", file=sys.stderr) | |
| return 2 | |
| repo = git_root() | |
| os.chdir(repo) # run all git ops at repo root | |
| src_root = args.source.resolve() | |
| if not src_root.exists() or not src_root.is_dir(): | |
| print(f"Error: source does not exist or is not a directory: {src_root}", file=sys.stderr) | |
| return 2 | |
| # DEFAULT DEST: repo root ('.') unless explicitly provided | |
| dest_rel = args.dest if args.dest is not None else Path(".") | |
| if dest_rel.is_absolute(): | |
| print("--dest must be a path inside the repo (relative).", file=sys.stderr) | |
| return 2 | |
| items = plan( | |
| src_root, repo, | |
| dest_rel, | |
| args.exclude, | |
| args.respect_gitignore, | |
| args.exclude_dir, | |
| args.include_only, | |
| args.include, # include overrides | |
| args.ignore_case, | |
| args.debug_filters, | |
| ) | |
| if not items: | |
| print("Nothing to import (after filtering).") | |
| return 0 | |
| groups = group_by_mtime(items, args.group_seconds) | |
| total = sum(len(g) for g in groups) | |
| display_prefix = "" if dest_rel.as_posix() in (".", "") else (dest_rel.as_posix() + "/") | |
| print(f"Planned commits: {len(groups)} (total files: {total})") | |
| for idx, grp in enumerate(groups, 1): | |
| ts_iso = to_iso8601_z(max(f.mtime_ns for f in grp)) | |
| print(f"[{idx:04d}] {len(grp):4d} file(s) @ {ts_iso}") | |
| for f in grp[:5]: | |
| print(f" - {display_prefix}{f.src_rel.as_posix()}") | |
| if len(grp) > 5: | |
| print(f" ... +{len(grp)-5} more") | |
| if args.dry_run: | |
| return 0 | |
| if has_pending_changes(repo): | |
| print("Error: working tree not clean. Commit or stash your changes first.", file=sys.stderr) | |
| return 2 | |
| for i, grp in enumerate(groups, 1): | |
| ts_ns = max(f.mtime_ns for f in grp) | |
| ts_iso = to_iso8601_z(ts_ns) | |
| for f in grp: | |
| dst_abs = (repo / f.dst_rel) | |
| if args.verbose: | |
| print(f"[copy] {f.src_abs} -> {dst_abs}") | |
| copy_preserve_mtime(f.src_abs, dst_abs) | |
| paths = [str(f.dst_rel) for f in grp] | |
| ignored_now = git_check_ignored(repo, paths) | |
| if ignored_now and not args.force_add_ignored and args.verbose: | |
| for p in ignored_now: | |
| print(f"[info] '{p}' is ignored by .gitignore; not forcing add (use --force-add-ignored to include).") | |
| add_cmd = ["git", "add"] | |
| if args.force_add_ignored: | |
| add_cmd.append("-f") | |
| add_cmd += ["--"] + paths | |
| if args.verbose: | |
| print(f"[add ] {add_cmd}") | |
| run(add_cmd, cwd=repo) | |
| proc = run(["git", "diff", "--cached", "--quiet"], check=False, cwd=repo) | |
| if proc.returncode == 0: | |
| if args.verbose: | |
| print(f"[skip] nothing staged changed for group #{i}, unstage {len(paths)} path(s)") | |
| run(["git", "reset", "--"] + paths, check=False, cwd=repo) | |
| continue | |
| env = os.environ.copy() | |
| env["GIT_AUTHOR_DATE"] = ts_iso | |
| env["GIT_COMMITTER_DATE"] = ts_iso | |
| msg = f"{args.message}: {len(grp)} file(s) up to {ts_iso}" | |
| if args.verbose: | |
| print(f"[commit] {msg}") | |
| run(["git", "commit", "-m", msg, "--no-gpg-sign", "--date", ts_iso], env=env, cwd=repo) | |
| print(f"Committed [{i}/{len(groups)}]: {len(grp)} file(s) @ {ts_iso}") | |
| print("Done.") | |
| return 0 | |
| if __name__ == "__main__": | |
| sys.exit(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment