Created
February 16, 2026 12:02
-
-
Save minhvt22/9e91b0f201e51f32e926bb6cd2c11e48 to your computer and use it in GitHub Desktop.
generate_llms_txt.py — Generate an llms.txt file for a codebase.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| generate_llms_txt.py — Generate an llms.txt file for a codebase. | |
| Usage: | |
| python generate_llms_txt.py [ROOT_DIR] [OPTIONS] | |
| Options: | |
| --output FILE Output file (default: llms.txt) | |
| --max-file-kb N Skip files larger than N KB (default: 500) | |
| --include-ext Comma-separated extra extensions to include | |
| --exclude-ext Comma-separated extensions to exclude | |
| --no-content Only list file tree, don't embed file contents | |
| Examples: | |
| python generate_llms_txt.py . | |
| python generate_llms_txt.py ~/myproject --output context.txt --max-file-kb 100 | |
| python generate_llms_txt.py . --include-ext .env,.cfg --exclude-ext .min.js | |
| """ | |
| import argparse | |
| import os | |
| import subprocess | |
| import sys | |
| from pathlib import Path | |
| # --------------------------------------------------------------------------- | |
| # Defaults | |
| # --------------------------------------------------------------------------- | |
| DEFAULT_OUTPUT = "llms.txt" | |
| DEFAULT_MAX_FILE_KB = 500 | |
| # Extensions treated as text / source code by default | |
| TEXT_EXTENSIONS = { | |
| # Web | |
| ".html", | |
| ".htm", | |
| ".css", | |
| ".scss", | |
| ".sass", | |
| ".less", | |
| ".js", | |
| ".jsx", | |
| ".ts", | |
| ".tsx", | |
| ".vue", | |
| ".svelte", | |
| # Backend | |
| ".py", | |
| ".rb", | |
| ".php", | |
| ".java", | |
| ".kt", | |
| ".scala", | |
| ".go", | |
| ".rs", | |
| ".c", | |
| ".cpp", | |
| ".cc", | |
| ".h", | |
| ".hpp", | |
| ".cs", | |
| ".swift", | |
| ".m", | |
| ".ex", | |
| ".exs", | |
| ".erl", | |
| ".hs", | |
| ".lua", | |
| ".r", | |
| ".R", | |
| ".jl", | |
| # Shell / scripting | |
| ".sh", | |
| ".bash", | |
| ".zsh", | |
| ".fish", | |
| ".ps1", | |
| ".bat", | |
| ".cmd", | |
| # Config / data | |
| ".json", | |
| ".yaml", | |
| ".yml", | |
| ".toml", | |
| ".ini", | |
| ".cfg", | |
| ".conf", | |
| ".xml", | |
| ".env", | |
| ".env.example", | |
| ".properties", | |
| # Docs / markup | |
| ".md", | |
| ".mdx", | |
| ".rst", | |
| ".txt", | |
| ".adoc", | |
| # Database | |
| ".sql", | |
| ".graphql", | |
| ".gql", | |
| # Build / infra | |
| ".dockerfile", | |
| ".tf", | |
| ".hcl", | |
| ".nix", | |
| # Misc | |
| ".lock", | |
| ".gitignore", | |
| ".gitattributes", | |
| ".editorconfig", | |
| ".eslintrc", | |
| ".prettierrc", | |
| ".babelrc", | |
| } | |
| # Always skip these regardless of extension | |
| ALWAYS_SKIP_NAMES = { | |
| "node_modules", | |
| ".git", | |
| ".hg", | |
| ".svn", | |
| "__pycache__", | |
| ".mypy_cache", | |
| ".ruff_cache", | |
| ".pytest_cache", | |
| ".tox", | |
| "venv", | |
| ".venv", | |
| "env", | |
| "dist", | |
| "build", | |
| "out", | |
| ".next", | |
| ".nuxt", | |
| "coverage", | |
| ".coverage", | |
| ".nyc_output", | |
| ".DS_Store", | |
| "Thumbs.db", | |
| } | |
| # --------------------------------------------------------------------------- | |
| # .gitignore helpers | |
| # --------------------------------------------------------------------------- | |
| def load_gitignore_patterns(root: Path) -> list[str]: | |
| """Return a flat list of patterns from all .gitignore files in the tree.""" | |
| patterns = [] | |
| for gitignore in root.rglob(".gitignore"): | |
| try: | |
| rel_dir = gitignore.parent.relative_to(root) | |
| except ValueError: | |
| continue | |
| prefix = str(rel_dir) + "/" if str(rel_dir) != "." else "" | |
| for line in gitignore.read_text(errors="replace").splitlines(): | |
| line = line.strip() | |
| if line and not line.startswith("#"): | |
| # Prefix relative patterns so they're anchored correctly | |
| if not line.startswith("/") and not line.startswith("*"): | |
| patterns.append(prefix + line) | |
| else: | |
| patterns.append(line.lstrip("/")) | |
| return patterns | |
| def is_ignored_by_git(path: Path, root: Path) -> bool: | |
| """Ask git whether a path is ignored (fast, handles nested .gitignore).""" | |
| try: | |
| rel = path.relative_to(root) | |
| result = subprocess.run( | |
| ["git", "check-ignore", "-q", str(rel)], | |
| cwd=root, | |
| capture_output=True, | |
| ) | |
| return result.returncode == 0 | |
| except (FileNotFoundError, subprocess.SubprocessError): | |
| return False | |
| def git_available(root: Path) -> bool: | |
| """Return True if git is available and the root is inside a git repo.""" | |
| try: | |
| result = subprocess.run( | |
| ["git", "rev-parse", "--git-dir"], | |
| cwd=root, | |
| capture_output=True, | |
| ) | |
| return result.returncode == 0 | |
| except FileNotFoundError: | |
| return False | |
| # --------------------------------------------------------------------------- | |
| # File collection | |
| # --------------------------------------------------------------------------- | |
| def collect_files( | |
| root: Path, | |
| max_bytes: int, | |
| include_ext: set[str], | |
| exclude_ext: set[str], | |
| use_git: bool, | |
| ) -> list[Path]: | |
| """Walk root and return sorted list of paths to include.""" | |
| valid_ext = (TEXT_EXTENSIONS | include_ext) - exclude_ext | |
| collected = [] | |
| for dirpath, dirnames, filenames in os.walk(root, topdown=True): | |
| current = Path(dirpath) | |
| # Prune directories in-place (modifies os.walk iteration) | |
| dirnames[:] = [ | |
| d | |
| for d in dirnames | |
| if d not in ALWAYS_SKIP_NAMES | |
| and not (use_git and is_ignored_by_git(current / d, root)) | |
| ] | |
| for fname in filenames: | |
| fpath = current / fname | |
| _rel = fpath.relative_to(root) | |
| # Skip dot-files (hidden) unless they're common config files | |
| if ( | |
| fname.startswith(".") | |
| and fpath.suffix not in valid_ext | |
| and fname | |
| not in { | |
| ".gitignore", | |
| ".env.example", | |
| ".editorconfig", | |
| ".eslintrc", | |
| ".prettierrc", | |
| ".babelrc", | |
| } | |
| ): | |
| continue | |
| # Extension check | |
| ext = fpath.suffix.lower() or fname.lower() # handles Dockerfile, Makefile | |
| # Also check full lowercase name for extensionless special files | |
| if ext not in valid_ext and fname.lower() not in { | |
| "makefile", | |
| "dockerfile", | |
| "procfile", | |
| "rakefile", | |
| "gemfile", | |
| "cmakelists.txt", | |
| "cargo.toml", | |
| "go.mod", | |
| "go.sum", | |
| "pyproject.toml", | |
| "setup.cfg", | |
| "setup.py", | |
| }: | |
| continue | |
| # Size check | |
| try: | |
| size = fpath.stat().st_size | |
| except OSError: | |
| continue | |
| if size > max_bytes: | |
| continue | |
| # gitignore check | |
| if use_git and is_ignored_by_git(fpath, root): | |
| continue | |
| collected.append(fpath) | |
| return sorted(collected, key=lambda p: p.relative_to(root)) | |
| # --------------------------------------------------------------------------- | |
| # Output generation | |
| # --------------------------------------------------------------------------- | |
| def file_tree(files: list[Path], root: Path) -> str: | |
| lines = [] | |
| seen_dirs: set[Path] = set() | |
| for f in files: | |
| parts = f.relative_to(root).parts | |
| for depth, part in enumerate(parts[:-1]): | |
| dir_path = root.joinpath(*parts[: depth + 1]) | |
| if dir_path not in seen_dirs: | |
| lines.append(" " * depth + f"📁 {part}/") | |
| seen_dirs.add(dir_path) | |
| lines.append(" " * (len(parts) - 1) + f"📄 {parts[-1]}") | |
| return "\n".join(lines) | |
| def generate( | |
| root: Path, | |
| output: Path, | |
| max_bytes: int, | |
| include_ext: set[str], | |
| exclude_ext: set[str], | |
| no_content: bool, | |
| ) -> None: | |
| use_git = git_available(root) | |
| if use_git: | |
| print("✓ Git detected — respecting .gitignore via `git check-ignore`") | |
| else: | |
| print("⚠ No git repo found — .gitignore patterns not applied") | |
| print(f" Scanning {root} …", end="", flush=True) | |
| files = collect_files(root, max_bytes, include_ext, exclude_ext, use_git) | |
| print(f" {len(files)} files found") | |
| with output.open("w", encoding="utf-8") as out: | |
| # ── Header ────────────────────────────────────────────────────────── | |
| out.write(f"# llms.txt — {root.resolve().name}\n") | |
| out.write(f"# Generated from: {root.resolve()}\n") | |
| out.write(f"# Files included: {len(files)}\n") | |
| out.write(f"# .gitignore respected: {use_git}\n\n") | |
| # ── File tree ─────────────────────────────────────────────────────── | |
| out.write("## File Tree\n\n") | |
| out.write("```\n") | |
| out.write(file_tree(files, root)) | |
| out.write("\n```\n\n") | |
| if no_content: | |
| return | |
| # ── File contents ─────────────────────────────────────────────────── | |
| out.write("## File Contents\n\n") | |
| skipped = 0 | |
| for fpath in files: | |
| rel = fpath.relative_to(root) | |
| try: | |
| content = fpath.read_text(encoding="utf-8", errors="replace") | |
| except OSError as exc: | |
| out.write(f"### {rel}\n\n*Could not read file: {exc}*\n\n") | |
| skipped += 1 | |
| continue | |
| ext = fpath.suffix.lstrip(".") or "text" | |
| out.write(f"### {rel}\n\n") | |
| out.write(f"```{ext}\n") | |
| out.write(content) | |
| if not content.endswith("\n"): | |
| out.write("\n") | |
| out.write("```\n\n") | |
| if skipped: | |
| print(f"⚠ {skipped} file(s) could not be read (see output for details)") | |
| size_kb = output.stat().st_size / 1024 | |
| print(f"✓ Written → {output} ({size_kb:,.1f} KB)") | |
| # --------------------------------------------------------------------------- | |
| # CLI | |
| # --------------------------------------------------------------------------- | |
| def parse_ext_arg(raw: str) -> set[str]: | |
| """Parse a comma-separated list of extensions, normalising to '.ext' form.""" | |
| result = set() | |
| for part in raw.split(","): | |
| part = part.strip() | |
| if part: | |
| result.add(part if part.startswith(".") else f".{part}") | |
| return result | |
| def main() -> None: | |
| parser = argparse.ArgumentParser( | |
| description="Generate llms.txt for a codebase, respecting .gitignore.", | |
| formatter_class=argparse.RawDescriptionHelpFormatter, | |
| epilog=__doc__, | |
| ) | |
| parser.add_argument( | |
| "root", nargs="?", default=".", help="Root directory (default: .)" | |
| ) | |
| parser.add_argument("--output", default=DEFAULT_OUTPUT, help="Output file path") | |
| parser.add_argument( | |
| "--max-file-kb", | |
| type=int, | |
| default=DEFAULT_MAX_FILE_KB, | |
| help=f"Skip files larger than N KB (default: {DEFAULT_MAX_FILE_KB})", | |
| ) | |
| parser.add_argument( | |
| "--include-ext", default="", help="Extra extensions to include, e.g. '.csv,.log'" | |
| ) | |
| parser.add_argument( | |
| "--exclude-ext", default="", help="Extensions to exclude, e.g. '.min.js,.map'" | |
| ) | |
| parser.add_argument( | |
| "--no-content", | |
| action="store_true", | |
| help="Only write the file tree, skip file contents", | |
| ) | |
| args = parser.parse_args() | |
| root = Path(args.root).resolve() | |
| if not root.is_dir(): | |
| sys.exit(f"Error: '{root}' is not a directory.") | |
| output = Path(args.output) | |
| if not output.is_absolute(): | |
| output = Path.cwd() / output | |
| generate( | |
| root=root, | |
| output=output, | |
| max_bytes=args.max_file_kb * 1024, | |
| include_ext=parse_ext_arg(args.include_ext), | |
| exclude_ext=parse_ext_arg(args.exclude_ext), | |
| no_content=args.no_content, | |
| ) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment