Skip to content

Instantly share code, notes, and snippets.

@minhvt22
Created February 16, 2026 12:02
Show Gist options
  • Select an option

  • Save minhvt22/9e91b0f201e51f32e926bb6cd2c11e48 to your computer and use it in GitHub Desktop.

Select an option

Save minhvt22/9e91b0f201e51f32e926bb6cd2c11e48 to your computer and use it in GitHub Desktop.
generate_llms_txt.py — Generate an llms.txt file for a codebase.
#!/usr/bin/env python3
"""
generate_llms_txt.py — Generate an llms.txt file for a codebase.
Usage:
python generate_llms_txt.py [ROOT_DIR] [OPTIONS]
Options:
--output FILE Output file (default: llms.txt)
--max-file-kb N Skip files larger than N KB (default: 500)
--include-ext Comma-separated extra extensions to include
--exclude-ext Comma-separated extensions to exclude
--no-content Only list file tree, don't embed file contents
Examples:
python generate_llms_txt.py .
python generate_llms_txt.py ~/myproject --output context.txt --max-file-kb 100
python generate_llms_txt.py . --include-ext .env,.cfg --exclude-ext .min.js
"""
import argparse
import os
import subprocess
import sys
from pathlib import Path
# ---------------------------------------------------------------------------
# Defaults
# ---------------------------------------------------------------------------
DEFAULT_OUTPUT = "llms.txt"
DEFAULT_MAX_FILE_KB = 500
# Extensions treated as text / source code by default
TEXT_EXTENSIONS = {
# Web
".html",
".htm",
".css",
".scss",
".sass",
".less",
".js",
".jsx",
".ts",
".tsx",
".vue",
".svelte",
# Backend
".py",
".rb",
".php",
".java",
".kt",
".scala",
".go",
".rs",
".c",
".cpp",
".cc",
".h",
".hpp",
".cs",
".swift",
".m",
".ex",
".exs",
".erl",
".hs",
".lua",
".r",
".R",
".jl",
# Shell / scripting
".sh",
".bash",
".zsh",
".fish",
".ps1",
".bat",
".cmd",
# Config / data
".json",
".yaml",
".yml",
".toml",
".ini",
".cfg",
".conf",
".xml",
".env",
".env.example",
".properties",
# Docs / markup
".md",
".mdx",
".rst",
".txt",
".adoc",
# Database
".sql",
".graphql",
".gql",
# Build / infra
".dockerfile",
".tf",
".hcl",
".nix",
# Misc
".lock",
".gitignore",
".gitattributes",
".editorconfig",
".eslintrc",
".prettierrc",
".babelrc",
}
# Always skip these regardless of extension
ALWAYS_SKIP_NAMES = {
"node_modules",
".git",
".hg",
".svn",
"__pycache__",
".mypy_cache",
".ruff_cache",
".pytest_cache",
".tox",
"venv",
".venv",
"env",
"dist",
"build",
"out",
".next",
".nuxt",
"coverage",
".coverage",
".nyc_output",
".DS_Store",
"Thumbs.db",
}
# ---------------------------------------------------------------------------
# .gitignore helpers
# ---------------------------------------------------------------------------
def load_gitignore_patterns(root: Path) -> list[str]:
"""Return a flat list of patterns from all .gitignore files in the tree."""
patterns = []
for gitignore in root.rglob(".gitignore"):
try:
rel_dir = gitignore.parent.relative_to(root)
except ValueError:
continue
prefix = str(rel_dir) + "/" if str(rel_dir) != "." else ""
for line in gitignore.read_text(errors="replace").splitlines():
line = line.strip()
if line and not line.startswith("#"):
# Prefix relative patterns so they're anchored correctly
if not line.startswith("/") and not line.startswith("*"):
patterns.append(prefix + line)
else:
patterns.append(line.lstrip("/"))
return patterns
def is_ignored_by_git(path: Path, root: Path) -> bool:
"""Ask git whether a path is ignored (fast, handles nested .gitignore)."""
try:
rel = path.relative_to(root)
result = subprocess.run(
["git", "check-ignore", "-q", str(rel)],
cwd=root,
capture_output=True,
)
return result.returncode == 0
except (FileNotFoundError, subprocess.SubprocessError):
return False
def git_available(root: Path) -> bool:
"""Return True if git is available and the root is inside a git repo."""
try:
result = subprocess.run(
["git", "rev-parse", "--git-dir"],
cwd=root,
capture_output=True,
)
return result.returncode == 0
except FileNotFoundError:
return False
# ---------------------------------------------------------------------------
# File collection
# ---------------------------------------------------------------------------
def collect_files(
root: Path,
max_bytes: int,
include_ext: set[str],
exclude_ext: set[str],
use_git: bool,
) -> list[Path]:
"""Walk root and return sorted list of paths to include."""
valid_ext = (TEXT_EXTENSIONS | include_ext) - exclude_ext
collected = []
for dirpath, dirnames, filenames in os.walk(root, topdown=True):
current = Path(dirpath)
# Prune directories in-place (modifies os.walk iteration)
dirnames[:] = [
d
for d in dirnames
if d not in ALWAYS_SKIP_NAMES
and not (use_git and is_ignored_by_git(current / d, root))
]
for fname in filenames:
fpath = current / fname
_rel = fpath.relative_to(root)
# Skip dot-files (hidden) unless they're common config files
if (
fname.startswith(".")
and fpath.suffix not in valid_ext
and fname
not in {
".gitignore",
".env.example",
".editorconfig",
".eslintrc",
".prettierrc",
".babelrc",
}
):
continue
# Extension check
ext = fpath.suffix.lower() or fname.lower() # handles Dockerfile, Makefile
# Also check full lowercase name for extensionless special files
if ext not in valid_ext and fname.lower() not in {
"makefile",
"dockerfile",
"procfile",
"rakefile",
"gemfile",
"cmakelists.txt",
"cargo.toml",
"go.mod",
"go.sum",
"pyproject.toml",
"setup.cfg",
"setup.py",
}:
continue
# Size check
try:
size = fpath.stat().st_size
except OSError:
continue
if size > max_bytes:
continue
# gitignore check
if use_git and is_ignored_by_git(fpath, root):
continue
collected.append(fpath)
return sorted(collected, key=lambda p: p.relative_to(root))
# ---------------------------------------------------------------------------
# Output generation
# ---------------------------------------------------------------------------
def file_tree(files: list[Path], root: Path) -> str:
lines = []
seen_dirs: set[Path] = set()
for f in files:
parts = f.relative_to(root).parts
for depth, part in enumerate(parts[:-1]):
dir_path = root.joinpath(*parts[: depth + 1])
if dir_path not in seen_dirs:
lines.append(" " * depth + f"📁 {part}/")
seen_dirs.add(dir_path)
lines.append(" " * (len(parts) - 1) + f"📄 {parts[-1]}")
return "\n".join(lines)
def generate(
root: Path,
output: Path,
max_bytes: int,
include_ext: set[str],
exclude_ext: set[str],
no_content: bool,
) -> None:
use_git = git_available(root)
if use_git:
print("✓ Git detected — respecting .gitignore via `git check-ignore`")
else:
print("⚠ No git repo found — .gitignore patterns not applied")
print(f" Scanning {root} …", end="", flush=True)
files = collect_files(root, max_bytes, include_ext, exclude_ext, use_git)
print(f" {len(files)} files found")
with output.open("w", encoding="utf-8") as out:
# ── Header ──────────────────────────────────────────────────────────
out.write(f"# llms.txt — {root.resolve().name}\n")
out.write(f"# Generated from: {root.resolve()}\n")
out.write(f"# Files included: {len(files)}\n")
out.write(f"# .gitignore respected: {use_git}\n\n")
# ── File tree ───────────────────────────────────────────────────────
out.write("## File Tree\n\n")
out.write("```\n")
out.write(file_tree(files, root))
out.write("\n```\n\n")
if no_content:
return
# ── File contents ───────────────────────────────────────────────────
out.write("## File Contents\n\n")
skipped = 0
for fpath in files:
rel = fpath.relative_to(root)
try:
content = fpath.read_text(encoding="utf-8", errors="replace")
except OSError as exc:
out.write(f"### {rel}\n\n*Could not read file: {exc}*\n\n")
skipped += 1
continue
ext = fpath.suffix.lstrip(".") or "text"
out.write(f"### {rel}\n\n")
out.write(f"```{ext}\n")
out.write(content)
if not content.endswith("\n"):
out.write("\n")
out.write("```\n\n")
if skipped:
print(f"⚠ {skipped} file(s) could not be read (see output for details)")
size_kb = output.stat().st_size / 1024
print(f"✓ Written → {output} ({size_kb:,.1f} KB)")
# ---------------------------------------------------------------------------
# CLI
# ---------------------------------------------------------------------------
def parse_ext_arg(raw: str) -> set[str]:
"""Parse a comma-separated list of extensions, normalising to '.ext' form."""
result = set()
for part in raw.split(","):
part = part.strip()
if part:
result.add(part if part.startswith(".") else f".{part}")
return result
def main() -> None:
parser = argparse.ArgumentParser(
description="Generate llms.txt for a codebase, respecting .gitignore.",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog=__doc__,
)
parser.add_argument(
"root", nargs="?", default=".", help="Root directory (default: .)"
)
parser.add_argument("--output", default=DEFAULT_OUTPUT, help="Output file path")
parser.add_argument(
"--max-file-kb",
type=int,
default=DEFAULT_MAX_FILE_KB,
help=f"Skip files larger than N KB (default: {DEFAULT_MAX_FILE_KB})",
)
parser.add_argument(
"--include-ext", default="", help="Extra extensions to include, e.g. '.csv,.log'"
)
parser.add_argument(
"--exclude-ext", default="", help="Extensions to exclude, e.g. '.min.js,.map'"
)
parser.add_argument(
"--no-content",
action="store_true",
help="Only write the file tree, skip file contents",
)
args = parser.parse_args()
root = Path(args.root).resolve()
if not root.is_dir():
sys.exit(f"Error: '{root}' is not a directory.")
output = Path(args.output)
if not output.is_absolute():
output = Path.cwd() / output
generate(
root=root,
output=output,
max_bytes=args.max_file_kb * 1024,
include_ext=parse_ext_arg(args.include_ext),
exclude_ext=parse_ext_arg(args.exclude_ext),
no_content=args.no_content,
)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment