Created
October 9, 2025 13:04
-
-
Save jfrobbins/924627d4ae9c41d9479c3186ad53dda6 to your computer and use it in GitHub Desktop.
A tool to recursively scan a directory and collapse its contents into a single file. It provides intelligent API and structure parsing for Python files and works generically with any other readable text file, such as C# (.cs), Java, JSON, Markdown, etc. Ideal for creating a comprehensive project overview for AI analysis.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| # -*- coding: utf-8 -*- | |
| # --- MIT License --- | |
| # | |
| # Copyright (c) 2025 [Jon Robbins] | |
| # | |
| # Permission is hereby granted, free of charge, to any person obtaining a copy | |
| # of this software and associated documentation files (the "Software"), to deal | |
| # in the Software without restriction, including without limitation the rights | |
| # to use, copy, modify, merge, publish, distribute, sublicense, and/or sell | |
| # copies of the Software, and to permit persons to whom the Software is | |
| # furnished to do so, subject to the following conditions: | |
| # | |
| # The above copyright notice and this permission notice shall be included in all | |
| # copies or substantial portions of the Software. | |
| # | |
| # THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY, EXPRESS OR | |
| # IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, | |
| # FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE | |
| # AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER | |
| # LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, | |
| # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE | |
| # SOFTWARE. | |
| # -------------------- | |
| """ | |
| pyCollapse: A tool to recursively scan a directory and collapse its contents | |
| into a single file. It provides intelligent API and structure parsing for | |
| Python files and works generically with any other readable text file, | |
| such as C# (.cs), Java, JSON, Markdown, etc. Ideal for creating a | |
| comprehensive project overview for AI analysis. ๐ค | |
| --- | |
| ## USAGE | |
| --- | |
| The script is run from the command line and accepts several arguments. | |
| **Syntax:** | |
| `python pyCollapse.py <DIRECTORY> --output <OUTPUT_FILE> [OPTIONS]` | |
| **Required Arguments:** | |
| `DIRECTORY` The path to the root directory you want to scan. | |
| `--output, -o` The name of the final output file. If using compression, | |
| this should be the name of the archive (e.g., 'project.zip'). | |
| **Optional Arguments:** | |
| `--format` The output format. Currently only 'markdown' is supported. | |
| (Default: markdown) | |
| `--compression` The compression type. Currently only 'zip' is supported. | |
| If omitted, the output will be an uncompressed text file. | |
| **Examples:** | |
| *# 1. Collapse './my_project' into a single markdown file* | |
| `python pyCollapse.py ./my_project --output summary.md` | |
| *# 2. Collapse a C# project into a zipped markdown file* | |
| `python pyCollapse.py ./src/MyWebApp -o WebApp_source.zip --compression zip` | |
| --- | |
| ## HOW IT WORKS | |
| --- | |
| 1. **Recursive Scan:** The script walks through every file and subdirectory | |
| starting from the specified root directory. | |
| 2. **File Filtering:** It uses a list of binary file extensions (`BINARY_EXTENSIONS`) | |
| to skip non-text files like images, archives, and compiled code. | |
| 3. **Content Processing:** | |
| - **Python Files (.py):** These are treated specially. The script uses Python's | |
| `ast` module to parse the code's structure, generating a summary of | |
| classes, methods, and functions. This summary is placed above the full | |
| source code in the output. | |
| - **Other Text Files:** Any other file not identified as binary (e.g., .cs, | |
| .java, .js, .json, .txt, .md) is handled by a generic processor. It reads | |
| the file's content and wraps it in a markdown code block, using the file's | |
| extension for syntax highlighting. | |
| 4. **Aggregation & Output:** All the processed content is concatenated into a | |
| single string and then written to the specified output file, with optional | |
| zip compression. | |
| --- | |
| ## EXTENDING THE SCRIPT | |
| --- | |
| You can easily add custom processors for other file types to extract more | |
| detailed information, similar to the Python parser. | |
| **Goal:** Add a special parser for `.my_format` files. | |
| **Step 1: Create a New Handler Function** | |
| Create a new Python function that accepts a `pathlib.Path` object and returns | |
| a formatted string. Add this function in the "File Handlers" section of the script. | |
| def process_my_format_file(file_path: Path) -> str: | |
| ''' | |
| Parses a .my_format file and extracts its key features. | |
| ''' | |
| try: | |
| content = file_path.read_text(encoding='utf-8') | |
| # --- Your custom parsing logic goes here --- | |
| # For example, let's find all lines starting with 'FEATURE:' | |
| features = [ | |
| line.split(':', 1)[1].strip() | |
| for line in content.splitlines() | |
| if line.startswith('FEATURE:') | |
| ] | |
| summary = "#### Key Features\\n" + "\\n".join(f"- {f}" for f in features) | |
| # --- End of custom logic --- | |
| # Return the content formatted in markdown | |
| return ( | |
| f"--- | |
| " | |
| f"### File: `{file_path}` (My Format)\\n\\n" | |
| f"{summary}\\n\\n" | |
| f"#### Raw Content\\n" | |
| f"```my_format\\n{content}\\n```\\n\\n" | |
| ) | |
| except Exception as e: | |
| print(f"Warning: Could not process .my_format file '{file_path}': {e}", file=sys.stderr) | |
| return "" # Return empty string on failure | |
| **Step 2: Integrate the Handler in `main()`** | |
| Find the `if/elif/else` block inside the `main()` function's loop and add a | |
| condition to call your new function for the correct file extension. | |
| # In the main() function's loop: | |
| # ... | |
| if file_path.suffix == '.py': | |
| processed_content = process_python_file(file_path) | |
| # ADD YOUR NEW HANDLER HERE | |
| elif file_path.suffix == '.my_format': | |
| processed_content = process_my_format_file(file_path) | |
| else: | |
| # All other readable files are handled generically | |
| processed_content = process_generic_file(file_path) | |
| # ... | |
| """ | |
| import os | |
| import argparse | |
| import ast | |
| import zipfile | |
| import sys | |
| from pathlib import Path | |
| # --- Constants --- | |
| # Heuristic set of file extensions to ignore. | |
| # The goal is to process any text-based source code or data file. | |
| BINARY_EXTENSIONS = { | |
| # Compiled code & packages | |
| '.pyc', '.pyo', '.o', '.a', '.so', '.dll', '.exe', '.nupkg', '.jar', | |
| # Archives | |
| '.zip', '.tar', '.gz', '.bz2', '.7z', '.rar', '.egg', | |
| # Images | |
| '.png', '.jpg', '.jpeg', '.gif', '.bmp', '.tiff', '.ico', '.svg', | |
| # Audio/Video | |
| '.mp3', '.wav', '.flac', '.ogg', '.mp4', '.avi', '.mov', '.mkv', | |
| # Documents & Fonts | |
| '.pdf', '.doc', '.docx', '.xls', '.xlsx', '.ppt', '.pptx', '.odt', '.ttf', '.woff', | |
| # Other | |
| '.db', '.sqlite3', '.dat', '.bin', '.iso', '.img', '.lock' | |
| } | |
| # --- API Parsing for Python Files --- | |
| class PythonApiVisitor(ast.NodeVisitor): | |
| """ | |
| An AST (Abstract Syntax Tree) visitor that walks through Python source code | |
| to extract information about classes, methods, and functions. | |
| """ | |
| def __init__(self): | |
| self.api_details = [] | |
| self._current_class = None | |
| def visit_ClassDef(self, node: ast.ClassDef): | |
| """Extracts information about a class definition.""" | |
| class_info = { | |
| "type": "Class", | |
| "name": node.name, | |
| "docstring": ast.get_docstring(node) or "No docstring.", | |
| "methods": [] | |
| } | |
| self.api_details.append(class_info) | |
| self._current_class = class_info | |
| self.generic_visit(node) | |
| self._current_class = None | |
| def visit_FunctionDef(self, node: ast.FunctionDef): | |
| """Extracts information about a function or method definition.""" | |
| args_list = [arg.arg for arg in node.args.args] | |
| func_info = { | |
| "type": "Function", | |
| "name": node.name, | |
| "args": ", ".join(args_list), | |
| "docstring": ast.get_docstring(node) or "No docstring." | |
| } | |
| if self._current_class: | |
| func_info["type"] = "Method" | |
| self._current_class["methods"].append(func_info) | |
| else: | |
| self.api_details.append(func_info) | |
| def visit_AsyncFunctionDef(self, node: ast.AsyncFunctionDef): | |
| """Handles async functions similarly to regular functions.""" | |
| self.visit_FunctionDef(node) | |
| def _format_api_summary_markdown(api_details: list) -> str: | |
| """ | |
| Takes structured API data and formats it into human-readable markdown. | |
| """ | |
| summary = ["#### API & Code Structure Summary"] | |
| if not api_details: | |
| summary.append("\n*No classes or functions found.*") | |
| return "".join(summary) | |
| for item in api_details: | |
| if item['type'] == 'Class': | |
| summary.append(f"\n- **Class `{item['name']}`**") | |
| summary.append(f" - *{item['docstring'].strip()}*") | |
| for method in item['methods']: | |
| summary.append(f" - **Method `def {method['name']}({method['args']})`**") | |
| summary.append(f" - *{method['docstring'].strip()}*") | |
| elif item['type'] == 'Function': | |
| summary.append(f"\n- **Function `def {item['name']}({item['args']})`**") | |
| summary.append(f" - *{item['docstring'].strip()}*") | |
| return "\n".join(summary) | |
| # --- File Handlers --- | |
| def process_python_file(file_path: Path) -> str: | |
| """ | |
| Reads a Python file, parses it to extract an API summary, and formats | |
| the output with the summary followed by the full source code. | |
| """ | |
| try: | |
| content = file_path.read_text(encoding='utf-8') | |
| tree = ast.parse(content) | |
| visitor = PythonApiVisitor() | |
| visitor.visit(tree) | |
| api_summary = _format_api_summary_markdown(visitor.api_details) | |
| return ( | |
| f"---\n" | |
| f"### File: `{file_path}`\n\n" | |
| f"{api_summary}\n\n" | |
| f"#### Full Source Code\n" | |
| f"```python\n{content}\n```\n\n" | |
| ) | |
| except Exception as e: | |
| print(f"Warning: Could not parse Python file '{file_path}': {e}. Treating as generic text.", file=sys.stderr) | |
| return process_generic_file(file_path) | |
| def process_generic_file(file_path: Path) -> str: | |
| """ | |
| Reads any text-based file (e.g., C#, Java, JSON, TXT, MD) and formats | |
| its content into a markdown code block, labeling it with its original path. | |
| """ | |
| try: | |
| content = file_path.read_text(encoding='utf-8') | |
| lang = file_path.suffix.lstrip('.') if file_path.suffix else 'text' | |
| return ( | |
| f"---\n" | |
| f"### File: `{file_path}`\n\n" | |
| f"```{lang}\n{content}\n```\n\n" | |
| ) | |
| except UnicodeDecodeError: | |
| print(f"Warning: Skipping file with unsupported encoding: {file_path}", file=sys.stderr) | |
| return "" | |
| except Exception as e: | |
| print(f"Warning: Could not read file '{file_path}': {e}", file=sys.stderr) | |
| return "" | |
| # --- Main Execution --- | |
| def main(): | |
| """ | |
| Main function to parse arguments and orchestrate the file processing. | |
| """ | |
| parser = argparse.ArgumentParser( | |
| description="Collapse a source code directory into a single formatted text file for analysis.", | |
| formatter_class=argparse.RawTextHelpFormatter, | |
| epilog="For more details on usage and extending the script, read the main docstring in the script file." | |
| ) | |
| parser.add_argument( | |
| "directory", | |
| type=str, | |
| help="The root directory to scan recursively." | |
| ) | |
| parser.add_argument( | |
| "--output", | |
| "-o", | |
| type=str, | |
| required=True, | |
| help="Path for the final output file (e.g., 'project_summary.md' or 'project.zip')." | |
| ) | |
| parser.add_argument( | |
| "--format", | |
| type=str, | |
| default="markdown", | |
| choices=["markdown"], | |
| help="The output format for the collapsed text (default: markdown)." | |
| ) | |
| parser.add_argument( | |
| "--compression", | |
| type=str, | |
| default=None, | |
| choices=["zip"], | |
| help="Compression type for the output file (e.g., 'zip')." | |
| ) | |
| args = parser.parse_args() | |
| input_path = Path(args.directory) | |
| if not input_path.is_dir(): | |
| print(f"Error: Input path '{input_path}' is not a valid directory.", file=sys.stderr) | |
| sys.exit(1) | |
| print(f"๐ Starting collapse of '{input_path}'...") | |
| all_content = [] | |
| for root, _, files in os.walk(input_path): | |
| current_dir = Path(root) | |
| for filename in sorted(files): | |
| file_path = current_dir / filename | |
| if file_path.suffix.lower() in BINARY_EXTENSIONS: | |
| continue | |
| print(f" ๐ Processing: {file_path}") | |
| if file_path.suffix == '.py': | |
| processed_content = process_python_file(file_path) | |
| else: | |
| processed_content = process_generic_file(file_path) | |
| if processed_content: | |
| all_content.append(processed_content) | |
| final_output_str = "".join(all_content) | |
| output_path = Path(args.output) | |
| if args.compression == 'zip': | |
| internal_filename = output_path.with_suffix(f".{args.format}").name | |
| print(f"\n๐ฆ Compressing output to '{output_path}' with internal file '{internal_filename}'...") | |
| try: | |
| with zipfile.ZipFile(output_path, 'w', zipfile.ZIP_DEFLATED) as zf: | |
| zf.writestr(internal_filename, final_output_str) | |
| except Exception as e: | |
| print(f"Error: Failed to write to zip file '{output_path}': {e}", file=sys.stderr) | |
| sys.exit(1) | |
| elif args.compression is None: | |
| print(f"\n๐พ Writing output to '{output_path}'...") | |
| try: | |
| output_path.write_text(final_output_str, encoding='utf-8') | |
| except Exception as e: | |
| print(f"Error: Failed to write to file '{output_path}': {e}", file=sys.stderr) | |
| sys.exit(1) | |
| print("โ Done!") | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment