Created
July 20, 2025 19:11
-
-
Save farzadhallaji/b2fccf77620e901a8f6bcfd3fe8cc81c to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import os | |
| import re | |
| import io | |
| import argparse | |
| from concurrent.futures import ThreadPoolExecutor | |
| def search_file(path: str, regex: re.Pattern, encoding: str, bufsize: int): | |
| """ | |
| Open `path` in text mode, read line by line with a large buffer, | |
| and print any line matching `regex` as: | |
| path:lineno:line | |
| """ | |
| try: | |
| with open(path, 'r', encoding=encoding, errors='ignore', buffering=bufsize) as f: | |
| for lineno, line in enumerate(f, start=1): | |
| if regex.search(line): | |
| print(f"{path}:{lineno}:{line.rstrip()}") | |
| except Exception as e: | |
| print(f"Skipping {path!r}: {e}", file=os.sys.stderr) | |
| def gather_csvs(root_dir: str): | |
| """Recursively find all .csv (case-insensitive) under root_dir.""" | |
| for dp, _, files in os.walk(root_dir): | |
| for fn in files: | |
| if fn.lower().endswith('.csv'): | |
| yield os.path.join(dp, fn) | |
| def main(): | |
| p = argparse.ArgumentParser( | |
| description="Recursively grep CSVs for a regex (streaming, low RAM)." | |
| ) | |
| p.add_argument('-p','--pattern', required=True, | |
| help="Regex (in quotes) to search for.") | |
| p.add_argument('-d','--path', default='.', | |
| help="Directory to scan for .csv files.") | |
| p.add_argument('-w','--workers', type=int, default=1, | |
| help="Number of files to scan in parallel (default 1).") | |
| p.add_argument('--buffer', type=int, | |
| default=io.DEFAULT_BUFFER_SIZE * 8, | |
| help="I/O buffer size in bytes (default 8× OS default).") | |
| p.add_argument('--encoding', default='utf8', | |
| help="File encoding (default utf8).") | |
| args = p.parse_args() | |
| print('io.DEFAULT_BUFFER_SIZE', io.DEFAULT_BUFFER_SIZE, 'buffer:', args.buffer) | |
| # Compile once up front | |
| try: | |
| regex = re.compile(re.escape(args.pattern)) | |
| except re.error as e: | |
| print(f"Invalid regex: {e}", file=os.sys.stderr) | |
| exit(1) | |
| csv_paths = list(gather_csvs(args.path)) | |
| if not csv_paths: | |
| print("No CSV files found.", file=os.sys.stderr) | |
| exit(1) | |
| if args.workers > 1: | |
| with ThreadPoolExecutor(max_workers=args.workers) as ex: | |
| for path in csv_paths: | |
| ex.submit(search_file, path, | |
| regex, args.encoding, args.buffer) | |
| else: | |
| for path in csv_paths: | |
| search_file(path, regex, args.encoding, args.buffer) | |
| if __name__ == "__main__": | |
| main() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment