Skip to content

Instantly share code, notes, and snippets.

@farzadhallaji
Created July 20, 2025 19:11
Show Gist options
  • Select an option

  • Save farzadhallaji/b2fccf77620e901a8f6bcfd3fe8cc81c to your computer and use it in GitHub Desktop.

Select an option

Save farzadhallaji/b2fccf77620e901a8f6bcfd3fe8cc81c to your computer and use it in GitHub Desktop.
#!/usr/bin/env python3
import os
import re
import io
import argparse
from concurrent.futures import ThreadPoolExecutor
def search_file(path: str, regex: re.Pattern, encoding: str, bufsize: int):
"""
Open `path` in text mode, read line by line with a large buffer,
and print any line matching `regex` as:
path:lineno:line
"""
try:
with open(path, 'r', encoding=encoding, errors='ignore', buffering=bufsize) as f:
for lineno, line in enumerate(f, start=1):
if regex.search(line):
print(f"{path}:{lineno}:{line.rstrip()}")
except Exception as e:
print(f"Skipping {path!r}: {e}", file=os.sys.stderr)
def gather_csvs(root_dir: str):
"""Recursively find all .csv (case-insensitive) under root_dir."""
for dp, _, files in os.walk(root_dir):
for fn in files:
if fn.lower().endswith('.csv'):
yield os.path.join(dp, fn)
def main():
p = argparse.ArgumentParser(
description="Recursively grep CSVs for a regex (streaming, low RAM)."
)
p.add_argument('-p','--pattern', required=True,
help="Regex (in quotes) to search for.")
p.add_argument('-d','--path', default='.',
help="Directory to scan for .csv files.")
p.add_argument('-w','--workers', type=int, default=1,
help="Number of files to scan in parallel (default 1).")
p.add_argument('--buffer', type=int,
default=io.DEFAULT_BUFFER_SIZE * 8,
help="I/O buffer size in bytes (default 8× OS default).")
p.add_argument('--encoding', default='utf8',
help="File encoding (default utf8).")
args = p.parse_args()
print('io.DEFAULT_BUFFER_SIZE', io.DEFAULT_BUFFER_SIZE, 'buffer:', args.buffer)
# Compile once up front
try:
regex = re.compile(re.escape(args.pattern))
except re.error as e:
print(f"Invalid regex: {e}", file=os.sys.stderr)
exit(1)
csv_paths = list(gather_csvs(args.path))
if not csv_paths:
print("No CSV files found.", file=os.sys.stderr)
exit(1)
if args.workers > 1:
with ThreadPoolExecutor(max_workers=args.workers) as ex:
for path in csv_paths:
ex.submit(search_file, path,
regex, args.encoding, args.buffer)
else:
for path in csv_paths:
search_file(path, regex, args.encoding, args.buffer)
if __name__ == "__main__":
main()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment