Skip to content

Instantly share code, notes, and snippets.

@rhenter
Created June 4, 2025 23:02
Show Gist options
  • Select an option

  • Save rhenter/ff7a93585e659a5dc092ac321b95ef09 to your computer and use it in GitHub Desktop.

Select an option

Save rhenter/ff7a93585e659a5dc092ac321b95ef09 to your computer and use it in GitHub Desktop.
import os
from pathlib import Path
from typing import Union, Dict, List
from concurrent.futures import ThreadPoolExecutor
class PDFCrawler:
def __init__(self, source: Union[str, Path, Dict[str, Union[str, Path]]], max_workers_per_folder: int = 4):
self.source_paths: List[Path] = self._parse_source(source)
self.max_workers_per_folder = max_workers_per_folder
def _parse_source(self, source: Union[str, Path, Dict[str, Union[str, Path]]]) -> List[Path]:
if isinstance(source, dict):
return [Path(p).expanduser().resolve() for p in source.values()]
elif isinstance(source, (str, Path)):
return [Path(source).expanduser().resolve()]
else:
raise ValueError("Invalid source type")
def _get_root_subfolders(self, root_path: Path) -> List[Path]:
"""Returns all first-level subdirectories of a given root directory."""
return [p for p in root_path.iterdir() if p.is_dir()]
def _find_pdfs_recursively(self, base_path: Path) -> List[Path]:
"""Finds all PDF files recursively under a given base directory."""
return [file for file in base_path.rglob("*.pdf") if file.is_file()]
def crawl(self):
for base_path in self.source_paths:
if not base_path.exists() or not base_path.is_dir():
continue
root_folders = self._get_root_subfolders(base_path)
if not root_folders:
# If there are no subfolders, process the root directly
pdfs = self._find_pdfs_recursively(base_path)
with ThreadPoolExecutor(max_workers=self.max_workers_per_folder) as executor:
executor.map(self.process_pdf, pdfs)
continue
# For each first-level subfolder, spawn a separate thread pool
for folder in root_folders:
pdfs = self._find_pdfs_recursively(folder)
if pdfs:
with ThreadPoolExecutor(max_workers=self.max_workers_per_folder) as executor:
executor.map(self.process_pdf, pdfs)
@staticmethod
def process_pdf(file_path: Path):
# Placeholder function – replace with your actual processing logic
print(f"[PDF]: {file_path}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment