Created
June 4, 2025 23:02
-
-
Save rhenter/ff7a93585e659a5dc092ac321b95ef09 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| from pathlib import Path | |
| from typing import Union, Dict, List | |
| from concurrent.futures import ThreadPoolExecutor | |
| class PDFCrawler: | |
| def __init__(self, source: Union[str, Path, Dict[str, Union[str, Path]]], max_workers_per_folder: int = 4): | |
| self.source_paths: List[Path] = self._parse_source(source) | |
| self.max_workers_per_folder = max_workers_per_folder | |
| def _parse_source(self, source: Union[str, Path, Dict[str, Union[str, Path]]]) -> List[Path]: | |
| if isinstance(source, dict): | |
| return [Path(p).expanduser().resolve() for p in source.values()] | |
| elif isinstance(source, (str, Path)): | |
| return [Path(source).expanduser().resolve()] | |
| else: | |
| raise ValueError("Invalid source type") | |
| def _get_root_subfolders(self, root_path: Path) -> List[Path]: | |
| """Returns all first-level subdirectories of a given root directory.""" | |
| return [p for p in root_path.iterdir() if p.is_dir()] | |
| def _find_pdfs_recursively(self, base_path: Path) -> List[Path]: | |
| """Finds all PDF files recursively under a given base directory.""" | |
| return [file for file in base_path.rglob("*.pdf") if file.is_file()] | |
| def crawl(self): | |
| for base_path in self.source_paths: | |
| if not base_path.exists() or not base_path.is_dir(): | |
| continue | |
| root_folders = self._get_root_subfolders(base_path) | |
| if not root_folders: | |
| # If there are no subfolders, process the root directly | |
| pdfs = self._find_pdfs_recursively(base_path) | |
| with ThreadPoolExecutor(max_workers=self.max_workers_per_folder) as executor: | |
| executor.map(self.process_pdf, pdfs) | |
| continue | |
| # For each first-level subfolder, spawn a separate thread pool | |
| for folder in root_folders: | |
| pdfs = self._find_pdfs_recursively(folder) | |
| if pdfs: | |
| with ThreadPoolExecutor(max_workers=self.max_workers_per_folder) as executor: | |
| executor.map(self.process_pdf, pdfs) | |
| @staticmethod | |
| def process_pdf(file_path: Path): | |
| # Placeholder function – replace with your actual processing logic | |
| print(f"[PDF]: {file_path}") | |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment