Skip to content

Instantly share code, notes, and snippets.

@dhbrojas
Created July 18, 2025 11:11
Show Gist options
  • Select an option

  • Save dhbrojas/2eb76b2c12200134c58d866bf2a9a362 to your computer and use it in GitHub Desktop.

Select an option

Save dhbrojas/2eb76b2c12200134c58d866bf2a9a362 to your computer and use it in GitHub Desktop.
Parquet Streaming Reader
from typing import Any, Dict, List
import pyarrow.parquet as pq
class ParquetReader:
def __init__(self, file: str, batch_size: int = 256):
self.fp = pq.ParquetFile(file)
self.num_rows = self.fp.metadata.num_rows
self.num_rows_read = 0
self.batch = None
self.batches = self.fp.iter_batches(batch_size=batch_size)
def __len__(self) -> int:
return self.num_rows - self.num_rows_read
def __next__(self) -> Dict[str, Any]:
if self.num_rows_read >= self.num_rows:
raise StopIteration
if self.batch is None or len(self.batch) == 0:
self.batch = next(self.batches).to_pylist()
row = self.batch.pop(0)
self.num_rows_read += 1
return row
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment