Skip to content

Instantly share code, notes, and snippets.

@James-Rocker
Created December 26, 2024 20:06
Show Gist options
  • Select an option

  • Save James-Rocker/984c8cda674ea9c68c94d96e12ba545f to your computer and use it in GitHub Desktop.

Select an option

Save James-Rocker/984c8cda674ea9c68c94d96e12ba545f to your computer and use it in GitHub Desktop.
Comparing Pandera vs Pydantic
import pandas as pd
import time
from pydantic import BaseModel, ValidationError, Field
import pandera as pa
from pandera import Column, DataFrameSchema
# Generate a synthetic DataFrame
def generate_data(n: int) -> pd.DataFrame:
data = {
"id": list(range(1, n + 1)),
"name": [f"name_{i}" for i in range(1, n + 1)],
"age": [i % 100 for i in range(1, n + 1)],
"salary": [i * 1000.0 for i in range(1, n + 1)]
}
return pd.DataFrame(data)
# Pydantic model for validating a row
class RowModel(BaseModel):
id: int = Field(..., ge=1)
name: str
age: int = Field(..., ge=0, le=100)
salary: float = Field(..., ge=0.0)
# Pandera schema for validating the entire DataFrame
schema = DataFrameSchema({
"id": Column(int, checks=pa.Check.ge(1)),
"name": Column(str),
"age": Column(int, checks=[pa.Check.ge(0), pa.Check.le(100)]),
"salary": Column(float, checks=pa.Check.ge(0.0))
})
# Validate rows using Pydantic
def validate_with_pydantic(df: pd.DataFrame) -> bool:
valid = True
for row in df.to_dict(orient="records"):
try:
RowModel(**row)
except ValidationError as e:
valid = False
print(f"Pydantic Validation Error: {e}")
return valid
# Validate entire DataFrame using Pandera
def validate_with_pandera(df: pd.DataFrame) -> bool:
try:
schema.validate(df)
return True
except pa.errors.SchemaError as e:
print(f"Pandera Validation Error: {e}")
return False
# Benchmark function
def benchmark(n: int) -> None:
df = generate_data(n)
# Benchmark Pydantic validation
start_time = time.time()
pydantic_valid = validate_with_pydantic(df)
pydantic_time = time.time() - start_time
# Benchmark Pandera validation
start_time = time.time()
pandera_valid = validate_with_pandera(df)
pandera_time = time.time() - start_time
# Results
print(f"Pydantic validation passed: {pydantic_valid}, Time taken: {pydantic_time:.6f} seconds")
print(f"Pandera validation passed: {pandera_valid}, Time taken: {pandera_time:.6f} seconds")
# Run the benchmark with 10000 rows
if __name__ == "__main__":
benchmark(10000)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment