Skip to content

Instantly share code, notes, and snippets.

@zzstoatzz
Created November 30, 2025 21:19
Show Gist options
  • Select an option

  • Save zzstoatzz/8975282e4e6751482165fc20b5d2f536 to your computer and use it in GitHub Desktop.

Select an option

Save zzstoatzz/8975282e4e6751482165fc20b5d2f536 to your computer and use it in GitHub Desktop.
"""proof of concept: image moderation with pydantic-ai.
tests pydantic-ai's ability to analyze images and detect policy violations
with different policy strictness levels.
usage:
uv run sandbox/test_image_moderation.py
"""
import asyncio
import os
from pathlib import Path
from typing import Literal
import dotenv
import logfire
from pydantic import BaseModel, Field
from pydantic_ai import Agent
from pydantic_ai.messages import BinaryImage
dotenv.load_dotenv()
logfire.configure(token=os.getenv("LOGFIRE_WRITE_TOKEN"))
logfire.instrument_pydantic_ai()
class ModerationResult(BaseModel):
"""structured moderation result."""
is_safe: bool = Field(description="whether the image passes moderation")
violated_categories: list[str] = Field(
default_factory=list,
description="categories that were violated (empty if safe)",
)
severity: Literal["safe", "low", "medium", "high"] = Field(
description="severity of violations"
)
explanation: str = Field(description="explanation of the decision")
async def moderate_image(image_path: Path, policy: str) -> ModerationResult:
"""moderate an image against a policy using pydantic-ai.
args:
image_path: path to image file
policy: moderation policy description
returns:
structured moderation result
"""
# read image bytes
image_bytes = image_path.read_bytes()
# create BinaryImage from raw bytes
binary_image = BinaryImage(data=image_bytes, media_type="image/png")
# create agent with temperature=0 for deterministic moderation
moderator_agent = Agent[None, ModerationResult](
"gateway/anthropic:claude-sonnet-4-5", output_type=ModerationResult
)
# use instructions kwarg for policy enforcement
instructions = f"""
Analyze the provided image against this content policy:
{policy}
Be accurate and conservative - only flag clear violations.
"""
result = await moderator_agent.run(
["this is the image to moderate", binary_image],
instructions=instructions,
model_settings={"temperature": 0},
)
return result.output
async def main() -> None:
"""run image moderation proof of concept."""
sandbox = Path(__file__).parent
test_image = sandbox / "handguns.png"
if not test_image.exists():
raise FileNotFoundError(f"test image not found: {test_image}")
# policy 1: strict - no weapons at all
strict_policy = """
Content Policy - STRICT:
PROHIBITED CONTENT:
- Firearms, guns, or any violent weapons
- Depictions of weapons regardless of context
Any image containing weapons should be flagged with HIGH severity.
"""
# policy 2: nuanced - weapons ok if not depicting violence
nuanced_policy = """
Content Policy - NUANCED:
PROHIBITED CONTENT:
- Explicit acts of violence
- Weapons being used to harm people or animals
- Threatening or aggressive poses with weapons
ALLOWED CONTENT:
- Static displays of weapons (e.g., collections, product photos)
- Historical or educational weapon imagery
- Weapons not being actively used for violence
Only flag images showing actual violence or threatening behavior.
Neutral weapon displays without violent context are acceptable.
"""
print("\n" + "=" * 70)
print(f"testing image moderation with: {test_image.name}")
print("=" * 70)
# test 1: strict policy (should flag)
print("\ntest 1: strict policy (no weapons allowed)")
print("-" * 70)
strict_result = await moderate_image(test_image, strict_policy)
print(f"is_safe: {strict_result.is_safe}")
print(f"violated_categories: {strict_result.violated_categories}")
print(f"severity: {strict_result.severity}")
print(f"explanation: {strict_result.explanation}")
# test 2: nuanced policy (should pass - no violence depicted)
print("\ntest 2: nuanced policy (weapons ok if no violence)")
print("-" * 70)
nuanced_result = await moderate_image(test_image, nuanced_policy)
print(f"is_safe: {nuanced_result.is_safe}")
print(f"violated_categories: {nuanced_result.violated_categories}")
print(f"severity: {nuanced_result.severity}")
print(f"explanation: {nuanced_result.explanation}")
print("\n" + "=" * 70)
print("proof of concept complete")
print("=" * 70)
# validate results match expectations
print("\nexpectations:")
print(f" strict policy should flag: {not strict_result.is_safe} ✓")
print(
f" nuanced policy should pass: {nuanced_result.is_safe} {'✓' if nuanced_result.is_safe else '✗'}"
)
if __name__ == "__main__":
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment