zzstoatzz/image_moderation_pt_1.py

## image_moderation_pt_1.py
"""proof of concept: image moderation with pydantic-ai.

tests pydantic-ai's ability to analyze images and detect policy violations
with different policy strictness levels.

usage:
    uv run sandbox/test_image_moderation.py
"""

import asyncio
import os
from pathlib import Path
from typing import Literal

import dotenv
import logfire
from pydantic import BaseModel, Field
from pydantic_ai import Agent
from pydantic_ai.messages import BinaryImage

dotenv.load_dotenv()

logfire.configure(token=os.getenv("LOGFIRE_WRITE_TOKEN"))
logfire.instrument_pydantic_ai()


class ModerationResult(BaseModel):
    """structured moderation result."""

    is_safe: bool = Field(description="whether the image passes moderation")
    violated_categories: list[str] = Field(
        default_factory=list,
        description="categories that were violated (empty if safe)",
    )
    severity: Literal["safe", "low", "medium", "high"] = Field(
        description="severity of violations"
    )
    explanation: str = Field(description="explanation of the decision")


async def moderate_image(image_path: Path, policy: str) -> ModerationResult:
    """moderate an image against a policy using pydantic-ai.

    args:
        image_path: path to image file
        policy: moderation policy description

    returns:
        structured moderation result
    """
    # read image bytes
    image_bytes = image_path.read_bytes()

    # create BinaryImage from raw bytes
    binary_image = BinaryImage(data=image_bytes, media_type="image/png")

    # create agent with temperature=0 for deterministic moderation
    moderator_agent = Agent[None, ModerationResult](
        "gateway/anthropic:claude-sonnet-4-5", output_type=ModerationResult
    )

    # use instructions kwarg for policy enforcement
    instructions = f"""
    Analyze the provided image against this content policy:

    {policy}

    Be accurate and conservative - only flag clear violations.
    """

    result = await moderator_agent.run(
        ["this is the image to moderate", binary_image],
        instructions=instructions,
        model_settings={"temperature": 0},
    )
    return result.output


async def main() -> None:
    """run image moderation proof of concept."""
    sandbox = Path(__file__).parent
    test_image = sandbox / "handguns.png"

    if not test_image.exists():
        raise FileNotFoundError(f"test image not found: {test_image}")

    # policy 1: strict - no weapons at all
    strict_policy = """
    Content Policy - STRICT:

    PROHIBITED CONTENT:
    - Firearms, guns, or any violent weapons
    - Depictions of weapons regardless of context

    Any image containing weapons should be flagged with HIGH severity.
    """

    # policy 2: nuanced - weapons ok if not depicting violence
    nuanced_policy = """
    Content Policy - NUANCED:

    PROHIBITED CONTENT:
    - Explicit acts of violence
    - Weapons being used to harm people or animals
    - Threatening or aggressive poses with weapons

    ALLOWED CONTENT:
    - Static displays of weapons (e.g., collections, product photos)
    - Historical or educational weapon imagery
    - Weapons not being actively used for violence

    Only flag images showing actual violence or threatening behavior.
    Neutral weapon displays without violent context are acceptable.
    """

    print("\n" + "=" * 70)
    print(f"testing image moderation with: {test_image.name}")
    print("=" * 70)

    # test 1: strict policy (should flag)
    print("\ntest 1: strict policy (no weapons allowed)")
    print("-" * 70)
    strict_result = await moderate_image(test_image, strict_policy)
    print(f"is_safe: {strict_result.is_safe}")
    print(f"violated_categories: {strict_result.violated_categories}")
    print(f"severity: {strict_result.severity}")
    print(f"explanation: {strict_result.explanation}")

    # test 2: nuanced policy (should pass - no violence depicted)
    print("\ntest 2: nuanced policy (weapons ok if no violence)")
    print("-" * 70)
    nuanced_result = await moderate_image(test_image, nuanced_policy)
    print(f"is_safe: {nuanced_result.is_safe}")
    print(f"violated_categories: {nuanced_result.violated_categories}")
    print(f"severity: {nuanced_result.severity}")
    print(f"explanation: {nuanced_result.explanation}")

    print("\n" + "=" * 70)
    print("proof of concept complete")
    print("=" * 70)

    # validate results match expectations
    print("\nexpectations:")
    print(f"  strict policy should flag: {not strict_result.is_safe} ✓")
    print(
        f"  nuanced policy should pass: {nuanced_result.is_safe} {'✓' if nuanced_result.is_safe else '✗'}"
    )


if __name__ == "__main__":
    asyncio.run(main())
	"""proof of concept: image moderation with pydantic-ai.

	tests pydantic-ai's ability to analyze images and detect policy violations
	with different policy strictness levels.

	usage:
	uv run sandbox/test_image_moderation.py
	"""

	import asyncio
	import os
	from pathlib import Path
	from typing import Literal

	import dotenv
	import logfire
	from pydantic import BaseModel, Field
	from pydantic_ai import Agent
	from pydantic_ai.messages import BinaryImage

	dotenv.load_dotenv()

	logfire.configure(token=os.getenv("LOGFIRE_WRITE_TOKEN"))
	logfire.instrument_pydantic_ai()


	class ModerationResult(BaseModel):
	"""structured moderation result."""

	is_safe: bool = Field(description="whether the image passes moderation")
	violated_categories: list[str] = Field(
	default_factory=list,
	description="categories that were violated (empty if safe)",
	)
	severity: Literal["safe", "low", "medium", "high"] = Field(
	description="severity of violations"
	)
	explanation: str = Field(description="explanation of the decision")


	async def moderate_image(image_path: Path, policy: str) -> ModerationResult:
	"""moderate an image against a policy using pydantic-ai.

	args:
	image_path: path to image file
	policy: moderation policy description

	returns:
	structured moderation result
	"""
	# read image bytes
	image_bytes = image_path.read_bytes()

	# create BinaryImage from raw bytes
	binary_image = BinaryImage(data=image_bytes, media_type="image/png")

	# create agent with temperature=0 for deterministic moderation
	moderator_agent = Agent[None, ModerationResult](
	"gateway/anthropic:claude-sonnet-4-5", output_type=ModerationResult
	)

	# use instructions kwarg for policy enforcement
	instructions = f"""
	Analyze the provided image against this content policy:

	{policy}

	Be accurate and conservative - only flag clear violations.
	"""

	result = await moderator_agent.run(
	["this is the image to moderate", binary_image],
	instructions=instructions,
	model_settings={"temperature": 0},
	)
	return result.output


	async def main() -> None:
	"""run image moderation proof of concept."""
	sandbox = Path(__file__).parent
	test_image = sandbox / "handguns.png"

	if not test_image.exists():
	raise FileNotFoundError(f"test image not found: {test_image}")

	# policy 1: strict - no weapons at all
	strict_policy = """
	Content Policy - STRICT:

	PROHIBITED CONTENT:
	- Firearms, guns, or any violent weapons
	- Depictions of weapons regardless of context

	Any image containing weapons should be flagged with HIGH severity.
	"""

	# policy 2: nuanced - weapons ok if not depicting violence
	nuanced_policy = """
	Content Policy - NUANCED:

	PROHIBITED CONTENT:
	- Explicit acts of violence
	- Weapons being used to harm people or animals
	- Threatening or aggressive poses with weapons

	ALLOWED CONTENT:
	- Static displays of weapons (e.g., collections, product photos)
	- Historical or educational weapon imagery
	- Weapons not being actively used for violence

	Only flag images showing actual violence or threatening behavior.
	Neutral weapon displays without violent context are acceptable.
	"""

	print("\n" + "=" * 70)
	print(f"testing image moderation with: {test_image.name}")
	print("=" * 70)

	# test 1: strict policy (should flag)
	print("\ntest 1: strict policy (no weapons allowed)")
	print("-" * 70)
	strict_result = await moderate_image(test_image, strict_policy)
	print(f"is_safe: {strict_result.is_safe}")
	print(f"violated_categories: {strict_result.violated_categories}")
	print(f"severity: {strict_result.severity}")
	print(f"explanation: {strict_result.explanation}")

	# test 2: nuanced policy (should pass - no violence depicted)
	print("\ntest 2: nuanced policy (weapons ok if no violence)")
	print("-" * 70)
	nuanced_result = await moderate_image(test_image, nuanced_policy)
	print(f"is_safe: {nuanced_result.is_safe}")
	print(f"violated_categories: {nuanced_result.violated_categories}")
	print(f"severity: {nuanced_result.severity}")
	print(f"explanation: {nuanced_result.explanation}")

	print("\n" + "=" * 70)
	print("proof of concept complete")
	print("=" * 70)

	# validate results match expectations
	print("\nexpectations:")
	print(f" strict policy should flag: {not strict_result.is_safe} ✓")
	print(
	f" nuanced policy should pass: {nuanced_result.is_safe} {'✓' if nuanced_result.is_safe else '✗'}"
	)


	if __name__ == "__main__":
	asyncio.run(main())
No results found