Skip to content

Instantly share code, notes, and snippets.

@caleb-kaiser
Last active April 2, 2025 20:04
Show Gist options
  • Select an option

  • Save caleb-kaiser/c79b6b545e61c87290acc108360bb58c to your computer and use it in GitHub Desktop.

Select an option

Save caleb-kaiser/c79b6b545e61c87290acc108360bb58c to your computer and use it in GitHub Desktop.
from typing import List
import openai
import json
from presidio_analyzer import AnalyzerEngine
from presidio_anonymizer import AnonymizerEngine
from opik.guardrail import Guardrail, Transformation
from opik.evaluation.metrics.score_result import ScoreResult
from opik.evaluation.metrics import BaseMetric
# Step 1. Define a metric that checks if a response contains content about a specific topic.
class TopicFilterMetric(BaseMetric):
"""A metric that uses an LLM to check if a response contains content about a specific topic."""
def __init__(self, topic: str):
self.topic = topic
super().__init__(name=f"topic_filter_{topic}")
async def score(self, output: str, **kwargs) -> ScoreResult:
"""Score the output by checking if it contains content about the restricted topic."""
client = openai.AsyncOpenAI(
api_key="sk-or-v1-fdsafsfdsfafsfasfd",
base_url="https://openrouter.ai/api/v1"
)
prompt = f"""Analyze the following text and determine if it contains content about {self.topic}.
Return a JSON object with two fields:
- 'contains_topic': boolean indicating if the text contains content about {self.topic}
- 'reason': string explaining your decision
Text to analyze: {output}"""
response = await client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[{"role": "user", "content": prompt}],
response_format={"type": "json_object"}
)
result = json.loads(response.choices[0].message.content)
return ScoreResult(
name=self.name,
value=0.0 if result["contains_topic"] else 1.0,
metadata={
"contains_topic": result["contains_topic"],
"reason": result["reason"]
}
)
# Step 2. Define a PII detection metric
class PIIDetectionMetric(BaseMetric):
"""A metric that uses Microsoft's Presidio to detect PII in text."""
def __init__(self):
super().__init__(name="pii_detection")
self.analyzer = AnalyzerEngine()
async def score(self, output: str, **kwargs) -> ScoreResult:
"""Score the output by checking for PII."""
results = self.analyzer.analyze(text=output, language="en")
# Convert results to a more readable format
pii_types = [result.entity_type for result in results]
pii_locations = [(result.start, result.end) for result in results]
return ScoreResult(
name=self.name,
value=0.0 if results else 1.0, # 0.0 if PII found, 1.0 if no PII
metadata={
"pii_types": pii_types,
"pii_locations": pii_locations,
"reason": f"Found {len(results)} PII instances" if results else "No PII detected"
}
)
# Step 3. Define a PII removal transformation
class PIIRemovalTransformation(Transformation):
"""A transformation that removes PII using Microsoft's Presidio."""
def __init__(self):
super().__init__(
transform_fn=self._remove_pii,
name="pii_removal",
description="Removes PII using Microsoft's Presidio"
)
self.analyzer = AnalyzerEngine()
self.anonymizer = AnonymizerEngine()
def _remove_pii(self, text: str) -> str:
"""Remove PII from text using Presidio."""
results = self.analyzer.analyze(text=text, language="en")
anonymized = self.anonymizer.anonymize(
text=text,
analyzer_results=results
)
return anonymized.text
# Step 4. Define a guardrail that uses both metrics and the PII transformation
guardrail = Guardrail(
metrics=[
TopicFilterMetric(
topic="Formula 1 racing",
),
PIIDetectionMetric()
],
transformations=[PIIRemovalTransformation()],
calculate_success=lambda results: all(result.value == 1.0 for result in results),
on_success=lambda output: print(f"Response passed: {output.results[0].metadata['reason']}"),
on_failure=lambda output: print(f"Response blocked: {output.results[0].metadata['reason']}")
)
# Step 5. Run the guardrail
async def main():
# Test the guardrail with PII
result = await guardrail(
llm_output="The Monaco Grand Prix is one of the most prestigious races in Formula 1. My phone number is 555-123-4567 and I live at 123 Main St.",
)
if not result.passed:
print("Response was blocked!")
for result in result.results:
print(f"{result.name}: {result.metadata['reason']}")
else:
print("Response passed all checks!")
print(f"Anonymized text: {result.transformed_output}")
second_results = await guardrail(
llm_output="On the 1st of April, 2025, I will be in Monaco. I will be visiting my friend Harry Houdini. His phone number is 555-123-4567 and he lives at 123 Main St, Monaco City, Monaco.",
)
if not second_results.passed:
print("Response was blocked!")
for result in second_results.results:
print(f"{result.name}: {result.metadata['reason']}")
else:
print("Response passed all checks!")
print(f"Anonymized text: {second_results.transformed_output}")
if __name__ == "__main__":
import asyncio
asyncio.run(main())
@caleb-kaiser
Copy link
Author

Output looks like:

# First prompt
Response was blocked!
topic_filter_Formula 1 racing: The text mentions the Monaco Grand Prix, which is a well-known race in Formula 1, indicating that the content is about Formula 1 racing.
pii_detection: No PII detected

# Second prompt
Response passed: The text does not contain any content related to Formula 1 racing. It primarily talks about visiting a friend in a specific location and providing contact details.
Response passed all checks!
Anonymized text: On <DATE_TIME>, I will be in <LOCATION>. I will be visiting my friend <PERSON>. His phone number is <PHONE_NUMBER> and he lives at 123 <LOCATION>, <LOCATION>, <LOCATION>.

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment