yossiovadia/test_lora_pii_pure_python.py

## test_lora_pii_pure_python.py
#!/usr/bin/env python3
"""
Pure Python LoRA PII Model Test
================================

This test loads the LoRA PII model directly using HuggingFace transformers
and runs inference WITHOUT using ANY semantic-router code (no Go, no Rust FFI).

This proves whether the 0.9 confidence comes from:
- The model itself ✓
- Our semantic-router code ✗
"""

import json
import torch
from transformers import (
    AutoTokenizer,
    AutoModelForTokenClassification,
    pipeline,
)
from peft import PeftModel, PeftConfig


def load_lora_pii_model(model_path: str):
    """Load LoRA PII model directly using HuggingFace/PEFT."""
    print(f"Loading LoRA PII model from: {model_path}")
    print("=" * 80)

    # Check if this is a LoRA model
    try:
        config = PeftConfig.from_pretrained(model_path)
        print(f"✓ Detected LoRA model")
        print(f"  Base model: {config.base_model_name_or_path}")
        print(f"  Task type: {config.task_type}")

        # Load base model
        base_model = AutoModelForTokenClassification.from_pretrained(
            config.base_model_name_or_path
        )

        # Load LoRA weights
        model = PeftModel.from_pretrained(base_model, model_path)
        tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

        print(f"✓ Model loaded successfully\n")
        return model, tokenizer

    except Exception as e:
        print(f"✗ Error loading LoRA model: {e}")
        print("Trying to load as standard transformers model...")

        # Fallback to standard model
        model = AutoModelForTokenClassification.from_pretrained(model_path)
        tokenizer = AutoTokenizer.from_pretrained(model_path)
        print(f"✓ Model loaded successfully\n")
        return model, tokenizer


def test_pii_detection(model, tokenizer, test_cases):
    """Run PII detection tests and analyze confidence scores."""

    # Create NER pipeline
    ner_pipeline = pipeline(
        "ner",
        model=model,
        tokenizer=tokenizer,
        aggregation_strategy="simple",  # Aggregate subword tokens
    )

    print("Running PII Detection Tests")
    print("=" * 80)

    all_confidences = []
    total_detections = 0

    for i, tc in enumerate(test_cases, 1):
        print(f"\n[Test {i}/{len(test_cases)}] {tc['name']}")
        print(f"Text: \"{tc['text']}\"")

        # Run inference directly
        results = ner_pipeline(tc["text"])

        if not results:
            print("Result: ✓ No PII detected")
            continue

        print(f"Result: ⚠️  PII detected ({len(results)} entities)")
        total_detections += len(results)

        for j, entity in enumerate(results, 1):
            confidence = entity["score"]
            all_confidences.append(confidence)

            print(
                f"  Entity {j}: type={entity['entity_group']}, "
                f"word=\"{entity['word']}\", "
                f"confidence={confidence:.6f}"
            )

    return all_confidences, total_detections


def analyze_confidence_distribution(confidences, total_detections):
    """Analyze the confidence score distribution."""
    print("\n" + "=" * 80)
    print("STATISTICAL ANALYSIS")
    print("=" * 80)
    print(f"Total PII detections: {total_detections}")
    print(f"Unique confidence values: {len(set(confidences))}")

    if not confidences:
        print("No detections to analyze")
        return

    # Count occurrences of each confidence value
    from collections import Counter

    confidence_counts = Counter(confidences)

    print("\nConfidence distribution:")
    for conf in sorted(confidence_counts.keys(), reverse=True):
        count = confidence_counts[conf]
        percentage = (count / total_detections) * 100
        print(f"  {conf:.6f}: {count} detections ({percentage:.1f}%)")

    # Statistical summary
    import statistics

    print(f"\nStatistical Summary:")
    print(f"  Min: {min(confidences):.6f}")
    print(f"  Max: {max(confidences):.6f}")
    print(f"  Mean: {statistics.mean(confidences):.6f}")
    print(f"  Median: {statistics.median(confidences):.6f}")
    if len(confidences) > 1:
        print(f"  Std Dev: {statistics.stdev(confidences):.6f}")

    # Conclusion
    print("\n" + "=" * 80)
    print("CONCLUSION")
    print("=" * 80)

    if len(set(confidences)) == 1:
        uniform_conf = confidences[0]
        print(f"✓ ALL {total_detections} detections returned EXACTLY {uniform_conf:.6f}")
        print("\nThis proves the uniform confidence comes from the MODEL ITSELF,")
        print("not from semantic-router code (Go or Rust).")
        print("\nThis test used:")
        print("  - Pure Python (no Go code)")
        print("  - HuggingFace transformers (no Rust candle-binding)")
        print("  - Direct model inference (no semantic-router)")
    else:
        print("✓ Model returns varying confidence scores (probabilistic)")

    print("=" * 80)


def main():
    print("\n")
    print("=" * 80)
    print("PURE PYTHON LoRA PII MODEL TEST")
    print("=" * 80)
    print("This test loads and runs the LoRA PII model WITHOUT any semantic-router code.")
    print("It proves whether 0.9 confidence is from the model or our code.\n")

    # Model path
    model_path = "models/lora_pii_detector_bert-base-uncased_model"

    # Test cases
    test_cases = [
        {"name": "Email", "text": "Contact me at john.doe@example.com for more info"},
        {"name": "Phone", "text": "Call me at (555) 123-4567 anytime"},
        {"name": "SSN", "text": "My SSN is 123-45-6789"},
        {"name": "Credit Card", "text": "Card number: 4532-1488-0343-6467"},
        {
            "name": "Multiple PII",
            "text": "Email john@example.com, phone (555)123-4567",
        },
        {"name": "No PII #1", "text": "The weather is nice today"},
        {"name": "No PII #2", "text": "Machine learning is fascinating"},
    ]

    try:
        # Load model
        model, tokenizer = load_lora_pii_model(model_path)

        # Run tests
        confidences, total_detections = test_pii_detection(model, tokenizer, test_cases)

        # Analyze results
        analyze_confidence_distribution(confidences, total_detections)

    except Exception as e:
        print(f"\n✗ Error: {e}")
        import traceback

        traceback.print_exc()


if __name__ == "__main__":
    main()
	#!/usr/bin/env python3
	"""
	Pure Python LoRA PII Model Test
	================================

	This test loads the LoRA PII model directly using HuggingFace transformers
	and runs inference WITHOUT using ANY semantic-router code (no Go, no Rust FFI).

	This proves whether the 0.9 confidence comes from:
	- The model itself ✓
	- Our semantic-router code ✗
	"""

	import json
	import torch
	from transformers import (
	AutoTokenizer,
	AutoModelForTokenClassification,
	pipeline,
	)
	from peft import PeftModel, PeftConfig


	def load_lora_pii_model(model_path: str):
	"""Load LoRA PII model directly using HuggingFace/PEFT."""
	print(f"Loading LoRA PII model from: {model_path}")
	print("=" * 80)

	# Check if this is a LoRA model
	try:
	config = PeftConfig.from_pretrained(model_path)
	print(f"✓ Detected LoRA model")
	print(f" Base model: {config.base_model_name_or_path}")
	print(f" Task type: {config.task_type}")

	# Load base model
	base_model = AutoModelForTokenClassification.from_pretrained(
	config.base_model_name_or_path
	)

	# Load LoRA weights
	model = PeftModel.from_pretrained(base_model, model_path)
	tokenizer = AutoTokenizer.from_pretrained(config.base_model_name_or_path)

	print(f"✓ Model loaded successfully\n")
	return model, tokenizer

	except Exception as e:
	print(f"✗ Error loading LoRA model: {e}")
	print("Trying to load as standard transformers model...")

	# Fallback to standard model
	model = AutoModelForTokenClassification.from_pretrained(model_path)
	tokenizer = AutoTokenizer.from_pretrained(model_path)
	print(f"✓ Model loaded successfully\n")
	return model, tokenizer


	def test_pii_detection(model, tokenizer, test_cases):
	"""Run PII detection tests and analyze confidence scores."""

	# Create NER pipeline
	ner_pipeline = pipeline(
	"ner",
	model=model,
	tokenizer=tokenizer,
	aggregation_strategy="simple", # Aggregate subword tokens
	)

	print("Running PII Detection Tests")
	print("=" * 80)

	all_confidences = []
	total_detections = 0

	for i, tc in enumerate(test_cases, 1):
	print(f"\n[Test {i}/{len(test_cases)}] {tc['name']}")
	print(f"Text: \"{tc['text']}\"")

	# Run inference directly
	results = ner_pipeline(tc["text"])

	if not results:
	print("Result: ✓ No PII detected")
	continue

	print(f"Result: ⚠️ PII detected ({len(results)} entities)")
	total_detections += len(results)

	for j, entity in enumerate(results, 1):
	confidence = entity["score"]
	all_confidences.append(confidence)

	print(
	f" Entity {j}: type={entity['entity_group']}, "
	f"word=\"{entity['word']}\", "
	f"confidence={confidence:.6f}"
	)

	return all_confidences, total_detections


	def analyze_confidence_distribution(confidences, total_detections):
	"""Analyze the confidence score distribution."""
	print("\n" + "=" * 80)
	print("STATISTICAL ANALYSIS")
	print("=" * 80)
	print(f"Total PII detections: {total_detections}")
	print(f"Unique confidence values: {len(set(confidences))}")

	if not confidences:
	print("No detections to analyze")
	return

	# Count occurrences of each confidence value
	from collections import Counter

	confidence_counts = Counter(confidences)

	print("\nConfidence distribution:")
	for conf in sorted(confidence_counts.keys(), reverse=True):
	count = confidence_counts[conf]
	percentage = (count / total_detections) * 100
	print(f" {conf:.6f}: {count} detections ({percentage:.1f}%)")

	# Statistical summary
	import statistics

	print(f"\nStatistical Summary:")
	print(f" Min: {min(confidences):.6f}")
	print(f" Max: {max(confidences):.6f}")
	print(f" Mean: {statistics.mean(confidences):.6f}")
	print(f" Median: {statistics.median(confidences):.6f}")
	if len(confidences) > 1:
	print(f" Std Dev: {statistics.stdev(confidences):.6f}")

	# Conclusion
	print("\n" + "=" * 80)
	print("CONCLUSION")
	print("=" * 80)

	if len(set(confidences)) == 1:
	uniform_conf = confidences[0]
	print(f"✓ ALL {total_detections} detections returned EXACTLY {uniform_conf:.6f}")
	print("\nThis proves the uniform confidence comes from the MODEL ITSELF,")
	print("not from semantic-router code (Go or Rust).")
	print("\nThis test used:")
	print(" - Pure Python (no Go code)")
	print(" - HuggingFace transformers (no Rust candle-binding)")
	print(" - Direct model inference (no semantic-router)")
	else:
	print("✓ Model returns varying confidence scores (probabilistic)")

	print("=" * 80)


	def main():
	print("\n")
	print("=" * 80)
	print("PURE PYTHON LoRA PII MODEL TEST")
	print("=" * 80)
	print("This test loads and runs the LoRA PII model WITHOUT any semantic-router code.")
	print("It proves whether 0.9 confidence is from the model or our code.\n")

	# Model path
	model_path = "models/lora_pii_detector_bert-base-uncased_model"

	# Test cases
	test_cases = [
	{"name": "Email", "text": "Contact me at john.doe@example.com for more info"},
	{"name": "Phone", "text": "Call me at (555) 123-4567 anytime"},
	{"name": "SSN", "text": "My SSN is 123-45-6789"},
	{"name": "Credit Card", "text": "Card number: 4532-1488-0343-6467"},
	{
	"name": "Multiple PII",
	"text": "Email john@example.com, phone (555)123-4567",
	},
	{"name": "No PII #1", "text": "The weather is nice today"},
	{"name": "No PII #2", "text": "Machine learning is fascinating"},
	]

	try:
	# Load model
	model, tokenizer = load_lora_pii_model(model_path)

	# Run tests
	confidences, total_detections = test_pii_detection(model, tokenizer, test_cases)

	# Analyze results
	analyze_confidence_distribution(confidences, total_detections)

	except Exception as e:
	print(f"\n✗ Error: {e}")
	import traceback

	traceback.print_exc()


	if __name__ == "__main__":
	main()
No results found