Created
January 13, 2026 00:45
-
-
Save thiagobutignon/8e30ca0e6522d874cb88ea247be06541 to your computer and use it in GitHub Desktop.
privacy_budget_lab.py
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import opendp.prelude as dp | |
| dp.enable_features("contrib") | |
| import pandas as pd | |
| import numpy as np | |
| # ============================================================================= | |
| # DATASET SETUP | |
| # ============================================================================= | |
| # Create sample customer dataset | |
| customers = pd.DataFrame({ | |
| 'customer_id': range(1, 21), | |
| 'age': [25, 34, 45, 29, 52, 38, 41, 33, 47, 36, 28, 31, 42, 26, 39, 44, 30, 35, 48, 27], | |
| 'purchase_amount': [150, 320, 890, 200, 450, 275, 680, 180, 520, 340, | |
| 210, 380, 560, 190, 410, 620, 240, 290, 470, 160] | |
| }) | |
| print("=" * 60) | |
| print("CUSTOMER DATASET") | |
| print("=" * 60) | |
| print(customers.head(10)) | |
| print(f"\nTotal customers in dataset: {len(customers)}") | |
| print() | |
| # ============================================================================= | |
| # ACTIVITY 1: BASIC PRIVACY BUDGET IMPLEMENTATION | |
| # ============================================================================= | |
| # Configure privacy parameters | |
| epsilon = 0.5 # Privacy budget - strong privacy protection | |
| sensitivity = 1 # For count queries (adding/removing one customer changes count by 1) | |
| # Track total privacy budget consumption | |
| total_epsilon_consumed = 0.0 | |
| print("=" * 60) | |
| print("ACTIVITY 1: BASIC PRIVACY BUDGET IMPLEMENTATION") | |
| print("=" * 60) | |
| print(f"Initial Privacy Budget (ε): {epsilon}") | |
| print(f"Query Sensitivity: {sensitivity}") | |
| print() | |
| # Create differentially private count function using Laplace noise | |
| def private_count(data, epsilon, sensitivity): | |
| """ | |
| Implements differentially private count query using Laplace mechanism. | |
| Parameters: | |
| - data: DataFrame or array to count | |
| - epsilon: Privacy budget parameter (smaller = more privacy) | |
| - sensitivity: Query sensitivity (for count = 1) | |
| Returns: | |
| - Noisy count (non-negative integer) | |
| """ | |
| # Calculate true count | |
| true_count = len(data) | |
| # OpenDP implementation | |
| # Define the domain and metric for the count (integer scalar) | |
| space = dp.atom_domain(T=int), dp.absolute_distance(T=int) | |
| # Create the Laplace measurement | |
| # Scale = sensitivity / epsilon | |
| scale = sensitivity / epsilon | |
| laplace_mechanism = space >> dp.m.then_laplace(scale=scale) | |
| # Get private result | |
| private_result = laplace_mechanism(int(true_count)) | |
| # Ensure non-negative result | |
| return max(0, private_result) | |
| # Execute private count query on full dataset | |
| true_customer_count = len(customers) | |
| private_customer_count = private_count(customers, epsilon, sensitivity) | |
| print(f"True count (all customers): {true_customer_count}") | |
| print(f"Private count (with noise): {private_customer_count}") | |
| print(f"Noise added: {private_customer_count - true_customer_count}") | |
| print(f"Privacy budget consumed: {epsilon}") | |
| print() | |
| # Update total budget consumed | |
| total_epsilon_consumed += epsilon | |
| # ============================================================================= | |
| # PRACTICE CHALLENGE 1: AGE RANGE QUERY (25-35 years) | |
| # ============================================================================= | |
| print("=" * 60) | |
| print("PRACTICE CHALLENGE 1: AGE RANGE QUERY (25-35)") | |
| print("=" * 60) | |
| # Filter customers between ages 25-35 | |
| age_filtered = customers[(customers['age'] >= 25) & (customers['age'] <= 35)] | |
| # True count for age range | |
| true_age_count = len(age_filtered) | |
| # Apply differential privacy with same epsilon | |
| private_age_count = private_count(age_filtered, epsilon, sensitivity) | |
| # Calculate noise scale for transparency | |
| noise_scale = sensitivity / epsilon | |
| print(f"Age range filter: 25-35 years") | |
| print(f"True count (ages 25-35): {true_age_count}") | |
| print(f"Private count (with noise): {private_age_count}") | |
| print(f"Noise added: {private_age_count - true_age_count}") | |
| print(f"Noise scale (b = Δf/ε): {noise_scale:.2f}") | |
| print(f"Privacy budget consumed for this query: {epsilon}") | |
| print() | |
| # Update total budget consumed | |
| total_epsilon_consumed += epsilon | |
| # ============================================================================= | |
| # ADDITIONAL ANALYSIS: MULTIPLE AGE SEGMENTS | |
| # ============================================================================= | |
| print("=" * 60) | |
| print("BONUS: MULTIPLE AGE SEGMENT ANALYSIS") | |
| print("=" * 60) | |
| # Define age segments for marketing | |
| age_segments = [ | |
| ("Young Adults (25-30)", 25, 30), | |
| ("Mid-Career (31-40)", 31, 40), | |
| ("Mature (41-50)", 41, 50) | |
| ] | |
| # Allocate privacy budget across segments | |
| epsilon_per_segment = 0.3 # Lower epsilon for multiple queries | |
| print(f"Privacy budget per segment: {epsilon_per_segment}") | |
| print() | |
| for segment_name, min_age, max_age in age_segments: | |
| # Filter by age segment | |
| segment_data = customers[(customers['age'] >= min_age) & (customers['age'] <= max_age)] | |
| # Calculate true and private counts | |
| true_segment_count = len(segment_data) | |
| private_segment_count = private_count(segment_data, epsilon_per_segment, sensitivity) | |
| print(f"{segment_name}:") | |
| print(f" True count: {true_segment_count}") | |
| print(f" Private count: {private_segment_count}") | |
| print(f" Budget consumed: {epsilon_per_segment}") | |
| print() | |
| # Track budget | |
| total_epsilon_consumed += epsilon_per_segment | |
| # ============================================================================= | |
| # PRIVACY BUDGET SUMMARY | |
| # ============================================================================= | |
| print("=" * 60) | |
| print("PRIVACY BUDGET SUMMARY") | |
| print("=" * 60) | |
| print(f"Total privacy budget consumed (ε): {total_epsilon_consumed:.2f}") | |
| print(f"Number of queries executed: 5") | |
| print(f"Average ε per query: {total_epsilon_consumed / 5:.2f}") | |
| print() | |
| print("Key Insights:") | |
| print("- Lower ε values provide stronger privacy but more noise") | |
| print("- Privacy budget is additive across queries") | |
| print("- Must carefully allocate budget for multiple analyses") | |
| print("- Trade-off between privacy protection and analytical utility") | |
| print("=" * 60) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment