DiTo97/sampling_methods_for_evaluation.py

## sampling_methods_for_evaluation.py
# /// script
# requires-python = ">=3.12"
# dependencies = [
#   "scipy>1,<2",
# ]
# ///

import math
import random
from dataclasses import dataclass, field
from enum import Enum
from typing import Any, Callable

import numpy as np
import scipy.optimize as optimize
import scipy.special as special

# ==============================================================================
# 1. Data Structures
# ==============================================================================

@dataclass
class Problem:
    """Represents a single problem to be evaluated."""
    problem_id: str
    input_state: dict[str, Any]
    prompt_template: str
    output_state: dict[str, Any]
    tests: list[Callable[[Any], bool]]

    def get_prompt(self) -> str:
        """Generate the full prompt from template and input state."""
        return self.prompt_template.format(**self.input_state)


@dataclass
class ProblemStats:
    """Tracks sampling statistics for a single problem."""
    problem_id: str
    successes: int = 0
    attempts: int = 0


@dataclass
class ProblemLevelMetrics:
    """Metrics computed for a single problem."""
    problem_id: str
    attempts: int
    successes: int
    pass_at_k: dict[int, float] = field(default_factory=dict)
    pass_power_k: dict[int, float] = field(default_factory=dict)


@dataclass
class DatasetLevelMetrics:
    """Aggregated metrics across the entire dataset."""
    num_problems: int
    total_attempts: int
    pass_at_k: dict[int, float] = field(default_factory=dict)
    pass_power_k: dict[int, float] = field(default_factory=dict)


@dataclass
class EvaluationResult:
    """Complete evaluation results."""
    problem_metrics: list[ProblemLevelMetrics]
    dataset_metrics: DatasetLevelMetrics
    beta_binomial_params: dict[str, float] | None = None


class SamplingStrategy(Enum):
    """Sampling strategy for selecting problems to evaluate."""
    uniform = "uniform"
    dynamic = "dynamic"


# ==============================================================================
# 2. Evaluation Function
# ==============================================================================

def evaluate_solution(solution: Any, problem: Problem) -> bool:
    """
    Evaluates a generated solution against problem tests.

    Args:
        solution: The generated solution (could be code, text, etc.)
        problem: The Problem object containing test cases

    Returns:
        bool: True if solution passes all tests, False otherwise
    """
    try:
        for test in problem.tests:
            if not test(solution):
                return False
        return True
    except Exception:
        return False


# ==============================================================================
# 3. Sampling Strategies
# ==============================================================================

def uniform_sampling(
    problems: list[Problem],
    generator_func: Callable[[Problem], Any],
    budget: int
) -> dict[str, ProblemStats]:
    """
    Uniform sampling: randomly select problems with equal probability.

    Args:
        problems: List of Problem objects
        generator_func: Function that generates a solution for a problem
        budget: Total number of samples to generate

    Returns:
        Dictionary mapping problem_id to ProblemStats
    """
    stats: dict[str, ProblemStats] = {
        p.problem_id: ProblemStats(problem_id=p.problem_id)
        for p in problems
    }

    for _ in range(budget):
        # Randomly select a problem
        problem = random.choice(problems)

        # Generate and evaluate solution
        solution = generator_func(problem)
        is_correct = evaluate_solution(solution, problem)

        # Update stats
        stats[problem.problem_id].attempts += 1
        if is_correct:
            stats[problem.problem_id].successes += 1

    return stats


def dynamic_sampling(
    problems: list[Problem],
    generator_func: Callable[[Problem], Any],
    budget: int
) -> dict[str, ProblemStats]:
    """
    Dynamic sampling: select hardest problems (min successes, then min attempts).

    see Algorithm 2 from https://arxiv.org/abs/2510.05197

    Args:
        problems: List of Problem objects
        generator_func: Function that generates a solution for a problem
        budget: Total number of samples to generate

    Returns:
        Dictionary mapping problem_id to ProblemStats
    """
    stats: dict[str, ProblemStats] = {
        p.problem_id: ProblemStats(problem_id=p.problem_id)
        for p in problems
    }
    problem_map = {p.problem_id: p for p in problems}

    for _ in range(budget):
        # Find minimum number of successes
        min_successes = min(s.successes for s in stats.values())

        # Get all problems with minimum successes
        candidates = [
            pid for pid, s in stats.items()
            if s.successes == min_successes
        ]

        # Among candidates, find minimum attempts
        min_attempts = min(stats[pid].attempts for pid in candidates)

        # Get final candidates with min attempts
        final_candidates = [
            pid for pid in candidates
            if stats[pid].attempts == min_attempts
        ]

        # Randomly select one if multiple ties
        selected_id = random.choice(final_candidates)
        problem = problem_map[selected_id]

        # Generate and evaluate solution
        solution = generator_func(problem)
        is_correct = evaluate_solution(solution, problem)

        # Update stats
        stats[selected_id].attempts += 1
        if is_correct:
            stats[selected_id].successes += 1

    return stats


# ==============================================================================
# 4. Beta-Binomial Parameter Estimation
# ==============================================================================

def fit_beta_binomial(
    successes: np.ndarray,
    attempts: np.ndarray
) -> dict[str, float]:
    """
    Fit Beta-Binomial distribution parameters using MLE.

    see https://arxiv.org/abs/2510.05197

    Args:
        successes: Array of success counts for each problem
        attempts: Array of attempt counts for each problem

    Returns:
        Dictionary with 'alpha' and 'beta' parameters
    """
    def neg_log_likelihood(params: tuple[float, float]) -> float:
        alpha, beta = params
        if alpha <= 0 or beta <= 0:
            return 1e9

        # Compute log-likelihood using log-gamma functions
        term1 = (
            special.gammaln(successes + alpha) +
            special.gammaln(attempts - successes + beta) -
            special.gammaln(attempts + alpha + beta)
        )
        term2 = (
            special.gammaln(alpha) +
            special.gammaln(beta) -
            special.gammaln(alpha + beta)
        )

        log_L = np.sum(term1 - term2)
        return -log_L

    # Initial guess
    initial_guess = [1.0, 1.0]

    # Optimize
    result = optimize.minimize(
        neg_log_likelihood,
        initial_guess,
        bounds=((1e-5, None), (1e-5, None)),
        method='L-BFGS-B'
    )

    return {'alpha': result.x[0], 'beta': result.x[1]}


# ==============================================================================
# 5. Pass@k Estimation
# ==============================================================================

def pass_at_k_unbiased(n: int, c: int, k: int) -> float:
    """
    Unbiased estimator for pass@k when k <= n.

    see https://www.philschmid.de/agents-pass-at-k-pass-power-k

    Args:
        n: Total number of attempts
        c: Number of successes
        k: The k in pass@k

    Returns:
        Estimated pass@k value
    """
    if n < k:
        raise ValueError(f"n ({n}) must be >= k ({k})")
    if n - c < k:
        return 1.0

    log_comb_n_k = math.log(math.comb(n, k))
    log_comb_n_minus_c_k = math.log(math.comb(n - c, k))

    return 1.0 - math.exp(log_comb_n_minus_c_k - log_comb_n_k)


def pass_at_k_beta_binomial(
    successes: int,
    attempts: int,
    k: int,
    alpha: float,
    beta: float
) -> float:
    """
    Beta-Binomial estimator for pass@k when k > n.

    Uses the posterior predictive distribution.

    see https://arxiv.org/abs/2510.05197

    Args:
        successes: Number of successes for this problem
        attempts: Number of attempts for this problem
        k: The k in pass@k
        alpha: Beta distribution alpha parameter (from MLE)
        beta: Beta distribution beta parameter (from MLE)

    Returns:
        Estimated pass@k value
    """
    # Posterior parameters
    alpha_post = alpha + successes
    beta_post = beta + attempts - successes

    # Compute E[(1-p)^k] using product formula in log-space
    log_prob_fail = 0.0
    for j in range(k):
        log_prob_fail += (
            np.log(beta_post + j) -
            np.log(alpha_post + beta_post + j)
        )

    prob_fail = np.exp(log_prob_fail)
    return 1.0 - prob_fail


def pass_power_k(n: int, c: int, k: int) -> float:
    """
    Compute pass^k metric.

    see https://www.philschmid.de/agents-pass-at-k-pass-power-k

    Args:
        n: Total number of attempts
        c: Number of successes
        k: The k in pass^k

    Returns:
        Estimated pass^k value
    """
    if n == 0:
        return 0.0
    p = c / n
    return p ** k


# ==============================================================================
# 6. Main Evaluation Pipeline
# ==============================================================================

def evaluate_dataset(
    problems: list[Problem],
    generator_func: Callable[[Problem], Any],
    k_values: list[int],
    budget: int,
    sampling_strategy: SamplingStrategy = SamplingStrategy.uniform
) -> EvaluationResult:
    """
    Evaluate a dataset of problems with a given budget.

    Args:
        problems: List of Problem objects
        generator_func: Function to generate solutions
        k_values: List of k values to compute metrics for
        budget: Total number of samples to generate
        sampling_strategy: Which sampling strategy to use

    Returns:
        EvaluationResult with problem-level and dataset-level metrics
    """
    # Step 1: Collect samples using chosen strategy
    if sampling_strategy == SamplingStrategy.uniform:
        stats = uniform_sampling(problems, generator_func, budget)
    else:  # DYNAMIC
        stats = dynamic_sampling(problems, generator_func, budget)

    # Step 2: Fit Beta-Binomial parameters on whole dataset
    successes_array = np.array([stats[p.problem_id].successes for p in problems])
    attempts_array = np.array([stats[p.problem_id].attempts for p in problems])

    beta_binomial_params = fit_beta_binomial(successes_array, attempts_array)
    alpha, beta = beta_binomial_params['alpha'], beta_binomial_params['beta']

    # Step 3: Compute metrics for each problem
    problem_metrics: list[ProblemLevelMetrics] = []

    for problem in problems:
        stat = stats[problem.problem_id]
        metrics = ProblemLevelMetrics(
            problem_id=problem.problem_id,
            attempts=stat.attempts,
            successes=stat.successes
        )

        for k in k_values:
            # pass@k: use unbiased if k <= attempts, else Beta-Binomial
            if k <= stat.attempts and stat.attempts > 0:
                metrics.pass_at_k[k] = pass_at_k_unbiased(
                    stat.attempts, stat.successes, k
                )
            else:
                metrics.pass_at_k[k] = pass_at_k_beta_binomial(
                    stat.successes, stat.attempts, k, alpha, beta
                )

            # pass^k: always use the same formula
            metrics.pass_power_k[k] = pass_power_k(
                stat.attempts, stat.successes, k
            )

        problem_metrics.append(metrics)

    # Step 4: Aggregate to dataset level
    dataset_metrics = DatasetLevelMetrics(
        num_problems=len(problems),
        total_attempts=sum(s.attempts for s in stats.values())
    )

    for k in k_values:
        # Average over all problems
        dataset_metrics.pass_at_k[k] = np.mean([
            m.pass_at_k[k] for m in problem_metrics
        ])
        dataset_metrics.pass_power_k[k] = np.mean([
            m.pass_power_k[k] for m in problem_metrics
        ])

    return EvaluationResult(
        problem_metrics=problem_metrics,
        dataset_metrics=dataset_metrics,
        beta_binomial_params=beta_binomial_params
    )


# ==============================================================================
# 7. Example Usage
# ==============================================================================

if __name__ == '__main__':
    # Define mock problems with the new structure
    def test_length(solution: str) -> bool:
        return len(solution) > 50

    def test_contains_return(solution: str) -> bool:
        return 'return' in solution

    mock_problems = [
        Problem(
            problem_id='prob_001',
            input_state={'operation': 'add'},
            prompt_template='Write a function that {operation}s two numbers.',
            output_state={},
            tests=[test_length, test_contains_return]
        ),
        Problem(
            problem_id='prob_002',
            input_state={'operation': 'reverse'},
            prompt_template='Write a function that {operation}s a string.',
            output_state={},
            tests=[test_length, test_contains_return]
        ),
        Problem(
            problem_id='prob_003',
            input_state={'operation': 'sort'},
            prompt_template='Write a function that {operation}s a list.',
            output_state={},
            tests=[test_length, test_contains_return]
        ),
    ]

    # Mock generator: 60% success rate
    def mock_generator(problem: Problem) -> str:
        if np.random.rand() < 0.6:
            return f"def solution():\n    return 'correct implementation'" + "A" * 50
        else:
            return "def solution():\n    pass"

    # Evaluation parameters
    k_values_to_test = [1, 5, 10, 20]
    budget = 100

    # Run with uniform sampling
    print("=" * 70)
    print("UNIFORM SAMPLING EVALUATION")
    print("=" * 70)
    uniform_results = evaluate_dataset(
        problems=mock_problems,
        generator_func=mock_generator,
        k_values=k_values_to_test,
        budget=budget,
        sampling_strategy=SamplingStrategy.uniform
    )

    print(f"\n--- Dataset-Level Metrics ---")
    print(f"Total problems: {uniform_results.dataset_metrics.num_problems}")
    print(f"Total attempts: {uniform_results.dataset_metrics.total_attempts}")
    print(f"Beta-Binomial params: α={uniform_results.beta_binomial_params['alpha']:.3f}, "
          f"β={uniform_results.beta_binomial_params['beta']:.3f}")
    print(f"\npass@k metrics:")
    for k, val in uniform_results.dataset_metrics.pass_at_k.items():
        print(f"  pass@{k}: {val:.4f}")
    print(f"\npass^k metrics:")
    for k, val in uniform_results.dataset_metrics.pass_power_k.items():
        print(f"  pass^{k}: {val:.4f}")

    print(f"\n--- Problem-Level Metrics ---")
    for pm in uniform_results.problem_metrics:
        print(f"\nProblem {pm.problem_id}:")
        print(f"  Attempts: {pm.attempts}, Successes: {pm.successes}")
        print(f"  pass@k: {pm.pass_at_k}")
        print(f"  pass^k: {pm.pass_power_k}")

    # Run with dynamic sampling
    print("\n" + "=" * 70)
    print("DYNAMIC SAMPLING EVALUATION")
    print("=" * 70)
    dynamic_results = evaluate_dataset(
        problems=mock_problems,
        generator_func=mock_generator,
        k_values=k_values_to_test,
        budget=budget,
        sampling_strategy=SamplingStrategy.dynamic
    )

    print(f"\n--- Dataset-Level Metrics ---")
    print(f"Total problems: {dynamic_results.dataset_metrics.num_problems}")
    print(f"Total attempts: {dynamic_results.dataset_metrics.total_attempts}")
    print(f"Beta-Binomial params: α={dynamic_results.beta_binomial_params['alpha']:.3f}, "
          f"β={dynamic_results.beta_binomial_params['beta']:.3f}")
    print(f"\npass@k metrics:")
    for k, val in dynamic_results.dataset_metrics.pass_at_k.items():
        print(f"  pass@{k}: {val:.4f}")
    print(f"\npass^k metrics:")
    for k, val in dynamic_results.dataset_metrics.pass_power_k.items():
        print(f"  pass^{k}: {val:.4f}")

    print(f"\n--- Problem-Level Metrics ---")
    for pm in dynamic_results.problem_metrics:
        print(f"\nProblem {pm.problem_id}:")
        print(f"  Attempts: {pm.attempts}, Successes: {pm.successes}")
        print(f"  pass@k: {pm.pass_at_k}")
        print(f"  pass^k: {pm.pass_power_k}")
	# /// script
	# requires-python = ">=3.12"
	# dependencies = [
	# "scipy>1,<2",
	# ]
	# ///

	import math
	import random
	from dataclasses import dataclass, field
	from enum import Enum
	from typing import Any, Callable

	import numpy as np
	import scipy.optimize as optimize
	import scipy.special as special

	# ==============================================================================
	# 1. Data Structures
	# ==============================================================================

	@dataclass
	class Problem:
	"""Represents a single problem to be evaluated."""
	problem_id: str
	input_state: dict[str, Any]
	prompt_template: str
	output_state: dict[str, Any]
	tests: list[Callable[[Any], bool]]

	def get_prompt(self) -> str:
	"""Generate the full prompt from template and input state."""
	return self.prompt_template.format(**self.input_state)


	@dataclass
	class ProblemStats:
	"""Tracks sampling statistics for a single problem."""
	problem_id: str
	successes: int = 0
	attempts: int = 0


	@dataclass
	class ProblemLevelMetrics:
	"""Metrics computed for a single problem."""
	problem_id: str
	attempts: int
	successes: int
	pass_at_k: dict[int, float] = field(default_factory=dict)
	pass_power_k: dict[int, float] = field(default_factory=dict)


	@dataclass
	class DatasetLevelMetrics:
	"""Aggregated metrics across the entire dataset."""
	num_problems: int
	total_attempts: int
	pass_at_k: dict[int, float] = field(default_factory=dict)
	pass_power_k: dict[int, float] = field(default_factory=dict)


	@dataclass
	class EvaluationResult:
	"""Complete evaluation results."""
	problem_metrics: list[ProblemLevelMetrics]
	dataset_metrics: DatasetLevelMetrics
	beta_binomial_params: dict[str, float] \| None = None


	class SamplingStrategy(Enum):
	"""Sampling strategy for selecting problems to evaluate."""
	uniform = "uniform"
	dynamic = "dynamic"


	# ==============================================================================
	# 2. Evaluation Function
	# ==============================================================================

	def evaluate_solution(solution: Any, problem: Problem) -> bool:
	"""
	Evaluates a generated solution against problem tests.

	Args:
	solution: The generated solution (could be code, text, etc.)
	problem: The Problem object containing test cases

	Returns:
	bool: True if solution passes all tests, False otherwise
	"""
	try:
	for test in problem.tests:
	if not test(solution):
	return False
	return True
	except Exception:
	return False


	# ==============================================================================
	# 3. Sampling Strategies
	# ==============================================================================

	def uniform_sampling(
	problems: list[Problem],
	generator_func: Callable[[Problem], Any],
	budget: int
	) -> dict[str, ProblemStats]:
	"""
	Uniform sampling: randomly select problems with equal probability.

	Args:
	problems: List of Problem objects
	generator_func: Function that generates a solution for a problem
	budget: Total number of samples to generate

	Returns:
	Dictionary mapping problem_id to ProblemStats
	"""
	stats: dict[str, ProblemStats] = {
	p.problem_id: ProblemStats(problem_id=p.problem_id)
	for p in problems
	}

	for _ in range(budget):
	# Randomly select a problem
	problem = random.choice(problems)

	# Generate and evaluate solution
	solution = generator_func(problem)
	is_correct = evaluate_solution(solution, problem)

	# Update stats
	stats[problem.problem_id].attempts += 1
	if is_correct:
	stats[problem.problem_id].successes += 1

	return stats


	def dynamic_sampling(
	problems: list[Problem],
	generator_func: Callable[[Problem], Any],
	budget: int
	) -> dict[str, ProblemStats]:
	"""
	Dynamic sampling: select hardest problems (min successes, then min attempts).

	see Algorithm 2 from https://arxiv.org/abs/2510.05197

	Args:
	problems: List of Problem objects
	generator_func: Function that generates a solution for a problem
	budget: Total number of samples to generate

	Returns:
	Dictionary mapping problem_id to ProblemStats
	"""
	stats: dict[str, ProblemStats] = {
	p.problem_id: ProblemStats(problem_id=p.problem_id)
	for p in problems
	}
	problem_map = {p.problem_id: p for p in problems}

	for _ in range(budget):
	# Find minimum number of successes
	min_successes = min(s.successes for s in stats.values())

	# Get all problems with minimum successes
	candidates = [
	pid for pid, s in stats.items()
	if s.successes == min_successes
	]

	# Among candidates, find minimum attempts
	min_attempts = min(stats[pid].attempts for pid in candidates)

	# Get final candidates with min attempts
	final_candidates = [
	pid for pid in candidates
	if stats[pid].attempts == min_attempts
	]

	# Randomly select one if multiple ties
	selected_id = random.choice(final_candidates)
	problem = problem_map[selected_id]

	# Generate and evaluate solution
	solution = generator_func(problem)
	is_correct = evaluate_solution(solution, problem)

	# Update stats
	stats[selected_id].attempts += 1
	if is_correct:
	stats[selected_id].successes += 1

	return stats


	# ==============================================================================
	# 4. Beta-Binomial Parameter Estimation
	# ==============================================================================

	def fit_beta_binomial(
	successes: np.ndarray,
	attempts: np.ndarray
	) -> dict[str, float]:
	"""
	Fit Beta-Binomial distribution parameters using MLE.

	see https://arxiv.org/abs/2510.05197

	Args:
	successes: Array of success counts for each problem
	attempts: Array of attempt counts for each problem

	Returns:
	Dictionary with 'alpha' and 'beta' parameters
	"""
	def neg_log_likelihood(params: tuple[float, float]) -> float:
	alpha, beta = params
	if alpha <= 0 or beta <= 0:
	return 1e9

	# Compute log-likelihood using log-gamma functions
	term1 = (
	special.gammaln(successes + alpha) +
	special.gammaln(attempts - successes + beta) -
	special.gammaln(attempts + alpha + beta)
	)
	term2 = (
	special.gammaln(alpha) +
	special.gammaln(beta) -
	special.gammaln(alpha + beta)
	)

	log_L = np.sum(term1 - term2)
	return -log_L

	# Initial guess
	initial_guess = [1.0, 1.0]

	# Optimize
	result = optimize.minimize(
	neg_log_likelihood,
	initial_guess,
	bounds=((1e-5, None), (1e-5, None)),
	method='L-BFGS-B'
	)

	return {'alpha': result.x[0], 'beta': result.x[1]}


	# ==============================================================================
	# 5. Pass@k Estimation
	# ==============================================================================

	def pass_at_k_unbiased(n: int, c: int, k: int) -> float:
	"""
	Unbiased estimator for pass@k when k <= n.

	see https://www.philschmid.de/agents-pass-at-k-pass-power-k

	Args:
	n: Total number of attempts
	c: Number of successes
	k: The k in pass@k

	Returns:
	Estimated pass@k value
	"""
	if n < k:
	raise ValueError(f"n ({n}) must be >= k ({k})")
	if n - c < k:
	return 1.0

	log_comb_n_k = math.log(math.comb(n, k))
	log_comb_n_minus_c_k = math.log(math.comb(n - c, k))

	return 1.0 - math.exp(log_comb_n_minus_c_k - log_comb_n_k)


	def pass_at_k_beta_binomial(
	successes: int,
	attempts: int,
	k: int,
	alpha: float,
	beta: float
	) -> float:
	"""
	Beta-Binomial estimator for pass@k when k > n.

	Uses the posterior predictive distribution.

	see https://arxiv.org/abs/2510.05197

	Args:
	successes: Number of successes for this problem
	attempts: Number of attempts for this problem
	k: The k in pass@k
	alpha: Beta distribution alpha parameter (from MLE)
	beta: Beta distribution beta parameter (from MLE)

	Returns:
	Estimated pass@k value
	"""
	# Posterior parameters
	alpha_post = alpha + successes
	beta_post = beta + attempts - successes

	# Compute E[(1-p)^k] using product formula in log-space
	log_prob_fail = 0.0
	for j in range(k):
	log_prob_fail += (
	np.log(beta_post + j) -
	np.log(alpha_post + beta_post + j)
	)

	prob_fail = np.exp(log_prob_fail)
	return 1.0 - prob_fail


	def pass_power_k(n: int, c: int, k: int) -> float:
	"""
	Compute pass^k metric.

	see https://www.philschmid.de/agents-pass-at-k-pass-power-k

	Args:
	n: Total number of attempts
	c: Number of successes
	k: The k in pass^k

	Returns:
	Estimated pass^k value
	"""
	if n == 0:
	return 0.0
	p = c / n
	return p ** k


	# ==============================================================================
	# 6. Main Evaluation Pipeline
	# ==============================================================================

	def evaluate_dataset(
	problems: list[Problem],
	generator_func: Callable[[Problem], Any],
	k_values: list[int],
	budget: int,
	sampling_strategy: SamplingStrategy = SamplingStrategy.uniform
	) -> EvaluationResult:
	"""
	Evaluate a dataset of problems with a given budget.

	Args:
	problems: List of Problem objects
	generator_func: Function to generate solutions
	k_values: List of k values to compute metrics for
	budget: Total number of samples to generate
	sampling_strategy: Which sampling strategy to use

	Returns:
	EvaluationResult with problem-level and dataset-level metrics
	"""
	# Step 1: Collect samples using chosen strategy
	if sampling_strategy == SamplingStrategy.uniform:
	stats = uniform_sampling(problems, generator_func, budget)
	else: # DYNAMIC
	stats = dynamic_sampling(problems, generator_func, budget)

	# Step 2: Fit Beta-Binomial parameters on whole dataset
	successes_array = np.array([stats[p.problem_id].successes for p in problems])
	attempts_array = np.array([stats[p.problem_id].attempts for p in problems])

	beta_binomial_params = fit_beta_binomial(successes_array, attempts_array)
	alpha, beta = beta_binomial_params['alpha'], beta_binomial_params['beta']

	# Step 3: Compute metrics for each problem
	problem_metrics: list[ProblemLevelMetrics] = []

	for problem in problems:
	stat = stats[problem.problem_id]
	metrics = ProblemLevelMetrics(
	problem_id=problem.problem_id,
	attempts=stat.attempts,
	successes=stat.successes
	)

	for k in k_values:
	# pass@k: use unbiased if k <= attempts, else Beta-Binomial
	if k <= stat.attempts and stat.attempts > 0:
	metrics.pass_at_k[k] = pass_at_k_unbiased(
	stat.attempts, stat.successes, k
	)
	else:
	metrics.pass_at_k[k] = pass_at_k_beta_binomial(
	stat.successes, stat.attempts, k, alpha, beta
	)

	# pass^k: always use the same formula
	metrics.pass_power_k[k] = pass_power_k(
	stat.attempts, stat.successes, k
	)

	problem_metrics.append(metrics)

	# Step 4: Aggregate to dataset level
	dataset_metrics = DatasetLevelMetrics(
	num_problems=len(problems),
	total_attempts=sum(s.attempts for s in stats.values())
	)

	for k in k_values:
	# Average over all problems
	dataset_metrics.pass_at_k[k] = np.mean([
	m.pass_at_k[k] for m in problem_metrics
	])
	dataset_metrics.pass_power_k[k] = np.mean([
	m.pass_power_k[k] for m in problem_metrics
	])

	return EvaluationResult(
	problem_metrics=problem_metrics,
	dataset_metrics=dataset_metrics,
	beta_binomial_params=beta_binomial_params
	)


	# ==============================================================================
	# 7. Example Usage
	# ==============================================================================

	if __name__ == '__main__':
	# Define mock problems with the new structure
	def test_length(solution: str) -> bool:
	return len(solution) > 50

	def test_contains_return(solution: str) -> bool:
	return 'return' in solution

	mock_problems = [
	Problem(
	problem_id='prob_001',
	input_state={'operation': 'add'},
	prompt_template='Write a function that {operation}s two numbers.',
	output_state={},
	tests=[test_length, test_contains_return]
	),
	Problem(
	problem_id='prob_002',
	input_state={'operation': 'reverse'},
	prompt_template='Write a function that {operation}s a string.',
	output_state={},
	tests=[test_length, test_contains_return]
	),
	Problem(
	problem_id='prob_003',
	input_state={'operation': 'sort'},
	prompt_template='Write a function that {operation}s a list.',
	output_state={},
	tests=[test_length, test_contains_return]
	),
	]

	# Mock generator: 60% success rate
	def mock_generator(problem: Problem) -> str:
	if np.random.rand() < 0.6:
	return f"def solution():\n return 'correct implementation'" + "A" * 50
	else:
	return "def solution():\n pass"

	# Evaluation parameters
	k_values_to_test = [1, 5, 10, 20]
	budget = 100

	# Run with uniform sampling
	print("=" * 70)
	print("UNIFORM SAMPLING EVALUATION")
	print("=" * 70)
	uniform_results = evaluate_dataset(
	problems=mock_problems,
	generator_func=mock_generator,
	k_values=k_values_to_test,
	budget=budget,
	sampling_strategy=SamplingStrategy.uniform
	)

	print(f"\n--- Dataset-Level Metrics ---")
	print(f"Total problems: {uniform_results.dataset_metrics.num_problems}")
	print(f"Total attempts: {uniform_results.dataset_metrics.total_attempts}")
	print(f"Beta-Binomial params: α={uniform_results.beta_binomial_params['alpha']:.3f}, "
	f"β={uniform_results.beta_binomial_params['beta']:.3f}")
	print(f"\npass@k metrics:")
	for k, val in uniform_results.dataset_metrics.pass_at_k.items():
	print(f" pass@{k}: {val:.4f}")
	print(f"\npass^k metrics:")
	for k, val in uniform_results.dataset_metrics.pass_power_k.items():
	print(f" pass^{k}: {val:.4f}")

	print(f"\n--- Problem-Level Metrics ---")
	for pm in uniform_results.problem_metrics:
	print(f"\nProblem {pm.problem_id}:")
	print(f" Attempts: {pm.attempts}, Successes: {pm.successes}")
	print(f" pass@k: {pm.pass_at_k}")
	print(f" pass^k: {pm.pass_power_k}")

	# Run with dynamic sampling
	print("\n" + "=" * 70)
	print("DYNAMIC SAMPLING EVALUATION")
	print("=" * 70)
	dynamic_results = evaluate_dataset(
	problems=mock_problems,
	generator_func=mock_generator,
	k_values=k_values_to_test,
	budget=budget,
	sampling_strategy=SamplingStrategy.dynamic
	)

	print(f"\n--- Dataset-Level Metrics ---")
	print(f"Total problems: {dynamic_results.dataset_metrics.num_problems}")
	print(f"Total attempts: {dynamic_results.dataset_metrics.total_attempts}")
	print(f"Beta-Binomial params: α={dynamic_results.beta_binomial_params['alpha']:.3f}, "
	f"β={dynamic_results.beta_binomial_params['beta']:.3f}")
	print(f"\npass@k metrics:")
	for k, val in dynamic_results.dataset_metrics.pass_at_k.items():
	print(f" pass@{k}: {val:.4f}")
	print(f"\npass^k metrics:")
	for k, val in dynamic_results.dataset_metrics.pass_power_k.items():
	print(f" pass^{k}: {val:.4f}")

	print(f"\n--- Problem-Level Metrics ---")
	for pm in dynamic_results.problem_metrics:
	print(f"\nProblem {pm.problem_id}:")
	print(f" Attempts: {pm.attempts}, Successes: {pm.successes}")
	print(f" pass@k: {pm.pass_at_k}")
	print(f" pass^k: {pm.pass_power_k}")
No results found