Skip to content

Instantly share code, notes, and snippets.

@DiTo97
Created November 13, 2025 00:09
Show Gist options
  • Select an option

  • Save DiTo97/22bd1c1341a827d9360a3644a266d798 to your computer and use it in GitHub Desktop.

Select an option

Save DiTo97/22bd1c1341a827d9360a3644a266d798 to your computer and use it in GitHub Desktop.
efficient and reliable sampling methodologies for pass@k and pass^k evaluation
import math
import numpy as np
from scipy.stats import beta
from collections import defaultdict
# ==============================================================================
# 1. Metric Estimation Functions
# ==============================================================================
def pass_at_k_unbiased_estimator(n, c, k):
"""
Computes the unbiased estimator for pass@k.
Args:
n (int): Total number of solutions generated.
c (int): Number of correct solutions among n.
k (int): The k in pass@k.
Returns:
float: The pass@k estimate.
"""
if n < k:
raise ValueError("n must be greater than or equal to k")
if n - c < k:
# Trivial case: not enough failures to pick k failures.
return 1.0
# The estimator is 1 - P(all k solutions fail)
# The probability of all k failing is comb(n-c, k) / comb(n, k)
log_comb_n_k = math.log(math.comb(n, k))
log_comb_n_minus_c_k = math.log(math.comb(n - c, k))
return 1.0 - math.exp(log_comb_n_minus_c_k - log_comb_n_k)
def pass_power_k_estimator(n, c, k):
"""
Computes the pass^k estimator.
Args:
n (int): Total number of solutions generated.
c (int): Number of correct solutions among n.
k (int): The k in pass^k.
Returns:
float: The pass^k estimate.
"""
if n == 0:
return 0.0
# The probability of a single success is c/n.
# For k independent trials, the probability of all succeeding is (c/n)^k.
p = c / n
return p ** k
# ==============================================================================
# 2. Beta-Binomial Dynamic Sampling
# ==============================================================================
def get_beta_binomial_params(n, c):
"""
Computes the parameters (alpha, beta) of the posterior beta distribution
for the problem's success probability, given n trials and c successes.
We use a uniform Beta(1, 1) as the prior.
Args:
n (int): Number of trials.
c (int): Number of successes.
Returns:
tuple: A tuple (alpha, beta) for the posterior beta distribution.
"""
return c + 1, n - c + 1
def dynamic_sampling(problem, generator_func, max_n=500, tolerance=0.01):
"""
Dynamically samples solutions for a single problem using a beta-binomial model.
The process stops when the posterior standard deviation of the underlying
success probability is sufficiently small.
This is based on the idea from Kulal et al. (2025) and other Bayesian
evaluation methods.
Args:
problem (dict): The problem data (e.g., prompt, tests).
generator_func (function): A function that takes a problem and returns
a generated solution.
max_n (int): The maximum number of samples to generate.
tolerance (float): The desired standard deviation of the posterior.
A smaller value means more samples and higher confidence.
Returns:
tuple: A tuple (n, c), where n is the total number of samples and
c is the number of correct ones.
"""
c = 0
# First, run a small pilot study to get an initial estimate
pilot_n = min(10, max_n)
solutions = [generator_func(problem) for _ in range(pilot_n)]
c += sum(1 for sol in solutions if is_correct(sol, problem))
n = pilot_n
while n < max_n:
alpha, beta_param = get_beta_binomial_params(n, c)
# Calculate the standard deviation of the posterior Beta distribution
posterior_std = math.sqrt((alpha * beta_param) / ((alpha + beta_param)**2 * (alpha + beta_param + 1)))
if posterior_std < tolerance:
break
# Sample one more solution
n += 1
solution = generator_func(problem)
if is_correct(solution, problem):
c += 1
return n, c
# ==============================================================================
# 3. Main Evaluation Function and Helper Classes
# ==============================================================================
class Problem:
def __init__(self, problem_id, prompt, tests):
self.problem_id = problem_id
self.prompt = prompt
self.tests = tests
def is_correct(solution, problem):
"""
A placeholder function for running tests against a generated solution.
In a real-world scenario, this would involve a sandboxed execution environment.
Args:
solution (str): The generated code solution.
problem (Problem): The problem object containing test cases.
Returns:
bool: True if the solution passes all tests, False otherwise.
"""
# This is a mock implementation.
# In practice, this needs to be a robust, sandboxed execution.
try:
# A simple check: if 'pass' is in the code, assume it's valid but potentially wrong.
# A real check would run the tests.
return len(solution) > 50 and 'return' in solution
except Exception:
return False
def evaluate_experiment(problems, generation_func, k_values, dynamic_sampling_enabled=True):
"""
Executes the evaluation experiment and computes pass@k and pass^k.
Args:
problems (list): A list of Problem objects.
generation_func (function): A function to generate solutions.
k_values (list): An array of k values to compute metrics for.
dynamic_sampling_enabled (bool): Whether to use dynamic sampling or a fixed n.
Returns:
dict: A dictionary containing the computed pass@k and pass^k results.
"""
results_per_problem = defaultdict(dict)
for problem in problems:
if dynamic_sampling_enabled:
n, c = dynamic_sampling(problem, generation_func)
else:
# If not dynamic, use a fixed n, e.g., max_k * 2 or a standard value.
max_k = max(k_values)
fixed_n = max(max_k, 200)
solutions = [generation_func(problem) for _ in range(fixed_n)]
c = sum(1 for sol in solutions if is_correct(sol, problem))
n = fixed_n
for k in k_values:
if k > n:
print(f"Warning: k={k} > n={n} for problem {problem.problem_id}. Skipping pass@k.")
continue
pass_at_k = pass_at_k_unbiased_estimator(n, c, k)
pass_power_k = pass_power_k_estimator(n, c, k)
results_per_problem[problem.problem_id][f'pass@{k}'] = pass_at_k
results_per_problem[problem.problem_id][f'pass^{k}'] = pass_power_k
results_per_problem[problem.problem_id]['n'] = n
results_per_problem[problem.problem_id]['c'] = c
# Aggregate results
final_metrics = defaultdict(list)
for problem_id, metrics in results_per_problem.items():
for metric, value in metrics.items():
if metric.startswith('pass'):
final_metrics[metric].append(value)
average_metrics = {metric: np.mean(values) for metric, values in final_metrics.items()}
return {
'problem_level_results': results_per_problem,
'average_results': average_metrics,
}
# ==============================================================================
# 4. Example Usage
# ==============================================================================
if __name__ == '__main__':
# Define a set of mock problems
mock_problems = [
Problem(problem_id='prob_001', prompt='Write a function that adds two numbers.', tests=[]),
Problem(problem_id='prob_002', prompt='Write a function that reverses a string.', tests=[]),
# Add more problems as needed
]
# Define a mock generation function
def mock_generator(problem):
# This function simulates generating a solution.
# Here, we simulate a model that is correct 60% of the time.
if np.random.rand() < 0.6:
return f"def {problem.prompt.split(' ')[-1].replace('.', '')}(...):\n return ..."
else:
return "def some_wrong_solution(...):\n return 'wrong'"
# Define the K values to evaluate
k_values_to_test = [1, 10, 50, 100]
# Run the experiment with dynamic sampling
print("Running experiment with dynamic beta-binomial sampling...")
dynamic_results = evaluate_experiment(
problems=mock_problems,
generation_func=mock_generator,
k_values=k_values_to_test,
dynamic_sampling_enabled=True,
)
print("\n--- Dynamic Sampling Results ---")
print("Average Metrics:", dynamic_results['average_results'])
# Optional: Print detailed results per problem
# for p_id, res in dynamic_results['problem_level_results'].items():
# print(f"Problem {p_id}: n={res['n']}, c={res['c']}, Results: {res}")
# Run the experiment with a fixed n
print("\nRunning experiment with fixed n=200...")
fixed_results = evaluate_experiment(
problems=mock_problems,
generation_func=mock_generator,
k_values=k_values_to_test,
dynamic_sampling_enabled=False,
)
print("\n--- Fixed Sampling Results (n=200) ---")
print("Average Metrics:", fixed_results['average_results'])
# Optional: Print detailed results per problem
# for p_id, res in fixed_results['problem_level_results'].items():
# print(f"Problem {p_id}: n={res['n']}, c={res['c']}, Results: {res}")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment