Tom Adamczewski tadamcz

## README.md

      
              4 files
            
          
              0 forks
            
          
                0 comments
              
            
              0 stars
            
          
                tadamcz
                / README.md
            
            
              Created
              December 9, 2025 18:58
            
              
                Epoch AI implementation of SimpleQA Verified
              
          
    Taken from inspect_evals @ ef181cdc95dc2d92b0f9dff8b6c0756b42128b78, with our modifications:

don't pass in temperatures or grader models (we have our own defaults/policies)
use tool calling for more structured output in the scorer (theirs was extremely brittle)
only calculate the % correct over all questions (it's the only metric we can compare with other benchmarks)


## __init__.py
"""
Based on [1] by @max-kaufmann as a starting point, with substantial modifications; not intended to
be a drop-in alternative.

[1] https://github.com/UKGovernmentBEIS/inspect_evals/tree/b30a1aab73217e035d5aa22fd0526c70650e4b3e/src/inspect_evals/swe_bench
"""

import json
import logging
import platform

## Dockerfile
FROM python:3.10-bookworm

# Install system dependencies required for gmpy2
RUN apt-get update && apt-get install -y \
    libmpfr-dev \
    libmpc-dev \
    libgmp-dev \
    && rm -rf /var/lib/apt/lists/*

COPY agent-python-requirements.txt .

## __init__.py
from typing import Dict, Any

from inspect_ai import Task, task, Epochs
from inspect_ai.dataset import Sample, hf_dataset
from inspect_ai.scorer import scorer, Score, Target, CORRECT, INCORRECT, accuracy, stderr
from inspect_ai.solver import generate, prompt_template, TaskState

from bench.model import default_grader_model

SOLUTION_TEMPLATE = """Please solve this AIME problem step by step. The answer is an integer ranging from

## gpqa.py
import random
from string import ascii_uppercase
from typing import Dict, Any
from inspect_ai import task, Task
from inspect_ai.dataset import Sample
from inspect_ai.dataset._sources.hf import hf_dataset
import inspect_ai.solver
import inspect_ai.scorer


## __init__.py
from typing import Optional, List

from inspect_ai._eval.registry import task
from inspect_ai._eval.task import Task
from inspect_ai.dataset._sources.hf import hf_dataset

from bench.model import DEFAULT_GRADER_MODEL
from bench.task.hendrycks_math.dataset import filter_dataset, record_to_sample
from bench.task.hendrycks_math.scorer import (
    normalized_string_match,

## jekyll_picture_tag_hook.rb
# Jekyll plugin to replace Markdown image syntax with {% picture %} tag for crafting responsive images
# Adapted from https://gist.github.com/mmistakes/77c68fbb07731a456805a7b473f47841
# Use as a gem: https://github.com/tadamcz/jekyll-markdown-responsive-image


Jekyll::Hooks.register [:pages, :documents], :pre_render do |post, payload|
  file_ext = post.extname.tr('.', '')

  # This regex will match all of the following correctly:
  #

## diffocop.sh
git diff origin/master --name-only --diff-filter=ACMRTUXB | grep "\.rb$" | tr "\n" " " | xargs bundle exec rubocop -A --force-exclusion
	"""
	Based on [1] by @max-kaufmann as a starting point, with substantial modifications; not intended to
	be a drop-in alternative.

	[1] https://github.com/UKGovernmentBEIS/inspect_evals/tree/b30a1aab73217e035d5aa22fd0526c70650e4b3e/src/inspect_evals/swe_bench
	"""

	import json
	import logging
	import platform
	FROM python:3.10-bookworm

	# Install system dependencies required for gmpy2
	RUN apt-get update && apt-get install -y \
	libmpfr-dev \
	libmpc-dev \
	libgmp-dev \
	&& rm -rf /var/lib/apt/lists/*

	COPY agent-python-requirements.txt .
	from typing import Dict, Any

	from inspect_ai import Task, task, Epochs
	from inspect_ai.dataset import Sample, hf_dataset
	from inspect_ai.scorer import scorer, Score, Target, CORRECT, INCORRECT, accuracy, stderr
	from inspect_ai.solver import generate, prompt_template, TaskState

	from bench.model import default_grader_model

	SOLUTION_TEMPLATE = """Please solve this AIME problem step by step. The answer is an integer ranging from
	import random
	from string import ascii_uppercase
	from typing import Dict, Any
	from inspect_ai import task, Task
	from inspect_ai.dataset import Sample
	from inspect_ai.dataset._sources.hf import hf_dataset
	import inspect_ai.solver
	import inspect_ai.scorer
	from typing import Optional, List

	from inspect_ai._eval.registry import task
	from inspect_ai._eval.task import Task
	from inspect_ai.dataset._sources.hf import hf_dataset

	from bench.model import DEFAULT_GRADER_MODEL
	from bench.task.hendrycks_math.dataset import filter_dataset, record_to_sample
	from bench.task.hendrycks_math.scorer import (
	normalized_string_match,
	# Jekyll plugin to replace Markdown image syntax with {% picture %} tag for crafting responsive images
	# Adapted from https://gist.github.com/mmistakes/77c68fbb07731a456805a7b473f47841
	# Use as a gem: https://github.com/tadamcz/jekyll-markdown-responsive-image


	Jekyll::Hooks.register [:pages, :documents], :pre_render do \|post, payload\|
	file_ext = post.extname.tr('.', '')

	# This regex will match all of the following correctly:
	#