Skip to content

Instantly share code, notes, and snippets.

@DiTo97
Last active January 15, 2026 09:27
Show Gist options
  • Select an option

  • Save DiTo97/4d0ba4106193ad7e343da7f24a0c3ee5 to your computer and use it in GitHub Desktop.

Select an option

Save DiTo97/4d0ba4106193ad7e343da7f24a0c3ee5 to your computer and use it in GitHub Desktop.
# /// script
# dependencies = [
# "openai-agents",
# ]
# ///
import asyncio
import json
from dataclasses import dataclass
from typing import Sequence, AsyncGenerator
import numpy as np
from agents import Agent, Runner, Tool, WebSearchTool
from openai import AsyncOpenAI
@dataclass
class TrajectoryStep:
reasoning: str
tool_name: str | None
tool_input: str | None
tool_output: str | None
@dataclass
class Trajectory:
traject_id: str
path: list[TrajectoryStep] # Agent's reasoning and tool interactions
final_output: str # Agent's final answer
input_query: str # User's original request
async def agentic_loop(
agent: Agent,
input_query: str,
temperature: float,
*,
run_id: str
) -> Trajectory:
"""
Runs an agentic loop, tracing all model/tool reasoning and actions for a given scenario.
"""
trace: list[TrajectoryStep] = []
# The Runner API gives detailed yield results for tracing steps
async for event in Runner.run_with_trace_async(
agent,
input_query,
temperature=temperature
):
if event.type == "tool":
trace.append(TrajectoryStep(
reasoning=event.prior_message,
tool_name=event.tool_name,
tool_input=event.tool_input,
tool_output=event.tool_output
))
elif event.type == "thought":
trace.append(TrajectoryStep(
reasoning=event.message,
tool_name=None,
tool_input=None,
tool_output=None
))
# When iteration ends, the response is available
final_output = await Runner.run_async(agent, input_query, temperature=temperature)
return Trajectory(
traject_id=run_id,
path=trace,
final_output=final_output.final_output,
input_query=input_query
)
async def generate_trajectory_group(
agent: Agent,
user_query: str,
N: int,
) -> list[Trajectory]:
"""
Generate one anchor trajectory (temperature=0) and (N-1) samples (temperature=8).
"""
anchor: Trajectory = await agentic_loop(agent, user_query, temperature=0, run_id="anchor")
coros = [
agentic_loop(agent, user_query, temperature=8, run_id=f"sample_{i}")
for i in range(1, N)
]
high_entropy_samples: list[Trajectory] = await asyncio.gather(*coros)
return [anchor, *high_entropy_samples]
def build_judge_prompt(
user_query: str,
path_a: str, answer_a: str,
path_b: str, answer_b: str
) -> str:
# Plug into the structure from the paper
return f"""
You are a rigorously minded “Comprehensive LLM Writing Evaluator” versed in writing-assessment methodology. Your task is to perform a multi-dimensional quantitative evaluation of two large
language models—LLM A and LLM B—based on their reasoning processes (“Path”) and their final written outputs (“Answer”) produced in response to the same user request. Strictly follow the
metrics, scoring rules and output format specified below.
I. Input Format for the Items to Be Evaluated
(When evaluating, refer only to the text provided) ——————————
<USER_QUERY>
{The user’s original writing request}
</USER_QUERY>
<PATH_A>
{LLM A’s complete chain of thought / reasoning path, including key-point breakdown, information retrieval, structural planning, etc.}
</PATH_A>
<PATH_B>
{LLM B’s complete chain of thought / reasoning path, including key-point breakdown, information retrieval, structural planning, etc.}
</PATH_B>
<ANSWER_A>
{LLM A’s final article / copy}
</ANSWER_A>
<ANSWER_B>
{LLM B’s final article / copy}
</ANSWER_B> ——————————
II. Path Evaluation (assess PATH only; ignore ANSWER)
[Dimension Descriptions]
1. Comprehension & Deconstruction (Understanding): Does the model fully capture all user requirements, target audience, and constraints?
2. Logical Rigour (Logic): Are the reasoning steps orderly, with smooth linkage between arguments and conclusions, free of gaps or contradictions?
3. Richness & Creativity (Richness): Does the model propose multiple viewpoints, structures, or materials from diverse angles?
[Scoring Rules]
• Each dimension: 0-10 points (0 = “entirely missing”, 10 = “outstanding”).
• Overall path score (overall_p) = arithmetic mean of the three dimensions, rounded to the nearest integer.
III. Answer Evaluation (assess ANSWER with reference to its PATH)
[Dimension Descriptions]
1. Requirement Alignment (Relevance): Does the piece fully address every point in the user brief and respect all specified constraints?
2. Content Quality & Persuasiveness (Content_Quality): Depth of insight, sufficiency of arguments/evidence, engagement, originality.
3. Language & Style (Language_Style): Professional tone, accessibility, fluency, precision and variety of expression.
4. Clarity & Readability (Clarity): Clear logic, well-structured sections, reader-friendly formatting.
[Scoring Rules]
• Each dimension: 0-10 points (0 = “entirely missing”, 10 = “outstanding”).
• Overall answer score (overall_a) = arithmetic mean of the four dimensions, rounded to the nearest integer.
IV. Combined Score & Winner Determination
1. combined_score = 0.4 × overall_p + 0.6 × overall_a, rounded to one decimal place.
2. If both models obtain the same combined_score, declare “Tie”.
V. Output Format (strictly follow; do not add, remove or reorder fields)
{
"analysis": {
"path_A": "<80-120 Chinese characters: highlight strengths and weaknesses of A’s path>",
"path_B": "<80-120 Chinese characters: highlight strengths and weaknesses of B’s path>",
"answer_A": "<80-120 Chinese characters: highlight strengths and weaknesses of A’s answer>",
"answer_B": "<80-120 Chinese characters: highlight strengths and weaknesses of B’s answer>"
},
"path_scores": {
"LLM_A": {
"understanding": <0-10>,
"logic": <0-10>,
"richness": <0-10>,
"overall_p": <0-10>
},
"LLM_B": {
"understanding": <0-10>,
"logic": <0-10>,
"richness": <0-10>,
"overall_p": <0-10>
}
},
"answer_scores": {
"LLM_A": {
"relevance": <0-10>,
"content_quality": <0-10>,
"language_style": <0-10>,
"clarity": <0-10>,
"overall_a": <0-10>
},
"LLM_B": {
"relevance": <0-10>,
"content_quality": <0-10>,
"language_style": <0-10>,
"clarity": <0-10>,
"overall_a": <0-10>
}
},
"combined_scores": {
"LLM_A": <0-10>,
"LLM_B": <0-10>
},
"winner": "<LLM_A | LLM_B | Tie>"
}
VI. Critical Requirements
1. Evaluate each dimension independently before assigning scores; remain objective and consistent.
2. Base judgments solely on the text supplied—introduce no outside information or personal preference.
3. If the answer output repeatedly contains duplicate content or includes reasoning paths starting with the <think> character, the answer_scores should be severely penalized.
4. Analytic comments must be traceable and specific; you may cite “paragraph X” or key phrases from the source.
5. Output must be valid JSON so that downstream programs can parse it.
"""
async def arena_judge(
traj_a: Trajectory,
traj_b: Trajectory,
llm_client: AsyncOpenAI,
model: str = "gpt-4"
) -> tuple[float, float]:
"""
Calls the LLM judge and returns (score_a, score_b) as floats for ArenaRL.
"""
prompt = build_judge_prompt(
traj_a.input_query,
path_a="\n".join(s.reasoning for s in traj_a.path),
answer_a=traj_a.final_output,
path_b="\n".join(s.reasoning for s in traj_b.path),
answer_b=traj_b.final_output,
)
response = await llm_client.chat.completions.create(
model=model,
messages=[{"role": "user", "content": prompt}],
temperature=0.0, # Always deterministic judge
response_format={"type": "json_object"}
)
j = json.loads(response.choices[0].message.content)
score_a, score_b = float(j["combined_scores"]["LLM_A"]), float(j["combined_scores"]["LLM_B"])
return score_a, score_b
def arena_advantage_scoring(traj_group, arena_judge):
"""
traj_group: [anchor, traj_1, ..., traj_N-1]
arena_judge: function(traj_a, traj_b) -> score_a, score_b
Returns: standardized advantages A_i for each trajectory
"""
N = len(traj_group)
anchor = traj_group[0]
others = traj_group[1:]
# Phase 1: Anchor-Based Seeding
# For each candidate, score vs anchor
S_init = {}
anchor_scores = []
for i, traj in enumerate(others):
s_i, s_anchor = arena_judge(traj, anchor)
S_init[traj['traject_id']] = s_i
anchor_scores.append(s_anchor)
S_init[anchor['traject_id']] = np.mean(anchor_scores) # anchor gets average of its scores
# Sort descendingly by seed scores
ordered = sorted(traj_group, key=lambda t: S_init[t['traject_id']], reverse=True)
P = ordered
# Accumulated scores
V = {t['traject_id']: S_init[t['traject_id']] for t in traj_group}
tiers = []
# Phase 2: Single-Elimination Tournament
B = [None] * N
idx_head = 0
idx_tail = N - 1
for k in range(1, N//2 + 1):
t_high, t_low = P[k-1], P[N-k]
if k % 2 == 1: # odd, fill from front
B[idx_head] = t_high
B[idx_head+1] = t_low
idx_head += 2
else: # even, fill from back
B[idx_tail] = t_high
B[idx_tail-1] = t_low
idx_tail -= 2
B = [x for x in B if x is not None]
while len(B) > 1:
W_round, L_round = [], []
for k in range(len(B)//2):
ta, tb = B[2*k], B[2*k+1]
sa, sb = arena_judge(ta, tb)
V[ta['traject_id']] += sa
V[tb['traject_id']] += sb
if sa > sb:
W_round.append(ta)
L_round.append(tb)
else:
W_round.append(tb)
L_round.append(ta)
tiers.append(L_round)
B = W_round
tiers.append(B) # Champion final tier
# Phase 3: Hierarchical Rank Assignment
k = 0
ranks = {}
for tier in reversed(tiers):
# Sort within tier by accumulated score, descending
tier_sorted = sorted(tier, key=lambda t: V[t['traject_id']], reverse=True)
for t in tier_sorted:
ranks[t['traject_id']] = k
k += 1
# Phase 4: Standardized Advantage Calculation
rewards = {}
for t in traj_group:
rewards[t['traject_id']] = 1 - ranks[t['traject_id']] / (N - 1)
μ_r = np.mean(list(rewards.values()))
σ_r = np.std(list(rewards.values()))
eps = 1e-8
A = []
for t in traj_group:
A_i = (rewards[t['traject_id']] - μ_r) / (σ_r + eps)
A.append(A_i)
return A
INSTRUCTIONS = (
"You are a research assistant. Given a search term, you search the web for that term and "
"produce a concise summary of the results. The summary must be 2-3 paragraphs and less than 300 "
"words. Capture the main points. Write succinctly, no need to have complete sentences or good "
"grammar. This will be consumed by someone synthesizing a report, so its vital you capture the "
"essence and ignore any fluff. Do not include any additional commentary other than the summary "
"itself."
)
search_agent = Agent(
name="Search agent",
model="gpt-5.2",
instructions=INSTRUCTIONS,
tools=[WebSearchTool()],
)
async def run_example():
# Agent with tools, loaded per openai-agents-python docs
agent = search_agent
user_query = "Write an essay on the evolution of programming languages."
N = 8
# Generation
trajectories = await generate_trajectory_group(agent, user_query, N)
# LLM Judge client
llm_client = AsyncOpenAI(api_key="YOUR_KEY", base_url="https://api.openai.com/v1")
# Arena scoring function that wraps your async judge
async def arena_judge_wrapper(t1, t2):
return await arena_judge(t1, t2, llm_client)
advantages = await arena_advantage_scoring(trajectories, arena_judge_wrapper)
print("Standardized Advantages:", advantages)
if __name__ == "__main__":
asyncio.run(run_arena_rl_example())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment