Last active
January 15, 2026 09:27
-
-
Save DiTo97/4d0ba4106193ad7e343da7f24a0c3ee5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # /// script | |
| # dependencies = [ | |
| # "openai-agents", | |
| # ] | |
| # /// | |
| import asyncio | |
| import json | |
| from dataclasses import dataclass | |
| from typing import Sequence, AsyncGenerator | |
| import numpy as np | |
| from agents import Agent, Runner, Tool, WebSearchTool | |
| from openai import AsyncOpenAI | |
| @dataclass | |
| class TrajectoryStep: | |
| reasoning: str | |
| tool_name: str | None | |
| tool_input: str | None | |
| tool_output: str | None | |
| @dataclass | |
| class Trajectory: | |
| traject_id: str | |
| path: list[TrajectoryStep] # Agent's reasoning and tool interactions | |
| final_output: str # Agent's final answer | |
| input_query: str # User's original request | |
| async def agentic_loop( | |
| agent: Agent, | |
| input_query: str, | |
| temperature: float, | |
| *, | |
| run_id: str | |
| ) -> Trajectory: | |
| """ | |
| Runs an agentic loop, tracing all model/tool reasoning and actions for a given scenario. | |
| """ | |
| trace: list[TrajectoryStep] = [] | |
| # The Runner API gives detailed yield results for tracing steps | |
| async for event in Runner.run_with_trace_async( | |
| agent, | |
| input_query, | |
| temperature=temperature | |
| ): | |
| if event.type == "tool": | |
| trace.append(TrajectoryStep( | |
| reasoning=event.prior_message, | |
| tool_name=event.tool_name, | |
| tool_input=event.tool_input, | |
| tool_output=event.tool_output | |
| )) | |
| elif event.type == "thought": | |
| trace.append(TrajectoryStep( | |
| reasoning=event.message, | |
| tool_name=None, | |
| tool_input=None, | |
| tool_output=None | |
| )) | |
| # When iteration ends, the response is available | |
| final_output = await Runner.run_async(agent, input_query, temperature=temperature) | |
| return Trajectory( | |
| traject_id=run_id, | |
| path=trace, | |
| final_output=final_output.final_output, | |
| input_query=input_query | |
| ) | |
| async def generate_trajectory_group( | |
| agent: Agent, | |
| user_query: str, | |
| N: int, | |
| ) -> list[Trajectory]: | |
| """ | |
| Generate one anchor trajectory (temperature=0) and (N-1) samples (temperature=8). | |
| """ | |
| anchor: Trajectory = await agentic_loop(agent, user_query, temperature=0, run_id="anchor") | |
| coros = [ | |
| agentic_loop(agent, user_query, temperature=8, run_id=f"sample_{i}") | |
| for i in range(1, N) | |
| ] | |
| high_entropy_samples: list[Trajectory] = await asyncio.gather(*coros) | |
| return [anchor, *high_entropy_samples] | |
| def build_judge_prompt( | |
| user_query: str, | |
| path_a: str, answer_a: str, | |
| path_b: str, answer_b: str | |
| ) -> str: | |
| # Plug into the structure from the paper | |
| return f""" | |
| You are a rigorously minded “Comprehensive LLM Writing Evaluator” versed in writing-assessment methodology. Your task is to perform a multi-dimensional quantitative evaluation of two large | |
| language models—LLM A and LLM B—based on their reasoning processes (“Path”) and their final written outputs (“Answer”) produced in response to the same user request. Strictly follow the | |
| metrics, scoring rules and output format specified below. | |
| I. Input Format for the Items to Be Evaluated | |
| (When evaluating, refer only to the text provided) —————————— | |
| <USER_QUERY> | |
| {The user’s original writing request} | |
| </USER_QUERY> | |
| <PATH_A> | |
| {LLM A’s complete chain of thought / reasoning path, including key-point breakdown, information retrieval, structural planning, etc.} | |
| </PATH_A> | |
| <PATH_B> | |
| {LLM B’s complete chain of thought / reasoning path, including key-point breakdown, information retrieval, structural planning, etc.} | |
| </PATH_B> | |
| <ANSWER_A> | |
| {LLM A’s final article / copy} | |
| </ANSWER_A> | |
| <ANSWER_B> | |
| {LLM B’s final article / copy} | |
| </ANSWER_B> —————————— | |
| II. Path Evaluation (assess PATH only; ignore ANSWER) | |
| [Dimension Descriptions] | |
| 1. Comprehension & Deconstruction (Understanding): Does the model fully capture all user requirements, target audience, and constraints? | |
| 2. Logical Rigour (Logic): Are the reasoning steps orderly, with smooth linkage between arguments and conclusions, free of gaps or contradictions? | |
| 3. Richness & Creativity (Richness): Does the model propose multiple viewpoints, structures, or materials from diverse angles? | |
| [Scoring Rules] | |
| • Each dimension: 0-10 points (0 = “entirely missing”, 10 = “outstanding”). | |
| • Overall path score (overall_p) = arithmetic mean of the three dimensions, rounded to the nearest integer. | |
| III. Answer Evaluation (assess ANSWER with reference to its PATH) | |
| [Dimension Descriptions] | |
| 1. Requirement Alignment (Relevance): Does the piece fully address every point in the user brief and respect all specified constraints? | |
| 2. Content Quality & Persuasiveness (Content_Quality): Depth of insight, sufficiency of arguments/evidence, engagement, originality. | |
| 3. Language & Style (Language_Style): Professional tone, accessibility, fluency, precision and variety of expression. | |
| 4. Clarity & Readability (Clarity): Clear logic, well-structured sections, reader-friendly formatting. | |
| [Scoring Rules] | |
| • Each dimension: 0-10 points (0 = “entirely missing”, 10 = “outstanding”). | |
| • Overall answer score (overall_a) = arithmetic mean of the four dimensions, rounded to the nearest integer. | |
| IV. Combined Score & Winner Determination | |
| 1. combined_score = 0.4 × overall_p + 0.6 × overall_a, rounded to one decimal place. | |
| 2. If both models obtain the same combined_score, declare “Tie”. | |
| V. Output Format (strictly follow; do not add, remove or reorder fields) | |
| { | |
| "analysis": { | |
| "path_A": "<80-120 Chinese characters: highlight strengths and weaknesses of A’s path>", | |
| "path_B": "<80-120 Chinese characters: highlight strengths and weaknesses of B’s path>", | |
| "answer_A": "<80-120 Chinese characters: highlight strengths and weaknesses of A’s answer>", | |
| "answer_B": "<80-120 Chinese characters: highlight strengths and weaknesses of B’s answer>" | |
| }, | |
| "path_scores": { | |
| "LLM_A": { | |
| "understanding": <0-10>, | |
| "logic": <0-10>, | |
| "richness": <0-10>, | |
| "overall_p": <0-10> | |
| }, | |
| "LLM_B": { | |
| "understanding": <0-10>, | |
| "logic": <0-10>, | |
| "richness": <0-10>, | |
| "overall_p": <0-10> | |
| } | |
| }, | |
| "answer_scores": { | |
| "LLM_A": { | |
| "relevance": <0-10>, | |
| "content_quality": <0-10>, | |
| "language_style": <0-10>, | |
| "clarity": <0-10>, | |
| "overall_a": <0-10> | |
| }, | |
| "LLM_B": { | |
| "relevance": <0-10>, | |
| "content_quality": <0-10>, | |
| "language_style": <0-10>, | |
| "clarity": <0-10>, | |
| "overall_a": <0-10> | |
| } | |
| }, | |
| "combined_scores": { | |
| "LLM_A": <0-10>, | |
| "LLM_B": <0-10> | |
| }, | |
| "winner": "<LLM_A | LLM_B | Tie>" | |
| } | |
| VI. Critical Requirements | |
| 1. Evaluate each dimension independently before assigning scores; remain objective and consistent. | |
| 2. Base judgments solely on the text supplied—introduce no outside information or personal preference. | |
| 3. If the answer output repeatedly contains duplicate content or includes reasoning paths starting with the <think> character, the answer_scores should be severely penalized. | |
| 4. Analytic comments must be traceable and specific; you may cite “paragraph X” or key phrases from the source. | |
| 5. Output must be valid JSON so that downstream programs can parse it. | |
| """ | |
| async def arena_judge( | |
| traj_a: Trajectory, | |
| traj_b: Trajectory, | |
| llm_client: AsyncOpenAI, | |
| model: str = "gpt-4" | |
| ) -> tuple[float, float]: | |
| """ | |
| Calls the LLM judge and returns (score_a, score_b) as floats for ArenaRL. | |
| """ | |
| prompt = build_judge_prompt( | |
| traj_a.input_query, | |
| path_a="\n".join(s.reasoning for s in traj_a.path), | |
| answer_a=traj_a.final_output, | |
| path_b="\n".join(s.reasoning for s in traj_b.path), | |
| answer_b=traj_b.final_output, | |
| ) | |
| response = await llm_client.chat.completions.create( | |
| model=model, | |
| messages=[{"role": "user", "content": prompt}], | |
| temperature=0.0, # Always deterministic judge | |
| response_format={"type": "json_object"} | |
| ) | |
| j = json.loads(response.choices[0].message.content) | |
| score_a, score_b = float(j["combined_scores"]["LLM_A"]), float(j["combined_scores"]["LLM_B"]) | |
| return score_a, score_b | |
| def arena_advantage_scoring(traj_group, arena_judge): | |
| """ | |
| traj_group: [anchor, traj_1, ..., traj_N-1] | |
| arena_judge: function(traj_a, traj_b) -> score_a, score_b | |
| Returns: standardized advantages A_i for each trajectory | |
| """ | |
| N = len(traj_group) | |
| anchor = traj_group[0] | |
| others = traj_group[1:] | |
| # Phase 1: Anchor-Based Seeding | |
| # For each candidate, score vs anchor | |
| S_init = {} | |
| anchor_scores = [] | |
| for i, traj in enumerate(others): | |
| s_i, s_anchor = arena_judge(traj, anchor) | |
| S_init[traj['traject_id']] = s_i | |
| anchor_scores.append(s_anchor) | |
| S_init[anchor['traject_id']] = np.mean(anchor_scores) # anchor gets average of its scores | |
| # Sort descendingly by seed scores | |
| ordered = sorted(traj_group, key=lambda t: S_init[t['traject_id']], reverse=True) | |
| P = ordered | |
| # Accumulated scores | |
| V = {t['traject_id']: S_init[t['traject_id']] for t in traj_group} | |
| tiers = [] | |
| # Phase 2: Single-Elimination Tournament | |
| B = [None] * N | |
| idx_head = 0 | |
| idx_tail = N - 1 | |
| for k in range(1, N//2 + 1): | |
| t_high, t_low = P[k-1], P[N-k] | |
| if k % 2 == 1: # odd, fill from front | |
| B[idx_head] = t_high | |
| B[idx_head+1] = t_low | |
| idx_head += 2 | |
| else: # even, fill from back | |
| B[idx_tail] = t_high | |
| B[idx_tail-1] = t_low | |
| idx_tail -= 2 | |
| B = [x for x in B if x is not None] | |
| while len(B) > 1: | |
| W_round, L_round = [], [] | |
| for k in range(len(B)//2): | |
| ta, tb = B[2*k], B[2*k+1] | |
| sa, sb = arena_judge(ta, tb) | |
| V[ta['traject_id']] += sa | |
| V[tb['traject_id']] += sb | |
| if sa > sb: | |
| W_round.append(ta) | |
| L_round.append(tb) | |
| else: | |
| W_round.append(tb) | |
| L_round.append(ta) | |
| tiers.append(L_round) | |
| B = W_round | |
| tiers.append(B) # Champion final tier | |
| # Phase 3: Hierarchical Rank Assignment | |
| k = 0 | |
| ranks = {} | |
| for tier in reversed(tiers): | |
| # Sort within tier by accumulated score, descending | |
| tier_sorted = sorted(tier, key=lambda t: V[t['traject_id']], reverse=True) | |
| for t in tier_sorted: | |
| ranks[t['traject_id']] = k | |
| k += 1 | |
| # Phase 4: Standardized Advantage Calculation | |
| rewards = {} | |
| for t in traj_group: | |
| rewards[t['traject_id']] = 1 - ranks[t['traject_id']] / (N - 1) | |
| μ_r = np.mean(list(rewards.values())) | |
| σ_r = np.std(list(rewards.values())) | |
| eps = 1e-8 | |
| A = [] | |
| for t in traj_group: | |
| A_i = (rewards[t['traject_id']] - μ_r) / (σ_r + eps) | |
| A.append(A_i) | |
| return A | |
| INSTRUCTIONS = ( | |
| "You are a research assistant. Given a search term, you search the web for that term and " | |
| "produce a concise summary of the results. The summary must be 2-3 paragraphs and less than 300 " | |
| "words. Capture the main points. Write succinctly, no need to have complete sentences or good " | |
| "grammar. This will be consumed by someone synthesizing a report, so its vital you capture the " | |
| "essence and ignore any fluff. Do not include any additional commentary other than the summary " | |
| "itself." | |
| ) | |
| search_agent = Agent( | |
| name="Search agent", | |
| model="gpt-5.2", | |
| instructions=INSTRUCTIONS, | |
| tools=[WebSearchTool()], | |
| ) | |
| async def run_example(): | |
| # Agent with tools, loaded per openai-agents-python docs | |
| agent = search_agent | |
| user_query = "Write an essay on the evolution of programming languages." | |
| N = 8 | |
| # Generation | |
| trajectories = await generate_trajectory_group(agent, user_query, N) | |
| # LLM Judge client | |
| llm_client = AsyncOpenAI(api_key="YOUR_KEY", base_url="https://api.openai.com/v1") | |
| # Arena scoring function that wraps your async judge | |
| async def arena_judge_wrapper(t1, t2): | |
| return await arena_judge(t1, t2, llm_client) | |
| advantages = await arena_advantage_scoring(trajectories, arena_judge_wrapper) | |
| print("Standardized Advantages:", advantages) | |
| if __name__ == "__main__": | |
| asyncio.run(run_arena_rl_example()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment