DiTo97/ArenaRL.py

## ArenaRL.py
# /// script
# dependencies = [
#   "openai-agents",
# ]
# ///

import asyncio
import json
from dataclasses import dataclass
from typing import Sequence, AsyncGenerator

import numpy as np
from agents import Agent, Runner, Tool, WebSearchTool
from openai import AsyncOpenAI


@dataclass
class TrajectoryStep:
    reasoning: str
    tool_name: str | None
    tool_input: str | None
    tool_output: str | None


@dataclass
class Trajectory:
    traject_id: str
    path: list[TrajectoryStep]      # Agent's reasoning and tool interactions
    final_output: str               # Agent's final answer
    input_query: str                # User's original request


async def agentic_loop(
    agent: Agent,
    input_query: str,
    temperature: float,
    *,
    run_id: str
) -> Trajectory:
    """
    Runs an agentic loop, tracing all model/tool reasoning and actions for a given scenario.
    """
    trace: list[TrajectoryStep] = []
    # The Runner API gives detailed yield results for tracing steps
    async for event in Runner.run_with_trace_async(
        agent,
        input_query,
        temperature=temperature
    ):
        if event.type == "tool":
            trace.append(TrajectoryStep(
                reasoning=event.prior_message,
                tool_name=event.tool_name,
                tool_input=event.tool_input,
                tool_output=event.tool_output
            ))
        elif event.type == "thought":
            trace.append(TrajectoryStep(
                reasoning=event.message,
                tool_name=None,
                tool_input=None,
                tool_output=None
            ))
    # When iteration ends, the response is available
    final_output = await Runner.run_async(agent, input_query, temperature=temperature)
    return Trajectory(
        traject_id=run_id,
        path=trace,
        final_output=final_output.final_output,
        input_query=input_query
    )


async def generate_trajectory_group(
    agent: Agent,
    user_query: str,
    N: int,
) -> list[Trajectory]:
    """
    Generate one anchor trajectory (temperature=0) and (N-1) samples (temperature=8).
    """
    anchor: Trajectory = await agentic_loop(agent, user_query, temperature=0, run_id="anchor")
    coros = [
        agentic_loop(agent, user_query, temperature=8, run_id=f"sample_{i}")
        for i in range(1, N)
    ]
    high_entropy_samples: list[Trajectory] = await asyncio.gather(*coros)
    return [anchor, *high_entropy_samples]


def build_judge_prompt(
    user_query: str,
    path_a: str, answer_a: str,
    path_b: str, answer_b: str
) -> str:
    # Plug into the structure from the paper
    return f"""
You are a rigorously minded “Comprehensive LLM Writing Evaluator” versed in writing-assessment methodology. Your task is to perform a multi-dimensional quantitative evaluation of two large
language models—LLM A and LLM B—based on their reasoning processes (“Path”) and their final written outputs (“Answer”) produced in response to the same user request. Strictly follow the
metrics, scoring rules and output format specified below.
I. Input Format for the Items to Be Evaluated
(When evaluating, refer only to the text provided) ——————————
<USER_QUERY>
{The user’s original writing request}
</USER_QUERY>
<PATH_A>
{LLM A’s complete chain of thought / reasoning path, including key-point breakdown, information retrieval, structural planning, etc.}
</PATH_A>
<PATH_B>
{LLM B’s complete chain of thought / reasoning path, including key-point breakdown, information retrieval, structural planning, etc.}
</PATH_B>
<ANSWER_A>
{LLM A’s final article / copy}
</ANSWER_A>
<ANSWER_B>
{LLM B’s final article / copy}
</ANSWER_B> ——————————
II. Path Evaluation (assess PATH only; ignore ANSWER)
[Dimension Descriptions]
1. Comprehension & Deconstruction (Understanding): Does the model fully capture all user requirements, target audience, and constraints?
2. Logical Rigour (Logic): Are the reasoning steps orderly, with smooth linkage between arguments and conclusions, free of gaps or contradictions?
3. Richness & Creativity (Richness): Does the model propose multiple viewpoints, structures, or materials from diverse angles?
[Scoring Rules]
• Each dimension: 0-10 points (0 = “entirely missing”, 10 = “outstanding”).
• Overall path score (overall_p) = arithmetic mean of the three dimensions, rounded to the nearest integer.
III. Answer Evaluation (assess ANSWER with reference to its PATH)
[Dimension Descriptions]
1. Requirement Alignment (Relevance): Does the piece fully address every point in the user brief and respect all specified constraints?
2. Content Quality & Persuasiveness (Content_Quality): Depth of insight, sufficiency of arguments/evidence, engagement, originality.
3. Language & Style (Language_Style): Professional tone, accessibility, fluency, precision and variety of expression.
4. Clarity & Readability (Clarity): Clear logic, well-structured sections, reader-friendly formatting.
[Scoring Rules]
• Each dimension: 0-10 points (0 = “entirely missing”, 10 = “outstanding”).
• Overall answer score (overall_a) = arithmetic mean of the four dimensions, rounded to the nearest integer.
IV. Combined Score & Winner Determination
1. combined_score = 0.4 × overall_p + 0.6 × overall_a, rounded to one decimal place.
2. If both models obtain the same combined_score, declare “Tie”.
V. Output Format (strictly follow; do not add, remove or reorder fields)
{
"analysis": {
"path_A": "<80-120 Chinese characters: highlight strengths and weaknesses of A’s path>",
"path_B": "<80-120 Chinese characters: highlight strengths and weaknesses of B’s path>",
"answer_A": "<80-120 Chinese characters: highlight strengths and weaknesses of A’s answer>",
"answer_B": "<80-120 Chinese characters: highlight strengths and weaknesses of B’s answer>"
},
"path_scores": {
"LLM_A": {
"understanding": <0-10>,
"logic": <0-10>,
"richness": <0-10>,
"overall_p": <0-10>
},
"LLM_B": {
"understanding": <0-10>,
"logic": <0-10>,
"richness": <0-10>,
"overall_p": <0-10>
}
},
"answer_scores": {
"LLM_A": {
"relevance": <0-10>,
"content_quality": <0-10>,
"language_style": <0-10>,
"clarity": <0-10>,
"overall_a": <0-10>
},
"LLM_B": {
"relevance": <0-10>,
"content_quality": <0-10>,
"language_style": <0-10>,
"clarity": <0-10>,
"overall_a": <0-10>
}
},
"combined_scores": {
"LLM_A": <0-10>,
"LLM_B": <0-10>
},
"winner": "<LLM_A | LLM_B | Tie>"
}
VI. Critical Requirements
1. Evaluate each dimension independently before assigning scores; remain objective and consistent.
2. Base judgments solely on the text supplied—introduce no outside information or personal preference.
3. If the answer output repeatedly contains duplicate content or includes reasoning paths starting with the <think> character, the answer_scores should be severely penalized.
4. Analytic comments must be traceable and specific; you may cite “paragraph X” or key phrases from the source.
5. Output must be valid JSON so that downstream programs can parse it.
    """


async def arena_judge(
    traj_a: Trajectory,
    traj_b: Trajectory,
    llm_client: AsyncOpenAI,
    model: str = "gpt-4"
) -> tuple[float, float]:
    """
    Calls the LLM judge and returns (score_a, score_b) as floats for ArenaRL.
    """
    prompt = build_judge_prompt(
        traj_a.input_query,
        path_a="\n".join(s.reasoning for s in traj_a.path),
        answer_a=traj_a.final_output,
        path_b="\n".join(s.reasoning for s in traj_b.path),
        answer_b=traj_b.final_output,
    )
    response = await llm_client.chat.completions.create(
        model=model,
        messages=[{"role": "user", "content": prompt}],
        temperature=0.0,  # Always deterministic judge
        response_format={"type": "json_object"}
    )
    j = json.loads(response.choices[0].message.content)
    score_a, score_b = float(j["combined_scores"]["LLM_A"]), float(j["combined_scores"]["LLM_B"])
    return score_a, score_b


def arena_advantage_scoring(traj_group, arena_judge):
    """
    traj_group: [anchor, traj_1, ..., traj_N-1]
    arena_judge: function(traj_a, traj_b) -> score_a, score_b
    Returns: standardized advantages A_i for each trajectory
    """
    N = len(traj_group)
    anchor = traj_group[0]
    others = traj_group[1:]

    # Phase 1: Anchor-Based Seeding
    # For each candidate, score vs anchor
    S_init = {}
    anchor_scores = []
    for i, traj in enumerate(others):
        s_i, s_anchor = arena_judge(traj, anchor)
        S_init[traj['traject_id']] = s_i
        anchor_scores.append(s_anchor)
    S_init[anchor['traject_id']] = np.mean(anchor_scores)  # anchor gets average of its scores

    # Sort descendingly by seed scores
    ordered = sorted(traj_group, key=lambda t: S_init[t['traject_id']], reverse=True)
    P = ordered

    # Accumulated scores
    V = {t['traject_id']: S_init[t['traject_id']] for t in traj_group}
    tiers = []

    # Phase 2: Single-Elimination Tournament
    B = [None] * N
    idx_head = 0
    idx_tail = N - 1
    for k in range(1, N//2 + 1):
        t_high, t_low = P[k-1], P[N-k]
        if k % 2 == 1:  # odd, fill from front
            B[idx_head] = t_high
            B[idx_head+1] = t_low
            idx_head += 2
        else:           # even, fill from back
            B[idx_tail] = t_high
            B[idx_tail-1] = t_low
            idx_tail -= 2

    B = [x for x in B if x is not None]
    while len(B) > 1:
        W_round, L_round = [], []
        for k in range(len(B)//2):
            ta, tb = B[2*k], B[2*k+1]
            sa, sb = arena_judge(ta, tb)
            V[ta['traject_id']] += sa
            V[tb['traject_id']] += sb
            if sa > sb:
                W_round.append(ta)
                L_round.append(tb)
            else:
                W_round.append(tb)
                L_round.append(ta)
        tiers.append(L_round)
        B = W_round
    tiers.append(B)  # Champion final tier

    # Phase 3: Hierarchical Rank Assignment
    k = 0
    ranks = {}
    for tier in reversed(tiers):
        # Sort within tier by accumulated score, descending
        tier_sorted = sorted(tier, key=lambda t: V[t['traject_id']], reverse=True)
        for t in tier_sorted:
            ranks[t['traject_id']] = k
            k += 1

    # Phase 4: Standardized Advantage Calculation
    rewards = {}
    for t in traj_group:
        rewards[t['traject_id']] = 1 - ranks[t['traject_id']] / (N - 1)
    μ_r = np.mean(list(rewards.values()))
    σ_r = np.std(list(rewards.values()))
    eps = 1e-8
    A = []
    for t in traj_group:
        A_i = (rewards[t['traject_id']] - μ_r) / (σ_r + eps)
        A.append(A_i)

    return A


INSTRUCTIONS = (
    "You are a research assistant. Given a search term, you search the web for that term and "
    "produce a concise summary of the results. The summary must be 2-3 paragraphs and less than 300 "
    "words. Capture the main points. Write succinctly, no need to have complete sentences or good "
    "grammar. This will be consumed by someone synthesizing a report, so its vital you capture the "
    "essence and ignore any fluff. Do not include any additional commentary other than the summary "
    "itself."
)

search_agent = Agent(
    name="Search agent",
    model="gpt-5.2",
    instructions=INSTRUCTIONS,
    tools=[WebSearchTool()],
)


async def run_example():
    # Agent with tools, loaded per openai-agents-python docs
    agent = search_agent
    user_query = "Write an essay on the evolution of programming languages."
    N = 8

    # Generation
    trajectories = await generate_trajectory_group(agent, user_query, N)

    # LLM Judge client
    llm_client = AsyncOpenAI(api_key="YOUR_KEY", base_url="https://api.openai.com/v1")

    # Arena scoring function that wraps your async judge
    async def arena_judge_wrapper(t1, t2):
        return await arena_judge(t1, t2, llm_client)

    advantages = await arena_advantage_scoring(trajectories, arena_judge_wrapper)
    print("Standardized Advantages:", advantages)


if __name__ == "__main__":
    asyncio.run(run_arena_rl_example())
	# /// script
	# dependencies = [
	# "openai-agents",
	# ]
	# ///

	import asyncio
	import json
	from dataclasses import dataclass
	from typing import Sequence, AsyncGenerator

	import numpy as np
	from agents import Agent, Runner, Tool, WebSearchTool
	from openai import AsyncOpenAI


	@dataclass
	class TrajectoryStep:
	reasoning: str
	tool_name: str \| None
	tool_input: str \| None
	tool_output: str \| None


	@dataclass
	class Trajectory:
	traject_id: str
	path: list[TrajectoryStep] # Agent's reasoning and tool interactions
	final_output: str # Agent's final answer
	input_query: str # User's original request


	async def agentic_loop(
	agent: Agent,
	input_query: str,
	temperature: float,
	*,
	run_id: str
	) -> Trajectory:
	"""
	Runs an agentic loop, tracing all model/tool reasoning and actions for a given scenario.
	"""
	trace: list[TrajectoryStep] = []
	# The Runner API gives detailed yield results for tracing steps
	async for event in Runner.run_with_trace_async(
	agent,
	input_query,
	temperature=temperature
	):
	if event.type == "tool":
	trace.append(TrajectoryStep(
	reasoning=event.prior_message,
	tool_name=event.tool_name,
	tool_input=event.tool_input,
	tool_output=event.tool_output
	))
	elif event.type == "thought":
	trace.append(TrajectoryStep(
	reasoning=event.message,
	tool_name=None,
	tool_input=None,
	tool_output=None
	))
	# When iteration ends, the response is available
	final_output = await Runner.run_async(agent, input_query, temperature=temperature)
	return Trajectory(
	traject_id=run_id,
	path=trace,
	final_output=final_output.final_output,
	input_query=input_query
	)


	async def generate_trajectory_group(
	agent: Agent,
	user_query: str,
	N: int,
	) -> list[Trajectory]:
	"""
	Generate one anchor trajectory (temperature=0) and (N-1) samples (temperature=8).
	"""
	anchor: Trajectory = await agentic_loop(agent, user_query, temperature=0, run_id="anchor")
	coros = [
	agentic_loop(agent, user_query, temperature=8, run_id=f"sample_{i}")
	for i in range(1, N)
	]
	high_entropy_samples: list[Trajectory] = await asyncio.gather(*coros)
	return [anchor, *high_entropy_samples]


	def build_judge_prompt(
	user_query: str,
	path_a: str, answer_a: str,
	path_b: str, answer_b: str
	) -> str:
	# Plug into the structure from the paper
	return f"""
	You are a rigorously minded “Comprehensive LLM Writing Evaluator” versed in writing-assessment methodology. Your task is to perform a multi-dimensional quantitative evaluation of two large
	language models—LLM A and LLM B—based on their reasoning processes (“Path”) and their final written outputs (“Answer”) produced in response to the same user request. Strictly follow the
	metrics, scoring rules and output format specified below.
	I. Input Format for the Items to Be Evaluated
	(When evaluating, refer only to the text provided) ——————————
	<USER_QUERY>
	{The user’s original writing request}
	</USER_QUERY>
	<PATH_A>
	{LLM A’s complete chain of thought / reasoning path, including key-point breakdown, information retrieval, structural planning, etc.}
	</PATH_A>
	<PATH_B>
	{LLM B’s complete chain of thought / reasoning path, including key-point breakdown, information retrieval, structural planning, etc.}
	</PATH_B>
	<ANSWER_A>
	{LLM A’s final article / copy}
	</ANSWER_A>
	<ANSWER_B>
	{LLM B’s final article / copy}
	</ANSWER_B> ——————————
	II. Path Evaluation (assess PATH only; ignore ANSWER)
	[Dimension Descriptions]
	1. Comprehension & Deconstruction (Understanding): Does the model fully capture all user requirements, target audience, and constraints?
	2. Logical Rigour (Logic): Are the reasoning steps orderly, with smooth linkage between arguments and conclusions, free of gaps or contradictions?
	3. Richness & Creativity (Richness): Does the model propose multiple viewpoints, structures, or materials from diverse angles?
	[Scoring Rules]
	• Each dimension: 0-10 points (0 = “entirely missing”, 10 = “outstanding”).
	• Overall path score (overall_p) = arithmetic mean of the three dimensions, rounded to the nearest integer.
	III. Answer Evaluation (assess ANSWER with reference to its PATH)
	[Dimension Descriptions]
	1. Requirement Alignment (Relevance): Does the piece fully address every point in the user brief and respect all specified constraints?
	2. Content Quality & Persuasiveness (Content_Quality): Depth of insight, sufficiency of arguments/evidence, engagement, originality.
	3. Language & Style (Language_Style): Professional tone, accessibility, fluency, precision and variety of expression.
	4. Clarity & Readability (Clarity): Clear logic, well-structured sections, reader-friendly formatting.
	[Scoring Rules]
	• Each dimension: 0-10 points (0 = “entirely missing”, 10 = “outstanding”).
	• Overall answer score (overall_a) = arithmetic mean of the four dimensions, rounded to the nearest integer.
	IV. Combined Score & Winner Determination
	1. combined_score = 0.4 × overall_p + 0.6 × overall_a, rounded to one decimal place.
	2. If both models obtain the same combined_score, declare “Tie”.
	V. Output Format (strictly follow; do not add, remove or reorder fields)
	{
	"analysis": {
	"path_A": "<80-120 Chinese characters: highlight strengths and weaknesses of A’s path>",
	"path_B": "<80-120 Chinese characters: highlight strengths and weaknesses of B’s path>",
	"answer_A": "<80-120 Chinese characters: highlight strengths and weaknesses of A’s answer>",
	"answer_B": "<80-120 Chinese characters: highlight strengths and weaknesses of B’s answer>"
	},
	"path_scores": {
	"LLM_A": {
	"understanding": <0-10>,
	"logic": <0-10>,
	"richness": <0-10>,
	"overall_p": <0-10>
	},
	"LLM_B": {
	"understanding": <0-10>,
	"logic": <0-10>,
	"richness": <0-10>,
	"overall_p": <0-10>
	}
	},
	"answer_scores": {
	"LLM_A": {
	"relevance": <0-10>,
	"content_quality": <0-10>,
	"language_style": <0-10>,
	"clarity": <0-10>,
	"overall_a": <0-10>
	},
	"LLM_B": {
	"relevance": <0-10>,
	"content_quality": <0-10>,
	"language_style": <0-10>,
	"clarity": <0-10>,
	"overall_a": <0-10>
	}
	},
	"combined_scores": {
	"LLM_A": <0-10>,
	"LLM_B": <0-10>
	},
	"winner": "<LLM_A \| LLM_B \| Tie>"
	}
	VI. Critical Requirements
	1. Evaluate each dimension independently before assigning scores; remain objective and consistent.
	2. Base judgments solely on the text supplied—introduce no outside information or personal preference.
	3. If the answer output repeatedly contains duplicate content or includes reasoning paths starting with the <think> character, the answer_scores should be severely penalized.
	4. Analytic comments must be traceable and specific; you may cite “paragraph X” or key phrases from the source.
	5. Output must be valid JSON so that downstream programs can parse it.
	"""


	async def arena_judge(
	traj_a: Trajectory,
	traj_b: Trajectory,
	llm_client: AsyncOpenAI,
	model: str = "gpt-4"
	) -> tuple[float, float]:
	"""
	Calls the LLM judge and returns (score_a, score_b) as floats for ArenaRL.
	"""
	prompt = build_judge_prompt(
	traj_a.input_query,
	path_a="\n".join(s.reasoning for s in traj_a.path),
	answer_a=traj_a.final_output,
	path_b="\n".join(s.reasoning for s in traj_b.path),
	answer_b=traj_b.final_output,
	)
	response = await llm_client.chat.completions.create(
	model=model,
	messages=[{"role": "user", "content": prompt}],
	temperature=0.0, # Always deterministic judge
	response_format={"type": "json_object"}
	)
	j = json.loads(response.choices[0].message.content)
	score_a, score_b = float(j["combined_scores"]["LLM_A"]), float(j["combined_scores"]["LLM_B"])
	return score_a, score_b


	def arena_advantage_scoring(traj_group, arena_judge):
	"""
	traj_group: [anchor, traj_1, ..., traj_N-1]
	arena_judge: function(traj_a, traj_b) -> score_a, score_b
	Returns: standardized advantages A_i for each trajectory
	"""
	N = len(traj_group)
	anchor = traj_group[0]
	others = traj_group[1:]

	# Phase 1: Anchor-Based Seeding
	# For each candidate, score vs anchor
	S_init = {}
	anchor_scores = []
	for i, traj in enumerate(others):
	s_i, s_anchor = arena_judge(traj, anchor)
	S_init[traj['traject_id']] = s_i
	anchor_scores.append(s_anchor)
	S_init[anchor['traject_id']] = np.mean(anchor_scores) # anchor gets average of its scores

	# Sort descendingly by seed scores
	ordered = sorted(traj_group, key=lambda t: S_init[t['traject_id']], reverse=True)
	P = ordered

	# Accumulated scores
	V = {t['traject_id']: S_init[t['traject_id']] for t in traj_group}
	tiers = []

	# Phase 2: Single-Elimination Tournament
	B = [None] * N
	idx_head = 0
	idx_tail = N - 1
	for k in range(1, N//2 + 1):
	t_high, t_low = P[k-1], P[N-k]
	if k % 2 == 1: # odd, fill from front
	B[idx_head] = t_high
	B[idx_head+1] = t_low
	idx_head += 2
	else: # even, fill from back
	B[idx_tail] = t_high
	B[idx_tail-1] = t_low
	idx_tail -= 2

	B = [x for x in B if x is not None]
	while len(B) > 1:
	W_round, L_round = [], []
	for k in range(len(B)//2):
	ta, tb = B[2k], B[2k+1]
	sa, sb = arena_judge(ta, tb)
	V[ta['traject_id']] += sa
	V[tb['traject_id']] += sb
	if sa > sb:
	W_round.append(ta)
	L_round.append(tb)
	else:
	W_round.append(tb)
	L_round.append(ta)
	tiers.append(L_round)
	B = W_round
	tiers.append(B) # Champion final tier

	# Phase 3: Hierarchical Rank Assignment
	k = 0
	ranks = {}
	for tier in reversed(tiers):
	# Sort within tier by accumulated score, descending
	tier_sorted = sorted(tier, key=lambda t: V[t['traject_id']], reverse=True)
	for t in tier_sorted:
	ranks[t['traject_id']] = k
	k += 1

	# Phase 4: Standardized Advantage Calculation
	rewards = {}
	for t in traj_group:
	rewards[t['traject_id']] = 1 - ranks[t['traject_id']] / (N - 1)
	μ_r = np.mean(list(rewards.values()))
	σ_r = np.std(list(rewards.values()))
	eps = 1e-8
	A = []
	for t in traj_group:
	A_i = (rewards[t['traject_id']] - μ_r) / (σ_r + eps)
	A.append(A_i)

	return A


	INSTRUCTIONS = (
	"You are a research assistant. Given a search term, you search the web for that term and "
	"produce a concise summary of the results. The summary must be 2-3 paragraphs and less than 300 "
	"words. Capture the main points. Write succinctly, no need to have complete sentences or good "
	"grammar. This will be consumed by someone synthesizing a report, so its vital you capture the "
	"essence and ignore any fluff. Do not include any additional commentary other than the summary "
	"itself."
	)

	search_agent = Agent(
	name="Search agent",
	model="gpt-5.2",
	instructions=INSTRUCTIONS,
	tools=[WebSearchTool()],
	)


	async def run_example():
	# Agent with tools, loaded per openai-agents-python docs
	agent = search_agent
	user_query = "Write an essay on the evolution of programming languages."
	N = 8

	# Generation
	trajectories = await generate_trajectory_group(agent, user_query, N)

	# LLM Judge client
	llm_client = AsyncOpenAI(api_key="YOUR_KEY", base_url="https://api.openai.com/v1")

	# Arena scoring function that wraps your async judge
	async def arena_judge_wrapper(t1, t2):
	return await arena_judge(t1, t2, llm_client)

	advantages = await arena_advantage_scoring(trajectories, arena_judge_wrapper)
	print("Standardized Advantages:", advantages)


	if __name__ == "__main__":
	asyncio.run(run_arena_rl_example())
No results found