Quentin Gallouédec qgallouedec

## benchmark_generation.py
# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
#     http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software

## train.py
from datasets import load_dataset
from trl import SFTTrainer

dataset = load_dataset("trl-lib/Capybara", split="train")
trainer = SFTTrainer(
    model="Qwen/Qwen2.5-0.5B",
    train_dataset=dataset,
)
trainer.train()
train.push_to_hub("Qwen2.5-0.5B-SFT")

## trl_health_report.py
import pandas as pd
from datetime import datetime, timedelta
from datasets import load_dataset

# Helper function to filter data based on time range
def filter_date_range(df, date_col, start_date, end_date):
    return df[(df[date_col] >= start_date) & (df[date_col] < end_date)]


# Get the current time

## position_biais.py
from transformers import pipeline
from string import Template

template = Template(
    """Please act as an impartial judge and evaluate the quality of the responses provided by
two AI assistants to the user question displayed below. Your evaluation should consider
factors such as the helpfulness and relevance. Ensure that the order in which the responses
were presented does not influence your decision. Answer just by [[A]] if assistant A is better,
[[B]] if assistant B is better, and [[C]] for a tie.

## generate_progress_report.py
import urllib.request
from datetime import datetime

import wandb
import yaml
from yaml.loader import SafeLoader

atari_ids = [
    # "AdventureNoFrameskip-v4",
    # "AirRaidNoFrameskip-v4",

## MyBeautifulGraph0.py
from manim import *  # I'll skip this line for the following

class MyBeautifulGraph(Scene):
    def construct(self):
        axes = Axes()
        self.add(axes)

## SARSA_update_epslion.py
for i_episode in range(nb_episodes):
    # HERE : THE CODE TO
    # initilize episode and loop until the game is over
    # ...
    # improve the policy after each transition
    # ...

    # if episode ends with a reward
    if reward:
        # decrease epsilon

## GLIE_update_Q.py
for t in range(len(states)):
  # Compute the dicounted reward Gt from time t
  # Gt = rewards[t] + gamma*Gt
  Gt = compute_gain(rewards, t, gamma)

  # \delta_t = G_t - Q(S_t, A_t)
  delta_t = Gt - Q[states[t]][actions[t]]

  # Add pair state-action to the counter
  N[states[t]][actions[t]] += 1

## SARSA_improve_Q.py
# \delta_t = R_{t+1} + \gamma * Q(S_{t+1}, A_{t+1}) - Q(S_{t}, A_{t})
delta_t = reward + gamma* Q[next_state][next_action] - Q[state][action]

# Add delta_t to the current value function
# Q(S_t, A_t) += alpha * \delta_t
Q[state][action] += alpha * delta_t

## GLIE_loop.py
# initilise Q and N
Q_pi = np.ones((nb_states, nb_actions))
N = np.zeros_like(Q_pi)

for i_generation in range(nb_generation):
    # Define π' = ε-greedy policy w.r.t Q_π
    pi = epsilon_greedy_policy(Q_pi, eps)

    # Generate one episode by following the π'
    states, actions, rewards = generate_episode(env, pi)
	# coding=utf-8
	# Copyright 2025 The HuggingFace Inc. team
	#
	# Licensed under the Apache License, Version 2.0 (the "License");
	# you may not use this file except in compliance with the License.
	# You may obtain a copy of the License at
	#
	# http://www.apache.org/licenses/LICENSE-2.0
	#
	# Unless required by applicable law or agreed to in writing, software
	from datasets import load_dataset
	from trl import SFTTrainer

	dataset = load_dataset("trl-lib/Capybara", split="train")
	trainer = SFTTrainer(
	model="Qwen/Qwen2.5-0.5B",
	train_dataset=dataset,
	)
	trainer.train()
	train.push_to_hub("Qwen2.5-0.5B-SFT")
	import pandas as pd
	from datetime import datetime, timedelta
	from datasets import load_dataset

	# Helper function to filter data based on time range
	def filter_date_range(df, date_col, start_date, end_date):
	return df[(df[date_col] >= start_date) & (df[date_col] < end_date)]


	# Get the current time
	from transformers import pipeline
	from string import Template

	template = Template(
	"""Please act as an impartial judge and evaluate the quality of the responses provided by
	two AI assistants to the user question displayed below. Your evaluation should consider
	factors such as the helpfulness and relevance. Ensure that the order in which the responses
	were presented does not influence your decision. Answer just by [[A]] if assistant A is better,
	[[B]] if assistant B is better, and [[C]] for a tie.
	import urllib.request
	from datetime import datetime

	import wandb
	import yaml
	from yaml.loader import SafeLoader

	atari_ids = [
	# "AdventureNoFrameskip-v4",
	# "AirRaidNoFrameskip-v4",
	from manim import * # I'll skip this line for the following

	class MyBeautifulGraph(Scene):
	def construct(self):
	axes = Axes()
	self.add(axes)
	for i_episode in range(nb_episodes):
	# HERE : THE CODE TO
	# initilize episode and loop until the game is over
	# ...
	# improve the policy after each transition
	# ...

	# if episode ends with a reward
	if reward:
	# decrease epsilon
	for t in range(len(states)):
	# Compute the dicounted reward Gt from time t
	# Gt = rewards[t] + gamma*Gt
	Gt = compute_gain(rewards, t, gamma)

	# \delta_t = G_t - Q(S_t, A_t)
	delta_t = Gt - Q[states[t]][actions[t]]

	# Add pair state-action to the counter
	N[states[t]][actions[t]] += 1
	# \delta_t = R_{t+1} + \gamma * Q(S_{t+1}, A_{t+1}) - Q(S_{t}, A_{t})
	delta_t = reward + gamma* Q[next_state][next_action] - Q[state][action]

	# Add delta_t to the current value function
	# Q(S_t, A_t) += alpha * \delta_t
	Q[state][action] += alpha * delta_t
	# initilise Q and N
	Q_pi = np.ones((nb_states, nb_actions))
	N = np.zeros_like(Q_pi)

	for i_generation in range(nb_generation):
	# Define π' = ε-greedy policy w.r.t Q_π
	pi = epsilon_greedy_policy(Q_pi, eps)

	# Generate one episode by following the π'
	states, actions, rewards = generate_episode(env, pi)