Skip to content

Instantly share code, notes, and snippets.

# coding=utf-8
# Copyright 2025 The HuggingFace Inc. team
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
@qgallouedec
qgallouedec / train.py
Last active September 13, 2025 23:17
from datasets import load_dataset
from trl import SFTTrainer
dataset = load_dataset("trl-lib/Capybara", split="train")
trainer = SFTTrainer(
model="Qwen/Qwen2.5-0.5B",
train_dataset=dataset,
)
trainer.train()
train.push_to_hub("Qwen2.5-0.5B-SFT")
import pandas as pd
from datetime import datetime, timedelta
from datasets import load_dataset
# Helper function to filter data based on time range
def filter_date_range(df, date_col, start_date, end_date):
return df[(df[date_col] >= start_date) & (df[date_col] < end_date)]
# Get the current time
from transformers import pipeline
from string import Template
template = Template(
"""Please act as an impartial judge and evaluate the quality of the responses provided by
two AI assistants to the user question displayed below. Your evaluation should consider
factors such as the helpfulness and relevance. Ensure that the order in which the responses
were presented does not influence your decision. Answer just by [[A]] if assistant A is better,
[[B]] if assistant B is better, and [[C]] for a tie.
import urllib.request
from datetime import datetime
import wandb
import yaml
from yaml.loader import SafeLoader
atari_ids = [
# "AdventureNoFrameskip-v4",
# "AirRaidNoFrameskip-v4",
@qgallouedec
qgallouedec / MyBeautifulGraph0.py
Last active November 28, 2021 17:09
MyBeautifulGraph0
from manim import * # I'll skip this line for the following
class MyBeautifulGraph(Scene):
def construct(self):
axes = Axes()
self.add(axes)
for i_episode in range(nb_episodes):
# HERE : THE CODE TO
# initilize episode and loop until the game is over
# ...
# improve the policy after each transition
# ...
# if episode ends with a reward
if reward:
# decrease epsilon
for t in range(len(states)):
# Compute the dicounted reward Gt from time t
# Gt = rewards[t] + gamma*Gt
Gt = compute_gain(rewards, t, gamma)
# \delta_t = G_t - Q(S_t, A_t)
delta_t = Gt - Q[states[t]][actions[t]]
# Add pair state-action to the counter
N[states[t]][actions[t]] += 1
# \delta_t = R_{t+1} + \gamma * Q(S_{t+1}, A_{t+1}) - Q(S_{t}, A_{t})
delta_t = reward + gamma* Q[next_state][next_action] - Q[state][action]
# Add delta_t to the current value function
# Q(S_t, A_t) += alpha * \delta_t
Q[state][action] += alpha * delta_t
# initilise Q and N
Q_pi = np.ones((nb_states, nb_actions))
N = np.zeros_like(Q_pi)
for i_generation in range(nb_generation):
# Define π' = ε-greedy policy w.r.t Q_π
pi = epsilon_greedy_policy(Q_pi, eps)
# Generate one episode by following the π'
states, actions, rewards = generate_episode(env, pi)