dhruvilp/Ft.py

## Ft.py
# train.py
# Run with: accelerate launch --num_processes 4 train.py
# Make sure to have accelerate config set up for DDP, or it will auto.

import os
import torch
from datasets import load_dataset
from peft import LoraConfig, get_peft_model, TaskType
from transformers import (
    AutoModelForCausalLM,
    AutoTokenizer,
    BitsAndBytesConfig,
    TrainingArguments,
)
from trl import SFTTrainer, SFTConfig

import wandb
wandb.init(project="gpt-oss-finetune", name="gpt-oss-20b-lora")

# Model and tokenizer
model_name = "openai/gpt-oss-20b"
tokenizer = AutoTokenizer.from_pretrained(model_name)
if tokenizer.pad_token is None:
    tokenizer.pad_token = tokenizer.eos_token

# Quantization config with bitsandbytes
bnb_config = BitsAndBytesConfig(
    load_in_4bit=True,
    bnb_4bit_use_double_quant=True,
    bnb_4bit_quant_type="nf4",
    bnb_4bit_compute_dtype=torch.bfloat16,
)

# Load model
model = AutoModelForCausalLM.from_pretrained(
    model_name,
    quantization_config=bnb_config,
    device_map="auto",
    torch_dtype=torch.bfloat16,
    trust_remote_code=True,  # If needed for custom model
)

# LoRA config - specific targets for efficiency on this MoE model
peft_config = LoraConfig(
    task_type=TaskType.CAUSAL_LM,
    inference_mode=False,
    r=8,
    lora_alpha=16,
    lora_dropout=0.1,
    target_modules=[
        "7.mlp.experts.gate_up_proj",
        "7.mlp.experts.down_proj",
        "15.mlp.experts.gate_up_proj",
        "15.mlp.experts.down_proj",
        "23.mlp.experts.gate_up_proj",
        "23.mlp.experts.down_proj",
    ],
)

# Apply LoRA
model = get_peft_model(model, peft_config)
model.print_trainable_parameters()

# Load dataset
dataset = load_dataset("HuggingFaceH4/Multilingual-Thinking", split="train")

# Formatting function - assuming dataset has 'messages' in chat format
# Adjust based on actual dataset structure; this formats to a simple prompt-response
def formatting_prompts_func(example):
    messages = example["messages"]
    prompt = ""
    for msg in messages:
        role = msg["role"]
        content = msg["content"]
        if role == "user":
            prompt += f"<|user|>\n{content}<|end|>\n"
        elif role == "assistant":
            prompt += f"<|assistant|>\n{content}<|end|>\n"
    return prompt.strip()

# Training config
training_args = SFTConfig(
    output_dir="./gpt-oss-20b-finetuned",
    num_train_epochs=1,
    per_device_train_batch_size=2,  # Adjust based on memory; effective batch_size = 2 * 4 GPUs * gradient_accumulation_steps
    gradient_accumulation_steps=4,
    gradient_checkpointing=True,
    optim="paged_adamw_8bit",
    save_steps=500,
    logging_steps=10,
    learning_rate=2e-4,
    weight_decay=0.001,
    fp16=False,
    bf16=True,
    max_grad_norm=0.3,
    max_steps=-1,
    warmup_ratio=0.03,
    group_by_length=True,
    lr_scheduler_type="cosine",
    report_to="wandb",
    run_name="gpt-oss-20b-multilingual",
    push_to_hub=False,  # Set to True if you want to push
    # hub_model_id="your-username/gpt-oss-20b-finetuned",
)

# Trainer
trainer = SFTTrainer(
    model=model,
    train_dataset=dataset,
    formatting_func=formatting_prompts_func,
    max_seq_length=2048,
    tokenizer=tokenizer,
    args=training_args,
    packing=False,  # Set to True for better efficiency if no formatting func
)

# Train
trainer.train()

# Save
trainer.save_model()
wandb.finish()

## trl.md

      
    Raw
  

              trl.md
            
          
    Unsloth Integration

Unsloth is an open‑source framework for fine‑tuning and reinforcement learning that trains LLMs (like Llama, OpenAI gpt-oss, Mistral, Gemma, DeepSeek, and more) up to 2× faster with up to 80% less VRAM. Unsloth allows training, evaluation, running and deployment with other inference engines like llama.cpp, Ollama and vLLM.
The library provides a streamlined, Hugging Face compatible workflow for training, evaluation, inference and deployment and is fully compatible with SFTTrainer.
Key Features


Training support for all transformer compatible models: Text-to-speech (TTS), multimodal, BERT, RL and more
Supports full fine-tuning, pretraining, LoRA, QLoRA, 8-bit training & more
Works on Linux, Windows, Colab, Kaggle; NVIDIA GPUs, soon AMD & Intel setups
Supports most features TRL supports, including RLHF (GSPO, GRPO, DPO etc.)
Hand-written Triton kernels and a manual backprop engine ensure no accuracy degradation (0% approximation error)

Installation

pip install

Local Installation (Linux recommended):
pip install unsloth
You can also install unsloth according to the official documentation. Once installed, you can incorporate unsloth into your workflow in a very simple manner; instead of loading AutoModelForCausalLM, you just need to load a FastLanguageModel as follows:
import torch
from trl import SFTConfig, SFTTrainer
from unsloth import FastLanguageModel

max_length = 2048 # Supports automatic RoPE Scaling, so choose any number

# Load model
model, tokenizer = FastLanguageModel.from_pretrained(
    model_name="unsloth/mistral-7b",
    max_seq_length=max_length,
    dtype=None,  # None for auto detection. Float16 for Tesla T4, V100, Bfloat16 for Ampere+
    load_in_4bit=True,  # Use 4bit quantization to reduce memory usage. Can be False
)

# Do model patching and add fast LoRA weights
model = FastLanguageModel.get_peft_model(
    model,
    r=16,
    target_modules=[
        "q_proj",
        "k_proj",
        "v_proj",
        "o_proj",
        "gate_proj",
        "up_proj",
        "down_proj",
    ],
    lora_alpha=16,
    lora_dropout=0,  # Dropout = 0 is currently optimized
    bias="none",  # Bias = "none" is currently optimized
    use_gradient_checkpointing=True,
    random_state=3407,
)

training_args = SFTConfig(output_dir="./output", max_length=max_length)

trainer = SFTTrainer(
    model=model,
    args=training_args,
    train_dataset=dataset,
)
trainer.train()
The saved model is fully compatible with Hugging Face's transformers library. Learn more about unsloth in their official repository.
Docker Install

docker run -d -e JUPYTER_PASSWORD="mypassword" \
  -p 8888:8888 -p 2222:22 \
  -v $(pwd)/work:/workspace/work \
  --gpus all \
  unsloth/unsloth
Access Jupyter Lab at http://localhost:8888 and start fine-tuning!
Training

These are some core settings you can toggle before training:

max_seq_length = 2048 – Controls context length. While Llama-3 supports 8192, we recommend 2048 for testing. Unsloth enables 4× longer context fine-tuning.
dtype = None – Defaults to None; use torch.float16 or torch.bfloat16 for newer GPUs.
load_in_4bit = True – Enables 4-bit quantization, reducing memory use 4× for fine-tuning. Disabling it allows for LoRA 16-bit fine-tuning to be enabled.
To enable full fine-tuning (FFT), set full_finetuning = True. For 8-bit fine-tuning, set load_in_8bit = True. Note: Only one training method can be set to True at a time.

For more information on configuring Unsloth's hyperparameters and features, read their documentation guide here.
Saving the model

Unsloth allows you to directly save the finetuned model as a small file called a LoRA adapter. You can instead push to the Hugging Face hub as well if you want to upload your model! Remember to get a Hugging Face token and add your token!
Saving to GGUF

To save to GGUF, Unsloth uses llama.cpp. To save locally:
model.save_pretrained_gguf("directory", tokenizer, quantization_method = "q4_k_m")
model.save_pretrained_gguf("directory", tokenizer, quantization_method = "q8_0")
model.save_pretrained_gguf("directory", tokenizer, quantization_method = "f16")
To push to the hub:
model.push_to_hub_gguf("hf_username/directory", tokenizer, quantization_method = "q4_k_m")
model.push_to_hub_gguf("hf_username/directory", tokenizer, quantization_method = "q8_0")
Saving to vLLM

To save to 16-bit for vLLM, use:
model.save_pretrained_merged("model", tokenizer, save_method = "merged_16bit",)
model.push_to_hub_merged("hf/model", tokenizer, save_method = "merged_16bit", token = "")
	# train.py
	# Run with: accelerate launch --num_processes 4 train.py
	# Make sure to have accelerate config set up for DDP, or it will auto.

	import os
	import torch
	from datasets import load_dataset
	from peft import LoraConfig, get_peft_model, TaskType
	from transformers import (
	AutoModelForCausalLM,
	AutoTokenizer,
	BitsAndBytesConfig,
	TrainingArguments,
	)
	from trl import SFTTrainer, SFTConfig

	import wandb
	wandb.init(project="gpt-oss-finetune", name="gpt-oss-20b-lora")

	# Model and tokenizer
	model_name = "openai/gpt-oss-20b"
	tokenizer = AutoTokenizer.from_pretrained(model_name)
	if tokenizer.pad_token is None:
	tokenizer.pad_token = tokenizer.eos_token

	# Quantization config with bitsandbytes
	bnb_config = BitsAndBytesConfig(
	load_in_4bit=True,
	bnb_4bit_use_double_quant=True,
	bnb_4bit_quant_type="nf4",
	bnb_4bit_compute_dtype=torch.bfloat16,
	)

	# Load model
	model = AutoModelForCausalLM.from_pretrained(
	model_name,
	quantization_config=bnb_config,
	device_map="auto",
	torch_dtype=torch.bfloat16,
	trust_remote_code=True, # If needed for custom model
	)

	# LoRA config - specific targets for efficiency on this MoE model
	peft_config = LoraConfig(
	task_type=TaskType.CAUSAL_LM,
	inference_mode=False,
	r=8,
	lora_alpha=16,
	lora_dropout=0.1,
	target_modules=[
	"7.mlp.experts.gate_up_proj",
	"7.mlp.experts.down_proj",
	"15.mlp.experts.gate_up_proj",
	"15.mlp.experts.down_proj",
	"23.mlp.experts.gate_up_proj",
	"23.mlp.experts.down_proj",
	],
	)

	# Apply LoRA
	model = get_peft_model(model, peft_config)
	model.print_trainable_parameters()

	# Load dataset
	dataset = load_dataset("HuggingFaceH4/Multilingual-Thinking", split="train")

	# Formatting function - assuming dataset has 'messages' in chat format
	# Adjust based on actual dataset structure; this formats to a simple prompt-response
	def formatting_prompts_func(example):
	messages = example["messages"]
	prompt = ""
	for msg in messages:
	role = msg["role"]
	content = msg["content"]
	if role == "user":
	prompt += f"<\|user\|>\n{content}<\|end\|>\n"
	elif role == "assistant":
	prompt += f"<\|assistant\|>\n{content}<\|end\|>\n"
	return prompt.strip()

	# Training config
	training_args = SFTConfig(
	output_dir="./gpt-oss-20b-finetuned",
	num_train_epochs=1,
	per_device_train_batch_size=2, # Adjust based on memory; effective batch_size = 2 * 4 GPUs * gradient_accumulation_steps
	gradient_accumulation_steps=4,
	gradient_checkpointing=True,
	optim="paged_adamw_8bit",
	save_steps=500,
	logging_steps=10,
	learning_rate=2e-4,
	weight_decay=0.001,
	fp16=False,
	bf16=True,
	max_grad_norm=0.3,
	max_steps=-1,
	warmup_ratio=0.03,
	group_by_length=True,
	lr_scheduler_type="cosine",
	report_to="wandb",
	run_name="gpt-oss-20b-multilingual",
	push_to_hub=False, # Set to True if you want to push
	# hub_model_id="your-username/gpt-oss-20b-finetuned",
	)

	# Trainer
	trainer = SFTTrainer(
	model=model,
	train_dataset=dataset,
	formatting_func=formatting_prompts_func,
	max_seq_length=2048,
	tokenizer=tokenizer,
	args=training_args,
	packing=False, # Set to True for better efficiency if no formatting func
	)

	# Train
	trainer.train()

	# Save
	trainer.save_model()
	wandb.finish()
No results found