davidar/axolotl-mistral-24b.yaml

## axolotl-mistral-24b.yaml
# Base model configuration
base_model: mistralai/Mistral-Small-24B-Base-2501
model_type: MistralForCausalLM
tokenizer_type: AutoTokenizer
trust_remote_code: true
tokenizer_use_fast: true

# Device settings - simpler approach for multi-GPU
# Use balanced loading with 4-bit quantization
device_map: "balanced"

# Memory settings - optimized for dual A40 GPUs
load_in_4bit: true          # Essential for fitting 24B on A40s
load_in_8bit: false         # Don't use 8-bit alongside 4-bit
bf16: true                  # Use bfloat16 for better numerical stability than fp16
low_cpu_mem_usage: true     # Reduces CPU memory usage during model loading

# Memory and speed optimizations
flash_attention: true       # Significant memory savings AND speed improvement
gradient_checkpointing: true

# Dataset configuration
datasets:
  - path: json
    data_files: ./chatlogs.jsonl
    type: completion

# Output tracking
dataset_prepared_path: last_run_prepared
output_dir: ./outputs/irc-mistral-24b-run1
val_set_size: 0.01

# Sequence and training settings
sequence_len: 4096          # captures ~200 IRC messages
sample_packing: true        # Essential for efficient training with conversation data
pad_to_sequence_len: true   # Helps with stable memory usage
train_on_inputs: true       # Train on both inputs and outputs for conversation modeling
eval_sample_packing: false  # Not supported together with sample_packing

# LoRA configuration
adapter: lora
lora_r: 128                 # Scaled up for 24B model (from 64 for 7B)
lora_alpha: 256             # Scaled up for 24B model (from 128 for 7B)
lora_dropout: 0.1           # Maintained from successful 7B runs
lora_target_modules:        # Target all key transformer components
  - q_proj
  - v_proj
  - k_proj
  - o_proj
  - gate_proj
  - down_proj
  - up_proj

# Training hyperparameters - optimized for speed while maintaining quality
micro_batch_size: 1
gradient_accumulation_steps: 16    # Maintain effective batch size
num_epochs: 2               # Consistent with successful 7B runs
optimizer: adamw_torch      # Standard optimizer choice for LLMs
lr_scheduler: cosine        # Smooth learning rate decay
learning_rate: 0.00008      # Scaled down for 24B stability
weight_decay: 0.01          # Consistent with successful 7B runs
warmup_ratio: 0.05          # Add proper warmup for training stability

# Evaluation and checkpointing
evals_per_epoch: 3
include_tokens_per_second: true  # Track performance metrics

# Performance and quality monitoring
group_by_length: true       # Group similar sequence lengths for efficiency
shuffle_merged_datasets: true    # Ensure proper dataset shuffling

# Wandb integration
wandb_project: irc-llm-training
wandb_entity: davidar
wandb_name: irc-mistral-24b-run1
wandb_log_model: "false"
eval_table_size: 5          # Show 5 samples in WandB UI for qualitative assessment

# Mistral model configuration
is_mistral_derived_model: true
	# Base model configuration
	base_model: mistralai/Mistral-Small-24B-Base-2501
	model_type: MistralForCausalLM
	tokenizer_type: AutoTokenizer
	trust_remote_code: true
	tokenizer_use_fast: true

	# Device settings - simpler approach for multi-GPU
	# Use balanced loading with 4-bit quantization
	device_map: "balanced"

	# Memory settings - optimized for dual A40 GPUs
	load_in_4bit: true # Essential for fitting 24B on A40s
	load_in_8bit: false # Don't use 8-bit alongside 4-bit
	bf16: true # Use bfloat16 for better numerical stability than fp16
	low_cpu_mem_usage: true # Reduces CPU memory usage during model loading

	# Memory and speed optimizations
	flash_attention: true # Significant memory savings AND speed improvement
	gradient_checkpointing: true

	# Dataset configuration
	datasets:
	- path: json
	data_files: ./chatlogs.jsonl
	type: completion

	# Output tracking
	dataset_prepared_path: last_run_prepared
	output_dir: ./outputs/irc-mistral-24b-run1
	val_set_size: 0.01

	# Sequence and training settings
	sequence_len: 4096 # captures ~200 IRC messages
	sample_packing: true # Essential for efficient training with conversation data
	pad_to_sequence_len: true # Helps with stable memory usage
	train_on_inputs: true # Train on both inputs and outputs for conversation modeling
	eval_sample_packing: false # Not supported together with sample_packing

	# LoRA configuration
	adapter: lora
	lora_r: 128 # Scaled up for 24B model (from 64 for 7B)
	lora_alpha: 256 # Scaled up for 24B model (from 128 for 7B)
	lora_dropout: 0.1 # Maintained from successful 7B runs
	lora_target_modules: # Target all key transformer components
	- q_proj
	- v_proj
	- k_proj
	- o_proj
	- gate_proj
	- down_proj
	- up_proj

	# Training hyperparameters - optimized for speed while maintaining quality
	micro_batch_size: 1
	gradient_accumulation_steps: 16 # Maintain effective batch size
	num_epochs: 2 # Consistent with successful 7B runs
	optimizer: adamw_torch # Standard optimizer choice for LLMs
	lr_scheduler: cosine # Smooth learning rate decay
	learning_rate: 0.00008 # Scaled down for 24B stability
	weight_decay: 0.01 # Consistent with successful 7B runs
	warmup_ratio: 0.05 # Add proper warmup for training stability

	# Evaluation and checkpointing
	evals_per_epoch: 3
	include_tokens_per_second: true # Track performance metrics

	# Performance and quality monitoring
	group_by_length: true # Group similar sequence lengths for efficiency
	shuffle_merged_datasets: true # Ensure proper dataset shuffling

	# Wandb integration
	wandb_project: irc-llm-training
	wandb_entity: davidar
	wandb_name: irc-mistral-24b-run1
	wandb_log_model: "false"
	eval_table_size: 5 # Show 5 samples in WandB UI for qualitative assessment

	# Mistral model configuration
	is_mistral_derived_model: true
No results found