Created
May 17, 2025 06:56
-
-
Save davidar/e2bf7af9cbf33f749d233f1d6dbf2e47 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # Base model configuration | |
| base_model: mistralai/Mistral-Small-24B-Base-2501 | |
| model_type: MistralForCausalLM | |
| tokenizer_type: AutoTokenizer | |
| trust_remote_code: true | |
| tokenizer_use_fast: true | |
| # Device settings - simpler approach for multi-GPU | |
| # Use balanced loading with 4-bit quantization | |
| device_map: "balanced" | |
| # Memory settings - optimized for dual A40 GPUs | |
| load_in_4bit: true # Essential for fitting 24B on A40s | |
| load_in_8bit: false # Don't use 8-bit alongside 4-bit | |
| bf16: true # Use bfloat16 for better numerical stability than fp16 | |
| low_cpu_mem_usage: true # Reduces CPU memory usage during model loading | |
| # Memory and speed optimizations | |
| flash_attention: true # Significant memory savings AND speed improvement | |
| gradient_checkpointing: true | |
| # Dataset configuration | |
| datasets: | |
| - path: json | |
| data_files: ./chatlogs.jsonl | |
| type: completion | |
| # Output tracking | |
| dataset_prepared_path: last_run_prepared | |
| output_dir: ./outputs/irc-mistral-24b-run1 | |
| val_set_size: 0.01 | |
| # Sequence and training settings | |
| sequence_len: 4096 # captures ~200 IRC messages | |
| sample_packing: true # Essential for efficient training with conversation data | |
| pad_to_sequence_len: true # Helps with stable memory usage | |
| train_on_inputs: true # Train on both inputs and outputs for conversation modeling | |
| eval_sample_packing: false # Not supported together with sample_packing | |
| # LoRA configuration | |
| adapter: lora | |
| lora_r: 128 # Scaled up for 24B model (from 64 for 7B) | |
| lora_alpha: 256 # Scaled up for 24B model (from 128 for 7B) | |
| lora_dropout: 0.1 # Maintained from successful 7B runs | |
| lora_target_modules: # Target all key transformer components | |
| - q_proj | |
| - v_proj | |
| - k_proj | |
| - o_proj | |
| - gate_proj | |
| - down_proj | |
| - up_proj | |
| # Training hyperparameters - optimized for speed while maintaining quality | |
| micro_batch_size: 1 | |
| gradient_accumulation_steps: 16 # Maintain effective batch size | |
| num_epochs: 2 # Consistent with successful 7B runs | |
| optimizer: adamw_torch # Standard optimizer choice for LLMs | |
| lr_scheduler: cosine # Smooth learning rate decay | |
| learning_rate: 0.00008 # Scaled down for 24B stability | |
| weight_decay: 0.01 # Consistent with successful 7B runs | |
| warmup_ratio: 0.05 # Add proper warmup for training stability | |
| # Evaluation and checkpointing | |
| evals_per_epoch: 3 | |
| include_tokens_per_second: true # Track performance metrics | |
| # Performance and quality monitoring | |
| group_by_length: true # Group similar sequence lengths for efficiency | |
| shuffle_merged_datasets: true # Ensure proper dataset shuffling | |
| # Wandb integration | |
| wandb_project: irc-llm-training | |
| wandb_entity: davidar | |
| wandb_name: irc-mistral-24b-run1 | |
| wandb_log_model: "false" | |
| eval_table_size: 5 # Show 5 samples in WandB UI for qualitative assessment | |
| # Mistral model configuration | |
| is_mistral_derived_model: true |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment