Created
January 16, 2026 16:12
-
-
Save ArthurDelannoyazerty/459fea5b99317cf6eb01001fb6644e07 to your computer and use it in GitHub Desktop.
Test models speed with vllm for various parameters. chmod +x bench.sh & ./bench.sh
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # ============================================================================== | |
| # 🚀 AUTOMATED BENCHMARK SUITE - NVIDIA H200 (FIXED CLI VERSION) | |
| # ============================================================================== | |
| # On garde 5 prompts pour montrer la puissance parallèle de la H200 | |
| COMMON_ARGS="--input-len 1000 --output-len 500 --num-prompts 5 --gpu-memory-utilization 0.95" | |
| # Liste des modèles | |
| MODELS=( | |
| "google/gemma-3-4b-it" | |
| "google/gemma-3-12b-it" | |
| "google/gemma-3-27b-it" | |
| "mistralai/Mistral-Small-24B-Instruct-2501" | |
| "meta-llama/Llama-3.3-70B-Instruct" | |
| ) | |
| # Configuration Spéciale 70B | |
| BIG_MODEL="meta-llama/Llama-3.3-70B-Instruct" | |
| DRAFT_MODEL="meta-llama/Llama-3.2-3B-Instruct" | |
| OUTPUT_FILE="benchmark_results_h200.csv" | |
| echo "Model,Configuration,Output Speed (tok/s)" > $OUTPUT_FILE | |
| run_bench() { | |
| local model=$1 | |
| local name=$2 | |
| local extra_args=$3 | |
| echo "----------------------------------------------------------------" | |
| echo "đź§Ş Benchmarking: $model" | |
| echo "⚙️ Config: $name" | |
| echo "----------------------------------------------------------------" | |
| # --- CORRECTION ICI : Utilisation de 'vllm bench throughput' --- | |
| OUTPUT=$(vllm bench throughput \ | |
| --model $model \ | |
| --dtype bfloat16 \ | |
| $COMMON_ARGS \ | |
| $extra_args 2>&1) | |
| # On affiche la ligne brute pour contrĂ´le visuel | |
| echo "$OUTPUT" | grep "Throughput:" | |
| # Extraction du débit de sortie (Output tokens/s - généralement le 7ème champ) | |
| # Format type: "Throughput: 1.42 requests/s, 2130.75 total tokens/s, 710.25 output tokens/s" | |
| RESULT=$(echo "$OUTPUT" | grep "Throughput:" | sed 's/,//g' | awk '{print $7}') | |
| if [ -z "$RESULT" ]; then | |
| RESULT="Error" | |
| fi | |
| echo "âś… Captured Speed: $RESULT tok/s" | |
| echo "$model,$name,$RESULT" >> $OUTPUT_FILE | |
| # Nettoyage | |
| rm -rf ~/.cache/vllm/torch_compile_cache/* 2>/dev/null | |
| sleep 1 | |
| } | |
| echo "Starting Benchmark Suite (CLI Version)..." | |
| for model in "${MODELS[@]}"; do | |
| # 1. TEST STANDARD (BF16 - Qualité Max) | |
| run_bench "$model" "BF16 (Native)" "" | |
| # 2. TEST FP8 (Turbo - Vitesse Max) | |
| run_bench "$model" "FP8 (Turbo)" "--quantization fp8 --kv-cache-dtype fp8" | |
| # 3. TEST SPECULATIVE (Seulement pour le 70B) | |
| if [ "$model" == "$BIG_MODEL" ]; then | |
| echo "⏳ Downloading draft model for Speculative decoding (first run only)..." | |
| SPEC_ARGS="--speculative-model $DRAFT_MODEL --num-speculative-tokens 5 --quantization fp8 --kv-cache-dtype fp8" | |
| run_bench "$model" "Speculative + FP8" "$SPEC_ARGS" | |
| fi | |
| done | |
| echo "" | |
| echo "================================================================" | |
| echo "🎉 RESULTATS (Output Tokens/s)" | |
| echo "Rappel : C'est le débit total cumulé (5 utilisateurs simultanés)" | |
| echo "----------------------------------------------------------------" | |
| cat $OUTPUT_FILE | column -s, -t | |
| echo "================================================================" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment