ArthurDelannoyazerty/vllm_benchmark.sh

## vllm_benchmark.sh
#!/bin/bash

# ==============================================================================
# 🚀 AUTOMATED BENCHMARK SUITE - NVIDIA H200 (FIXED CLI VERSION)
# ==============================================================================

# On garde 5 prompts pour montrer la puissance parallèle de la H200
COMMON_ARGS="--input-len 1000 --output-len 500 --num-prompts 5 --gpu-memory-utilization 0.95"

# Liste des modèles
MODELS=(
    "google/gemma-3-4b-it"
    "google/gemma-3-12b-it"
    "google/gemma-3-27b-it"
    "mistralai/Mistral-Small-24B-Instruct-2501"
    "meta-llama/Llama-3.3-70B-Instruct"
)

# Configuration Spéciale 70B
BIG_MODEL="meta-llama/Llama-3.3-70B-Instruct"
DRAFT_MODEL="meta-llama/Llama-3.2-3B-Instruct"

OUTPUT_FILE="benchmark_results_h200.csv"
echo "Model,Configuration,Output Speed (tok/s)" > $OUTPUT_FILE

run_bench() {
    local model=$1
    local name=$2
    local extra_args=$3

    echo "----------------------------------------------------------------"
    echo "🧪 Benchmarking: $model"
    echo "⚙️  Config: $name"
    echo "----------------------------------------------------------------"

    # --- CORRECTION ICI : Utilisation de 'vllm bench throughput' ---
    OUTPUT=$(vllm bench throughput \
        --model $model \
        --dtype bfloat16 \
        $COMMON_ARGS \
        $extra_args 2>&1)

    # On affiche la ligne brute pour contrôle visuel
    echo "$OUTPUT" | grep "Throughput:"

    # Extraction du débit de sortie (Output tokens/s - généralement le 7ème champ)
    # Format type: "Throughput: 1.42 requests/s, 2130.75 total tokens/s, 710.25 output tokens/s"
    RESULT=$(echo "$OUTPUT" | grep "Throughput:" | sed 's/,//g' | awk '{print $7}')

    if [ -z "$RESULT" ]; then
        RESULT="Error"
    fi

    echo "✅ Captured Speed: $RESULT tok/s"
    echo "$model,$name,$RESULT" >> $OUTPUT_FILE

    # Nettoyage
    rm -rf ~/.cache/vllm/torch_compile_cache/* 2>/dev/null
    sleep 1
}

echo "Starting Benchmark Suite (CLI Version)..."

for model in "${MODELS[@]}"; do

    # 1. TEST STANDARD (BF16 - Qualité Max)
    run_bench "$model" "BF16 (Native)" ""

    # 2. TEST FP8 (Turbo - Vitesse Max)
    run_bench "$model" "FP8 (Turbo)" "--quantization fp8 --kv-cache-dtype fp8"

    # 3. TEST SPECULATIVE (Seulement pour le 70B)
    if [ "$model" == "$BIG_MODEL" ]; then
        echo "⏳ Downloading draft model for Speculative decoding (first run only)..."
        SPEC_ARGS="--speculative-model $DRAFT_MODEL --num-speculative-tokens 5 --quantization fp8 --kv-cache-dtype fp8"
        run_bench "$model" "Speculative + FP8" "$SPEC_ARGS"
    fi

done

echo ""
echo "================================================================"
echo "🎉 RESULTATS (Output Tokens/s)"
echo "Rappel : C'est le débit total cumulé (5 utilisateurs simultanés)"
echo "----------------------------------------------------------------"
cat $OUTPUT_FILE | column -s, -t
echo "================================================================"
	#!/bin/bash

	# ==============================================================================
	# 🚀 AUTOMATED BENCHMARK SUITE - NVIDIA H200 (FIXED CLI VERSION)
	# ==============================================================================

	# On garde 5 prompts pour montrer la puissance parallèle de la H200
	COMMON_ARGS="--input-len 1000 --output-len 500 --num-prompts 5 --gpu-memory-utilization 0.95"

	# Liste des modèles
	MODELS=(
	"google/gemma-3-4b-it"
	"google/gemma-3-12b-it"
	"google/gemma-3-27b-it"
	"mistralai/Mistral-Small-24B-Instruct-2501"
	"meta-llama/Llama-3.3-70B-Instruct"
	)

	# Configuration Spéciale 70B
	BIG_MODEL="meta-llama/Llama-3.3-70B-Instruct"
	DRAFT_MODEL="meta-llama/Llama-3.2-3B-Instruct"

	OUTPUT_FILE="benchmark_results_h200.csv"
	echo "Model,Configuration,Output Speed (tok/s)" > $OUTPUT_FILE

	run_bench() {
	local model=$1
	local name=$2
	local extra_args=$3

	echo "----------------------------------------------------------------"
	echo "🧪 Benchmarking: $model"
	echo "⚙️ Config: $name"
	echo "----------------------------------------------------------------"

	# --- CORRECTION ICI : Utilisation de 'vllm bench throughput' ---
	OUTPUT=$(vllm bench throughput \
	--model $model \
	--dtype bfloat16 \
	$COMMON_ARGS \
	$extra_args 2>&1)

	# On affiche la ligne brute pour contrôle visuel
	echo "$OUTPUT" \| grep "Throughput:"

	# Extraction du débit de sortie (Output tokens/s - généralement le 7ème champ)
	# Format type: "Throughput: 1.42 requests/s, 2130.75 total tokens/s, 710.25 output tokens/s"
	RESULT=$(echo "$OUTPUT" \| grep "Throughput:" \| sed 's/,//g' \| awk '{print $7}')

	if [ -z "$RESULT" ]; then
	RESULT="Error"
	fi

	echo "✅ Captured Speed: $RESULT tok/s"
	echo "$model,$name,$RESULT" >> $OUTPUT_FILE

	# Nettoyage
	rm -rf ~/.cache/vllm/torch_compile_cache/* 2>/dev/null
	sleep 1
	}

	echo "Starting Benchmark Suite (CLI Version)..."

	for model in "${MODELS[@]}"; do

	# 1. TEST STANDARD (BF16 - Qualité Max)
	run_bench "$model" "BF16 (Native)" ""

	# 2. TEST FP8 (Turbo - Vitesse Max)
	run_bench "$model" "FP8 (Turbo)" "--quantization fp8 --kv-cache-dtype fp8"

	# 3. TEST SPECULATIVE (Seulement pour le 70B)
	if [ "$model" == "$BIG_MODEL" ]; then
	echo "⏳ Downloading draft model for Speculative decoding (first run only)..."
	SPEC_ARGS="--speculative-model $DRAFT_MODEL --num-speculative-tokens 5 --quantization fp8 --kv-cache-dtype fp8"
	run_bench "$model" "Speculative + FP8" "$SPEC_ARGS"
	fi

	done

	echo ""
	echo "================================================================"
	echo "🎉 RESULTATS (Output Tokens/s)"
	echo "Rappel : C'est le débit total cumulé (5 utilisateurs simultanés)"
	echo "----------------------------------------------------------------"
	cat $OUTPUT_FILE \| column -s, -t
	echo "================================================================"
No results found