Skip to content

Instantly share code, notes, and snippets.

@rizar
Created September 26, 2025 01:06
Show Gist options
  • Select an option

  • Save rizar/27d7276480b9556b057e01f288efcd1e to your computer and use it in GitHub Desktop.

Select an option

Save rizar/27d7276480b9556b057e01f288efcd1e to your computer and use it in GitHub Desktop.
bench
#!/usr/bin/env python3
"""
vLLM Load Test Script
Makes 512 concurrent requests to vLLM server at localhost:8000
"""
import sys
import asyncio
import os
from pprint import pprint
import aiohttp
import time
import json
from typing import Dict, Any
async def make_request(session: aiohttp.ClientSession, request_id: int) -> Dict[str, Any]:
"""Make a single request to vLLM server."""
url = "http://localhost:8000/v1/completions"
# Sample payload for vLLM completion
payload = {
"model": os.environ["LOCAL_MODEL"], # Adjust model name as needed
"prompt": f"Hello, this is request #{request_id}. Please write a very long novel.",
"max_tokens": 512,
"temperature": 0.7,
"top_p": 0.9,
"stream": False
}
headers = {
"Content-Type": "application/json"
}
try:
async with session.post(url, json=payload, headers=headers) as response:
result = {
"request_id": request_id,
"status_code": response.status,
"success": response.status == 200,
"response_time": None,
"error": None,
"tokens_generated": 0
}
if response.status == 200:
try:
response_data = await response.json()
result["response_data"] = response_data
# Extract token count from response
if "usage" in response_data and "completion_tokens" in response_data["usage"]:
result["tokens_generated"] = response_data["usage"]["completion_tokens"]
except Exception as e:
result["error"] = f"Failed to parse JSON: {str(e)}"
result["success"] = False
else:
result["error"] = f"HTTP {response.status}: {await response.text()}"
return result
except asyncio.TimeoutError:
return {
"request_id": request_id,
"status_code": None,
"success": False,
"error": "Request timeout",
"response_time": None,
"tokens_generated": 0
}
except Exception as e:
return {
"request_id": request_id,
"status_code": None,
"success": False,
"error": f"Request failed: {str(e)}",
"response_time": None,
"tokens_generated": 0
}
async def run_load_test(num_requests: int = 512) -> Dict[str, Any]:
"""Run the load test with specified number of concurrent requests and return stats dictionary."""
print(f"Starting load test with {num_requests} concurrent requests to vLLM at localhost:8000")
print("=" * 70)
# Configure session with timeouts
timeout = aiohttp.ClientTimeout(total=3600)
connector = aiohttp.TCPConnector(limit=num_requests, limit_per_host=num_requests)
async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
# Warmup phase - 128 requests without logging stats
print("Running warmup with 128 requests...")
warmup_tasks = [make_request(session, i+1) for i in range(128)]
await asyncio.gather(*warmup_tasks, return_exceptions=True)
print("Warmup completed.")
start_time = time.time()
# Create all request tasks for main benchmark
tasks = [make_request(session, i+1) for i in range(num_requests)]
# Execute all requests concurrently
print(f"Sending {num_requests} requests...")
results = await asyncio.gather(*tasks, return_exceptions=True)
end_time = time.time()
total_time = end_time - start_time
# Process results
successful_requests = 0
failed_requests = 0
error_counts = {}
total_tokens = 0
for result in results:
if isinstance(result, Exception):
failed_requests += 1
error_type = type(result).__name__
error_counts[error_type] = error_counts.get(error_type, 0) + 1
elif result.get("success", False):
successful_requests += 1
total_tokens += result.get("tokens_generated", 0)
else:
failed_requests += 1
error = result.get("error", "Unknown error")
error_counts[error] = error_counts.get(error, 0) + 1
# Create stats dictionary
stats = {
"total_requests": num_requests,
"successful_requests": successful_requests,
"failed_requests": failed_requests,
"success_rate": (successful_requests/num_requests)*100,
"total_time": total_time,
"requests_per_second": num_requests/total_time if total_time > 0 else 0,
"average_response_time": total_time/num_requests if num_requests > 0 else 0,
"total_tokens_generated": total_tokens,
"tokens_per_second": total_tokens/total_time if total_time > 0 else 0,
"average_tokens_per_request": total_tokens/successful_requests if successful_requests > 0 else 0,
"error_counts": error_counts
}
# Print summary
print("\n" + "=" * 70)
print("LOAD TEST RESULTS")
print("=" * 70)
pprint(stats)
if stats['error_counts']:
print("\nError breakdown:")
for error, count in stats['error_counts'].items():
print(f" {error}: {count}")
return stats
async def main():
"""Main function to run the load test."""
os.makedirs(os.environ['RESULTS_DIR'], exist_ok=True)
try:
stats = await run_load_test(int(sys.argv[1]))
# Optionally save stats to file
timestamp = time.strftime("%Y%m%d_%H%M%S")
filename = f"{os.environ['RESULTS_DIR']}/vllm_load_test_stats_{timestamp}.json"
with open(filename, 'w') as f:
json.dump(stats, f, indent=2, default=str)
print(f"Stats saved to {filename}")
except KeyboardInterrupt:
print("\nLoad test interrupted by user")
except Exception as e:
print(f"Load test failed: {str(e)}")
if __name__ == "__main__":
# Run the async main function
asyncio.run(main())
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment