Created
September 26, 2025 01:06
-
-
Save rizar/27d7276480b9556b057e01f288efcd1e to your computer and use it in GitHub Desktop.
bench
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| """ | |
| vLLM Load Test Script | |
| Makes 512 concurrent requests to vLLM server at localhost:8000 | |
| """ | |
| import sys | |
| import asyncio | |
| import os | |
| from pprint import pprint | |
| import aiohttp | |
| import time | |
| import json | |
| from typing import Dict, Any | |
| async def make_request(session: aiohttp.ClientSession, request_id: int) -> Dict[str, Any]: | |
| """Make a single request to vLLM server.""" | |
| url = "http://localhost:8000/v1/completions" | |
| # Sample payload for vLLM completion | |
| payload = { | |
| "model": os.environ["LOCAL_MODEL"], # Adjust model name as needed | |
| "prompt": f"Hello, this is request #{request_id}. Please write a very long novel.", | |
| "max_tokens": 512, | |
| "temperature": 0.7, | |
| "top_p": 0.9, | |
| "stream": False | |
| } | |
| headers = { | |
| "Content-Type": "application/json" | |
| } | |
| try: | |
| async with session.post(url, json=payload, headers=headers) as response: | |
| result = { | |
| "request_id": request_id, | |
| "status_code": response.status, | |
| "success": response.status == 200, | |
| "response_time": None, | |
| "error": None, | |
| "tokens_generated": 0 | |
| } | |
| if response.status == 200: | |
| try: | |
| response_data = await response.json() | |
| result["response_data"] = response_data | |
| # Extract token count from response | |
| if "usage" in response_data and "completion_tokens" in response_data["usage"]: | |
| result["tokens_generated"] = response_data["usage"]["completion_tokens"] | |
| except Exception as e: | |
| result["error"] = f"Failed to parse JSON: {str(e)}" | |
| result["success"] = False | |
| else: | |
| result["error"] = f"HTTP {response.status}: {await response.text()}" | |
| return result | |
| except asyncio.TimeoutError: | |
| return { | |
| "request_id": request_id, | |
| "status_code": None, | |
| "success": False, | |
| "error": "Request timeout", | |
| "response_time": None, | |
| "tokens_generated": 0 | |
| } | |
| except Exception as e: | |
| return { | |
| "request_id": request_id, | |
| "status_code": None, | |
| "success": False, | |
| "error": f"Request failed: {str(e)}", | |
| "response_time": None, | |
| "tokens_generated": 0 | |
| } | |
| async def run_load_test(num_requests: int = 512) -> Dict[str, Any]: | |
| """Run the load test with specified number of concurrent requests and return stats dictionary.""" | |
| print(f"Starting load test with {num_requests} concurrent requests to vLLM at localhost:8000") | |
| print("=" * 70) | |
| # Configure session with timeouts | |
| timeout = aiohttp.ClientTimeout(total=3600) | |
| connector = aiohttp.TCPConnector(limit=num_requests, limit_per_host=num_requests) | |
| async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session: | |
| # Warmup phase - 128 requests without logging stats | |
| print("Running warmup with 128 requests...") | |
| warmup_tasks = [make_request(session, i+1) for i in range(128)] | |
| await asyncio.gather(*warmup_tasks, return_exceptions=True) | |
| print("Warmup completed.") | |
| start_time = time.time() | |
| # Create all request tasks for main benchmark | |
| tasks = [make_request(session, i+1) for i in range(num_requests)] | |
| # Execute all requests concurrently | |
| print(f"Sending {num_requests} requests...") | |
| results = await asyncio.gather(*tasks, return_exceptions=True) | |
| end_time = time.time() | |
| total_time = end_time - start_time | |
| # Process results | |
| successful_requests = 0 | |
| failed_requests = 0 | |
| error_counts = {} | |
| total_tokens = 0 | |
| for result in results: | |
| if isinstance(result, Exception): | |
| failed_requests += 1 | |
| error_type = type(result).__name__ | |
| error_counts[error_type] = error_counts.get(error_type, 0) + 1 | |
| elif result.get("success", False): | |
| successful_requests += 1 | |
| total_tokens += result.get("tokens_generated", 0) | |
| else: | |
| failed_requests += 1 | |
| error = result.get("error", "Unknown error") | |
| error_counts[error] = error_counts.get(error, 0) + 1 | |
| # Create stats dictionary | |
| stats = { | |
| "total_requests": num_requests, | |
| "successful_requests": successful_requests, | |
| "failed_requests": failed_requests, | |
| "success_rate": (successful_requests/num_requests)*100, | |
| "total_time": total_time, | |
| "requests_per_second": num_requests/total_time if total_time > 0 else 0, | |
| "average_response_time": total_time/num_requests if num_requests > 0 else 0, | |
| "total_tokens_generated": total_tokens, | |
| "tokens_per_second": total_tokens/total_time if total_time > 0 else 0, | |
| "average_tokens_per_request": total_tokens/successful_requests if successful_requests > 0 else 0, | |
| "error_counts": error_counts | |
| } | |
| # Print summary | |
| print("\n" + "=" * 70) | |
| print("LOAD TEST RESULTS") | |
| print("=" * 70) | |
| pprint(stats) | |
| if stats['error_counts']: | |
| print("\nError breakdown:") | |
| for error, count in stats['error_counts'].items(): | |
| print(f" {error}: {count}") | |
| return stats | |
| async def main(): | |
| """Main function to run the load test.""" | |
| os.makedirs(os.environ['RESULTS_DIR'], exist_ok=True) | |
| try: | |
| stats = await run_load_test(int(sys.argv[1])) | |
| # Optionally save stats to file | |
| timestamp = time.strftime("%Y%m%d_%H%M%S") | |
| filename = f"{os.environ['RESULTS_DIR']}/vllm_load_test_stats_{timestamp}.json" | |
| with open(filename, 'w') as f: | |
| json.dump(stats, f, indent=2, default=str) | |
| print(f"Stats saved to {filename}") | |
| except KeyboardInterrupt: | |
| print("\nLoad test interrupted by user") | |
| except Exception as e: | |
| print(f"Load test failed: {str(e)}") | |
| if __name__ == "__main__": | |
| # Run the async main function | |
| asyncio.run(main()) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment