rizar/bench.py

## bench.py
#!/usr/bin/env python3
"""
vLLM Load Test Script
Makes 512 concurrent requests to vLLM server at localhost:8000
"""

import sys
import asyncio
import os
from pprint import pprint
import aiohttp
import time
import json
from typing import Dict, Any


async def make_request(session: aiohttp.ClientSession, request_id: int) -> Dict[str, Any]:
    """Make a single request to vLLM server."""
    url = "http://localhost:8000/v1/completions"

    # Sample payload for vLLM completion
    payload = {
        "model": os.environ["LOCAL_MODEL"],  # Adjust model name as needed
        "prompt": f"Hello, this is request #{request_id}. Please write a very long novel.",
        "max_tokens": 512,
        "temperature": 0.7,
        "top_p": 0.9,
        "stream": False
    }

    headers = {
        "Content-Type": "application/json"
    }

    try:
        async with session.post(url, json=payload, headers=headers) as response:
            result = {
                "request_id": request_id,
                "status_code": response.status,
                "success": response.status == 200,
                "response_time": None,
                "error": None,
                "tokens_generated": 0
            }

            if response.status == 200:
                try:
                    response_data = await response.json()
                    result["response_data"] = response_data

                    # Extract token count from response
                    if "usage" in response_data and "completion_tokens" in response_data["usage"]:
                        result["tokens_generated"] = response_data["usage"]["completion_tokens"]

                except Exception as e:
                    result["error"] = f"Failed to parse JSON: {str(e)}"
                    result["success"] = False
            else:
                result["error"] = f"HTTP {response.status}: {await response.text()}"

            return result

    except asyncio.TimeoutError:
        return {
            "request_id": request_id,
            "status_code": None,
            "success": False,
            "error": "Request timeout",
            "response_time": None,
            "tokens_generated": 0
        }
    except Exception as e:
        return {
            "request_id": request_id,
            "status_code": None,
            "success": False,
            "error": f"Request failed: {str(e)}",
            "response_time": None,
            "tokens_generated": 0
        }


async def run_load_test(num_requests: int = 512) -> Dict[str, Any]:
    """Run the load test with specified number of concurrent requests and return stats dictionary."""
    print(f"Starting load test with {num_requests} concurrent requests to vLLM at localhost:8000")
    print("=" * 70)

    # Configure session with timeouts
    timeout = aiohttp.ClientTimeout(total=3600)
    connector = aiohttp.TCPConnector(limit=num_requests, limit_per_host=num_requests)


    async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
        # Warmup phase - 128 requests without logging stats
        print("Running warmup with 128 requests...")
        warmup_tasks = [make_request(session, i+1) for i in range(128)]
        await asyncio.gather(*warmup_tasks, return_exceptions=True)
        print("Warmup completed.")

        start_time = time.time()

        # Create all request tasks for main benchmark
        tasks = [make_request(session, i+1) for i in range(num_requests)]

        # Execute all requests concurrently
        print(f"Sending {num_requests} requests...")
        results = await asyncio.gather(*tasks, return_exceptions=True)

    end_time = time.time()
    total_time = end_time - start_time

    # Process results
    successful_requests = 0
    failed_requests = 0
    error_counts = {}
    total_tokens = 0

    for result in results:
        if isinstance(result, Exception):
            failed_requests += 1
            error_type = type(result).__name__
            error_counts[error_type] = error_counts.get(error_type, 0) + 1
        elif result.get("success", False):
            successful_requests += 1
            total_tokens += result.get("tokens_generated", 0)
        else:
            failed_requests += 1
            error = result.get("error", "Unknown error")
            error_counts[error] = error_counts.get(error, 0) + 1

    # Create stats dictionary
    stats = {
        "total_requests": num_requests,
        "successful_requests": successful_requests,
        "failed_requests": failed_requests,
        "success_rate": (successful_requests/num_requests)*100,
        "total_time": total_time,
        "requests_per_second": num_requests/total_time if total_time > 0 else 0,
        "average_response_time": total_time/num_requests if num_requests > 0 else 0,
        "total_tokens_generated": total_tokens,
        "tokens_per_second": total_tokens/total_time if total_time > 0 else 0,
        "average_tokens_per_request": total_tokens/successful_requests if successful_requests > 0 else 0,
        "error_counts": error_counts
    }

    # Print summary
    print("\n" + "=" * 70)
    print("LOAD TEST RESULTS")
    print("=" * 70)
    pprint(stats)

    if stats['error_counts']:
        print("\nError breakdown:")
        for error, count in stats['error_counts'].items():
            print(f"  {error}: {count}")

    return stats


async def main():
    """Main function to run the load test."""
    os.makedirs(os.environ['RESULTS_DIR'], exist_ok=True)
    try:
        stats = await run_load_test(int(sys.argv[1]))

        # Optionally save stats to file
        timestamp = time.strftime("%Y%m%d_%H%M%S")
        filename = f"{os.environ['RESULTS_DIR']}/vllm_load_test_stats_{timestamp}.json"
        with open(filename, 'w') as f:
            json.dump(stats, f, indent=2, default=str)
        print(f"Stats saved to {filename}")

    except KeyboardInterrupt:
        print("\nLoad test interrupted by user")
    except Exception as e:
        print(f"Load test failed: {str(e)}")


if __name__ == "__main__":
    # Run the async main function
    asyncio.run(main())
	#!/usr/bin/env python3
	"""
	vLLM Load Test Script
	Makes 512 concurrent requests to vLLM server at localhost:8000
	"""

	import sys
	import asyncio
	import os
	from pprint import pprint
	import aiohttp
	import time
	import json
	from typing import Dict, Any


	async def make_request(session: aiohttp.ClientSession, request_id: int) -> Dict[str, Any]:
	"""Make a single request to vLLM server."""
	url = "http://localhost:8000/v1/completions"

	# Sample payload for vLLM completion
	payload = {
	"model": os.environ["LOCAL_MODEL"], # Adjust model name as needed
	"prompt": f"Hello, this is request #{request_id}. Please write a very long novel.",
	"max_tokens": 512,
	"temperature": 0.7,
	"top_p": 0.9,
	"stream": False
	}

	headers = {
	"Content-Type": "application/json"
	}

	try:
	async with session.post(url, json=payload, headers=headers) as response:
	result = {
	"request_id": request_id,
	"status_code": response.status,
	"success": response.status == 200,
	"response_time": None,
	"error": None,
	"tokens_generated": 0
	}

	if response.status == 200:
	try:
	response_data = await response.json()
	result["response_data"] = response_data

	# Extract token count from response
	if "usage" in response_data and "completion_tokens" in response_data["usage"]:
	result["tokens_generated"] = response_data["usage"]["completion_tokens"]

	except Exception as e:
	result["error"] = f"Failed to parse JSON: {str(e)}"
	result["success"] = False
	else:
	result["error"] = f"HTTP {response.status}: {await response.text()}"

	return result

	except asyncio.TimeoutError:
	return {
	"request_id": request_id,
	"status_code": None,
	"success": False,
	"error": "Request timeout",
	"response_time": None,
	"tokens_generated": 0
	}
	except Exception as e:
	return {
	"request_id": request_id,
	"status_code": None,
	"success": False,
	"error": f"Request failed: {str(e)}",
	"response_time": None,
	"tokens_generated": 0
	}


	async def run_load_test(num_requests: int = 512) -> Dict[str, Any]:
	"""Run the load test with specified number of concurrent requests and return stats dictionary."""
	print(f"Starting load test with {num_requests} concurrent requests to vLLM at localhost:8000")
	print("=" * 70)

	# Configure session with timeouts
	timeout = aiohttp.ClientTimeout(total=3600)
	connector = aiohttp.TCPConnector(limit=num_requests, limit_per_host=num_requests)


	async with aiohttp.ClientSession(timeout=timeout, connector=connector) as session:
	# Warmup phase - 128 requests without logging stats
	print("Running warmup with 128 requests...")
	warmup_tasks = [make_request(session, i+1) for i in range(128)]
	await asyncio.gather(*warmup_tasks, return_exceptions=True)
	print("Warmup completed.")

	start_time = time.time()

	# Create all request tasks for main benchmark
	tasks = [make_request(session, i+1) for i in range(num_requests)]

	# Execute all requests concurrently
	print(f"Sending {num_requests} requests...")
	results = await asyncio.gather(*tasks, return_exceptions=True)

	end_time = time.time()
	total_time = end_time - start_time

	# Process results
	successful_requests = 0
	failed_requests = 0
	error_counts = {}
	total_tokens = 0

	for result in results:
	if isinstance(result, Exception):
	failed_requests += 1
	error_type = type(result).__name__
	error_counts[error_type] = error_counts.get(error_type, 0) + 1
	elif result.get("success", False):
	successful_requests += 1
	total_tokens += result.get("tokens_generated", 0)
	else:
	failed_requests += 1
	error = result.get("error", "Unknown error")
	error_counts[error] = error_counts.get(error, 0) + 1

	# Create stats dictionary
	stats = {
	"total_requests": num_requests,
	"successful_requests": successful_requests,
	"failed_requests": failed_requests,
	"success_rate": (successful_requests/num_requests)*100,
	"total_time": total_time,
	"requests_per_second": num_requests/total_time if total_time > 0 else 0,
	"average_response_time": total_time/num_requests if num_requests > 0 else 0,
	"total_tokens_generated": total_tokens,
	"tokens_per_second": total_tokens/total_time if total_time > 0 else 0,
	"average_tokens_per_request": total_tokens/successful_requests if successful_requests > 0 else 0,
	"error_counts": error_counts
	}

	# Print summary
	print("\n" + "=" * 70)
	print("LOAD TEST RESULTS")
	print("=" * 70)
	pprint(stats)

	if stats['error_counts']:
	print("\nError breakdown:")
	for error, count in stats['error_counts'].items():
	print(f" {error}: {count}")

	return stats


	async def main():
	"""Main function to run the load test."""
	os.makedirs(os.environ['RESULTS_DIR'], exist_ok=True)
	try:
	stats = await run_load_test(int(sys.argv[1]))

	# Optionally save stats to file
	timestamp = time.strftime("%Y%m%d_%H%M%S")
	filename = f"{os.environ['RESULTS_DIR']}/vllm_load_test_stats_{timestamp}.json"
	with open(filename, 'w') as f:
	json.dump(stats, f, indent=2, default=str)
	print(f"Stats saved to {filename}")

	except KeyboardInterrupt:
	print("\nLoad test interrupted by user")
	except Exception as e:
	print(f"Load test failed: {str(e)}")


	if __name__ == "__main__":
	# Run the async main function
	asyncio.run(main())
No results found