shunting314/try-vllm-offline-api.py

## try-vllm-offline-api.py
import torch
from torch import nn
from torch import distributed
import contextlib
import os

from vllm import LLM, SamplingParams

os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
os.environ["VLLM_ATTENTION_BACKEND"] = os.getenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")

class script_args:
    # model_name = "Qwen/Qwen3-0.6B"
    model_name = "meta-llama/Meta-Llama-3-8B"
    profile = True
    compile = True

if __name__ == "__main__":
    if script_args.profile:
        profile = torch.profiler.profile(with_stack=True)
    else:
        profile = contextlib.nullcontext()


    if script_args.compile:
        compilation_config = None
    else:
        from vllm.config import CompilationConfig, CUDAGraphMode, CompilationMode
        compilation_config = CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE, mode=CompilationMode.NONE)

    llm = LLM(model=script_args.model_name, compilation_config=compilation_config)
    sampling_params = SamplingParams(temperature=0.7, max_tokens=128)
    # sampling_params = SamplingParams(temperature=0, max_tokens=128)

    requests = [
        # "Tell me a joke.",
        "How to estimate the value of pi in mathematics?",
        "How to estimate the value of pi in mathematics?",
        "How to estimate the value of pi in mathematics?",
        "How to estimate the value of pi in mathematics?",
        # "How does quicksort works?",
    ]

    if script_args.profile:
        # do a warmup if profiling
        outputs = llm.generate(requests, sampling_params)

    with profile:
        outputs = llm.generate(requests, sampling_params)

    assert len(outputs) == len(requests)
    for i, req_text in enumerate(requests):
        print(f"Response for request {i}: {outputs[i].outputs[0].text}")

    if script_args.profile:
        path = "/tmp/profile.json"
        profile.export_chrome_trace(path)
        print(f"Profile written to {path}")
	import torch
	from torch import nn
	from torch import distributed
	import contextlib
	import os

	from vllm import LLM, SamplingParams

	os.environ["VLLM_ENABLE_V1_MULTIPROCESSING"] = "0"
	os.environ["VLLM_ATTENTION_BACKEND"] = os.getenv("VLLM_ATTENTION_BACKEND", "FLEX_ATTENTION")

	class script_args:
	# model_name = "Qwen/Qwen3-0.6B"
	model_name = "meta-llama/Meta-Llama-3-8B"
	profile = True
	compile = True

	if __name__ == "__main__":
	if script_args.profile:
	profile = torch.profiler.profile(with_stack=True)
	else:
	profile = contextlib.nullcontext()


	if script_args.compile:
	compilation_config = None
	else:
	from vllm.config import CompilationConfig, CUDAGraphMode, CompilationMode
	compilation_config = CompilationConfig(cudagraph_mode=CUDAGraphMode.NONE, mode=CompilationMode.NONE)

	llm = LLM(model=script_args.model_name, compilation_config=compilation_config)
	sampling_params = SamplingParams(temperature=0.7, max_tokens=128)
	# sampling_params = SamplingParams(temperature=0, max_tokens=128)

	requests = [
	# "Tell me a joke.",
	"How to estimate the value of pi in mathematics?",
	"How to estimate the value of pi in mathematics?",
	"How to estimate the value of pi in mathematics?",
	"How to estimate the value of pi in mathematics?",
	# "How does quicksort works?",
	]

	if script_args.profile:
	# do a warmup if profiling
	outputs = llm.generate(requests, sampling_params)

	with profile:
	outputs = llm.generate(requests, sampling_params)

	assert len(outputs) == len(requests)
	for i, req_text in enumerate(requests):
	print(f"Response for request {i}: {outputs[i].outputs[0].text}")

	if script_args.profile:
	path = "/tmp/profile.json"
	profile.export_chrome_trace(path)
	print(f"Profile written to {path}")
No results found