cyysky/20251230_wedlm_openai_server.py

## 20251230_wedlm_openai_server.py
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Optional, Dict, Any
import uvicorn
import time
from transformers import AutoTokenizer
from wedlm import LLM, SamplingParams

app = FastAPI(title="WeDLM OpenAI Compatible API")

# Add CORS middleware
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Initialize model and tokenizer
try:
    print("Loading WeDLM model...")
    llm = LLM(model="WeDLM-8B-Instruct")
    tokenizer = AutoTokenizer.from_pretrained("WeDLM-8B-Instruct", trust_remote_code=True)
    print("Model loaded successfully!")
except Exception as e:
    print(f"Error loading model: {e}")
    llm = None
    tokenizer = None

# Pydantic models for request/response
class ModelInfo(BaseModel):
    id: str
    object: str = "model"
    created: int
    owned_by: str = "organization-name"

class Message(BaseModel):
    role: str
    content: str

class ChatCompletionRequest(BaseModel):
    model: str
    messages: List[Message]
    temperature: Optional[float] = 0.7
    max_tokens: Optional[int] = 512
    top_p: Optional[float] = 1.0
    frequency_penalty: Optional[float] = 0.0
    presence_penalty: Optional[float] = 0.0
    stream: Optional[bool] = False

class ChatCompletionChoice(BaseModel):
    index: int
    message: Message
    finish_reason: Optional[str] = "stop"

class Usage(BaseModel):
    prompt_tokens: int
    completion_tokens: int
    total_tokens: int

class ChatCompletionResponse(BaseModel):
    id: str
    object: str = "chat.completion"
    created: int
    model: str
    choices: List[ChatCompletionChoice]
    usage: Usage

# Routes
@app.get("/v1/models", response_model=List[ModelInfo])
async def list_models():
    """List available models"""
    if llm is None:
        raise HTTPException(status_code=503, detail="Model not loaded")

    return [
        ModelInfo(
            id="WeDLM-8B-Instruct",
            created=int(time.time()),
            owned_by="WeDLM"
        )
    ]

@app.get("/v1/models/{model_name}", response_model=ModelInfo)
async def get_model_info(model_name: str):
    """Get information about a specific model"""
    if llm is None:
        raise HTTPException(status_code=503, detail="Model not loaded")

    if model_name != "WeDLM-8B-Instruct":
        raise HTTPException(status_code=404, detail="Model not found")

    return ModelInfo(
        id=model_name,
        created=int(time.time()),
        owned_by="WeDLM"
    )

@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
    """Create chat completion"""
    if llm is None:
        raise HTTPException(status_code=503, detail="Model not loaded")

    if request.model != "WeDLM-8B-Instruct":
        raise HTTPException(status_code=404, detail="Model not found")

    try:
        # Convert messages to the format expected by tokenizer
        messages_content = [{"role": msg.role, "content": msg.content} for msg in request.messages]

        # Apply chat template
        text = tokenizer.apply_chat_template(
            messages_content,
            tokenize=False,
            add_generation_prompt=True
        )

        # Generate response
        sampling_params = SamplingParams(
            temperature=request.temperature,
            max_tokens=request.max_tokens,
            top_p=request.top_p
        )

        outputs = llm.generate([text], sampling_params)
        generated_text = outputs[0]["text"]

        # Extract assistant's response (remove the prompt)
        assistant_response = generated_text.split("assistant\n")[-1].strip()

        # Calculate token counts
        prompt_tokens = len(tokenizer.encode(text))
        completion_tokens = len(tokenizer.encode(assistant_response))

        response = ChatCompletionResponse(
            id=f"chatcmpl-{int(time.time())}",
            created=int(time.time()),
            model=request.model,
            choices=[
                ChatCompletionChoice(
                    index=0,
                    message=Message(role="assistant", content=assistant_response),
                    finish_reason="stop"
                )
            ],
            usage=Usage(
                prompt_tokens=prompt_tokens,
                completion_tokens=completion_tokens,
                total_tokens=prompt_tokens + completion_tokens
            )
        )

        return response

    except Exception as e:
        raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")

@app.get("/health")
async def health_check():
    """Health check endpoint"""
    if llm is None:
        return {"status": "unhealthy", "detail": "Model not loaded"}
    return {"status": "healthy"}

if __name__ == "__main__":
    uvicorn.run(app, host="0.0.0.0", port=8000)

## readme.md

      
    Raw
  

              readme.md
            
          
    WeDLM OpenAI-Compatible Server Setup

This repository provides a complete setup guide for deploying Tencent's WeDLM model with an OpenAI-compatible API server using FastAPI. Follow the instructions below to install dependencies, download the model, and launch the server.
Prerequisites


Operating System: Linux or macOS (with bash support)
Python: Version 3.13 or higher
Hardware: GPU with sufficient VRAM to run WeDLM-8B-Instruct (recommended: 24GB+ VRAM)
Disk Space: At least 20GB for model files and dependencies

Installation

Execute the following commands in your terminal to set up the environment and install all required components:
# Install uv package manager
wget -qO- https://astral.sh/uv/install.sh | sh

# Create Python 3.13 virtual environment
uv venv --python 3.13 venv-wedlm

# Activate virtual environment
source venv-wedlm/bin/activate

# Install WeDLM from source
pip install git+https://github.com/tencent/WeDLM.git

# Download WeDLM-8B-Instruct model files
hf download tencent/WeDLM-8B-Instruct --local-dir WeDLM-8B-Instruct

# Install flash-attention for optimized inference
uv pip install flash-attn --no-build-isolation

# Install FastAPI and server dependencies
uv pip install fastapi uvicorn pydantic
Command Explanations

The installation process consists of several steps, each serving a specific purpose in setting up your development environment.
uv is a modern, extremely fast Python package installer and environment manager written in Rust. It significantly accelerates the installation process compared to traditional tools like pip.
The virtual environment creation isolates your WeDLM dependencies from other Python projects on your system, preventing version conflicts and ensuring reproducible deployments.
WeDLM installation pulls the latest version directly from Tencent's GitHub repository, ensuring you have access to the most recent features and bug fixes.
Model download retrieves the WeDLM-8B-Instruct model weights from HuggingFace Hub and stores them locally in the WeDLM-8B-Instruct directory for inference.
flash-attn is a flash attention implementation that substantially reduces memory usage and increases inference speed for transformer models, making large model deployment more efficient.
FastAPI, uvicorn, and pydantic form the foundation of your API server, providing fast async HTTP capabilities and robust data validation.
Running the Server

After completing the installation, launch the WeDLM OpenAI-compatible API server:
python 20251230_wedlm_openai_server.py
Once started, the server will be accessible at http://localhost:8000 by default. The API is fully compatible with OpenAI's chat completions endpoint format, allowing you to use standard OpenAI client libraries with minimal code changes.
API Usage Example

With your server running, you can make requests using any OpenAI-compatible client:
from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="not-needed"
)

response = client.chat.completions.create(
    model="WeDLM-8B-Instruct",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Explain how quantum computing works."}
    ],
    temperature=0.7,
    max_tokens=1024
)

print(response.choices[0].message.content)
Environment Variables

You can customize server behavior using environment variables:


Variable
Description
Default


MODEL_PATH
Path to WeDLM model directory
WeDLM-8B-Instruct


HOST
Server bind address
0.0.0.0


PORT
Server port
8000


DEVICE
Compute device (cuda, cpu, mps)
cuda


MAX_BATCH_SIZE
Maximum batch size for inference
1


Troubleshooting

If you encounter CUDA out-of-memory errors, reduce the batch size or enable CPU offloading by setting CUDA_VISIBLE_DEVICES= appropriately. For systems with limited VRAM, consider using model quantization or reducing the max_tokens parameter in your requests.
For flash-attention installation issues on certain systems, ensure you have the latest CUDA toolkit installed and that your PyTorch version matches your CUDA version.
	from fastapi import FastAPI, HTTPException
	from fastapi.middleware.cors import CORSMiddleware
	from pydantic import BaseModel
	from typing import List, Optional, Dict, Any
	import uvicorn
	import time
	from transformers import AutoTokenizer
	from wedlm import LLM, SamplingParams

	app = FastAPI(title="WeDLM OpenAI Compatible API")

	# Add CORS middleware
	app.add_middleware(
	CORSMiddleware,
	allow_origins=["*"],
	allow_credentials=True,
	allow_methods=["*"],
	allow_headers=["*"],
	)

	# Initialize model and tokenizer
	try:
	print("Loading WeDLM model...")
	llm = LLM(model="WeDLM-8B-Instruct")
	tokenizer = AutoTokenizer.from_pretrained("WeDLM-8B-Instruct", trust_remote_code=True)
	print("Model loaded successfully!")
	except Exception as e:
	print(f"Error loading model: {e}")
	llm = None
	tokenizer = None

	# Pydantic models for request/response
	class ModelInfo(BaseModel):
	id: str
	object: str = "model"
	created: int
	owned_by: str = "organization-name"

	class Message(BaseModel):
	role: str
	content: str

	class ChatCompletionRequest(BaseModel):
	model: str
	messages: List[Message]
	temperature: Optional[float] = 0.7
	max_tokens: Optional[int] = 512
	top_p: Optional[float] = 1.0
	frequency_penalty: Optional[float] = 0.0
	presence_penalty: Optional[float] = 0.0
	stream: Optional[bool] = False

	class ChatCompletionChoice(BaseModel):
	index: int
	message: Message
	finish_reason: Optional[str] = "stop"

	class Usage(BaseModel):
	prompt_tokens: int
	completion_tokens: int
	total_tokens: int

	class ChatCompletionResponse(BaseModel):
	id: str
	object: str = "chat.completion"
	created: int
	model: str
	choices: List[ChatCompletionChoice]
	usage: Usage

	# Routes
	@app.get("/v1/models", response_model=List[ModelInfo])
	async def list_models():
	"""List available models"""
	if llm is None:
	raise HTTPException(status_code=503, detail="Model not loaded")

	return [
	ModelInfo(
	id="WeDLM-8B-Instruct",
	created=int(time.time()),
	owned_by="WeDLM"
	)
	]

	@app.get("/v1/models/{model_name}", response_model=ModelInfo)
	async def get_model_info(model_name: str):
	"""Get information about a specific model"""
	if llm is None:
	raise HTTPException(status_code=503, detail="Model not loaded")

	if model_name != "WeDLM-8B-Instruct":
	raise HTTPException(status_code=404, detail="Model not found")

	return ModelInfo(
	id=model_name,
	created=int(time.time()),
	owned_by="WeDLM"
	)

	@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
	async def create_chat_completion(request: ChatCompletionRequest):
	"""Create chat completion"""
	if llm is None:
	raise HTTPException(status_code=503, detail="Model not loaded")

	if request.model != "WeDLM-8B-Instruct":
	raise HTTPException(status_code=404, detail="Model not found")

	try:
	# Convert messages to the format expected by tokenizer
	messages_content = [{"role": msg.role, "content": msg.content} for msg in request.messages]

	# Apply chat template
	text = tokenizer.apply_chat_template(
	messages_content,
	tokenize=False,
	add_generation_prompt=True
	)

	# Generate response
	sampling_params = SamplingParams(
	temperature=request.temperature,
	max_tokens=request.max_tokens,
	top_p=request.top_p
	)

	outputs = llm.generate([text], sampling_params)
	generated_text = outputs[0]["text"]

	# Extract assistant's response (remove the prompt)
	assistant_response = generated_text.split("assistant\n")[-1].strip()

	# Calculate token counts
	prompt_tokens = len(tokenizer.encode(text))
	completion_tokens = len(tokenizer.encode(assistant_response))

	response = ChatCompletionResponse(
	id=f"chatcmpl-{int(time.time())}",
	created=int(time.time()),
	model=request.model,
	choices=[
	ChatCompletionChoice(
	index=0,
	message=Message(role="assistant", content=assistant_response),
	finish_reason="stop"
	)
	],
	usage=Usage(
	prompt_tokens=prompt_tokens,
	completion_tokens=completion_tokens,
	total_tokens=prompt_tokens + completion_tokens
	)
	)

	return response

	except Exception as e:
	raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")

	@app.get("/health")
	async def health_check():
	"""Health check endpoint"""
	if llm is None:
	return {"status": "unhealthy", "detail": "Model not loaded"}
	return {"status": "healthy"}

	if __name__ == "__main__":
	uvicorn.run(app, host="0.0.0.0", port=8000)
Variable	Description	Default
`MODEL_PATH`	Path to WeDLM model directory	`WeDLM-8B-Instruct`
`HOST`	Server bind address	`0.0.0.0`
`PORT`	Server port	`8000`
`DEVICE`	Compute device (`cuda`, `cpu`, `mps`)	`cuda`
`MAX_BATCH_SIZE`	Maximum batch size for inference	`1`