Skip to content

Instantly share code, notes, and snippets.

@cyysky
Last active January 13, 2026 08:03
Show Gist options
  • Select an option

  • Save cyysky/bedad2eeb7e440f5838e7cb8d2f11ca6 to your computer and use it in GitHub Desktop.

Select an option

Save cyysky/bedad2eeb7e440f5838e7cb8d2f11ca6 to your computer and use it in GitHub Desktop.
tencent/WeDLM-8B-Instruct openai compatible server
from fastapi import FastAPI, HTTPException
from fastapi.middleware.cors import CORSMiddleware
from pydantic import BaseModel
from typing import List, Optional, Dict, Any
import uvicorn
import time
from transformers import AutoTokenizer
from wedlm import LLM, SamplingParams
app = FastAPI(title="WeDLM OpenAI Compatible API")
# Add CORS middleware
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Initialize model and tokenizer
try:
print("Loading WeDLM model...")
llm = LLM(model="WeDLM-8B-Instruct")
tokenizer = AutoTokenizer.from_pretrained("WeDLM-8B-Instruct", trust_remote_code=True)
print("Model loaded successfully!")
except Exception as e:
print(f"Error loading model: {e}")
llm = None
tokenizer = None
# Pydantic models for request/response
class ModelInfo(BaseModel):
id: str
object: str = "model"
created: int
owned_by: str = "organization-name"
class Message(BaseModel):
role: str
content: str
class ChatCompletionRequest(BaseModel):
model: str
messages: List[Message]
temperature: Optional[float] = 0.7
max_tokens: Optional[int] = 512
top_p: Optional[float] = 1.0
frequency_penalty: Optional[float] = 0.0
presence_penalty: Optional[float] = 0.0
stream: Optional[bool] = False
class ChatCompletionChoice(BaseModel):
index: int
message: Message
finish_reason: Optional[str] = "stop"
class Usage(BaseModel):
prompt_tokens: int
completion_tokens: int
total_tokens: int
class ChatCompletionResponse(BaseModel):
id: str
object: str = "chat.completion"
created: int
model: str
choices: List[ChatCompletionChoice]
usage: Usage
# Routes
@app.get("/v1/models", response_model=List[ModelInfo])
async def list_models():
"""List available models"""
if llm is None:
raise HTTPException(status_code=503, detail="Model not loaded")
return [
ModelInfo(
id="WeDLM-8B-Instruct",
created=int(time.time()),
owned_by="WeDLM"
)
]
@app.get("/v1/models/{model_name}", response_model=ModelInfo)
async def get_model_info(model_name: str):
"""Get information about a specific model"""
if llm is None:
raise HTTPException(status_code=503, detail="Model not loaded")
if model_name != "WeDLM-8B-Instruct":
raise HTTPException(status_code=404, detail="Model not found")
return ModelInfo(
id=model_name,
created=int(time.time()),
owned_by="WeDLM"
)
@app.post("/v1/chat/completions", response_model=ChatCompletionResponse)
async def create_chat_completion(request: ChatCompletionRequest):
"""Create chat completion"""
if llm is None:
raise HTTPException(status_code=503, detail="Model not loaded")
if request.model != "WeDLM-8B-Instruct":
raise HTTPException(status_code=404, detail="Model not found")
try:
# Convert messages to the format expected by tokenizer
messages_content = [{"role": msg.role, "content": msg.content} for msg in request.messages]
# Apply chat template
text = tokenizer.apply_chat_template(
messages_content,
tokenize=False,
add_generation_prompt=True
)
# Generate response
sampling_params = SamplingParams(
temperature=request.temperature,
max_tokens=request.max_tokens,
top_p=request.top_p
)
outputs = llm.generate([text], sampling_params)
generated_text = outputs[0]["text"]
# Extract assistant's response (remove the prompt)
assistant_response = generated_text.split("assistant\n")[-1].strip()
# Calculate token counts
prompt_tokens = len(tokenizer.encode(text))
completion_tokens = len(tokenizer.encode(assistant_response))
response = ChatCompletionResponse(
id=f"chatcmpl-{int(time.time())}",
created=int(time.time()),
model=request.model,
choices=[
ChatCompletionChoice(
index=0,
message=Message(role="assistant", content=assistant_response),
finish_reason="stop"
)
],
usage=Usage(
prompt_tokens=prompt_tokens,
completion_tokens=completion_tokens,
total_tokens=prompt_tokens + completion_tokens
)
)
return response
except Exception as e:
raise HTTPException(status_code=500, detail=f"Generation error: {str(e)}")
@app.get("/health")
async def health_check():
"""Health check endpoint"""
if llm is None:
return {"status": "unhealthy", "detail": "Model not loaded"}
return {"status": "healthy"}
if __name__ == "__main__":
uvicorn.run(app, host="0.0.0.0", port=8000)

WeDLM OpenAI-Compatible Server Setup

This repository provides a complete setup guide for deploying Tencent's WeDLM model with an OpenAI-compatible API server using FastAPI. Follow the instructions below to install dependencies, download the model, and launch the server.

Prerequisites

  • Operating System: Linux or macOS (with bash support)
  • Python: Version 3.13 or higher
  • Hardware: GPU with sufficient VRAM to run WeDLM-8B-Instruct (recommended: 24GB+ VRAM)
  • Disk Space: At least 20GB for model files and dependencies

Installation

Execute the following commands in your terminal to set up the environment and install all required components:

# Install uv package manager
wget -qO- https://astral.sh/uv/install.sh | sh

# Create Python 3.13 virtual environment
uv venv --python 3.13 venv-wedlm

# Activate virtual environment
source venv-wedlm/bin/activate

# Install WeDLM from source
pip install git+https://github.com/tencent/WeDLM.git

# Download WeDLM-8B-Instruct model files
hf download tencent/WeDLM-8B-Instruct --local-dir WeDLM-8B-Instruct

# Install flash-attention for optimized inference
uv pip install flash-attn --no-build-isolation

# Install FastAPI and server dependencies
uv pip install fastapi uvicorn pydantic

Command Explanations

The installation process consists of several steps, each serving a specific purpose in setting up your development environment.

uv is a modern, extremely fast Python package installer and environment manager written in Rust. It significantly accelerates the installation process compared to traditional tools like pip.

The virtual environment creation isolates your WeDLM dependencies from other Python projects on your system, preventing version conflicts and ensuring reproducible deployments.

WeDLM installation pulls the latest version directly from Tencent's GitHub repository, ensuring you have access to the most recent features and bug fixes.

Model download retrieves the WeDLM-8B-Instruct model weights from HuggingFace Hub and stores them locally in the WeDLM-8B-Instruct directory for inference.

flash-attn is a flash attention implementation that substantially reduces memory usage and increases inference speed for transformer models, making large model deployment more efficient.

FastAPI, uvicorn, and pydantic form the foundation of your API server, providing fast async HTTP capabilities and robust data validation.

Running the Server

After completing the installation, launch the WeDLM OpenAI-compatible API server:

python 20251230_wedlm_openai_server.py

Once started, the server will be accessible at http://localhost:8000 by default. The API is fully compatible with OpenAI's chat completions endpoint format, allowing you to use standard OpenAI client libraries with minimal code changes.

API Usage Example

With your server running, you can make requests using any OpenAI-compatible client:

from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="not-needed"
)

response = client.chat.completions.create(
    model="WeDLM-8B-Instruct",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Explain how quantum computing works."}
    ],
    temperature=0.7,
    max_tokens=1024
)

print(response.choices[0].message.content)

Environment Variables

You can customize server behavior using environment variables:

Variable Description Default
MODEL_PATH Path to WeDLM model directory WeDLM-8B-Instruct
HOST Server bind address 0.0.0.0
PORT Server port 8000
DEVICE Compute device (cuda, cpu, mps) cuda
MAX_BATCH_SIZE Maximum batch size for inference 1

Troubleshooting

If you encounter CUDA out-of-memory errors, reduce the batch size or enable CPU offloading by setting CUDA_VISIBLE_DEVICES= appropriately. For systems with limited VRAM, consider using model quantization or reducing the max_tokens parameter in your requests.

For flash-attention installation issues on certain systems, ensure you have the latest CUDA toolkit installed and that your PyTorch version matches your CUDA version.

@exlaw
Copy link

exlaw commented Dec 30, 2025

Thanks for your work.

I think the correct API use example should be

from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="not-needed"
)

response = client.chat.completions.create(
    model="WeDLM-8B-Instruct",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Explain how quantum computing works."}
    ],
    temperature=0.7,
    max_tokens=1024
)

print(response.choices[0].message.content)

@cyysky
Copy link
Author

cyysky commented Dec 30, 2025

Thanks for your work.

I think the correct API use example should be

from openai import OpenAI

client = OpenAI(
    base_url="http://localhost:8000/v1",
    api_key="not-needed"
)

response = client.chat.completions.create(
    model="WeDLM-8B-Instruct",
    messages=[
        {"role": "system", "content": "You are a helpful assistant."},
        {"role": "user", "content": "Explain how quantum computing works."}
    ],
    temperature=0.7,
    max_tokens=1024
)

print(response.choices[0].message.content)

thanks, readme updated πŸ‘πŸ‘πŸ‘

@cnmoro
Copy link

cnmoro commented Jan 8, 2026

Is it possible to run this with any kind of quantization?

@cyysky
Copy link
Author

cyysky commented Jan 13, 2026

Is it possible to run this with any kind of quantization?

didn't have chance to test it in quantization

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment