Skip to content

Instantly share code, notes, and snippets.

@gordinmitya
Created November 11, 2025 12:07
Show Gist options
  • Select an option

  • Save gordinmitya/769647b364ea3854ce14f2e57dea6140 to your computer and use it in GitHub Desktop.

Select an option

Save gordinmitya/769647b364ea3854ce14f2e57dea6140 to your computer and use it in GitHub Desktop.
services:
qwen3-vl:
image: ghcr.io/ggml-org/llama.cpp:server-cuda
environment:
- LLAMA_ARG_N_GPU_LAYERS=999
- LLAMA_ARG_MMPROJ=/models/mmproj-BF16.gguf
- LLAMA_ARG_MODEL=/models/Qwen3-VL-4B-Instruct-UD-Q8_K_XL.gguf
ports:
- "8080:8080"
volumes:
- ./models:/models
deploy:
resources:
reservations:
devices:
- driver: nvidia
count: all
capabilities: [gpu]
import base64
from pathlib import Path
import cv2
from openai import OpenAI
from pydantic import BaseModel, Field
# pip install openai opencv-python pydantic
class Person(BaseModel):
age: int = Field(..., description="The person's age in years")
gender: str = Field(..., description="The person's gender. Male or female")
class VisionClient:
def __init__(self):
self.client = OpenAI(
base_url="http://localhost:8080",
api_key="any",
)
self.model = "/models/Qwen3-VL-4B-Instruct-UD-Q8_K_XL.gguf"
def encode_image(self, image_path):
if Path(image_path).suffix.lower() in {".jpg", ".jpeg"}:
data = Path(image_path).read_bytes()
else:
image = cv2.imread(image_path)
if image is None:
raise ValueError(f"Could not read image: {image_path}")
ok, buffer = cv2.imencode(".jpg", image)
if not ok:
raise ValueError("Could not encode image to JPEG")
data = buffer.tobytes()
return base64.b64encode(data).decode("utf-8")
def analyze_image(self, image_path):
prompt = "what's in this image?"
base64_image = self.encode_image(image_path)
response = self.client.chat.completions.parse(
model=self.model,
messages=[
{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{base64_image}"
},
},
],
}
],
response_format=Person,
temperature=0.7,
top_p=0.8,
presence_penalty=1.5,
frequency_penalty=0.0, # optional
max_tokens=32768,
seed=3407,
extra_body={
"top_k": 20,
"repetition_penalty": 1.0,
},
)
return response.choices[0].message.content
vision_client = VisionClient()
print(vision_client.analyze_image("person.jpg"))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment