Skip to content

Instantly share code, notes, and snippets.

@amosgyamfi
Created November 30, 2025 02:56
Show Gist options
  • Select an option

  • Save amosgyamfi/6b82b0eeeaad63820ca5f65b7ee8e741 to your computer and use it in GitHub Desktop.

Select an option

Save amosgyamfi/6b82b0eeeaad63820ca5f65b7ee8e741 to your computer and use it in GitHub Desktop.
import asyncio
from dotenv import load_dotenv
from vision_agents.core import Agent, User, cli
from vision_agents.core.agents import AgentLauncher
from vision_agents.core.events import CallSessionParticipantJoinedEvent
from vision_agents.plugins import deepgram, elevenlabs, getstream, openai, smart_turn
load_dotenv()
"""
A Tech Troubleshooting Vision Agent Using Baseten and Qwen3-VL
This example demonstrates Baseten Qwen3-VL integration with Vision Agents for vision-related tasks.
This example creates an agent that uses:
- Baseten Qwen3-VL for LLM processing
- Stream for edge/real-time communication
- Deepgram for speech-to-text (STT)
- ElevenLabs for text-to-speech (TTS)
- Smart Turn for turn detection
Requirements:
- STREAM_API_KEY and STREAM_API_SECRET
- OPENAI_API_KEY and BASE_URL (for Baseten VLM)
- DEEPGRAM_API_KEY
- ELEVENLABS_API_KEY
"""
async def create_agent(**kwargs) -> Agent:
# Initialize the Baseten VLM
llm = openai.ChatCompletionsVLM(model="qwen-3-vl-32b")
# Create an agent with video understanding capabilities
agent = Agent(
edge=getstream.Edge(),
agent_user=User(name="Video Assistant", id="agent"),
instructions="Read @plugin_examples/baseten_qwen3vl_instructions.md",
llm=llm,
turn_detection=smart_turn.TurnDetection(),
stt=deepgram.STT(),
tts=elevenlabs.TTS(),
processors=[],
)
return agent
async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
await agent.create_user()
call = await agent.create_call(call_type, call_id)
@agent.events.subscribe
async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
if event.participant.user.id != "agent":
await asyncio.sleep(2)
await agent.simple_response("Describe what you currently see and help to fix it if possible")
with await agent.join(call):
await agent.edge.open_demo(call)
# The agent will automatically process video frames and respond to user input
await agent.finish()
if __name__ == "__main__":
cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment