Created
November 30, 2025 02:56
-
-
Save amosgyamfi/6b82b0eeeaad63820ca5f65b7ee8e741 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import asyncio | |
| from dotenv import load_dotenv | |
| from vision_agents.core import Agent, User, cli | |
| from vision_agents.core.agents import AgentLauncher | |
| from vision_agents.core.events import CallSessionParticipantJoinedEvent | |
| from vision_agents.plugins import deepgram, elevenlabs, getstream, openai, smart_turn | |
| load_dotenv() | |
| """ | |
| A Tech Troubleshooting Vision Agent Using Baseten and Qwen3-VL | |
| This example demonstrates Baseten Qwen3-VL integration with Vision Agents for vision-related tasks. | |
| This example creates an agent that uses: | |
| - Baseten Qwen3-VL for LLM processing | |
| - Stream for edge/real-time communication | |
| - Deepgram for speech-to-text (STT) | |
| - ElevenLabs for text-to-speech (TTS) | |
| - Smart Turn for turn detection | |
| Requirements: | |
| - STREAM_API_KEY and STREAM_API_SECRET | |
| - OPENAI_API_KEY and BASE_URL (for Baseten VLM) | |
| - DEEPGRAM_API_KEY | |
| - ELEVENLABS_API_KEY | |
| """ | |
| async def create_agent(**kwargs) -> Agent: | |
| # Initialize the Baseten VLM | |
| llm = openai.ChatCompletionsVLM(model="qwen-3-vl-32b") | |
| # Create an agent with video understanding capabilities | |
| agent = Agent( | |
| edge=getstream.Edge(), | |
| agent_user=User(name="Video Assistant", id="agent"), | |
| instructions="Read @plugin_examples/baseten_qwen3vl_instructions.md", | |
| llm=llm, | |
| turn_detection=smart_turn.TurnDetection(), | |
| stt=deepgram.STT(), | |
| tts=elevenlabs.TTS(), | |
| processors=[], | |
| ) | |
| return agent | |
| async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None: | |
| await agent.create_user() | |
| call = await agent.create_call(call_type, call_id) | |
| @agent.events.subscribe | |
| async def on_participant_joined(event: CallSessionParticipantJoinedEvent): | |
| if event.participant.user.id != "agent": | |
| await asyncio.sleep(2) | |
| await agent.simple_response("Describe what you currently see and help to fix it if possible") | |
| with await agent.join(call): | |
| await agent.edge.open_demo(call) | |
| # The agent will automatically process video frames and respond to user input | |
| await agent.finish() | |
| if __name__ == "__main__": | |
| cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment