amosgyamfi/baseten_qwen3vl.py

## baseten_qwen3vl.py
import asyncio

from dotenv import load_dotenv
from vision_agents.core import Agent, User, cli
from vision_agents.core.agents import AgentLauncher
from vision_agents.core.events import CallSessionParticipantJoinedEvent
from vision_agents.plugins import deepgram, elevenlabs, getstream, openai, smart_turn

load_dotenv()

"""
A Tech Troubleshooting Vision Agent Using Baseten and Qwen3-VL

This example demonstrates Baseten Qwen3-VL integration with Vision Agents for vision-related tasks.

This example creates an agent that uses:
- Baseten Qwen3-VL for LLM processing
- Stream for edge/real-time communication
- Deepgram for speech-to-text (STT)
- ElevenLabs for text-to-speech (TTS)
- Smart Turn for turn detection

Requirements:
- STREAM_API_KEY and STREAM_API_SECRET
- OPENAI_API_KEY and BASE_URL (for Baseten VLM)
- DEEPGRAM_API_KEY
- ELEVENLABS_API_KEY
"""


async def create_agent(**kwargs) -> Agent:
    # Initialize the Baseten VLM
    llm = openai.ChatCompletionsVLM(model="qwen-3-vl-32b")

    # Create an agent with video understanding capabilities
    agent = Agent(
        edge=getstream.Edge(),
        agent_user=User(name="Video Assistant", id="agent"),
        instructions="Read @plugin_examples/baseten_qwen3vl_instructions.md",
        llm=llm,
        turn_detection=smart_turn.TurnDetection(),
        stt=deepgram.STT(),
        tts=elevenlabs.TTS(),
        processors=[],
    )
    return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
    await agent.create_user()
    call = await agent.create_call(call_type, call_id)

    @agent.events.subscribe
    async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
        if event.participant.user.id != "agent":
            await asyncio.sleep(2)
            await agent.simple_response("Describe what you currently see and help to fix it if possible")

    with await agent.join(call):
        await agent.edge.open_demo(call)
        # The agent will automatically process video frames and respond to user input
        await agent.finish()


if __name__ == "__main__":
    cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
	import asyncio

	from dotenv import load_dotenv
	from vision_agents.core import Agent, User, cli
	from vision_agents.core.agents import AgentLauncher
	from vision_agents.core.events import CallSessionParticipantJoinedEvent
	from vision_agents.plugins import deepgram, elevenlabs, getstream, openai, smart_turn

	load_dotenv()

	"""
	A Tech Troubleshooting Vision Agent Using Baseten and Qwen3-VL

	This example demonstrates Baseten Qwen3-VL integration with Vision Agents for vision-related tasks.

	This example creates an agent that uses:
	- Baseten Qwen3-VL for LLM processing
	- Stream for edge/real-time communication
	- Deepgram for speech-to-text (STT)
	- ElevenLabs for text-to-speech (TTS)
	- Smart Turn for turn detection

	Requirements:
	- STREAM_API_KEY and STREAM_API_SECRET
	- OPENAI_API_KEY and BASE_URL (for Baseten VLM)
	- DEEPGRAM_API_KEY
	- ELEVENLABS_API_KEY
	"""


	async def create_agent(**kwargs) -> Agent:
	# Initialize the Baseten VLM
	llm = openai.ChatCompletionsVLM(model="qwen-3-vl-32b")

	# Create an agent with video understanding capabilities
	agent = Agent(
	edge=getstream.Edge(),
	agent_user=User(name="Video Assistant", id="agent"),
	instructions="Read @plugin_examples/baseten_qwen3vl_instructions.md",
	llm=llm,
	turn_detection=smart_turn.TurnDetection(),
	stt=deepgram.STT(),
	tts=elevenlabs.TTS(),
	processors=[],
	)
	return agent


	async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
	await agent.create_user()
	call = await agent.create_call(call_type, call_id)

	@agent.events.subscribe
	async def on_participant_joined(event: CallSessionParticipantJoinedEvent):
	if event.participant.user.id != "agent":
	await asyncio.sleep(2)
	await agent.simple_response("Describe what you currently see and help to fix it if possible")

	with await agent.join(call):
	await agent.edge.open_demo(call)
	# The agent will automatically process video frames and respond to user input
	await agent.finish()


	if __name__ == "__main__":
	cli(AgentLauncher(create_agent=create_agent, join_call=join_call))
No results found