From bab9998065702a68318eaaf5bc14aa918de9305a Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 20 Jan 2026 12:10:27 +0100 Subject: [PATCH 1/6] Migrate examples to the new Runner API The new `Runner` class allows running agents in both console and server modes. --- DEVELOPMENT.md | 74 +++++-- README.md | 186 ++++++++++-------- agents-core/vision_agents/core/__init__.py | 2 - .../vision_agents/core/cli/__init__.py | 0 .../vision_agents/core/cli/cli_runner.py | 152 -------------- examples/01_simple_agent_example/README.md | 34 ++-- .../simple_agent_example.py | 4 +- examples/02_golf_coach_example/README.md | 35 ++-- .../golf_coach_example.py | 6 +- .../golf_coach_with_metrics.py | 11 +- .../RUNNING_THE_EXAMPLE.md | 16 +- .../football_commentator_example.py | 7 +- examples/05_security_camera_example/README.md | 46 +++-- .../security_camera_example.py | 10 +- .../06_prometheus_metrics_example/README.md | 8 +- .../prometheus_metrics_example.py | 9 +- examples/07_deploy_example/Dockerfile | 2 +- examples/07_deploy_example/Dockerfile.gpu | 2 +- examples/07_deploy_example/README.md | 10 +- examples/07_deploy_example/deploy_example.py | 9 +- .../aws_realtime_function_calling_example.py | 10 +- .../aws/example/aws_realtime_nova_example.py | 4 +- plugins/cartesia/example/README.md | 6 +- plugins/cartesia/example/main.py | 5 +- plugins/cartesia/example/narrator-example.py | 9 +- plugins/decart/example/README.md | 29 ++- plugins/decart/example/decart_example.py | 4 +- plugins/deepgram/example/README.md | 5 +- .../deepgram/example/deepgram_tts_example.py | 8 +- plugins/elevenlabs/example/README.md | 20 +- .../elevenlabs/example/elevenlabs_example.py | 8 +- .../example/fast_whisper_example.py | 7 +- plugins/fish/example/README.md | 13 +- plugins/fish/example/fish_example.py | 8 +- plugins/heygen/example/README.md | 60 +++--- plugins/heygen/example/avatar_example.py | 8 +- .../heygen/example/avatar_realtime_example.py | 8 +- plugins/huggingface/example/README.md | 6 +- plugins/huggingface/example/main.py | 7 +- .../inworld/example/inworld_tts_example.py | 8 +- plugins/kokoro/example/README.md | 14 +- plugins/kokoro/example/kokoro_example.py | 8 +- plugins/moondream/README.md | 77 +++++--- .../example/moondream_vlm_example.py | 10 +- plugins/nvidia/example/README.md | 4 +- plugins/nvidia/example/main.py | 7 +- .../qwen_vl_example/qwen_vl_example.py | 4 +- .../openrouter/example/openrouter_example.py | 12 +- plugins/pocket/example/README.md | 3 +- plugins/pocket/example/pocket_example.py | 8 +- plugins/qwen/example/README.md | 18 +- plugins/qwen/example/qwen_realtime_example.py | 4 +- plugins/roboflow/example/README.md | 3 +- plugins/roboflow/example/roboflow_example.py | 5 +- plugins/sample_plugin/example/my_example.py | 8 +- .../smart_turn/example/smart_turn_example.py | 7 +- plugins/vogent/example/vogent_example.py | 7 +- plugins/wizper/example/README.md | 7 +- plugins/wizper/example/wizper_example.py | 10 +- 59 files changed, 528 insertions(+), 544 deletions(-) delete mode 100644 agents-core/vision_agents/core/cli/__init__.py delete mode 100644 agents-core/vision_agents/core/cli/cli_runner.py diff --git a/DEVELOPMENT.md b/DEVELOPMENT.md index eabfe4155..7988f3039 100644 --- a/DEVELOPMENT.md +++ b/DEVELOPMENT.md @@ -11,18 +11,27 @@ pre-commit install ``` To setup your .env + ```bash cp env.example .env ``` ## Running + ```bash -uv run examples/01_simple_agent_example/simple_agent_example.py +uv run examples/01_simple_agent_example/simple_agent_example.py run ``` ### Running with a video file as input + ```bash -uv run --video-track-override +uv run run --video-track-override +``` + +### Running as an HTTP server + +```bash +uv run serve --host= --port= ``` ## Tests @@ -34,6 +43,7 @@ uv run py.test -m "not integration" -n auto ``` Integration test. (requires secrets in place, see .env setup) + ``` uv run py.test -m "integration" -n auto ``` @@ -60,7 +70,6 @@ uv run ruff check --fix ### Mypy type checks - ``` uv run mypy --install-types --non-interactive -p vision_agents ``` @@ -119,8 +128,10 @@ To see how the agent work open up agents.py Some important things about audio inside the library: 1. WebRTC uses Opus 48khz stereo but inside the library audio is always in PCM format -2. Plugins / AI models work with different PCM formats, passing bytes around without a container type leads to kaos and is forbidden -3. PCM data is always passed around using the `PcmData` object which contains information about sample rate, channels and format +2. Plugins / AI models work with different PCM formats, passing bytes around without a container type leads to kaos and + is forbidden +3. PCM data is always passed around using the `PcmData` object which contains information about sample rate, channels + and format 4. Audio resampling can be done using `PcmData.resample` method 5. Adjusting from stereo to mono and vice-versa can be done using the `PcmData.resample` method 6. `PcmData` comes with convenience constructor methods to build from bytes, iterators, ndarray, ... @@ -132,6 +143,7 @@ import asyncio from getstream.video.rtc.track_util import PcmData from openai import AsyncOpenAI + async def example(): client = AsyncOpenAI(api_key="sk-42") @@ -162,6 +174,7 @@ async def example(): await play_pcm_with_ffplay(resampled_pcm) + if __name__ == "__main__": asyncio.run(example()) ``` @@ -177,6 +190,7 @@ Sometimes you need to test audio manually, here's some tips: ## Creating PcmData ### from_bytes + Build from raw PCM bytes ```python @@ -186,6 +200,7 @@ PcmData.from_bytes(audio_bytes, sample_rate=16000, format=AudioFormat.S16, chann ``` ### from_numpy + Build from numpy arrays with automatic dtype/shape conversion ```python @@ -194,6 +209,7 @@ PcmData.from_numpy(np.array([1, 2], np.int16), sample_rate=16000, format=AudioFo ``` ### from_response + Construct from API response (bytes, iterators, async iterators, objects with .data) ```python @@ -204,6 +220,7 @@ PcmData.from_response( ``` ### from_av_frame + Create from PyAV AudioFrame ```python @@ -213,6 +230,7 @@ PcmData.from_av_frame(frame) ## Converting Format ### to_float32 + Convert samples to float32 in [-1, 1] ```python @@ -220,6 +238,7 @@ pcm_f32 = pcm.to_float32() ``` ### to_int16 + Convert samples to int16 PCM format ```python @@ -227,6 +246,7 @@ pcm_s16 = pcm.to_int16() ``` ### to_bytes + Return interleaved PCM bytes ```python @@ -234,6 +254,7 @@ audio_bytes = pcm.to_bytes() ``` ### to_wav_bytes + Return WAV file bytes (header + frames) ```python @@ -253,6 +274,7 @@ pcm = pcm.resample(16000, target_channels=1) # to 16khz, mono ## Manipulating Audio ### append + Append another PcmData in-place (adjusts format/rate automatically) ```python @@ -260,6 +282,7 @@ pcm.append(other_pcm) ``` ### copy + Create a deep copy ```python @@ -267,6 +290,7 @@ pcm_copy = pcm.copy() ``` ### clear + Clear all samples in-place (keeps metadata) ```python @@ -276,6 +300,7 @@ pcm.clear() ## Slicing and Chunking ### head + Keep only the first N seconds ```python @@ -283,6 +308,7 @@ pcm_head = pcm.head(duration_s=3.0) ``` ### tail + Keep only the last N seconds ```python @@ -290,6 +316,7 @@ pcm_tail = pcm.tail(duration_s=5.0) ``` ### chunks + Iterate over fixed-size chunks with optional overlap ```python @@ -318,7 +345,8 @@ pcm = await queue.get_duration(100) # AudioTrack -Use `getstream.video.rtc.AudioTrack` if you need to publish audio using PyAV, this class ensures that `recv` paces audio correctly every 20ms. +Use `getstream.video.rtc.AudioTrack` if you need to publish audio using PyAV, this class ensures that `recv` paces audio +correctly every 20ms. - Use `.write()` method to enqueue audio (PcmData) - Use `.flush()` to empty all the enqueued audio (eg. barge-in event) @@ -347,8 +375,10 @@ This prevents mistakes related to handling audio with different formats, sample ### Testing -Many of the underlying APIs change daily. To ensure things work we keep 2 sets of tests. Integration tests and unit tests. -Integration tests run once a day to verify that changes to underlying APIs didn't break the framework. Some testing guidelines +Many of the underlying APIs change daily. To ensure things work we keep 2 sets of tests. Integration tests and unit +tests. +Integration tests run once a day to verify that changes to underlying APIs didn't break the framework. Some testing +guidelines - Every plugin needs an integration test - Limit usage of response capturing style testing. (since they diverge from reality) @@ -442,11 +472,13 @@ metrics.set_meter_provider( start_http_server(port=9464) ``` -You can now see the metrics at `http://localhost:9464/metrics` (make sure that your Python program keeps running), after this you can setup your Prometheus server to scrape this endpoint. +You can now see the metrics at `http://localhost:9464/metrics` (make sure that your Python program keeps running), after +this you can setup your Prometheus server to scrape this endpoint. ### Profiling -The `Profiler` class uses `pyinstrument` to profile your agent's performance and generate an HTML report showing where time is spent during execution. +The `Profiler` class uses `pyinstrument` to profile your agent's performance and generate an HTML report showing where +time is spent during execution. #### Example usage: @@ -456,6 +488,7 @@ from vision_agents.core import User, Agent from vision_agents.core.profiling import Profiler from vision_agents.plugins import getstream, gemini, deepgram, elevenlabs, vogent + async def start_agent() -> None: agent = Agent( edge=getstream.Edge(), @@ -475,12 +508,13 @@ async def start_agent() -> None: ``` The profiler automatically: + - Starts profiling when the agent is created - Stops profiling when the agent finishes (on `AgentFinishEvent`) - Saves an HTML report to the specified output path (default: `./profile.html`) -You can open the generated HTML file in a browser to view the performance profile, which shows a timeline of function calls and where time is spent during agent execution. - +You can open the generated HTML file in a browser to view the performance profile, which shows a timeline of function +calls and where time is spent during agent execution. ### Queuing @@ -498,13 +532,15 @@ You can open the generated HTML file in a browser to view the performance profil ### Video Frames & Tracks -- Track.recv errors will fail silently. The API is to return a frame. Never return None. and wait till the next frame is available -- When using frame.to_ndarray(format="rgb24") specify the format. Typically you want rgb24 when connecting/sending to Yolo etc +- Track.recv errors will fail silently. The API is to return a frame. Never return None. and wait till the next frame is + available +- When using frame.to_ndarray(format="rgb24") specify the format. Typically you want rgb24 when connecting/sending to + Yolo etc - QueuedVideoTrack is a writable/queued video track implementation which is useful when forwarding video - ### Loading Resources in Plugins (aka "warmup") -Some plugins require to download and use external resources like models to work. + +Some plugins require to download and use external resources like models to work. For example: @@ -512,7 +548,7 @@ For example: - Video processors using `YOLO` models In order to standardise how these resources are loaded and to make it performant, the framework provides a special ABC -`vision_agents.core.warmup.Warmable`. +`vision_agents.core.warmup.Warmable`. To use it, simply subclass it and define the required methods. Note that `Warmable` supports generics to leverage type checking. @@ -551,12 +587,10 @@ class FasterWhisperSTT(STT, Warmable[WhisperModel]): # This method will be called every time a new agent is initialized. # The warmup process is now complete. self._whisper_model = whisper - + ... ``` - - ## Onboarding Plan for new contributors **Audio Formats** diff --git a/README.md b/README.md index 054220832..bb9566243 100644 --- a/README.md +++ b/README.md @@ -14,18 +14,21 @@ https://github.com/user-attachments/assets/d9778ab9-938d-4101-8605-ff879c29b0e4 - ### Multi-modal AI agents that watch, listen, and understand video. -Vision Agents give you the building blocks to create intelligent, low-latency video experiences powered by your models, your infrastructure, and your use cases. +Vision Agents give you the building blocks to create intelligent, low-latency video experiences powered by your models, +your infrastructure, and your use cases. ### Key Highlights - **Video AI:** Built for real-time video AI. Combine YOLO, Roboflow, and others with Gemini/OpenAI in real-time. -- **Low Latency:** Join quickly (500ms) and maintain audio/video latency under 30ms using [Stream's edge network](https://getstream.io/video/). +- **Low Latency:** Join quickly (500ms) and maintain audio/video latency under 30ms + using [Stream's edge network](https://getstream.io/video/). - **Open:** Built by Stream, but works with any video edge network. -- **Native APIs:** Native SDK methods from OpenAI (`create response`), Gemini (`generate`), and Claude (`create message`) — always access the latest LLM capabilities. -- **SDKs:** SDKs for React, Android, iOS, Flutter, React Native, and Unity, powered by Stream's ultra-low-latency network. +- **Native APIs:** Native SDK methods from OpenAI (`create response`), Gemini (`generate`), and Claude ( + `create message`) — always access the latest LLM capabilities. +- **SDKs:** SDKs for React, Android, iOS, Flutter, React Native, and Unity, powered by Stream's ultra-low-latency + network. https://github.com/user-attachments/assets/d66587ea-7af4-40c4-9966-5c04fbcf467c @@ -38,8 +41,10 @@ https://github.com/user-attachments/assets/d66587ea-7af4-40c4-9966-5c04fbcf467c https://github.com/user-attachments/assets/d1258ac2-ca98-4019-80e4-41ec5530117e This example shows you how to build golf coaching AI with YOLO and Gemini Live. -Combining a fast object detection model (like YOLO) with a full realtime AI is useful for many different video AI use cases. -For example: Drone fire detection, sports/video game coaching, physical therapy, workout coaching, just dance style games etc. +Combining a fast object detection model (like YOLO) with a full realtime AI is useful for many different video AI use +cases. +For example: Drone fire detection, sports/video game coaching, physical therapy, workout coaching, just dance style +games etc. ```python # partial example, full example: examples/02_golf_coach_example/golf_coach_example.py @@ -48,7 +53,7 @@ agent = Agent( agent_user=agent_user, instructions="Read @golf_coach.md", llm=gemini.Realtime(fps=10), - #llm=openai.Realtime(fps=1), # Careful with FPS can get expensive + # llm=openai.Realtime(fps=1), # Careful with FPS can get expensive processors=[ultralytics.YOLOPoseProcessor(model_path="yolo11n-pose.pt", device="cuda")], ) ``` @@ -57,9 +62,11 @@ agent = Agent( https://github.com/user-attachments/assets/92a2cdd8-909c-46d8-aab7-039a90efc186 -This example shows a security camera system that detects faces, tracks packages and detects when a package is stolen. It automatically generates "WANTED" posters, posting them to X in real-time. +This example shows a security camera system that detects faces, tracks packages and detects when a package is stolen. It +automatically generates "WANTED" posters, posting them to X in real-time. -It combines face recognition, YOLOv11 object detection, Nano Banana and Gemini for a complete security workflow with voice interaction. +It combines face recognition, YOLOv11 object detection, Nano Banana and Gemini for a complete security workflow with +voice interaction. ```python # partial example, full example: examples/04_security_camera_example/security_camera_example.py @@ -82,9 +89,11 @@ agent = Agent( ### Cluely style Invisible Assistant (coming soon) -Apps like Cluely offer realtime coaching via an invisible overlay. This example shows you how you can build your own invisible assistant. +Apps like Cluely offer realtime coaching via an invisible overlay. This example shows you how you can build your own +invisible assistant. It combines Gemini realtime (to watch your screen and audio), and doesn't broadcast audio (only text). This approach -is quite versatile and can be used for: Sales coaching, job interview cheating, physical world/ on the job coaching with glasses +is quite versatile and can be used for: Sales coaching, job interview cheating, physical world/ on the job coaching with +glasses Demo video @@ -110,53 +119,52 @@ agent = Agent( **Step 3: Obtain your Stream API credentials** -Get a free API key from [Stream](https://getstream.io/). Developers receive **333,000 participant minutes** per month, plus extra credits via the Maker Program. +Get a free API key from [Stream](https://getstream.io/). Developers receive **333,000 participant minutes** per month, +plus extra credits via the Maker Program. ## Features -| **Feature** | **Description** | -|------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------| -| **True real-time via WebRTC** | Stream directly to model providers that support it for instant visual understanding. | -| **Interval/processor pipeline** | For providers without WebRTC, process frames with pluggable video processors (e.g., YOLO, Roboflow, or custom PyTorch/ONNX) before/after model calls. | -| **Turn detection & diarization** | Keep conversations natural; know when the agent should speak or stay quiet and who's talking. | -| **Voice activity detection (VAD)** | Trigger actions intelligently and use resources efficiently. | -| **Speech↔Text↔Speech** | Enable low-latency loops for smooth, conversational voice UX. | -| **Tool/function calling** | Execute arbitrary code and APIs mid-conversation. Create Linear issues, query weather, trigger telephony, or hit internal services. | +| **Feature** | **Description** | +|-------------------------------------|-------------------------------------------------------------------------------------------------------------------------------------------------------| +| **True real-time via WebRTC** | Stream directly to model providers that support it for instant visual understanding. | +| **Interval/processor pipeline** | For providers without WebRTC, process frames with pluggable video processors (e.g., YOLO, Roboflow, or custom PyTorch/ONNX) before/after model calls. | +| **Turn detection & diarization** | Keep conversations natural; know when the agent should speak or stay quiet and who's talking. | +| **Voice activity detection (VAD)** | Trigger actions intelligently and use resources efficiently. | +| **Speech↔Text↔Speech** | Enable low-latency loops for smooth, conversational voice UX. | +| **Tool/function calling** | Execute arbitrary code and APIs mid-conversation. Create Linear issues, query weather, trigger telephony, or hit internal services. | | **Built-in memory via Stream Chat** | Agents recall context naturally across turns and sessions. | -| **Text back-channel** | Message the agent silently during a call. | -| **Phone and RAG** | Interact with the Agent via inbound or outbound phone calls using Twilio and Turbopuffer | - +| **Text back-channel** | Message the agent silently during a call. | +| **Phone and RAG** | Interact with the Agent via inbound or outbound phone calls using Twilio and Turbopuffer | ## Out-of-the-Box Integrations -| **Plugin Name** | **Description** | **Docs Link** | -|-------------|-------------|-----------| -| AWS Bedrock | Realtime speech-to-speech plugin using Amazon Nova models with automatic reconnection | [AWS](https://visionagents.ai/integrations/aws-bedrock) | -| AWS Polly | TTS plugin using Amazon's cloud-based service with natural-sounding voices and neural engine support | [AWS Polly](https://visionagents.ai/integrations/aws-polly) | -| Cartesia | TTS plugin for realistic voice synthesis in real-time voice applications | [Cartesia](https://visionagents.ai/integrations/cartesia) | -| Decart | Real-time AI video transformation service for applying artistic styles and effects to video streams | [Decart](https://visionagents.ai/integrations/decart) | -| Deepgram | STT plugin for fast, accurate real-time transcription with speaker diarization | [Deepgram](https://visionagents.ai/integrations/deepgram) | -| ElevenLabs | TTS plugin with highly realistic and expressive voices for conversational agents | [ElevenLabs](https://visionagents.ai/integrations/elevenlabs) | -| Fast-Whisper | High-performance STT plugin using OpenAI's Whisper model with CTranslate2 for fast inference | [Fast-Whisper](https://visionagents.ai/integrations/fast-whisper) | -| Fish Audio | STT and TTS plugin with automatic language detection and voice cloning capabilities | [Fish Audio](https://visionagents.ai/integrations/fish) | -| Gemini | Realtime API for building conversational agents with support for both voice and video | [Gemini](https://visionagents.ai/integrations/gemini) | -| HeyGen | Realtime interactive avatars powered by [HeyGen](https://heygen.com/) | [HeyGen](https://visionagents.ai/integrations/heygen) | -| Inworld | TTS plugin with high-quality streaming voices for real-time conversational AI agents | [Inworld](https://visionagents.ai/integrations/inworld) | -| Kokoro | Local TTS engine for offline voice synthesis with low latency | [Kokoro](https://visionagents.ai/integrations/kokoro) | -| Moondream | Moondream provides realtime detection and VLM capabilities. Developers can choose from using the hosted API or running locally on their CUDA devices. Vision Agents supports Moondream's Detect, Caption and VQA skills out-of-the-box. | [Moondream](https://visionagents.ai/integrations/moondream) | -| NVIDIA Cosmos 2 | VLM plugin using NVIDIA's Cosmos 2 models for video understanding with automatic frame buffering and streaming responses | [NVIDIA](https://visionagents.ai/integrations/nvidia) | -| OpenAI | Realtime API for building conversational agents with out of the box support for real-time video directly over WebRTC, LLMs and Open AI TTS | [OpenAI](https://visionagents.ai/integrations/openai) | -| OpenRouter | LLM plugin providing access to multiple providers (Anthropic, Google, OpenAI) through a unified API | [OpenRouter](https://visionagents.ai/integrations/openrouter) | -| Qwen | Realtime audio plugin using Alibaba's Qwen3 with native audio output and built-in speech recognition | [Qwen](https://visionagents.ai/integrations/qwen) | -| Roboflow | Object detection processor using Roboflow's hosted API or local RF-DETR models | [Roboflow](https://visionagents.ai/integrations/roboflow) | -| Smart Turn | Advanced turn detection system combining Silero VAD, Whisper, and neural models for natural conversation flow | [Smart Turn](https://visionagents.ai/integrations/smart-turn) | -| TurboPuffer | RAG plugin using TurboPuffer for hybrid search (vector + BM25) with Gemini embeddings for retrieval augmented generation | [TurboPuffer](https://visionagents.ai/guides/rag) | -| Twilio | Voice call integration plugin enabling bidirectional audio streaming via Twilio Media Streams with call registry and audio conversion | [Twilio](https://github.com/GetStream/Vision-Agents/tree/main/examples/03_phone_and_rag_example) | -| Ultralytics | Real-time pose detection processor using YOLO models with skeleton overlays | [Ultralytics](https://visionagents.ai/integrations/ultralytics) | -| Vogent | Neural turn detection system for intelligent turn-taking in voice conversations | [Vogent](https://visionagents.ai/integrations/vogent) | -| Wizper | STT plugin with real-time translation capabilities powered by Whisper v3 | [Wizper](https://visionagents.ai/integrations/wizper) | -| xAI | LLM plugin using xAI's Grok models with advanced reasoning and real-time knowledge | [xAI](https://visionagents.ai/integrations/xai) | - +| **Plugin Name** | **Description** | **Docs Link** | +|-----------------|-----------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|--------------------------------------------------------------------------------------------------| +| AWS Bedrock | Realtime speech-to-speech plugin using Amazon Nova models with automatic reconnection | [AWS](https://visionagents.ai/integrations/aws-bedrock) | +| AWS Polly | TTS plugin using Amazon's cloud-based service with natural-sounding voices and neural engine support | [AWS Polly](https://visionagents.ai/integrations/aws-polly) | +| Cartesia | TTS plugin for realistic voice synthesis in real-time voice applications | [Cartesia](https://visionagents.ai/integrations/cartesia) | +| Decart | Real-time AI video transformation service for applying artistic styles and effects to video streams | [Decart](https://visionagents.ai/integrations/decart) | +| Deepgram | STT plugin for fast, accurate real-time transcription with speaker diarization | [Deepgram](https://visionagents.ai/integrations/deepgram) | +| ElevenLabs | TTS plugin with highly realistic and expressive voices for conversational agents | [ElevenLabs](https://visionagents.ai/integrations/elevenlabs) | +| Fast-Whisper | High-performance STT plugin using OpenAI's Whisper model with CTranslate2 for fast inference | [Fast-Whisper](https://visionagents.ai/integrations/fast-whisper) | +| Fish Audio | STT and TTS plugin with automatic language detection and voice cloning capabilities | [Fish Audio](https://visionagents.ai/integrations/fish) | +| Gemini | Realtime API for building conversational agents with support for both voice and video | [Gemini](https://visionagents.ai/integrations/gemini) | +| HeyGen | Realtime interactive avatars powered by [HeyGen](https://heygen.com/) | [HeyGen](https://visionagents.ai/integrations/heygen) | +| Inworld | TTS plugin with high-quality streaming voices for real-time conversational AI agents | [Inworld](https://visionagents.ai/integrations/inworld) | +| Kokoro | Local TTS engine for offline voice synthesis with low latency | [Kokoro](https://visionagents.ai/integrations/kokoro) | +| Moondream | Moondream provides realtime detection and VLM capabilities. Developers can choose from using the hosted API or running locally on their CUDA devices. Vision Agents supports Moondream's Detect, Caption and VQA skills out-of-the-box. | [Moondream](https://visionagents.ai/integrations/moondream) | +| NVIDIA Cosmos 2 | VLM plugin using NVIDIA's Cosmos 2 models for video understanding with automatic frame buffering and streaming responses | [NVIDIA](https://visionagents.ai/integrations/nvidia) | +| OpenAI | Realtime API for building conversational agents with out of the box support for real-time video directly over WebRTC, LLMs and Open AI TTS | [OpenAI](https://visionagents.ai/integrations/openai) | +| OpenRouter | LLM plugin providing access to multiple providers (Anthropic, Google, OpenAI) through a unified API | [OpenRouter](https://visionagents.ai/integrations/openrouter) | +| Qwen | Realtime audio plugin using Alibaba's Qwen3 with native audio output and built-in speech recognition | [Qwen](https://visionagents.ai/integrations/qwen) | +| Roboflow | Object detection processor using Roboflow's hosted API or local RF-DETR models | [Roboflow](https://visionagents.ai/integrations/roboflow) | +| Smart Turn | Advanced turn detection system combining Silero VAD, Whisper, and neural models for natural conversation flow | [Smart Turn](https://visionagents.ai/integrations/smart-turn) | +| TurboPuffer | RAG plugin using TurboPuffer for hybrid search (vector + BM25) with Gemini embeddings for retrieval augmented generation | [TurboPuffer](https://visionagents.ai/guides/rag) | +| Twilio | Voice call integration plugin enabling bidirectional audio streaming via Twilio Media Streams with call registry and audio conversion | [Twilio](https://github.com/GetStream/Vision-Agents/tree/main/examples/03_phone_and_rag_example) | +| Ultralytics | Real-time pose detection processor using YOLO models with skeleton overlays | [Ultralytics](https://visionagents.ai/integrations/ultralytics) | +| Vogent | Neural turn detection system for intelligent turn-taking in voice conversations | [Vogent](https://visionagents.ai/integrations/vogent) | +| Wizper | STT plugin with real-time translation capabilities powered by Whisper v3 | [Wizper](https://visionagents.ai/integrations/wizper) | +| xAI | LLM plugin using xAI's Grok models with advanced reasoning and real-time knowledge | [xAI](https://visionagents.ai/integrations/xai) | ## Processors @@ -176,20 +184,20 @@ Check out our getting started guide at [VisionAgents.ai](https://visionagents.ai **Quickstart:** [Building a Voice AI app](https://visionagents.ai/introduction/voice-agents) **Quickstart:** [Building a Video AI app](https://visionagents.ai/introduction/video-agents) -**Tutorial:** [Building real-time sports coaching](https://github.com/GetStream/Vision-Agents/tree/main/examples/02_golf_coach_example) +**Tutorial: +** [Building real-time sports coaching](https://github.com/GetStream/Vision-Agents/tree/main/examples/02_golf_coach_example) **Tutorial:** [Building a real-time meeting assistant](https://github.com/GetStream/Vision-Agents#) ## Examples -| 🔮 Demo Applications | | -|:-----|--------------------------------------------------------------------------------| -|

Cartesia

Using Cartesia's Sonic 3 model to visually look at what's in the frame and tell a story with emotion.

• Real-time visual understanding
• Emotional storytelling
• Frame-by-frame analysis

[>Source Code and tutorial](https://github.com/GetStream/Vision-Agents/tree/main/plugins/cartesia/example) | Cartesia Demo | -|

Realtime Stable Diffusion

Realtime stable diffusion using Vision Agents and Decart's Mirage 2 model to create interactive scenes and stories.

• Real-time video restyling
• Interactive scene generation
• Stable diffusion integration

[>Source Code and tutorial](https://github.com/GetStream/Vision-Agents/tree/main/plugins/decart/example) | Mirage Demo | -|

Golf Coach

Using Gemini Live together with Vision Agents and Ultralytics YOLO, we're able to track the user's pose and provide realtime actionable feedback on their golf game.

• Real-time pose tracking
• Actionable coaching feedback
• YOLO pose detection
• Gemini Live integration

[>Source Code and tutorial](https://github.com/GetStream/Vision-Agents/tree/main/examples/02_golf_coach_example) | Golf Coach Demo | -|

GeoGuesser

Together with OpenAI Realtime and Vision Agents, we can take GeoGuesser to the next level by asking it to identify places in our real world surroundings.

• Real-world location identification
• OpenAI Realtime integration
• Visual scene understanding

[>Source Code and tutorial](https://visionagents.ai/integrations/openai#openai-realtime)| GeoGuesser Demo | -|

Phone and RAG

Interact with your Agent over the phone using Twilio. This example demonstrates how to use TurboPuffer for Retrieval Augmented Generation (RAG) to give your agent specialized knowledge.

• Inbound/Outbound telephony
• Twilio Media Streams integration
• Vector search with TurboPuffer
• Retrieval Augmented Generation

[>Source Code and tutorial](https://github.com/GetStream/Vision-Agents/tree/main/examples/03_phone_and_rag_example) | Phone and RAG Demo | -|

Security Camera

A security camera with face recognition, package detection and automated theft response. Generates WANTED posters with Nano Banana and posts them to X when packages disappear.

• Face detection & named recognition
• YOLOv11 package detection
• Automated WANTED poster generation
• Real-time X posting

[>Source Code and tutorial](https://github.com/GetStream/Vision-Agents/tree/main/examples/04_security_camera_example) | Security Camera Demo | - +| 🔮 Demo Applications | | +|:--------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------------|-----------------------------------------------------------------------------------------| +|

Cartesia

Using Cartesia's Sonic 3 model to visually look at what's in the frame and tell a story with emotion.

• Real-time visual understanding
• Emotional storytelling
• Frame-by-frame analysis

[>Source Code and tutorial](https://github.com/GetStream/Vision-Agents/tree/main/plugins/cartesia/example) | Cartesia Demo | +|

Realtime Stable Diffusion

Realtime stable diffusion using Vision Agents and Decart's Mirage 2 model to create interactive scenes and stories.

• Real-time video restyling
• Interactive scene generation
• Stable diffusion integration

[>Source Code and tutorial](https://github.com/GetStream/Vision-Agents/tree/main/plugins/decart/example) | Mirage Demo | +|

Golf Coach

Using Gemini Live together with Vision Agents and Ultralytics YOLO, we're able to track the user's pose and provide realtime actionable feedback on their golf game.

• Real-time pose tracking
• Actionable coaching feedback
• YOLO pose detection
• Gemini Live integration

[>Source Code and tutorial](https://github.com/GetStream/Vision-Agents/tree/main/examples/02_golf_coach_example) | Golf Coach Demo | +|

GeoGuesser

Together with OpenAI Realtime and Vision Agents, we can take GeoGuesser to the next level by asking it to identify places in our real world surroundings.

• Real-world location identification
• OpenAI Realtime integration
• Visual scene understanding

[>Source Code and tutorial](https://visionagents.ai/integrations/openai#openai-realtime) | GeoGuesser Demo | +|

Phone and RAG

Interact with your Agent over the phone using Twilio. This example demonstrates how to use TurboPuffer for Retrieval Augmented Generation (RAG) to give your agent specialized knowledge.

• Inbound/Outbound telephony
• Twilio Media Streams integration
• Vector search with TurboPuffer
• Retrieval Augmented Generation

[>Source Code and tutorial](https://github.com/GetStream/Vision-Agents/tree/main/examples/03_phone_and_rag_example) | Phone and RAG Demo | +|

Security Camera

A security camera with face recognition, package detection and automated theft response. Generates WANTED posters with Nano Banana and posts them to X when packages disappear.

• Face detection & named recognition
• YOLOv11 package detection
• Automated WANTED poster generation
• Real-time X posting

[>Source Code and tutorial](https://github.com/GetStream/Vision-Agents/tree/main/examples/04_security_camera_example) | Security Camera Demo | ## Development @@ -203,25 +211,25 @@ Want to add your platform or provider? Reach out to **nash@getstream.io**. Our favorite people & projects to follow for vision AI -| [](https://x.com/demishassabis) | [](https://x.com/OfficialLoganK) | [](https://x.com/ultralytics) | -| :----------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------: | -| [@demishassabis](https://x.com/demishassabis)
CEO @ Google DeepMind
Won a Nobel prize | [@OfficialLoganK](https://x.com/OfficialLoganK)
Product Lead @ Gemini
Posts about robotics vision | [@ultralytics](https://x.com/ultralytics)
Various fast vision AI models
Pose, detect, segment, classify | +| [](https://x.com/demishassabis) | [](https://x.com/OfficialLoganK) | [](https://x.com/ultralytics) | +|:--------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------:| +| [@demishassabis](https://x.com/demishassabis)
CEO @ Google DeepMind
Won a Nobel prize | [@OfficialLoganK](https://x.com/OfficialLoganK)
Product Lead @ Gemini
Posts about robotics vision | [@ultralytics](https://x.com/ultralytics)
Various fast vision AI models
Pose, detect, segment, classify | -| [](https://x.com/skalskip92) | [](https://x.com/moondreamai) | [](https://x.com/kwindla) | -| :---------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------: | -| [@skalskip92](https://x.com/skalskip92)
Open Source Lead @ Roboflow
Building tools for vision AI | [@moondreamai](https://x.com/moondreamai)
The tiny vision model that could
Lightweight, fast, efficient | [@kwindla](https://x.com/kwindla)
Pipecat / Daily
Sharing AI and vision insights | +| [](https://x.com/skalskip92) | [](https://x.com/moondreamai) | [](https://x.com/kwindla) | +|:-----------------------------------------------------------------------------------------------------------------------------------:|:------------------------------------------------------------------------------------------------------------------------------------:|:--------------------------------------------------------------------------------------------------------------------------------:| +| [@skalskip92](https://x.com/skalskip92)
Open Source Lead @ Roboflow
Building tools for vision AI | [@moondreamai](https://x.com/moondreamai)
The tiny vision model that could
Lightweight, fast, efficient | [@kwindla](https://x.com/kwindla)
Pipecat / Daily
Sharing AI and vision insights | -| [](https://x.com/juberti) | [](https://x.com/romainhuet) | [](https://x.com/thorwebdev) | -| :-------------------------------------------------------------------------------------------------: | :---------------------------------------------------------------------------------------------------------------------: | :-----------------------------------------------------------------------------------------------: | -| [@juberti](https://x.com/juberti)
Head of Realtime AI @ OpenAI
Realtime AI systems | [@romainhuet](https://x.com/romainhuet)
Head of DX @ OpenAI
Developer tooling & APIs | [@thorwebdev](https://x.com/thorwebdev)
Eleven Labs
Voice and AI experiments | +| [](https://x.com/juberti) | [](https://x.com/romainhuet) | [](https://x.com/thorwebdev) | +|:--------------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------:|:-----------------------------------------------------------------------------------------------------------------------------------:| +| [@juberti](https://x.com/juberti)
Head of Realtime AI @ OpenAI
Realtime AI systems | [@romainhuet](https://x.com/romainhuet)
Head of DX @ OpenAI
Developer tooling & APIs | [@thorwebdev](https://x.com/thorwebdev)
Eleven Labs
Voice and AI experiments | -| [](https://x.com/mervenoyann) | [](https://x.com/stash_pomichter) | [](https://x.com/Mentraglass) | -| :------------------------------------------------------------------------------------------------------: | :-------------------------------------------------------------------------------------------------------------------: | :----------------------------------------------------------------------------------------------------------------------: | -| [@mervenoyann](https://x.com/mervenoyann)
Hugging Face
Posts extensively about Video AI | [@stash_pomichter](https://x.com/stash_pomichter)
Spatial memory for robots
Robotics & AI navigation | [@Mentraglass](https://x.com/Mentraglass)
Open-source smart glasses
Open-Source, hackable AR glasses with AI capabilities built in | +| [](https://x.com/mervenoyann) | [](https://x.com/stash_pomichter) | [](https://x.com/Mentraglass) | +|:------------------------------------------------------------------------------------------------------------------------------------:|:----------------------------------------------------------------------------------------------------------------------------------------:|:---------------------------------------------------------------------------------------------------------------------------------------------------:| +| [@mervenoyann](https://x.com/mervenoyann)
Hugging Face
Posts extensively about Video AI | [@stash_pomichter](https://x.com/stash_pomichter)
Spatial memory for robots
Robotics & AI navigation | [@Mentraglass](https://x.com/Mentraglass)
Open-source smart glasses
Open-Source, hackable AR glasses with AI capabilities built in | -| [](https://x.com/vikhyatk) | -| :----------------------------------------------------------------------------------------------------------------------: | -| [@vikhyatk](https://x.com/vikhyatk)
AI Engineer
Open-source AI projects, Creator of Moondream AI | +| [](https://x.com/vikhyatk) | +|:-------------------------------------------------------------------------------------------------------------------------------:| +| [@vikhyatk](https://x.com/vikhyatk)
AI Engineer
Open-source AI projects, Creator of Moondream AI | ## Inspiration @@ -242,37 +250,43 @@ Our favorite people & projects to follow for vision AI - Improved openAI & Gemini realtime performance - Audio & Video utilities -### 0.3 - Examples and Deploys- Jan +### 0.3 - Examples and Deploys - Jan + - Production-grade HTTP API for agent deployment (`uv run serve`) - Metrics & Observability stack - Phone/voice integration with RAG capabilities -- 10 new LLM plugins ([AWS Nova 2](plugins/aws), [Qwen 3 Realtime](plugins/qwen), [NVIDIA Cosmos 2](plugins/nvidia), [Pocket TTS](plugins/pocket), [Deepgram TTS](plugins/deepgram), [OpenRouter](plugins/openrouter), [HuggingFace Inference](plugins/huggingface), [Roboflow](plugins/roboflow), [Twilio](plugins/twilio), [Turbopuffer](plugins/turbopuffer)) -- Real-world examples ([security camera](examples/05_security_camera_example), [phone integration](examples/03_phone_and_rag_example), [football commentator](examples/04_football_commentator_example), [Docker deployment with GPU support](examples/07_deploy_example), [agent server](examples/08_agent_server_example)) +- 10 new LLM + plugins ([AWS Nova 2](plugins/aws), [Qwen 3 Realtime](plugins/qwen), [NVIDIA Cosmos 2](plugins/nvidia), [Pocket TTS](plugins/pocket), [Deepgram TTS](plugins/deepgram), [OpenRouter](plugins/openrouter), [HuggingFace Inference](plugins/huggingface), [Roboflow](plugins/roboflow), [Twilio](plugins/twilio), [Turbopuffer](plugins/turbopuffer)) +- Real-world + examples ([security camera](examples/05_security_camera_example), [phone integration](examples/03_phone_and_rag_example), [football commentator](examples/04_football_commentator_example), [Docker deployment with GPU support](examples/07_deploy_example), [agent server](examples/08_agent_server_example)) - Stability: Fixes for participant sync, video frame handling, agent lifecycle, and screen sharing ### 0.4 Documentation/polish + - Excellence on documentation/polish - Better Roboflow annotation docs - Automated workflows for maintenance - Local camera/audio support AND/OR WebRTC connection - Embedded/robotics examples - ## Vision AI limitations Video AI is the frontier of AI. The state of the art is changing daily to help models understand live video. While building the integrations, here are the limitations we've noticed (Dec 2025) -* Video AI struggles with small text. If you want the AI to read the score in a game it will often get it wrong and hallucinate -* Longer videos can cause the AI to lose context. For instance if it's watching a soccer match it will get confused after 30 seconds -* Most applications require a combination of small specialized models like Yolo/Roboflow/Moondream, API calls to get more context and larger models like gemini/openAI +* Video AI struggles with small text. If you want the AI to read the score in a game it will often get it wrong and + hallucinate +* Longer videos can cause the AI to lose context. For instance if it's watching a soccer match it will get confused + after 30 seconds +* Most applications require a combination of small specialized models like Yolo/Roboflow/Moondream, API calls to get + more context and larger models like gemini/openAI * Image size & FPS need to stay relatively low due to performance constraints * Video doesn’t trigger responses in realtime models. You always need to send audio/text to trigger a response. - ## We are hiring -Join the team behind this project - we’re hiring a Staff Python Engineer to architect, build, and maintain a powerful toolkit for developers integrating voice and video AI into their products. +Join the team behind this project - we’re hiring a Staff Python Engineer to architect, build, and maintain a powerful +toolkit for developers integrating voice and video AI into their products. [Apply here](https://jobs.ashbyhq.com/stream/3bea7dba-54e1-4c71-aa02-712a075842df?utm_source=Jmv9QOkznl) diff --git a/agents-core/vision_agents/core/__init__.py b/agents-core/vision_agents/core/__init__.py index 8e6f6c1da..a5bd009ef 100644 --- a/agents-core/vision_agents/core/__init__.py +++ b/agents-core/vision_agents/core/__init__.py @@ -1,13 +1,11 @@ from vision_agents.core.agents import Agent from vision_agents.core.agents.agent_launcher import AgentLauncher, AgentSession -from vision_agents.core.cli.cli_runner import cli from vision_agents.core.edge.types import User from vision_agents.core.runner import Runner, ServeOptions __all__ = [ "Agent", "User", - "cli", "AgentLauncher", "AgentSession", "Runner", diff --git a/agents-core/vision_agents/core/cli/__init__.py b/agents-core/vision_agents/core/cli/__init__.py deleted file mode 100644 index e69de29bb..000000000 diff --git a/agents-core/vision_agents/core/cli/cli_runner.py b/agents-core/vision_agents/core/cli/cli_runner.py deleted file mode 100644 index 95b37c104..000000000 --- a/agents-core/vision_agents/core/cli/cli_runner.py +++ /dev/null @@ -1,152 +0,0 @@ -""" -Generic CLI runner for Vision Agents examples. - -Provides a Click-based CLI with common options for debugging and logging. -""" - -import asyncio -import logging -import warnings -from typing import TYPE_CHECKING, Optional -from uuid import uuid4 - -import click -from vision_agents.core.utils.logging import configure_sdk_logger - -if TYPE_CHECKING: - from vision_agents.core.agents.agent_launcher import AgentLauncher - - -asyncio_logger = logging.getLogger("asyncio") - -logger = logging.getLogger(__name__) - - -def cli(launcher: "AgentLauncher") -> None: - """ - Create and run a CLI from an AgentLauncher. - - Usage: - if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) - - Args: - launcher: AgentLauncher instance with create_agent and join_call functions - """ - - @click.command() - @click.option( - "--call-type", - type=str, - default="default", - help="Call type for the video call", - ) - @click.option( - "--call-id", - type=str, - default=None, - help="Call ID for the video call (auto-generated if not provided)", - ) - @click.option( - "--debug", - is_flag=True, - default=False, - help="Enable debug mode", - ) - @click.option( - "--log-level", - type=click.Choice( - ["DEBUG", "INFO", "WARNING", "ERROR", "CRITICAL"], case_sensitive=False - ), - default="INFO", - help="Set the logging level", - ) - @click.option( - "--no-demo", - is_flag=True, - default=False, - help="Disable opening the demo UI", - ) - @click.option( - "--video-track-override", - type=click.Path(dir_okay=False, exists=True, resolve_path=True), - default=None, - help="Optional local video track override for debugging. " - "This track will play instead of any incoming video track.", - ) - def run_agent( - call_type: str, - call_id: Optional[str], - debug: bool, - log_level: str, - no_demo: bool, - video_track_override: Optional[str], - ) -> None: - """Run the agent with the specified configuration.""" - # Configure logging - numeric_level = getattr(logging, log_level.upper(), logging.INFO) - configure_sdk_logger(level=numeric_level) - - # Suppress dataclasses_json missing value RuntimeWarnings. - # They pollute the output and cannot be fixed by the users. - warnings.filterwarnings( - "ignore", category=RuntimeWarning, module="dataclasses_json.core" - ) - - # Generate call ID if not provided - if call_id is None: - call_id = str(uuid4()) - - async def _run(): - logger.info("🚀 Launching agent...") - - try: - # Start the agent launcher. - await launcher.start() - - # Create the agent - agent = await launcher.launch() - if video_track_override: - agent.set_video_track_override_path(video_track_override) - - logger.info("✅ Agent warmed up and ready") - - # Open demo UI by default - if ( - not no_demo - and hasattr(agent, "edge") - and hasattr(agent.edge, "open_demo_for_agent") - ): - logger.info("🌐 Opening demo UI...") - await agent.edge.open_demo_for_agent(agent, call_type, call_id) - - # Join call if join_call function is provided - logger.info(f"📞 Joining call: {call_type}/{call_id}") - session = await launcher.start_session( - call_id, call_type, video_track_override_path=video_track_override - ) - await session.wait() - except KeyboardInterrupt: - logger.info("🛑 Received interrupt signal, shutting down gracefully...") - except Exception as e: - logger.error(f"❌ Error running agent: {e}", exc_info=True) - raise - finally: - await launcher.stop() - - asyncio_logger_level = asyncio_logger.level - - try: - asyncio.run(_run(), debug=debug) - except KeyboardInterrupt: - # Temporarily suppress asyncio error logging during cleanup - asyncio_logger_level = asyncio_logger.level - # Suppress KeyboardInterrupt and asyncio errors during cleanup - asyncio_logger.setLevel(logging.CRITICAL) - logger.info("👋 Agent shutdown complete") - finally: - # Restore original logging level - asyncio_logger.setLevel(asyncio_logger_level) - - # Invoke the click command - run_agent() diff --git a/examples/01_simple_agent_example/README.md b/examples/01_simple_agent_example/README.md index f23b300e4..b3f148617 100644 --- a/examples/01_simple_agent_example/README.md +++ b/examples/01_simple_agent_example/README.md @@ -1,6 +1,7 @@ # Simple Agent Example -This example shows you how to build a basic video AI agent using [Vision Agents](https://visionagents.ai/). The agent can have conversations with users through video and voice input and output. +This example shows you how to build a basic video AI agent using [Vision Agents](https://visionagents.ai/). The agent +can have conversations with users through video and voice input and output. - Listens to user speech and converts it to text - Processes the conversation using an LLM (Large Language Model) @@ -11,11 +12,11 @@ This example shows you how to build a basic video AI agent using [Vision Agents] - Python 3.13 or higher - API keys for: - - [OpenAI](https://openai.com) (for the LLM) - - [Elevenlabs](https://elevenlabs.io/) (for text-to-speech) - - [Deepgram](https://deepgram.com/) (for speech-to-text) - - [Stream](https://getstream.io/) (for video/audio infrastructure) - - [Smart Turn](https://fal.ai/models/fal-ai/smart-turn) (for turn detection) + - [OpenAI](https://openai.com) (for the LLM) + - [Elevenlabs](https://elevenlabs.io/) (for text-to-speech) + - [Deepgram](https://deepgram.com/) (for speech-to-text) + - [Stream](https://getstream.io/) (for video/audio infrastructure) + - [Smart Turn](https://fal.ai/models/fal-ai/smart-turn) (for turn detection) ## Installation @@ -41,11 +42,13 @@ This example shows you how to build a basic video AI agent using [Vision Agents] ## Running the Example Run the agent: + ```bash -uv run simple_agent_example.py +uv run simple_agent_example.py run ``` The agent will: + 1. Create a video call 2. Open a demo UI in your browser 3. Join the call and start listening @@ -70,6 +73,7 @@ agent = Agent( ``` **Components:** + - `edge`: Handles low-latency audio/video transport - `agent_user`: Sets the agent's name and ID - `instructions`: Tells the agent how to behave @@ -88,17 +92,17 @@ async with agent.join(call): ``` This code: + 1. Creates a new video call with a unique ID 2. Has the agent join the call 3. Keeps the agent running until the call ends **Note:** The CLI automatically opens the demo UI by default. Use `--no-demo` flag to disable it. - - ### Alternative: Using Realtime LLMs -You can simplify the setup by using a realtime LLM like OpenAI Realtime or Gemini Live. These models handle speech-to-text and text-to-speech internally: +You can simplify the setup by using a realtime LLM like OpenAI Realtime or Gemini Live. These models handle +speech-to-text and text-to-speech internally: ```python agent = Agent( @@ -112,7 +116,8 @@ agent = Agent( ### Native API Access -Vision Agents gives you direct access to native LLM APIs. You can use OpenAI's `create_response` method or any other provider-specific features: +Vision Agents gives you direct access to native LLM APIs. You can use OpenAI's `create_response` method or any other +provider-specific features: ```python await llm.create_response(input=[ @@ -130,18 +135,21 @@ await llm.create_response(input=[ ### Change the Instructions -Edit the `instructions` parameter to change how your agent behaves. The instructions file (`instructions.md`) shows an example of adding personality. +Edit the `instructions` parameter to change how your agent behaves. The instructions file (`instructions.md`) shows an +example of adding personality. ### Use Different Models You can swap out any component: + - Try `openai.Realtime()` for lower latency - Use `gemini.Realtime()` for Google's model - Switch TTS providers to `elevenlabs.TTS()` or `kokoro.TTS()` ### Add Processors -Add items to the `processors` list to give your agent new capabilities. See the golf coach example for how to use YOLO for object detection. +Add items to the `processors` list to give your agent new capabilities. See the golf coach example for how to use YOLO +for object detection. ## Next Steps diff --git a/examples/01_simple_agent_example/simple_agent_example.py b/examples/01_simple_agent_example/simple_agent_example.py index 05fe0e3a9..37bdcc1ac 100644 --- a/examples/01_simple_agent_example/simple_agent_example.py +++ b/examples/01_simple_agent_example/simple_agent_example.py @@ -2,7 +2,7 @@ from typing import Any, Dict from dotenv import load_dotenv -from vision_agents.core import Agent, AgentLauncher, User, cli +from vision_agents.core import Agent, AgentLauncher, Runner, User from vision_agents.core.utils.examples import get_weather_by_location from vision_agents.plugins import ( deepgram, @@ -77,4 +77,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/examples/02_golf_coach_example/README.md b/examples/02_golf_coach_example/README.md index 259237405..1e7f07f1f 100644 --- a/examples/02_golf_coach_example/README.md +++ b/examples/02_golf_coach_example/README.md @@ -1,23 +1,26 @@ # Golf Coach Example -This example shows you how to build a real-time golf coaching AI using [Vision Agents](https://visionagents.ai/). The agent uses video processing to watch golf swings and provide feedback through voice conversation. +This example shows you how to build a real-time golf coaching AI using [Vision Agents](https://visionagents.ai/). The +agent uses video processing to watch golf swings and provide feedback through voice conversation. In this example, the AI golf coach will: + - Watches video of the user's golf swing - Uses [YOLO](https://www.ultralytics.com/yolo) pose detection to analyze body position and movement - Processes the video in real-time with an LLM (Large Language Model) - Provides voice feedback on the swing technique - Runs on Stream's low-latency edge network -This approach combines a fast object detection model (YOLO) with a full realtime AI. You can apply this pattern to other video AI use cases like sports coaching, physical therapy, workout coaching, or drone monitoring. +This approach combines a fast object detection model (YOLO) with a full realtime AI. You can apply this pattern to other +video AI use cases like sports coaching, physical therapy, workout coaching, or drone monitoring. ## Prerequisites - Python 3.13 or higher - API keys for: - - [Gemini](https://ai.google.dev/) (for realtime LLM with vision) - - [Stream](https://getstream.io/) (for video/audio infrastructure) - - Alternatively: [OpenAI](https://openai.com) (if using OpenAI Realtime instead) + - [Gemini](https://ai.google.dev/) (for realtime LLM with vision) + - [Stream](https://getstream.io/) (for video/audio infrastructure) + - Alternatively: [OpenAI](https://openai.com) (if using OpenAI Realtime instead) ## Installation @@ -25,7 +28,7 @@ This approach combines a fast object detection model (YOLO) with a full realtime ```bash cd examples/02_golf_coach_example ``` - + 2. Install dependencies using uv: ```bash uv sync @@ -46,11 +49,13 @@ This approach combines a fast object detection model (YOLO) with a full realtime ## Running the Example Run the agent: + ```bash -uv run golf_coach_example.py +uv run golf_coach_example.py run ``` The agent will: + 1. Create a video call 2. Open a demo UI in your browser 3. Join the call and start watching @@ -74,6 +79,7 @@ agent = Agent( ``` **Components:** + - `edge`: Handles low-latency audio/video transport - `agent_user`: Sets the agent's name and ID - `instructions`: Loads coaching instructions from `golf_coach.md` @@ -82,14 +88,16 @@ agent = Agent( ### Understanding Processors -Processors enable the agent to analyze video in real-time. The `YOLOPoseProcessor` detects human poses and body positions in each video frame. This information is sent to the LLM so it can understand the user's body movement during the golf swing. +Processors enable the agent to analyze video in real-time. The `YOLOPoseProcessor` detects human poses and body +positions in each video frame. This information is sent to the LLM so it can understand the user's body movement during +the golf swing. The `fps=10` parameter means the LLM processes 10 frames per second. Higher FPS gives more detail but costs more. - ### Instructions File The `golf_coach.md` file contains detailed coaching guidelines. It tells the agent: + - How to behave (personality and tone) - What to look for in a golf swing - How to provide feedback @@ -102,14 +110,16 @@ You can modify this file to change the coaching style or add more specific guida ### Change the FPS Adjust how many frames per second the LLM processes: + ```python -llm=gemini.Realtime(fps=5) # Lower FPS = less expensive -llm=gemini.Realtime(fps=15) # Higher FPS = more detailed analysis +llm = gemini.Realtime(fps=5) # Lower FPS = less expensive +llm = gemini.Realtime(fps=15) # Higher FPS = more detailed analysis ``` ### Use OpenAI Instead of Gemini Switch to OpenAI's realtime API: + ```python agent = Agent( edge=getstream.Edge(), @@ -124,6 +134,7 @@ Both models support video processing with YOLO. ### Modify the Coaching Style Edit the `golf_coach.md` file to change: + - The agent's personality - The coaching focus areas - The level of detail in feedback @@ -132,6 +143,7 @@ Edit the `golf_coach.md` file to change: ### Use Different YOLO Models Try other YOLO models for different use cases: + ```python # For general object detection ultralytics.YOLOProcessor(model_path="yolo11n.pt") @@ -148,7 +160,6 @@ ultralytics.YOLOPoseProcessor(model_path="yolo11n-pose.pt") 4. **Analysis**: The LLM watches the swing and evaluates technique 5. **Feedback**: The agent speaks feedback based on coaching guidelines - ## Learn More - [Building a Voice AI app](https://visionagents.ai/introduction/voice-agents) diff --git a/examples/02_golf_coach_example/golf_coach_example.py b/examples/02_golf_coach_example/golf_coach_example.py index fdd39f0b4..7b3bbd8d4 100644 --- a/examples/02_golf_coach_example/golf_coach_example.py +++ b/examples/02_golf_coach_example/golf_coach_example.py @@ -1,9 +1,9 @@ import logging from dotenv import load_dotenv -from vision_agents.core import Agent, User, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import getstream, ultralytics, gemini +from vision_agents.plugins import gemini, getstream, ultralytics logger = logging.getLogger(__name__) @@ -39,4 +39,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/examples/02_golf_coach_example/golf_coach_with_metrics.py b/examples/02_golf_coach_example/golf_coach_with_metrics.py index dd9f3a7e0..482bcade3 100644 --- a/examples/02_golf_coach_example/golf_coach_with_metrics.py +++ b/examples/02_golf_coach_example/golf_coach_with_metrics.py @@ -2,15 +2,15 @@ Run with: cd examples/02_golf_coach_example - uv run python golf_coach_with_metrics.py --call-type default --call-id test-metrics + uv run python golf_coach_with_metrics.py run --call-type default --call-id test-metrics Then open http://localhost:9464/metrics to see real-time metrics. """ # Configure OpenTelemetry BEFORE importing vision_agents from opentelemetry import metrics -from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.exporter.prometheus import PrometheusMetricReader +from opentelemetry.sdk.metrics import MeterProvider from prometheus_client import start_http_server # Start Prometheus HTTP server on port 9464 @@ -26,11 +26,10 @@ import logging # noqa: E402 from dotenv import load_dotenv # noqa: E402 - -from vision_agents.core import User, Agent, cli # noqa: E402 +from vision_agents.core import Agent, Runner, User # noqa: E402 from vision_agents.core.agents import AgentLauncher # noqa: E402 from vision_agents.core.observability import MetricsCollector # noqa: E402 -from vision_agents.plugins import getstream, ultralytics, openai # noqa: E402 +from vision_agents.plugins import getstream, openai, ultralytics # noqa: E402 logger = logging.getLogger(__name__) logging.basicConfig(level=logging.INFO) @@ -85,4 +84,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/examples/04_football_commentator_example/RUNNING_THE_EXAMPLE.md b/examples/04_football_commentator_example/RUNNING_THE_EXAMPLE.md index f479834ca..6b01fc92e 100644 --- a/examples/04_football_commentator_example/RUNNING_THE_EXAMPLE.md +++ b/examples/04_football_commentator_example/RUNNING_THE_EXAMPLE.md @@ -1,12 +1,13 @@ # Football Commentator Example -A real-time AI sports commentator that watches football footage and provides play-by-play commentary using OpenAI Realtime and Roboflow object detection. +A real-time AI sports commentator that watches football footage and provides play-by-play commentary using OpenAI +Realtime and Roboflow object detection. ## Setup 1. **Get API keys:** - - OpenAI API key: https://platform.openai.com/api-keys - - GetStream API key: https://getstream.io + - OpenAI API key: https://platform.openai.com/api-keys + - GetStream API key: https://getstream.io 2. **Configure environment:** ```bash @@ -17,7 +18,7 @@ A real-time AI sports commentator that watches football footage and provides pla 3. **Run the example:** ```bash - uv run football_commentator_example.py + uv run football_commentator_example.py run ``` ## What It Does @@ -38,8 +39,9 @@ The agent: 4. Watch the annotated video and listen to the AI commentary To use a local video file instead of screen sharing: + ```bash -uv run football_commentator_example.py --video-track-override path/to/football.mp4 +uv run football_commentator_example.py run --video-track-override path/to/football.mp4 ``` ## Configuration @@ -49,8 +51,8 @@ Edit `football_commentator_example.py` to customize: ```python roboflow.RoboflowLocalDetectionProcessor( classes=["person", "sports ball"], # Objects to detect - conf_threshold=0.5, # Detection confidence (0-1) - fps=5, # Detection frame rate + conf_threshold=0.5, # Detection confidence (0-1) + fps=5, # Detection frame rate ) ``` diff --git a/examples/04_football_commentator_example/football_commentator_example.py b/examples/04_football_commentator_example/football_commentator_example.py index 5aaa5a080..f8bb60e40 100644 --- a/examples/04_football_commentator_example/football_commentator_example.py +++ b/examples/04_football_commentator_example/football_commentator_example.py @@ -2,12 +2,11 @@ import random from dotenv import load_dotenv -from vision_agents.core import Agent, User, cli +from utils import Debouncer +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher from vision_agents.plugins import getstream, openai, roboflow -from utils import Debouncer - logger = logging.getLogger(__name__) load_dotenv() @@ -70,4 +69,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/examples/05_security_camera_example/README.md b/examples/05_security_camera_example/README.md index e4ca97736..b827dc66f 100644 --- a/examples/05_security_camera_example/README.md +++ b/examples/05_security_camera_example/README.md @@ -1,11 +1,14 @@ # Security Camera Demo -A real-time security camera demo with face recognition, package detection, and automated package theft response - including wanted poster generation and posting to X. +A real-time security camera demo with face recognition, package detection, and automated package theft response - +including wanted poster generation and posting to X. ## Features -- 🎥 **Real-time Face Detection**: Uses [face_recognition](https://github.com/ageitgey/face_recognition) library for accurate face detection and recognition -- 📦 **Package Detection**: [YOLOv11](https://docs.ultralytics.com/models/yolo11/)-based object detection for packages and boxes +- 🎥 **Real-time Face Detection**: Uses [face_recognition](https://github.com/ageitgey/face_recognition) library for + accurate face detection and recognition +- 📦 **Package Detection**: [YOLOv11](https://docs.ultralytics.com/models/yolo11/)-based object detection for packages + and boxes - 🚨 **Package Theft Detection**: Detects when a package disappears and identifies the suspect - 🖼️ **Wanted Poster Generation**: Automatically creates a wanted poster when a package is "stolen" - 🐦 **X Integration**: Posts wanted posters to [X](https://developer.x.com) automatically @@ -73,7 +76,8 @@ The AI assistant has access to: - Python 3.13+ - Webcam/camera access - [GetStream](https://getstream.io) account for video transport -- API keys for [Gemini](https://ai.google.dev), [Deepgram](https://deepgram.com), and [ElevenLabs](https://elevenlabs.io) +- API keys for [Gemini](https://ai.google.dev), [Deepgram](https://deepgram.com), + and [ElevenLabs](https://elevenlabs.io) - (Optional) [X Developer API](https://developer.x.com) credentials for posting wanted posters ### Installation @@ -118,7 +122,7 @@ X_ACCESS_TOKEN_SECRET=your_x_access_token_secret ### Running the Demo ```bash -uv run security_camera_example.py +uv run security_camera_example.py run ``` The agent will join a call and start monitoring the video feed for faces and packages. @@ -150,15 +154,15 @@ You can adjust the processor parameters in `security_camera_example.py`: ```python security_processor = SecurityCameraProcessor( - fps=5, # Frames per second to process - time_window=1800, # Time window in seconds (30 min) - thumbnail_size=80, # Size of thumbnails in pixels - detection_interval=2.0, # Seconds between face detection with identity matching - bbox_update_interval=0.3, # Seconds between fast bbox updates for tracking + fps=5, # Frames per second to process + time_window=1800, # Time window in seconds (30 min) + thumbnail_size=80, # Size of thumbnails in pixels + detection_interval=2.0, # Seconds between face detection with identity matching + bbox_update_interval=0.3, # Seconds between fast bbox updates for tracking model_path="weights_custom.pt", # YOLO model for package detection - package_conf_threshold=0.7, # Package detection confidence threshold - max_tracked_packages=1, # Single-package mode for demo - face_match_tolerance=0.6, # Face matching tolerance (lower = stricter) + package_conf_threshold=0.7, # Package detection confidence threshold + max_tracked_packages=1, # Single-package mode for demo + face_match_tolerance=0.6, # Face matching tolerance (lower = stricter) ) ``` @@ -182,17 +186,23 @@ Uses a custom [YOLOv11](https://docs.ultralytics.com/models/yolo11/) model (`wei - Open_package - Package -The model runs package detection at configurable intervals with IoU-based tracking to maintain package identity across frames. +The model runs package detection at configurable intervals with IoU-based tracking to maintain package identity across +frames. ### About the Custom Model -The `weights_custom.pt` file is a YOLOv11 object detection model we trained using [Roboflow](https://roboflow.com) with [SAM 3](https://blog.roboflow.com/sam3/) for assisted labeling. SAM 3's text-prompt segmentation made it fast to annotate packages and boxes accurately. +The `weights_custom.pt` file is a YOLOv11 object detection model we trained using [Roboflow](https://roboflow.com) +with [SAM 3](https://blog.roboflow.com/sam3/) for assisted labeling. SAM 3's text-prompt segmentation made it fast to +annotate packages and boxes accurately. -**We are not distributing `weights_custom.pt`.** To run this demo, you'll need to provide your own YOLO model. +**We are not distributing `weights_custom.pt`.** To run this demo, you'll need to provide your own YOLO model. Options: -- **Train your own**: Use [Roboflow](https://roboflow.com) to label a dataset and train a YOLOv11 model. See their [YOLOv11 training guide](https://blog.roboflow.com/yolov11-how-to-train-custom-data/). -- **Find a pre-trained model**: Search [Roboflow Universe](https://universe.roboflow.com) for "package detection" datasets and models. + +- **Train your own**: Use [Roboflow](https://roboflow.com) to label a dataset and train a YOLOv11 model. See + their [YOLOv11 training guide](https://blog.roboflow.com/yolov11-how-to-train-custom-data/). +- **Find a pre-trained model**: Search [Roboflow Universe](https://universe.roboflow.com) for "package detection" + datasets and models. Place your model weights at `weights_custom.pt` in this directory, or change the `model_path` parameter. diff --git a/examples/05_security_camera_example/security_camera_example.py b/examples/05_security_camera_example/security_camera_example.py index c64334793..3a6f5dee7 100644 --- a/examples/05_security_camera_example/security_camera_example.py +++ b/examples/05_security_camera_example/security_camera_example.py @@ -15,11 +15,6 @@ import numpy as np from dotenv import load_dotenv - -from vision_agents.core import Agent, User, cli -from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import deepgram, elevenlabs, gemini, getstream - from poster_generator import generate_and_post_poster from security_camera_processor import ( PackageDetectedEvent, @@ -28,6 +23,9 @@ PersonDisappearedEvent, SecurityCameraProcessor, ) +from vision_agents.core import Agent, Runner, User +from vision_agents.core.agents import AgentLauncher +from vision_agents.plugins import deepgram, elevenlabs, gemini, getstream load_dotenv() @@ -278,4 +276,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/examples/06_prometheus_metrics_example/README.md b/examples/06_prometheus_metrics_example/README.md index 3f7a84cd9..70505b56b 100644 --- a/examples/06_prometheus_metrics_example/README.md +++ b/examples/06_prometheus_metrics_example/README.md @@ -5,6 +5,7 @@ Export real metrics from Stream Agents to Prometheus using OpenTelemetry. ## Overview This example demonstrates how to: + 1. Configure OpenTelemetry with a Prometheus exporter 2. Attach `MetricsCollector` to an agent for opt-in metrics collection 3. Scrape metrics from the `/metrics` endpoint during a live video call @@ -14,7 +15,7 @@ This example demonstrates how to: ```bash cd examples/03_prometheus_metrics_example uv sync -uv run python prometheus_metrics_example.py --call-type default --call-id test-metrics +uv run python prometheus_metrics_example.py run --call-type default --call-id test-metrics ``` Then open http://localhost:9464/metrics in your browser to see real-time metrics as you talk to the agent. @@ -22,6 +23,7 @@ Then open http://localhost:9464/metrics in your browser to see real-time metrics ## Metrics Available ### LLM Metrics + - `llm_latency_ms` - Total response latency (histogram) - `llm_time_to_first_token_ms` - Time to first token for streaming (histogram) - `llm_tokens_input` - Input/prompt tokens consumed (counter) @@ -31,21 +33,25 @@ Then open http://localhost:9464/metrics in your browser to see real-time metrics - `llm_tool_latency_ms` - Tool execution latency (histogram) ### STT Metrics + - `stt_latency_ms` - STT processing latency (histogram) - `stt_audio_duration_ms` - Duration of audio processed (histogram) - `stt_errors` - STT errors (counter) ### TTS Metrics + - `tts_latency_ms` - TTS synthesis latency (histogram) - `tts_audio_duration_ms` - Duration of synthesized audio (histogram) - `tts_characters` - Characters synthesized (counter) - `tts_errors` - TTS errors (counter) ### Turn Detection Metrics + - `turn_duration_ms` - Duration of detected turns (histogram) - `turn_trailing_silence_ms` - Trailing silence duration (histogram) ### Realtime LLM Metrics + - `realtime_sessions` - Realtime sessions started (counter) - `realtime_session_duration_ms` - Session duration (histogram) - `realtime_audio_input_bytes` - Audio bytes sent (counter) diff --git a/examples/06_prometheus_metrics_example/prometheus_metrics_example.py b/examples/06_prometheus_metrics_example/prometheus_metrics_example.py index 1c3b8440a..0c85966e4 100644 --- a/examples/06_prometheus_metrics_example/prometheus_metrics_example.py +++ b/examples/06_prometheus_metrics_example/prometheus_metrics_example.py @@ -22,8 +22,8 @@ # IMPORTANT: Configure OpenTelemetry BEFORE importing vision_agents # ============================================================================= from opentelemetry import metrics -from opentelemetry.sdk.metrics import MeterProvider from opentelemetry.exporter.prometheus import PrometheusMetricReader +from opentelemetry.sdk.metrics import MeterProvider from prometheus_client import start_http_server # Start Prometheus HTTP server on port 9464 @@ -42,11 +42,10 @@ from typing import Any, Dict # noqa: E402 from dotenv import load_dotenv # noqa: E402 - -from vision_agents.core import Agent, User, AgentLauncher, cli # noqa: E402 +from vision_agents.core import Agent, AgentLauncher, Runner, User # noqa: E402 from vision_agents.core.observability import MetricsCollector # noqa: E402 from vision_agents.core.utils.examples import get_weather_by_location # noqa: E402 -from vision_agents.plugins import deepgram, getstream, gemini, elevenlabs # noqa: E402 +from vision_agents.plugins import deepgram, elevenlabs, gemini, getstream # noqa: E402 load_dotenv() @@ -107,4 +106,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/examples/07_deploy_example/Dockerfile b/examples/07_deploy_example/Dockerfile index 8a746d977..305ee7b36 100644 --- a/examples/07_deploy_example/Dockerfile +++ b/examples/07_deploy_example/Dockerfile @@ -15,4 +15,4 @@ EXPOSE 8080 ENV UV_LINK_MODE=copy # Install deps at startup (faster push, slower cold start) -CMD ["sh", "-c", "uv sync --frozen -v && uv run deploy_example.py"] +CMD ["sh", "-c", "uv sync --frozen -v && uv run deploy_example.py run"] diff --git a/examples/07_deploy_example/Dockerfile.gpu b/examples/07_deploy_example/Dockerfile.gpu index 31519619b..8f3b45504 100644 --- a/examples/07_deploy_example/Dockerfile.gpu +++ b/examples/07_deploy_example/Dockerfile.gpu @@ -15,4 +15,4 @@ EXPOSE 8080 ENV UV_LINK_MODE=copy # Install deps at startup (faster push, slower cold start) -CMD ["sh", "-c", "uv sync --frozen -v && uv run deploy_example.py"] +CMD ["sh", "-c", "uv sync --frozen -v && uv run deploy_example.py run"] diff --git a/examples/07_deploy_example/README.md b/examples/07_deploy_example/README.md index 9b5bd0f5d..918d8c47f 100644 --- a/examples/07_deploy_example/README.md +++ b/examples/07_deploy_example/README.md @@ -1,11 +1,11 @@ - ## TODO / improvements - merge monitoring and HTTP efforts into this # Tips -* US-east. Services like Stream run a global edge network. But many providers default to US-east. So you typically want to run in US-east for optimal latency +* US-east. Services like Stream run a global edge network. But many providers default to US-east. So you typically want + to run in US-east for optimal latency * CPU build is quick to get up and running. GPU/CUDA takes hours. * This guide uses Nebius, but you could do this with other K8 enabled clouds quite easily * GPU setup needs more checks/testing @@ -21,7 +21,7 @@ cp .env.example .env Next fill in the required variables and run the example locally to verify everything works ``` -uv run deploy_example.py +uv run deploy_example.py run ``` # Requirements @@ -106,10 +106,12 @@ nebius mk8s node-group create \ ``` Available GPU presets: + - `1gpu-16vcpu-200gb` - 1x H200, 16 vCPU, 200GB RAM - `8gpu-128vcpu-1600gb` - 8x H200, 128 vCPU, 1.6TB RAM Available driver presets (see [Nebius GPU docs](https://docs.nebius.com/kubernetes/gpu/set-up)): + - `cuda12` - CUDA 12.4 (default) - `cuda12.4` - CUDA 12.4 - `cuda12.8` - CUDA 12.8 @@ -124,6 +126,7 @@ kubectl get nodes # verify connection # 1. Build the Docker image There are two Dockerfiles: + - `Dockerfile` - CPU version (python:3.13-slim, ~150MB) - `Dockerfile.gpu` - GPU version (pytorch:2.9.1-cuda12.8, ~8GB) @@ -193,6 +196,7 @@ kubectl create secret generic vision-agent-env --from-env-file=.env ``` To update secrets: + ``` kubectl delete secret vision-agent-env kubectl create secret generic vision-agent-env --from-env-file=.env diff --git a/examples/07_deploy_example/deploy_example.py b/examples/07_deploy_example/deploy_example.py index 5e7c35ada..fc8be3c6e 100644 --- a/examples/07_deploy_example/deploy_example.py +++ b/examples/07_deploy_example/deploy_example.py @@ -1,11 +1,10 @@ import logging -from typing import Dict, Any +from typing import Any, Dict from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli, AgentLauncher +from vision_agents.core import Agent, AgentLauncher, Runner, User from vision_agents.core.utils.examples import get_weather_by_location -from vision_agents.plugins import deepgram, getstream, gemini, elevenlabs +from vision_agents.plugins import deepgram, elevenlabs, gemini, getstream logger = logging.getLogger(__name__) @@ -51,4 +50,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/aws/example/aws_realtime_function_calling_example.py b/plugins/aws/example/aws_realtime_function_calling_example.py index 02cd14374..19d84dcb7 100644 --- a/plugins/aws/example/aws_realtime_function_calling_example.py +++ b/plugins/aws/example/aws_realtime_function_calling_example.py @@ -8,15 +8,13 @@ import asyncio import logging from typing import Dict -from typing_extensions import Any from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from typing_extensions import Any +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import aws, getstream from vision_agents.core.utils.examples import get_weather_by_location - +from vision_agents.plugins import aws, getstream logger = logging.getLogger(__name__) @@ -101,4 +99,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/aws/example/aws_realtime_nova_example.py b/plugins/aws/example/aws_realtime_nova_example.py index 8fcf4f746..6a5ffb7c0 100644 --- a/plugins/aws/example/aws_realtime_nova_example.py +++ b/plugins/aws/example/aws_realtime_nova_example.py @@ -10,7 +10,7 @@ from dotenv import load_dotenv -from vision_agents.core import User, Agent, cli +from vision_agents.core import Runner, User, Agent from vision_agents.core.agents import AgentLauncher from vision_agents.plugins import aws, getstream @@ -46,4 +46,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/cartesia/example/README.md b/plugins/cartesia/example/README.md index 71745ea0e..1a96cef5c 100644 --- a/plugins/cartesia/example/README.md +++ b/plugins/cartesia/example/README.md @@ -1,6 +1,7 @@ # Stream + Cartesia TTS Bot Example -This example demonstrates how to build a text-to-speech bot that joins a Stream video call and greets participants using [Cartesia's](https://cartesia.ai/?utm_medium=partner&utm_source=getstream) Sonic voices. +This example demonstrates how to build a text-to-speech bot that joins a Stream video call and greets participants +using [Cartesia's](https://cartesia.ai/?utm_medium=partner&utm_source=getstream) Sonic voices. ## What it does @@ -35,6 +36,7 @@ You can use your preferred package manager, but we recommend [`uv`](https://docs ## Usage Run the example: + ```bash -uv run main.py +uv run main.py run ``` diff --git a/plugins/cartesia/example/main.py b/plugins/cartesia/example/main.py index 24835cda3..7c1dcafe7 100644 --- a/plugins/cartesia/example/main.py +++ b/plugins/cartesia/example/main.py @@ -20,9 +20,8 @@ import logging from dotenv import load_dotenv - +from vision_agents.core import Runner from vision_agents.core.agents import Agent, AgentLauncher -from vision_agents.core import cli from vision_agents.core.edge.types import User from vision_agents.plugins import cartesia, getstream, openai @@ -57,4 +56,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/cartesia/example/narrator-example.py b/plugins/cartesia/example/narrator-example.py index b92ddfd89..98e4f48e9 100644 --- a/plugins/cartesia/example/narrator-example.py +++ b/plugins/cartesia/example/narrator-example.py @@ -17,15 +17,14 @@ CARTESIA_API_KEY """ +import asyncio import logging from dotenv import load_dotenv - +from vision_agents.core import Runner from vision_agents.core.agents import Agent, AgentLauncher -from vision_agents.core import cli from vision_agents.core.edge.types import User -from vision_agents.plugins import cartesia, getstream, openai, deepgram -import asyncio +from vision_agents.plugins import cartesia, deepgram, getstream, openai logger = logging.getLogger(__name__) @@ -60,4 +59,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/decart/example/README.md b/plugins/decart/example/README.md index 95c45560b..68158e986 100644 --- a/plugins/decart/example/README.md +++ b/plugins/decart/example/README.md @@ -1,11 +1,15 @@ # Decart Storyteller Example -This example shows you how to build a real-time storytelling agent using [Vision Agents](https://visionagents.ai/) and [Decart](https://decart.ai/). The agent tells a story while transforming your video feed into an animated style that matches the narrative. +This example shows you how to build a real-time storytelling agent using [Vision Agents](https://visionagents.ai/) +and [Decart](https://decart.ai/). The agent tells a story while transforming your video feed into an animated style that +matches the narrative. In this example, the AI storyteller will: + - Listen to your voice input - Generate a story based on your interactions -- Use [Decart](https://decart.ai/) to restyle your video feed in real-time (e.g., "A cute animated movie with vibrant colours") +- Use [Decart](https://decart.ai/) to restyle your video feed in real-time (e.g., "A cute animated movie with vibrant + colours") - Change the video style dynamically as the story progresses - Speak with an expressive voice using [ElevenLabs](https://elevenlabs.io/) - Run on Stream's low-latency edge network @@ -14,11 +18,11 @@ In this example, the AI storyteller will: - Python 3.10 or higher - API keys for: - - [OpenAI](https://openai.com) (for the LLM) - - [Decart](https://decart.ai/) (for video restyling) - - [ElevenLabs](https://elevenlabs.io/) (for text-to-speech) - - [Deepgram](https://deepgram.com/) (for speech-to-text) - - [Stream](https://getstream.io/) (for video/audio infrastructure) + - [OpenAI](https://openai.com) (for the LLM) + - [Decart](https://decart.ai/) (for video restyling) + - [ElevenLabs](https://elevenlabs.io/) (for text-to-speech) + - [Deepgram](https://deepgram.com/) (for speech-to-text) + - [Stream](https://getstream.io/) (for video/audio infrastructure) ## Installation @@ -40,11 +44,13 @@ In this example, the AI storyteller will: ## Running the Example Run the agent: + ```bash -uv run decart_example.py +uv run decart_example.py run ``` The agent will: + 1. Create a video call 2. Open a demo UI in your browser 3. Join the call @@ -74,6 +80,7 @@ agent = Agent( ``` **Components:** + - `processor`: The Decart RestylingProcessor that transforms the video feed. - `llm`: The language model (GPT-4o-mini) that generates the story and controls the processor. - `tts`: ElevenLabs TTS for expressive voice output. @@ -93,7 +100,8 @@ async def change_prompt(prompt: str) -> str: return f"Prompt changed to {prompt}" ``` -This allows the LLM to call `change_prompt("A dark and stormy night")` to instantly change the visual style of the video to match the story's mood. +This allows the LLM to call `change_prompt("A dark and stormy night")` to instantly change the visual style of the video +to match the story's mood. ## Customization @@ -110,7 +118,8 @@ processor = decart.RestylingProcessor( ### Modify the Storytelling Persona -Edit the `instructions` passed to the `Agent` to change the storyteller's personality, tone, or the type of stories they tell. +Edit the `instructions` passed to the `Agent` to change the storyteller's personality, tone, or the type of stories they +tell. ### Change the Voice diff --git a/plugins/decart/example/decart_example.py b/plugins/decart/example/decart_example.py index dd781802e..410b22fa8 100644 --- a/plugins/decart/example/decart_example.py +++ b/plugins/decart/example/decart_example.py @@ -1,7 +1,7 @@ import logging from dotenv import load_dotenv -from vision_agents.core import Agent, User, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher from vision_agents.plugins import decart, deepgram, elevenlabs, getstream, openai @@ -54,4 +54,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/deepgram/example/README.md b/plugins/deepgram/example/README.md index 2591a1f1c..3e305e64b 100644 --- a/plugins/deepgram/example/README.md +++ b/plugins/deepgram/example/README.md @@ -12,6 +12,7 @@ cp .env.example .env ``` Required environment variables: + - `DEEPGRAM_API_KEY` - Your Deepgram API key - `STREAM_API_KEY` - Your Stream API key - `STREAM_API_SECRET` - Your Stream API secret @@ -26,7 +27,7 @@ uv sync 3. Run the example: ```bash -uv run python deepgram_tts_example.py dev --call-type audio_room --call-id test +uv run python deepgram_tts_example.py run --call-type audio_room --call-id test ``` ## Features @@ -42,7 +43,7 @@ Deepgram offers various Aura voice models. You can customize the voice by passin ```python tts = deepgram.TTS(model="aura-2-thalia-en") # Default female voice -tts = deepgram.TTS(model="aura-2-orion-en") # Male voice +tts = deepgram.TTS(model="aura-2-orion-en") # Male voice ``` See [Deepgram TTS Models](https://developers.deepgram.com/docs/tts-models) for all available voices. diff --git a/plugins/deepgram/example/deepgram_tts_example.py b/plugins/deepgram/example/deepgram_tts_example.py index 960788b3c..47771ce43 100644 --- a/plugins/deepgram/example/deepgram_tts_example.py +++ b/plugins/deepgram/example/deepgram_tts_example.py @@ -19,11 +19,9 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import deepgram, getstream, gemini - +from vision_agents.plugins import deepgram, gemini, getstream logger = logging.getLogger(__name__) @@ -64,4 +62,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/elevenlabs/example/README.md b/plugins/elevenlabs/example/README.md index 47381e7be..c93ccb598 100644 --- a/plugins/elevenlabs/example/README.md +++ b/plugins/elevenlabs/example/README.md @@ -1,10 +1,12 @@ # ElevenLabs TTS and STT Example -This directory contains an example demonstrating how to use the ElevenLabs TTS and Scribe v2 STT plugins with Vision Agents. +This directory contains an example demonstrating how to use the ElevenLabs TTS and Scribe v2 STT plugins with Vision +Agents. ## Overview -This example creates an AI agent that uses ElevenLabs' state-of-the-art voice technology for both speech synthesis and recognition. +This example creates an AI agent that uses ElevenLabs' state-of-the-art voice technology for both speech synthesis and +recognition. ## Features @@ -17,12 +19,14 @@ This example creates an AI agent that uses ElevenLabs' state-of-the-art voice te ## Setup 1. Install dependencies: + ```bash cd plugins/elevenlabs/example uv sync ``` 2. Create a `.env` file with your API keys: + ```bash # Required for ElevenLabs TTS and STT ELEVENLABS_API_KEY=your_elevenlabs_api_key @@ -38,10 +42,11 @@ GEMINI_API_KEY=your_gemini_api_key ## Running the Example ```bash -uv run elevenlabs_example.py +uv run elevenlabs_example.py run ``` The agent will: + 1. Connect to the GetStream edge network 2. Initialize ElevenLabs TTS and Scribe v2 STT 3. Join a call and greet you @@ -82,19 +87,21 @@ Adjust turn detection sensitivity: ```python turn_detection = smart_turn.TurnDetection( - buffer_in_seconds=2.0, # How long to wait for speech - confidence_threshold=0.5, # How confident to be before ending turn + buffer_in_seconds=2.0, # How long to wait for speech + confidence_threshold=0.5, # How confident to be before ending turn ) ``` ## ElevenLabs Models ### TTS Models + - `eleven_multilingual_v2`: High-quality, emotionally rich (default) - `eleven_flash_v2_5`: Ultra-fast with low latency (~75ms) - `eleven_turbo_v2_5`: Balanced quality and speed ### STT Model + - `scribe_v2_realtime`: Real-time transcription with 99 language support ## Architecture @@ -121,16 +128,19 @@ User Hears Response ## Troubleshooting ### No audio output + - Verify your `ELEVENLABS_API_KEY` is valid - Check your audio device settings - Ensure GetStream connection is established ### Poor transcription quality + - Use 16kHz sample rate audio for optimal results - Speak clearly and avoid background noise - Adjust `vad_threshold` if needed ### High latency + - Consider using `eleven_flash_v2_5` for TTS - Check your network connection - Reduce `buffer_in_seconds` in turn detection diff --git a/plugins/elevenlabs/example/elevenlabs_example.py b/plugins/elevenlabs/example/elevenlabs_example.py index ed0ca74d3..7222c5351 100644 --- a/plugins/elevenlabs/example/elevenlabs_example.py +++ b/plugins/elevenlabs/example/elevenlabs_example.py @@ -17,11 +17,9 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import elevenlabs, getstream, smart_turn, gemini - +from vision_agents.plugins import elevenlabs, gemini, getstream, smart_turn logger = logging.getLogger(__name__) @@ -55,4 +53,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/fast_whisper/example/fast_whisper_example.py b/plugins/fast_whisper/example/fast_whisper_example.py index ff34982dd..790eae6a4 100644 --- a/plugins/fast_whisper/example/fast_whisper_example.py +++ b/plugins/fast_whisper/example/fast_whisper_example.py @@ -9,10 +9,9 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import fast_whisper, getstream, gemini, elevenlabs, vogent +from vision_agents.plugins import elevenlabs, fast_whisper, gemini, getstream, vogent logger = logging.getLogger(__name__) @@ -57,4 +56,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/fish/example/README.md b/plugins/fish/example/README.md index ace283025..ed285f50f 100644 --- a/plugins/fish/example/README.md +++ b/plugins/fish/example/README.md @@ -5,20 +5,24 @@ This directory contains examples demonstrating how to use the Fish Audio TTS plu ## Examples ### 1. Simple TTS Example (`simple_tts_example.py`) + Basic usage of Fish Audio TTS without a full agent setup. Perfect for testing or simple integrations. ### 2. Full Agent Example (`fish_tts_example.py`) + Complete agent setup with Fish Audio TTS, Deepgram STT, and real-time communication. ## Setup 1. Install dependencies: + ```bash cd plugins/fish/example uv sync ``` 2. Create a `.env` file with your API keys: + ```bash # Required for Fish Audio TTS FISH_AUDIO_API_KEY=your_fish_audio_api_key @@ -31,19 +35,16 @@ STREAM_API_SECRET=your_stream_api_secret ## Running the Examples -### Simple TTS Example -```bash -uv run simple_tts_example.py -``` - ### Full Agent Example + ```bash -uv run fish_tts_example.py +uv run fish_example.py run ``` ## What it does The example creates an AI agent that: + - Uses **Fish Audio** for high-quality text-to-speech synthesis - Uses **Deepgram** for speech-to-text transcription - Uses **GetStream** for real-time communication diff --git a/plugins/fish/example/fish_example.py b/plugins/fish/example/fish_example.py index a3639e2cb..9104f6649 100644 --- a/plugins/fish/example/fish_example.py +++ b/plugins/fish/example/fish_example.py @@ -18,11 +18,9 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import fish, getstream, smart_turn, gemini - +from vision_agents.plugins import fish, gemini, getstream, smart_turn logger = logging.getLogger(__name__) @@ -66,4 +64,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/heygen/example/README.md b/plugins/heygen/example/README.md index a92061714..eff49fb7b 100644 --- a/plugins/heygen/example/README.md +++ b/plugins/heygen/example/README.md @@ -10,7 +10,8 @@ Uses a standard streaming LLM (Gemini) with separate TTS/STT components. Best fo ### 2. Realtime LLM (`avatar_realtime_example.py`) -Uses Gemini Realtime with native audio input/output. The avatar lip-syncs to the transcribed text while Gemini handles voice processing. +Uses Gemini Realtime with native audio input/output. The avatar lip-syncs to the transcribed text while Gemini handles +voice processing. ## Setup @@ -30,6 +31,7 @@ cp .env.example .env ``` **For Standard Example** (`avatar_example.py`): + - `HEYGEN_API_KEY` - Get from [HeyGen](https://heygen.com) - `STREAM_API_KEY` and `STREAM_SECRET` - Get from [GetStream](https://getstream.io) - `CARTESIA_API_KEY` - Get from [Cartesia](https://cartesia.ai) @@ -37,6 +39,7 @@ cp .env.example .env - `GOOGLE_API_KEY` - Get from [Google AI Studio](https://makersuite.google.com/app/apikey) **For Realtime Example** (`avatar_realtime_example.py`): + - `HEYGEN_API_KEY` - Get from [HeyGen](https://heygen.com) - `STREAM_API_KEY` and `STREAM_SECRET` - Get from [GetStream](https://getstream.io) - `GOOGLE_API_KEY` - Get from [Google AI Studio](https://makersuite.google.com/app/apikey) @@ -46,16 +49,19 @@ cp .env.example .env From the project root: **Standard Streaming LLM:** + ```bash -uv run plugins/heygen/example/avatar_example.py +uv run plugins/heygen/example/avatar_example.py run ``` **Realtime LLM:** + ```bash -uv run plugins/heygen/example/avatar_realtime_example.py +uv run plugins/heygen/example/avatar_realtime_example.py run ``` Both will: + 1. Start an AI agent with a HeyGen avatar 2. Open a demo UI in your browser 3. The avatar will speak and be ready to chat @@ -65,41 +71,41 @@ Both will: ### Standard Streaming LLM (`avatar_example.py`) 1. **Agent Setup**: The agent is configured with: - - Gemini LLM for generating responses - - Cartesia TTS for speech synthesis - - Deepgram STT for speech recognition - - HeyGen AvatarPublisher for avatar video + - Gemini LLM for generating responses + - Cartesia TTS for speech synthesis + - Deepgram STT for speech recognition + - HeyGen AvatarPublisher for avatar video 2. **Avatar Streaming**: When the agent speaks: - - Text is generated by Gemini LLM - - Text is sent to HeyGen for lip-sync - - Audio is synthesized by Cartesia TTS - - HeyGen generates avatar video with lip-sync - - Avatar video and audio are streamed to the call + - Text is generated by Gemini LLM + - Text is sent to HeyGen for lip-sync + - Audio is synthesized by Cartesia TTS + - HeyGen generates avatar video with lip-sync + - Avatar video and audio are streamed to the call 3. **User Interaction**: When you speak: - - Audio is captured from your microphone - - Transcribed to text by Deepgram - - Sent to Gemini LLM for processing - - Response is generated and spoken through the avatar + - Audio is captured from your microphone + - Transcribed to text by Deepgram + - Sent to Gemini LLM for processing + - Response is generated and spoken through the avatar ### Realtime LLM (`avatar_realtime_example.py`) 1. **Agent Setup**: The agent is configured with: - - Gemini Realtime for native audio processing - - HeyGen AvatarPublisher for avatar video + - Gemini Realtime for native audio processing + - HeyGen AvatarPublisher for avatar video 2. **Avatar Streaming**: When the agent speaks: - - Gemini Realtime generates audio directly (24kHz PCM) - - Text transcription is sent to HeyGen for lip-sync - - HeyGen generates avatar video with lip-sync - - Gemini's audio is used (HeyGen audio is not forwarded for Realtime LLMs) - - Avatar video and Gemini audio are streamed to the call + - Gemini Realtime generates audio directly (24kHz PCM) + - Text transcription is sent to HeyGen for lip-sync + - HeyGen generates avatar video with lip-sync + - Gemini's audio is used (HeyGen audio is not forwarded for Realtime LLMs) + - Avatar video and Gemini audio are streamed to the call 3. **User Interaction**: When you speak: - - Audio is captured and sent directly to Gemini Realtime - - Gemini processes audio natively (no separate STT needed) - - Response is generated and spoken through the avatar + - Audio is captured and sent directly to Gemini Realtime + - Gemini processes audio natively (no separate STT needed) + - Response is generated and spoken through the avatar ## Customization @@ -133,6 +139,7 @@ heygen.AvatarPublisher( ### Using a Different LLM **With Standard Streaming LLM:** + ```python from vision_agents.plugins import openai, elevenlabs @@ -150,6 +157,7 @@ agent = Agent( ``` **With Realtime LLM:** + ```python from vision_agents.plugins import openai diff --git a/plugins/heygen/example/avatar_example.py b/plugins/heygen/example/avatar_example.py index d056356c4..1dd2a08dc 100644 --- a/plugins/heygen/example/avatar_example.py +++ b/plugins/heygen/example/avatar_example.py @@ -12,13 +12,11 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import getstream, gemini, heygen, deepgram +from vision_agents.plugins import deepgram, gemini, getstream, heygen from vision_agents.plugins.heygen import VideoQuality - logger = logging.getLogger(__name__) load_dotenv() @@ -70,4 +68,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/heygen/example/avatar_realtime_example.py b/plugins/heygen/example/avatar_realtime_example.py index 3714c0c52..d587505eb 100644 --- a/plugins/heygen/example/avatar_realtime_example.py +++ b/plugins/heygen/example/avatar_realtime_example.py @@ -9,13 +9,11 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import getstream, gemini, heygen +from vision_agents.plugins import gemini, getstream, heygen from vision_agents.plugins.heygen import VideoQuality - logger = logging.getLogger(__name__) load_dotenv() @@ -66,4 +64,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/huggingface/example/README.md b/plugins/huggingface/example/README.md index f7f5e7a0a..679acd699 100644 --- a/plugins/huggingface/example/README.md +++ b/plugins/huggingface/example/README.md @@ -1,6 +1,7 @@ # HuggingFace LLM Example -This example demonstrates how to use HuggingFace's Inference Providers API with Vision Agents to create a conversational voice agent. +This example demonstrates how to use HuggingFace's Inference Providers API with Vision Agents to create a conversational +voice agent. ## Setup @@ -12,6 +13,7 @@ cp .env.example .env ``` Required environment variables: + - `HF_TOKEN` - Your HuggingFace API token - `STREAM_API_KEY` - Your Stream API key - `STREAM_API_SECRET` - Your Stream API secret @@ -26,7 +28,7 @@ uv sync 3. Run the example: ```bash -uv run main.py dev +uv run main.py run ``` ## Features diff --git a/plugins/huggingface/example/main.py b/plugins/huggingface/example/main.py index bd75a8994..a1d9c3f2a 100644 --- a/plugins/huggingface/example/main.py +++ b/plugins/huggingface/example/main.py @@ -19,10 +19,9 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import huggingface, getstream, deepgram +from vision_agents.plugins import deepgram, getstream, huggingface logger = logging.getLogger(__name__) @@ -63,4 +62,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/inworld/example/inworld_tts_example.py b/plugins/inworld/example/inworld_tts_example.py index 9c512349a..2fd210c06 100644 --- a/plugins/inworld/example/inworld_tts_example.py +++ b/plugins/inworld/example/inworld_tts_example.py @@ -19,11 +19,9 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import inworld, getstream, smart_turn, gemini, deepgram - +from vision_agents.plugins import deepgram, gemini, getstream, inworld, smart_turn logger = logging.getLogger(__name__) @@ -65,4 +63,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/kokoro/example/README.md b/plugins/kokoro/example/README.md index e669ba02a..ba87c163f 100644 --- a/plugins/kokoro/example/README.md +++ b/plugins/kokoro/example/README.md @@ -4,7 +4,8 @@ This directory contains an example demonstrating how to use the Kokoro TTS plugi ## Overview -Kokoro is an open-weight, offline TTS model that runs locally without requiring API keys. This makes it ideal for privacy-focused applications or environments without internet access. +Kokoro is an open-weight, offline TTS model that runs locally without requiring API keys. This makes it ideal for +privacy-focused applications or environments without internet access. ## Features @@ -30,12 +31,14 @@ Kokoro is an open-weight, offline TTS model that runs locally without requiring ## Setup 1. Install dependencies: + ```bash cd plugins/kokoro/example uv sync ``` 2. Create a `.env` file with your API keys: + ```bash # Required for GetStream STREAM_API_KEY=your_stream_api_key @@ -48,10 +51,11 @@ OPENAI_API_KEY=your_openai_api_key ## Running the Example ```bash -uv run kokoro_example.py +uv run kokoro_example.py run ``` The agent will: + 1. Connect to the GetStream edge network 2. Initialize Kokoro TTS (downloads model on first run) 3. Join a call and greet participants when they join @@ -73,7 +77,8 @@ tts = kokoro.TTS(speed=1.2) ### Available Voices -See the [Kokoro model card](https://huggingface.co/NeuML/kokoro-int8-onnx#speaker-reference) for available voice presets. +See the [Kokoro model card](https://huggingface.co/NeuML/kokoro-int8-onnx#speaker-reference) for available voice +presets. ## Architecture @@ -92,15 +97,18 @@ User Hears Greeting ## Troubleshooting ### No audio output + - Verify espeak-ng is installed: `espeak-ng --version` - Check that the Kokoro model downloaded successfully - Ensure GetStream connection is established ### Model download issues + - First run downloads the model (~300MB) - Ensure you have internet access for the initial download - Models are cached locally after first download ### Audio quality + - Kokoro outputs at 24kHz sample rate - Ensure your audio track uses matching sample rate diff --git a/plugins/kokoro/example/kokoro_example.py b/plugins/kokoro/example/kokoro_example.py index 495d2cc6d..fb763502a 100644 --- a/plugins/kokoro/example/kokoro_example.py +++ b/plugins/kokoro/example/kokoro_example.py @@ -19,11 +19,9 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import kokoro, getstream, openai - +from vision_agents.plugins import getstream, kokoro, openai logger = logging.getLogger(__name__) @@ -53,4 +51,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/moondream/README.md b/plugins/moondream/README.md index a8ca442d8..25a7b0088 100644 --- a/plugins/moondream/README.md +++ b/plugins/moondream/README.md @@ -1,11 +1,13 @@ # Moondream Plugin This plugin provides Moondream 3 vision capabilities for vision-agents, including: + - **Object Detection**: Real-time zero-shot object detection on video streams - **Visual Question Answering (VQA)**: Answer questions about video frames - **Image Captioning**: Generate descriptions of video frames -Choose between cloud-hosted or local processing depending on your needs. When running locally, we recommend you do so on CUDA enabled devices. +Choose between cloud-hosted or local processing depending on your needs. When running locally, we recommend you do so on +CUDA enabled devices. ## Installation @@ -18,26 +20,31 @@ uv add vision-agents[moondream] ### Detection Processors #### CloudDetectionProcessor (Recommended for Most Users) + - **Use when:** You want a simple setup with no infrastructure management - **Pros:** No model download, no GPU required, automatic updates - **Cons:** Requires API key, 2 RPS rate limit by default (can be increased) - **Best for:** Development, testing, low-to-medium volume applications #### LocalDetectionProcessor (For Advanced Users) + - **Use when:** You need higher throughput, have your own GPU infrastructure, or want to avoid rate limits - **Pros:** No rate limits, no API costs, full control over hardware - **Cons:** Requires GPU for best performance, model download on first use, infrastructure management -- **Best for:** Production deployments, high-volume applications, Digital Ocean Gradient AI GPUs, or custom infrastructure +- **Best for:** Production deployments, high-volume applications, Digital Ocean Gradient AI GPUs, or custom + infrastructure ### Vision Language Models (VLM) #### CloudVLM (Recommended for Most Users) + - **Use when:** You want visual question answering or captioning without managing infrastructure - **Pros:** No model download, no GPU required, automatic updates - **Cons:** Requires API key, rate limits apply - **Best for:** Development, testing, applications requiring VQA or captioning #### LocalVLM (For Advanced Users) + - **Use when:** You need VQA or captioning with higher throughput or want to avoid rate limits - **Pros:** No rate limits, no API costs, full control over hardware - **Cons:** Requires GPU for best performance, model download on first use, infrastructure management @@ -47,7 +54,8 @@ uv add vision-agents[moondream] ### Using CloudDetectionProcessor (Hosted) -The `CloudDetectionProcessor` uses Moondream's hosted API. By default it has a 2 RPS (requests per second) rate limit and requires an API key. The rate limit can be adjusted by contacting the Moondream team to request a higher limit. +The `CloudDetectionProcessor` uses Moondream's hosted API. By default it has a 2 RPS (requests per second) rate limit +and requires an API key. The rate limit can be adjusted by contacting the Moondream team to request a higher limit. ```python from vision_agents.plugins import moondream @@ -70,9 +78,12 @@ agent = Agent( ### Using LocalDetectionProcessor (On-Device) -If you are running on your own infrastructure or using a service like Digital Ocean's Gradient AI GPUs, you can use the `LocalDetectionProcessor` which downloads the model from HuggingFace and runs on device. By default it will use CUDA for best performance. Performance will vary depending on your specific hardware configuration. +If you are running on your own infrastructure or using a service like Digital Ocean's Gradient AI GPUs, you can use the +`LocalDetectionProcessor` which downloads the model from HuggingFace and runs on device. By default it will use CUDA for +best performance. Performance will vary depending on your specific hardware configuration. **Note:** The moondream3-preview model is gated and requires HuggingFace authentication: + - Request access at https://huggingface.co/moondream/moondream3-preview - Set `HF_TOKEN` environment variable: `export HF_TOKEN=your_token_here` - Or run: `huggingface-cli login` @@ -112,26 +123,28 @@ processor = moondream.CloudDetectionProcessor( ### Using CloudVLM (Hosted) -The `CloudVLM` uses Moondream's hosted API for visual question answering and captioning. It automatically processes video frames and responds to questions asked via STT (Speech-to-Text). +The `CloudVLM` uses Moondream's hosted API for visual question answering and captioning. It automatically processes +video frames and responds to questions asked via STT (Speech-to-Text). ```python import asyncio import os from dotenv import load_dotenv -from vision_agents.core import User, Agent, cli +from vision_agents.core import User, Agent, Runner from vision_agents.core.agents import AgentLauncher from vision_agents.plugins import deepgram, getstream, elevenlabs, moondream from vision_agents.core.events import CallSessionParticipantJoinedEvent load_dotenv() + async def create_agent(**kwargs) -> Agent: # Create a cloud VLM for visual question answering llm = moondream.CloudVLM( api_key=os.getenv("MOONDREAM_API_KEY"), # or set MOONDREAM_API_KEY env var mode="vqa", # or "caption" for image captioning ) - + agent = Agent( edge=getstream.Edge(), agent_user=User(name="My happy AI friend", id="agent"), @@ -141,6 +154,7 @@ async def create_agent(**kwargs) -> Agent: ) return agent + async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None: await agent.create_user() call = await agent.create_call(call_type, call_id) @@ -155,8 +169,9 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non async with agent.join(call): await agent.finish() + if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() ``` ### Using LocalVLM (On-Device) @@ -164,6 +179,7 @@ if __name__ == "__main__": The `LocalVLM` downloads the model from HuggingFace and runs on device. It supports both VQA and captioning modes. **Note:** The moondream3-preview model is gated and requires HuggingFace authentication: + - Request access at https://huggingface.co/moondream/moondream3-preview - Set `HF_TOKEN` environment variable: `export HF_TOKEN=your_token_here` - Or run: `huggingface-cli login` @@ -212,45 +228,59 @@ llm = moondream.CloudVLM( ### CloudDetectionProcessor Parameters -- `api_key`: str - API key for Moondream Cloud API. If not provided, will attempt to read from `MOONDREAM_API_KEY` environment variable. -- `detect_objects`: str | List[str] - Object(s) to detect using zero-shot detection. Can be any object name like "person", "car", "basketball". Default: `"person"` +- `api_key`: str - API key for Moondream Cloud API. If not provided, will attempt to read from `MOONDREAM_API_KEY` + environment variable. +- `detect_objects`: str | List[str] - Object(s) to detect using zero-shot detection. Can be any object name like " + person", "car", "basketball". Default: `"person"` - `conf_threshold`: float - Confidence threshold for detections (default: 0.3) - `fps`: int - Frame processing rate (default: 30) - `interval`: int - Processing interval in seconds (default: 0) - `max_workers`: int - Thread pool size for CPU-intensive operations (default: 10) -**Rate Limits:** By default, the Moondream Cloud API has a 2rps (requests per second) rate limit. Contact the Moondream team to request a higher limit. +**Rate Limits:** By default, the Moondream Cloud API has a 2rps (requests per second) rate limit. Contact the Moondream +team to request a higher limit. ### LocalDetectionProcessor Parameters -- `detect_objects`: str | List[str] - Object(s) to detect using zero-shot detection. Can be any object name like "person", "car", "basketball". Default: `"person"` +- `detect_objects`: str | List[str] - Object(s) to detect using zero-shot detection. Can be any object name like " + person", "car", "basketball". Default: `"person"` - `conf_threshold`: float - Confidence threshold for detections (default: 0.3) - `fps`: int - Frame processing rate (default: 30) - `interval`: int - Processing interval in seconds (default: 0) - `max_workers`: int - Thread pool size for CPU-intensive operations (default: 10) -- `force_cpu`: bool - If True, force CPU usage even if CUDA/MPS is available. Auto-detects CUDA, then MPS (Apple Silicon), then defaults to CPU. We recommend running on CUDA for best performance. (default: False) +- `force_cpu`: bool - If True, force CPU usage even if CUDA/MPS is available. Auto-detects CUDA, then MPS (Apple + Silicon), then defaults to CPU. We recommend running on CUDA for best performance. (default: False) - `model_name`: str - Hugging Face model identifier (default: "moondream/moondream3-preview") -- `options`: AgentOptions - Model directory configuration. If not provided, uses default which defaults to tempfile.gettempdir() +- `options`: AgentOptions - Model directory configuration. If not provided, uses default which defaults to + tempfile.gettempdir() -**Performance:** Performance will vary depending on your hardware configuration. CUDA is recommended for best performance on NVIDIA GPUs. The model will be downloaded from HuggingFace on first use. +**Performance:** Performance will vary depending on your hardware configuration. CUDA is recommended for best +performance on NVIDIA GPUs. The model will be downloaded from HuggingFace on first use. ### CloudVLM Parameters -- `api_key`: str - API key for Moondream Cloud API. If not provided, will attempt to read from `MOONDREAM_API_KEY` environment variable. -- `mode`: Literal["vqa", "caption"] - "vqa" for visual question answering or "caption" for image captioning (default: "vqa") +- `api_key`: str - API key for Moondream Cloud API. If not provided, will attempt to read from `MOONDREAM_API_KEY` + environment variable. +- `mode`: Literal["vqa", "caption"] - "vqa" for visual question answering or "caption" for image captioning (default: " + vqa") - `max_workers`: int - Thread pool size for CPU-intensive operations (default: 10) -**Rate Limits:** By default, the Moondream Cloud API has rate limits. Contact the Moondream team to request higher limits. +**Rate Limits:** By default, the Moondream Cloud API has rate limits. Contact the Moondream team to request higher +limits. ### LocalVLM Parameters -- `mode`: Literal["vqa", "caption"] - "vqa" for visual question answering or "caption" for image captioning (default: "vqa") +- `mode`: Literal["vqa", "caption"] - "vqa" for visual question answering or "caption" for image captioning (default: " + vqa") - `max_workers`: int - Thread pool size for async operations (default: 10) -- `force_cpu`: bool - If True, force CPU usage even if CUDA/MPS is available. Auto-detects CUDA, then MPS (Apple Silicon), then defaults to CPU. Note: MPS is automatically converted to CPU due to model compatibility. We recommend running on CUDA for best performance. (default: False) +- `force_cpu`: bool - If True, force CPU usage even if CUDA/MPS is available. Auto-detects CUDA, then MPS (Apple + Silicon), then defaults to CPU. Note: MPS is automatically converted to CPU due to model compatibility. We recommend + running on CUDA for best performance. (default: False) - `model_name`: str - Hugging Face model identifier (default: "moondream/moondream3-preview") - `options`: AgentOptions - Model directory configuration. If not provided, uses default_agent_options() -**Performance:** Performance will vary depending on your hardware configuration. CUDA is recommended for best performance on NVIDIA GPUs. The model will be downloaded from HuggingFace on first use. +**Performance:** Performance will vary depending on your hardware configuration. CUDA is recommended for best +performance on NVIDIA GPUs. The model will be downloaded from HuggingFace on first use. ## Video Publishing @@ -284,6 +314,7 @@ pytest plugins/moondream/tests/ -k "annotation" -v ## Dependencies ### Required + - `vision-agents` - Core framework - `moondream` - Moondream SDK for cloud API (CloudDetectionProcessor and CloudVLM) - `numpy>=2.0.0` - Array operations @@ -292,10 +323,12 @@ pytest plugins/moondream/tests/ -k "annotation" -v - `aiortc` - WebRTC support ### Local Components Additional Dependencies + - `torch` - PyTorch for model inference - `transformers` - HuggingFace transformers library for model loading -**Note:** LocalDetectionProcessor and LocalVLM both require these dependencies. We recommend only running the model locally on CUDA devices. +**Note:** LocalDetectionProcessor and LocalVLM both require these dependencies. We recommend only running the model +locally on CUDA devices. ## Links diff --git a/plugins/moondream/example/moondream_vlm_example.py b/plugins/moondream/example/moondream_vlm_example.py index 1b689587f..0b5e19d68 100644 --- a/plugins/moondream/example/moondream_vlm_example.py +++ b/plugins/moondream/example/moondream_vlm_example.py @@ -1,12 +1,12 @@ import asyncio import logging -from dotenv import load_dotenv +import os -from vision_agents.core import User, Agent, cli +from dotenv import load_dotenv +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import deepgram, getstream, elevenlabs, moondream from vision_agents.core.events import CallSessionParticipantJoinedEvent -import os +from vision_agents.plugins import deepgram, elevenlabs, getstream, moondream logger = logging.getLogger(__name__) @@ -47,4 +47,4 @@ async def on_participant_joined(event: CallSessionParticipantJoinedEvent): if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/nvidia/example/README.md b/plugins/nvidia/example/README.md index 32ca125f8..a4d79054f 100644 --- a/plugins/nvidia/example/README.md +++ b/plugins/nvidia/example/README.md @@ -37,10 +37,11 @@ ELEVENLABS_API_KEY=your_elevenlabs_api_key ## Running the Example ```bash -uv run main.py +uv run main.py run ``` The agent will: + 1. Join a video call 2. Automatically buffer video frames 3. Respond to user questions about what it sees in the video @@ -49,6 +50,7 @@ The agent will: ## Usage Once the agent joins the call: + - Speak to the agent and ask questions about what it sees - The agent will analyze the video frames and respond - Example questions: "What do you see?", "Describe the scene", "What's happening?" diff --git a/plugins/nvidia/example/main.py b/plugins/nvidia/example/main.py index 489288baf..907200a50 100644 --- a/plugins/nvidia/example/main.py +++ b/plugins/nvidia/example/main.py @@ -17,10 +17,9 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import nvidia, getstream, deepgram, elevenlabs +from vision_agents.plugins import deepgram, elevenlabs, getstream, nvidia logger = logging.getLogger(__name__) @@ -61,4 +60,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/openai/examples/qwen_vl_example/qwen_vl_example.py b/plugins/openai/examples/qwen_vl_example/qwen_vl_example.py index 65d385db7..4f39790f4 100644 --- a/plugins/openai/examples/qwen_vl_example/qwen_vl_example.py +++ b/plugins/openai/examples/qwen_vl_example/qwen_vl_example.py @@ -1,7 +1,7 @@ import asyncio from dotenv import load_dotenv -from vision_agents.core import Agent, User, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher from vision_agents.core.events import CallSessionParticipantJoinedEvent from vision_agents.plugins import deepgram, elevenlabs, getstream, openai @@ -43,4 +43,4 @@ async def on_participant_joined(event: CallSessionParticipantJoinedEvent): if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/openrouter/example/openrouter_example.py b/plugins/openrouter/example/openrouter_example.py index 45fb718c3..792d58be1 100644 --- a/plugins/openrouter/example/openrouter_example.py +++ b/plugins/openrouter/example/openrouter_example.py @@ -13,19 +13,17 @@ import os from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher from vision_agents.core.mcp import MCPBaseServer, MCPServerRemote from vision_agents.plugins import ( - openrouter, - getstream, - elevenlabs, deepgram, + elevenlabs, + getstream, + openrouter, smart_turn, ) - logger = logging.getLogger(__name__) load_dotenv() @@ -121,4 +119,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/pocket/example/README.md b/plugins/pocket/example/README.md index cc330e087..34d41a277 100644 --- a/plugins/pocket/example/README.md +++ b/plugins/pocket/example/README.md @@ -11,6 +11,7 @@ A Vision Agents example using Pocket TTS for local text-to-speech. ## Requirements Set the following environment variables: + - `DEEPGRAM_API_KEY` - for speech-to-text - `STREAM_API_KEY` and `STREAM_API_SECRET` - for real-time communication - `GOOGLE_API_KEY` - for Gemini LLM @@ -19,5 +20,5 @@ Set the following environment variables: ```bash cd plugins/pocket/example -uv run pocket_example.py +uv run pocket_example.py run ``` diff --git a/plugins/pocket/example/pocket_example.py b/plugins/pocket/example/pocket_example.py index 3ba8e5298..360e54f2e 100644 --- a/plugins/pocket/example/pocket_example.py +++ b/plugins/pocket/example/pocket_example.py @@ -19,11 +19,9 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import pocket, getstream, deepgram, gemini - +from vision_agents.plugins import deepgram, gemini, getstream, pocket logger = logging.getLogger(__name__) @@ -60,4 +58,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/qwen/example/README.md b/plugins/qwen/example/README.md index d53aadedf..13d4083c0 100644 --- a/plugins/qwen/example/README.md +++ b/plugins/qwen/example/README.md @@ -31,19 +31,21 @@ DASHSCOPE_API_KEY=your_dashscope_api_key_here 2. Run the example: ```bash -uv run python qwen_realtime_example.py +uv run python qwen_realtime_example.py run ``` ## Code Example ```python from dotenv import load_dotenv -from vision_agents.core import Agent, User, cli + +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher from vision_agents.plugins import getstream, qwen load_dotenv() + async def create_agent(**kwargs) -> Agent: llm = qwen.Realtime(fps=1) @@ -55,6 +57,7 @@ async def create_agent(**kwargs) -> Agent: ) return agent + async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None: await agent.create_user() call = await agent.create_call(call_type, call_id) @@ -63,8 +66,9 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non await agent.edge.open_demo(call) await agent.finish() + if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() ``` ## Configuration @@ -75,10 +79,10 @@ if __name__ == "__main__": ### Realtime Parameters -| Parameter | Description | Default | -|-----------|-------------|---------| -| `fps` | Video frames per second | `1` | -| `api_key` | DashScope API key | `None` (from env) | +| Parameter | Description | Default | +|-----------|-------------------------|-------------------| +| `fps` | Video frames per second | `1` | +| `api_key` | DashScope API key | `None` (from env) | ## Requirements diff --git a/plugins/qwen/example/qwen_realtime_example.py b/plugins/qwen/example/qwen_realtime_example.py index 292f4ec75..6fb5f43c2 100644 --- a/plugins/qwen/example/qwen_realtime_example.py +++ b/plugins/qwen/example/qwen_realtime_example.py @@ -4,7 +4,7 @@ # This model also does not support text input so once you join the call, simply start speaking to the agent. from dotenv import load_dotenv -from vision_agents.core import Agent, User, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher from vision_agents.plugins import getstream, qwen @@ -33,4 +33,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/roboflow/example/README.md b/plugins/roboflow/example/README.md index e28f6e3cc..cc4d52d5a 100644 --- a/plugins/roboflow/example/README.md +++ b/plugins/roboflow/example/README.md @@ -21,10 +21,11 @@ cp env.example .env ## Running the Example ```bash -uv run roboflow_example.py +uv run roboflow_example.py run ``` The agent will: + 1. Connect to GetStream 2. Join a video call with object detection enabled 3. Process video frames at 5 FPS using RF-DETR diff --git a/plugins/roboflow/example/roboflow_example.py b/plugins/roboflow/example/roboflow_example.py index 355fb076e..848c2f024 100644 --- a/plugins/roboflow/example/roboflow_example.py +++ b/plugins/roboflow/example/roboflow_example.py @@ -14,8 +14,7 @@ """ from dotenv import load_dotenv - -from vision_agents.core import Agent, User, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher from vision_agents.plugins import getstream, openai, roboflow @@ -57,4 +56,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/sample_plugin/example/my_example.py b/plugins/sample_plugin/example/my_example.py index f01585468..aa32f29bb 100644 --- a/plugins/sample_plugin/example/my_example.py +++ b/plugins/sample_plugin/example/my_example.py @@ -9,11 +9,9 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import aws, getstream, cartesia, deepgram, smart_turn - +from vision_agents.plugins import aws, cartesia, deepgram, getstream, smart_turn logger = logging.getLogger(__name__) @@ -57,4 +55,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/smart_turn/example/smart_turn_example.py b/plugins/smart_turn/example/smart_turn_example.py index 4d17341ca..eef64c21a 100644 --- a/plugins/smart_turn/example/smart_turn_example.py +++ b/plugins/smart_turn/example/smart_turn_example.py @@ -1,10 +1,9 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import getstream, gemini, elevenlabs, smart_turn +from vision_agents.plugins import elevenlabs, gemini, getstream, smart_turn logger = logging.getLogger(__name__) @@ -36,4 +35,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/vogent/example/vogent_example.py b/plugins/vogent/example/vogent_example.py index 67124e3c4..a70919610 100644 --- a/plugins/vogent/example/vogent_example.py +++ b/plugins/vogent/example/vogent_example.py @@ -1,10 +1,9 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.plugins import vogent, getstream, gemini, elevenlabs +from vision_agents.plugins import elevenlabs, gemini, getstream, vogent logger = logging.getLogger(__name__) @@ -36,4 +35,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() diff --git a/plugins/wizper/example/README.md b/plugins/wizper/example/README.md index 71edba8f5..14dcd4213 100644 --- a/plugins/wizper/example/README.md +++ b/plugins/wizper/example/README.md @@ -15,12 +15,14 @@ Wizper is a speech-to-text service provided by [FAL.ai](https://fal.ai) with bui ## Setup 1. Install dependencies: + ```bash cd plugins/wizper/example uv sync ``` 2. Create a `.env` file with your API keys: + ```bash # Required for Wizper STT FAL_KEY=your_fal_api_key @@ -36,10 +38,11 @@ OPENAI_API_KEY=your_openai_api_key ## Running the Example ```bash -uv run wizper_example.py +uv run wizper_example.py run ``` The agent will: + 1. Connect to the GetStream edge network 2. Initialize Wizper STT 3. Join a call and transcribe (and optionally translate) speech @@ -90,10 +93,12 @@ Handler logs/processes transcript ## Troubleshooting ### No transcriptions + - Verify your `FAL_KEY` is valid - Check that GetStream connection is established - Ensure audio is being captured properly ### Translation issues + - Use valid ISO-639-1 language codes (e.g., "fr", "es", "de") - Check FAL.ai supported languages diff --git a/plugins/wizper/example/wizper_example.py b/plugins/wizper/example/wizper_example.py index 8bc8686ce..9f22f61eb 100644 --- a/plugins/wizper/example/wizper_example.py +++ b/plugins/wizper/example/wizper_example.py @@ -19,12 +19,10 @@ import logging from dotenv import load_dotenv - -from vision_agents.core import User, Agent, cli +from vision_agents.core import Agent, Runner, User from vision_agents.core.agents import AgentLauncher -from vision_agents.core.stt.events import STTTranscriptEvent, STTErrorEvent -from vision_agents.plugins import wizper, getstream, openai - +from vision_agents.core.stt.events import STTErrorEvent, STTTranscriptEvent +from vision_agents.plugins import getstream, openai, wizper logger = logging.getLogger(__name__) @@ -74,4 +72,4 @@ async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> Non if __name__ == "__main__": - cli(AgentLauncher(create_agent=create_agent, join_call=join_call)) + Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli() From d096d989525c4b6a98ad21843ff1bd15c39ec33f Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 20 Jan 2026 12:34:07 +0100 Subject: [PATCH 2/6] Fix paths in READMEs --- .../04_football_commentator_example/RUNNING_THE_EXAMPLE.md | 2 +- examples/06_prometheus_metrics_example/README.md | 4 ++-- .../prometheus_metrics_example.py | 2 +- 3 files changed, 4 insertions(+), 4 deletions(-) diff --git a/examples/04_football_commentator_example/RUNNING_THE_EXAMPLE.md b/examples/04_football_commentator_example/RUNNING_THE_EXAMPLE.md index 6b01fc92e..7938e1bca 100644 --- a/examples/04_football_commentator_example/RUNNING_THE_EXAMPLE.md +++ b/examples/04_football_commentator_example/RUNNING_THE_EXAMPLE.md @@ -11,7 +11,7 @@ Realtime and Roboflow object detection. 2. **Configure environment:** ```bash - cd examples/03_football_commentator_example + cd examples/04_football_commentator_example cp env.example .env # Edit .env with your actual API keys ``` diff --git a/examples/06_prometheus_metrics_example/README.md b/examples/06_prometheus_metrics_example/README.md index 70505b56b..156b67b65 100644 --- a/examples/06_prometheus_metrics_example/README.md +++ b/examples/06_prometheus_metrics_example/README.md @@ -13,7 +13,7 @@ This example demonstrates how to: ## Running the Example ```bash -cd examples/03_prometheus_metrics_example +cd examples/06_prometheus_metrics_example uv sync uv run python prometheus_metrics_example.py run --call-type default --call-id test-metrics ``` @@ -67,7 +67,7 @@ Add this to your `prometheus.yml`: scrape_configs: - job_name: 'stream-agents' static_configs: - - targets: ['localhost:9464'] + - targets: [ 'localhost:9464' ] scrape_interval: 15s ``` diff --git a/examples/06_prometheus_metrics_example/prometheus_metrics_example.py b/examples/06_prometheus_metrics_example/prometheus_metrics_example.py index 0c85966e4..bf35bce97 100644 --- a/examples/06_prometheus_metrics_example/prometheus_metrics_example.py +++ b/examples/06_prometheus_metrics_example/prometheus_metrics_example.py @@ -12,7 +12,7 @@ 4. Metrics are available at http://localhost:9464/metrics Run with: - cd examples/03_prometheus_metrics_example + cd examples/06_prometheus_metrics_example uv run python prometheus_metrics_example.py --call-type default --call-id test-metrics Then open http://localhost:9464/metrics to see real-time metrics as you talk to the agent. From cd701eb4c5df0d6b57fd8c89c07945f5988de360 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 20 Jan 2026 12:37:13 +0100 Subject: [PATCH 3/6] Fix paths in READMEs --- docs/ai/instructions/ai-tts.md | 17 ++++++++++------- 1 file changed, 10 insertions(+), 7 deletions(-) diff --git a/docs/ai/instructions/ai-tts.md b/docs/ai/instructions/ai-tts.md index bdc4e53d6..1d3bdbbef 100644 --- a/docs/ai/instructions/ai-tts.md +++ b/docs/ai/instructions/ai-tts.md @@ -5,10 +5,10 @@ Build a TTS plugin that streams audio and emits events. Keep it minimal and foll ## What to create - Make sure to follow PEP 420: Do NOT add `__init__.py` in plugin folders. Use this layout: - - `plugins//pyproject.toml` (depends on `vision-agents`) - - `plugins//vision_agents/plugins//tts.py` - - `plugins//tests/test_tts.py` (pytest tests at plugin root) - - `plugins//example/` (optional, see `plugins/fish/example/fish_tts_example.py`) + - `plugins//pyproject.toml` (depends on `vision-agents`) + - `plugins//vision_agents/plugins//tts.py` + - `plugins//tests/test_tts.py` (pytest tests at plugin root) + - `plugins//example/` (optional, see `plugins/deepgram/example/deepgram_tts_example.py`) ## Implementation essentials @@ -31,18 +31,21 @@ Build a TTS plugin that streams audio and emits events. Keep it minimal and foll The plugin constructor should: 1. Rely on env vars to fetch credentials -2. export kwargs that allow developers to pass important params to the model itself (eg. model name, voice ID, API URL, ...) +2. export kwargs that allow developers to pass important params to the model itself (eg. model name, voice ID, API + URL, ...) 3. if applicable the model or client instance 4. have defaults for all params when possible so that ENV var is enough ## Testing and examples - Look at `plugins/fish/tests/test_fish_tts.py` as a reference of what tests for a TTS plugins should look like -- Add pytest tests at `plugins//tests/test_tts.py`. Keep them simple: assert that `stream_audio` yields `PcmData` and that `send()` emits `TTSAudioEvent`. +- Add pytest tests at `plugins//tests/test_tts.py`. Keep them simple: assert that `stream_audio` yields + `PcmData` and that `send()` emits `TTSAudioEvent`. - Do not write spec tests with mocks, this is usually not necessary - Make sure to write at least a couple of integration tests, use `TTSSession` to avoid boiler-plate code in testing - Include a minimal example in `plugins//example/` (see `fish_tts_example.py`). ## PCM / Audio management -Use `PcmData` and other utils available from the `getstream.video.rtc.track_util` module. Do not write code that directly manipulates PCM, use the audio utilities instead. +Use `PcmData` and other utils available from the `getstream.video.rtc.track_util` module. Do not write code that +directly manipulates PCM, use the audio utilities instead. From 8d9c9de6bd29a1a4f7f3164235beede70718f30c Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 20 Jan 2026 12:49:54 +0100 Subject: [PATCH 4/6] Fix README.md --- README.md | 9 ++++----- 1 file changed, 4 insertions(+), 5 deletions(-) diff --git a/README.md b/README.md index bb9566243..6b07510b8 100644 --- a/README.md +++ b/README.md @@ -182,11 +182,10 @@ They take care of the hard stuff, like: Check out our getting started guide at [VisionAgents.ai](https://visionagents.ai/). -**Quickstart:** [Building a Voice AI app](https://visionagents.ai/introduction/voice-agents) -**Quickstart:** [Building a Video AI app](https://visionagents.ai/introduction/video-agents) -**Tutorial: -** [Building real-time sports coaching](https://github.com/GetStream/Vision-Agents/tree/main/examples/02_golf_coach_example) -**Tutorial:** [Building a real-time meeting assistant](https://github.com/GetStream/Vision-Agents#) +- **Quickstart:** [Building a Voice AI app](https://visionagents.ai/introduction/voice-agents) +- **Quickstart:** [Building a Video AI app](https://visionagents.ai/introduction/video-agents) +- **Tutorial:** [Building a real-time meeting assistant](https://github.com/GetStream/Vision-Agents/tree/main/examples/01_simple_agent_example) +- **Tutorial:** [Building real-time sports coaching](https://github.com/GetStream/Vision-Agents/tree/main/examples/02_golf_coach_example) ## Examples From a56b70a2948692afa056f5013a0462890a4a9e66 Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 20 Jan 2026 12:50:48 +0100 Subject: [PATCH 5/6] Fix prometheus readme --- .../06_prometheus_metrics_example/prometheus_metrics_example.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/06_prometheus_metrics_example/prometheus_metrics_example.py b/examples/06_prometheus_metrics_example/prometheus_metrics_example.py index bf35bce97..7ebdf7936 100644 --- a/examples/06_prometheus_metrics_example/prometheus_metrics_example.py +++ b/examples/06_prometheus_metrics_example/prometheus_metrics_example.py @@ -13,7 +13,7 @@ Run with: cd examples/06_prometheus_metrics_example - uv run python prometheus_metrics_example.py --call-type default --call-id test-metrics + uv run python prometheus_metrics_example.py run --call-type default --call-id test-metrics Then open http://localhost:9464/metrics to see real-time metrics as you talk to the agent. """ From d3d9207867c27e1bcefa282ed4eb3c0e2fe1b01f Mon Sep 17 00:00:00 2001 From: Daniil Gusev Date: Tue, 20 Jan 2026 12:56:01 +0100 Subject: [PATCH 6/6] Use deepgram example instead of fish in ai-tts.md --- docs/ai/instructions/ai-tts.md | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/docs/ai/instructions/ai-tts.md b/docs/ai/instructions/ai-tts.md index 1d3bdbbef..8a220ffae 100644 --- a/docs/ai/instructions/ai-tts.md +++ b/docs/ai/instructions/ai-tts.md @@ -38,12 +38,12 @@ The plugin constructor should: ## Testing and examples -- Look at `plugins/fish/tests/test_fish_tts.py` as a reference of what tests for a TTS plugins should look like +- Look at `plugins/deepgram/tests/test_deepgram_tts.py` as a reference of what tests for a TTS plugins should look like - Add pytest tests at `plugins//tests/test_tts.py`. Keep them simple: assert that `stream_audio` yields `PcmData` and that `send()` emits `TTSAudioEvent`. - Do not write spec tests with mocks, this is usually not necessary -- Make sure to write at least a couple of integration tests, use `TTSSession` to avoid boiler-plate code in testing -- Include a minimal example in `plugins//example/` (see `fish_tts_example.py`). +- Make sure to write at least a couple of integration tests, use `TTSSession` to avoid boilerplate code in testing +- Include a minimal example in `plugins//example/` (see `deepgram_tts_example.py`). ## PCM / Audio management