Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
42 changes: 36 additions & 6 deletions plugins/aws/README.md
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
# AWS Plugin for Vision Agents

AWS (Bedrock) integration for Vision Agents framework with support for standard LLM, realtime with Nova Sonic, and text-to-speech with automatic session resumption.
AWS integration for Vision Agents framework with support for standard LLM (Bedrock), realtime with Nova Sonic, text-to-speech (Polly), and streaming speech-to-text (Transcribe).

## Installation

Expand Down Expand Up @@ -80,17 +80,18 @@ See `example/aws_realtime_nova_example.py` for a complete example.

### Text-to-Speech (TTS)

AWS Polly TTS is available for converting text to speech:
AWS Polly synthesises speech from text and streams the resulting audio. Supports both standard and neural engines, plain-text or SSML input, and Polly lexicons for pronunciation overrides.

```python
from vision_agents.plugins import aws

tts = aws.TTS(
region_name="us-east-1",
voice_id="Joanna", # AWS Polly voice ID
engine="neural", # 'standard' or 'neural'
text_type="text", # 'text' or 'ssml'
language_code="en-US"
voice_id="Joanna", # any Polly voice ID
engine="neural", # "standard" | "neural"
text_type="text", # "text" | "ssml"
language_code="en-US",
lexicon_names=None, # optional list of Polly lexicons
)

# Use in agent
Expand All @@ -101,6 +102,35 @@ agent = Agent(
)
```

Credentials follow the standard boto3 chain (env vars, `~/.aws/credentials`, SSO, instance profile, etc.). Pass `aws_access_key_id` + `aws_secret_access_key` (both required together, plus `aws_session_token` for temporary credentials from STS / SSO / assumed roles) or `aws_profile` to override. You may also inject a pre-built boto3 Polly client via `client=...`. `region_name` falls back to `AWS_REGION` / `AWS_DEFAULT_REGION` and finally `us-east-1`.

### Speech-to-Text (STT)

AWS Transcribe streaming STT converts audio to text in realtime. The connection auto-reconnects with exponential backoff on idle timeouts, audio-length limits, and transient errors.

```python
from vision_agents.plugins import aws

stt = aws.STT(
language_code="en-US",
region_name="us-east-1",
show_speaker_label=False,
enable_partial_results_stabilization=False,
partial_results_stability=None, # "high" | "medium" | "low"
)

# Use in agent
agent = Agent(
llm=aws.LLM(model="qwen.qwen3-32b-v1:0"),
stt=stt,
# ... other components
)
```

Credentials follow the standard boto3 chain (env vars, `~/.aws/credentials`, SSO, instance profile, etc.). Pass `aws_access_key_id` + `aws_secret_access_key` (both required together, plus `aws_session_token` for temporary credentials from STS / SSO / assumed roles) or `aws_profile` to override.

See `example/aws_pipeline_example.py` for a complete STT - LLM - TTS pipeline using only AWS components.

## Function Calling

### Standard LLM (aws.LLM)
Expand Down
48 changes: 48 additions & 0 deletions plugins/aws/example/aws_pipeline_example.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,48 @@
"""
AWS STT - LLM - TTS Pipeline Example

Voice agent built entirely from AWS components:
- STT: AWS Transcribe streaming
- LLM: AWS Bedrock (Qwen)
- TTS: AWS Polly
"""

import asyncio
import logging

from dotenv import load_dotenv
from vision_agents.core import Agent, Runner, User
from vision_agents.core.agents import AgentLauncher
from vision_agents.plugins import aws, getstream

logger = logging.getLogger(__name__)

load_dotenv()


async def create_agent(**kwargs) -> Agent:
agent = Agent(
edge=getstream.Edge(),
agent_user=User(name="AWS Voice Agent", id="agent"),
instructions="You are a voice agent. Keep replies short and "
"conversational. Do not use special characters or formatting.",
llm=aws.LLM(model="qwen.qwen3-32b-v1:0", region_name="us-east-1"),
stt=aws.STT(language_code="en-US", region_name="us-east-1"),
tts=aws.TTS(region_name="us-east-1", voice_id="Joanna", engine="neural"),
)

return agent


async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
call = await agent.create_call(call_type, call_id)

async with agent.join(call):
await asyncio.sleep(5)
await agent.simple_response(text="Ask the user about their day.")

await agent.finish()


if __name__ == "__main__":
Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
5 changes: 3 additions & 2 deletions plugins/aws/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -5,16 +5,17 @@ build-backend = "hatchling.build"
[project]
name = "vision-agents-plugins-aws"
dynamic = ["version"]
description = "AWS (Bedrock) LLM integration for Vision Agents"
description = "AWS (Bedrock LLM, Transcribe STT, Polly TTS) integration for Vision Agents"
readme = "README.md"
keywords = ["aws", "bedrock", "LLM", "AI", "voice agents", "agents"]
keywords = ["aws", "bedrock", "transcribe", "polly", "STT", "TTS", "LLM", "AI", "voice agents", "agents"]
requires-python = ">=3.12"
license = "MIT"
dependencies = [
"vision-agents",
"onnxruntime>=1.16.1,<2",
"boto3>=1.42.65,<2",
"aws-sdk-bedrock-runtime>=0.4.0,<1",
"aws-sdk-transcribe-streaming>=0.5.0,<1",
]

[project.urls]
Expand Down
170 changes: 170 additions & 0 deletions plugins/aws/tests/test_aws_stt.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,170 @@
import asyncio

import pytest
from aws_sdk_transcribe_streaming.models import (
Alternative,
Result,
Transcript,
TranscriptEvent,
)
from dotenv import load_dotenv
from vision_agents.core.turn_detection import TurnEndedEvent, TurnStartedEvent
from vision_agents.plugins import aws

from conftest import STTSession

load_dotenv()


class TestTranscribeSTT:
@pytest.fixture
def transcript_event_factory(self):
def factory(
text: str, *, is_partial: bool, start_time: float
) -> TranscriptEvent:
return TranscriptEvent(
transcript=Transcript(
results=[
Result(
result_id="r",
start_time=start_time,
end_time=start_time + 1.0,
is_partial=is_partial,
alternatives=[Alternative(transcript=text, items=[])],
)
]
)
)

return factory

async def test_partial_result_emits_partial_transcript_and_turn_started(
self, participant, transcript_event_factory
):
stt = aws.STT(language_code="en-US")
stt._current_participant = participant
session = STTSession(stt)
turn_started: list[TurnStartedEvent] = []

@stt.events.subscribe
async def on_turn_started(event: TurnStartedEvent):
turn_started.append(event)

stt._handle_transcript_event(
transcript_event_factory("hello", is_partial=True, start_time=0.0)
)
await asyncio.sleep(0.05)

assert [e.text for e in session.partial_transcripts] == ["hello"]
assert not session.transcripts
assert len(turn_started) == 1
assert turn_started[0].participant == participant

async def test_final_result_emits_transcript_and_turn_ended(
self, participant, transcript_event_factory
):
stt = aws.STT(language_code="en-US")
stt._current_participant = participant
session = STTSession(stt)
turn_ended: list[TurnEndedEvent] = []

@stt.events.subscribe
async def on_turn_ended(event: TurnEndedEvent):
turn_ended.append(event)

stt._handle_transcript_event(
transcript_event_factory("hello world", is_partial=False, start_time=0.0)
)
await asyncio.sleep(0.05)

assert [e.text for e in session.transcripts] == ["hello world"]
assert len(turn_ended) == 1
assert turn_ended[0].participant == participant

def test_partial_static_credentials_rejected(self):
with pytest.raises(ValueError, match="provided together"):
aws.STT(aws_access_key_id="AKIA...")
with pytest.raises(ValueError, match="provided together"):
aws.STT(aws_secret_access_key="secret")

async def test_clear_drops_results_before_watermark(
self, participant, transcript_event_factory
):
stt = aws.STT(language_code="en-US")
stt._audio_sent_seconds = 5.0
await stt.clear()

stt._current_participant = participant
session = STTSession(stt)

stt._handle_transcript_event(
transcript_event_factory("stale", is_partial=False, start_time=2.0)
)
stt._handle_transcript_event(
transcript_event_factory("fresh", is_partial=False, start_time=6.0)
)
await asyncio.sleep(0.05)

assert [e.text for e in session.transcripts] == ["fresh"]


@pytest.mark.integration
class TestTranscribeSTTIntegration:
@pytest.fixture
async def stt(self):
stt = aws.STT(language_code="en-US")
try:
await stt.start()
yield stt
finally:
await stt.close()

async def test_transcribe_mia_audio_16khz(
self, stt, mia_audio_16khz_chunked, participant
):
session = STTSession(stt)

for chunk in mia_audio_16khz_chunked:
await stt.process_audio(chunk, participant=participant)

await session.wait_for_result(timeout=30.0)
assert not session.errors, f"Errors occurred: {session.errors}"

full_transcript = session.get_full_transcript().lower()
assert any(
word in full_transcript for word in ["village", "quiet", "mia", "treasures"]
), f"Transcript did not match expected content: {full_transcript!r}"

async def test_partial_transcripts_emitted(
self, stt, mia_audio_16khz_chunked, participant
):
session = STTSession(stt)

for chunk in mia_audio_16khz_chunked:
await stt.process_audio(chunk, participant=participant)

await session.wait_for_result(timeout=30.0)
assert session.partial_transcripts, "No partial transcripts received"

async def test_turn_events_emitted(self, stt, mia_audio_16khz_chunked, participant):
session = STTSession(stt)
turn_started: list[TurnStartedEvent] = []
turn_ended: list[TurnEndedEvent] = []

@stt.events.subscribe
async def on_turn_started(event: TurnStartedEvent):
turn_started.append(event)

@stt.events.subscribe
async def on_turn_ended(event: TurnEndedEvent):
turn_ended.append(event)

for chunk in mia_audio_16khz_chunked:
await stt.process_audio(chunk, participant=participant)

await session.wait_for_result(timeout=30.0)

assert turn_started, "No TurnStartedEvent received"
assert turn_ended, "No TurnEndedEvent received"
assert turn_started[0].participant == participant
assert turn_ended[0].participant == participant
8 changes: 8 additions & 0 deletions plugins/aws/tests/test_tts.py
Original file line number Diff line number Diff line change
Expand Up @@ -30,6 +30,14 @@ async def tts(self) -> aws.TTS: # type: ignore[name-defined]
return aws.TTS(voice_id=os.environ.get("AWS_POLLY_VOICE", "Joanna"))


class TestAWSPollyTTS:
def test_partial_static_credentials_rejected(self):
with pytest.raises(ValueError, match="provided together"):
aws.TTS(aws_access_key_id="AKIA...")
with pytest.raises(ValueError, match="provided together"):
aws.TTS(aws_secret_access_key="secret")


@pytest.mark.skip()
@pytest.mark.integration
class TestAWSPollyTTSIntegration:
Expand Down
3 changes: 2 additions & 1 deletion plugins/aws/vision_agents/plugins/aws/__init__.py
Original file line number Diff line number Diff line change
@@ -1,5 +1,6 @@
from .aws_llm import BedrockLLM as LLM
from .aws_realtime import Realtime
from .stt import TranscribeSTT as STT
from .tts import TTS

__all__ = ["LLM", "Realtime", "TTS"]
__all__ = ["LLM", "Realtime", "STT", "TTS"]
43 changes: 43 additions & 0 deletions plugins/aws/vision_agents/plugins/aws/_credentials.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,43 @@
import asyncio
from typing import Any, Optional

import boto3
from smithy_aws_core.identity.components import (
AWSCredentialsIdentity,
AWSIdentityProperties,
)
from smithy_core.aio.interfaces.identity import IdentityResolver


class Boto3CredentialsResolver(
IdentityResolver[AWSCredentialsIdentity, AWSIdentityProperties]
):
"""IdentityResolver that delegates to boto3.Session for credential resolution.

Supports the full boto3 credential chain: env vars, shared credentials files,
AWS profiles, SSO, EC2 instance profiles, etc.
"""

def __init__(self, profile_name: Optional[str] = None) -> None:
self._session = boto3.Session(profile_name=profile_name)

async def get_identity(
self, *, properties: AWSIdentityProperties, **kwargs: Any
) -> AWSCredentialsIdentity:
# Both calls can block: get_credentials() walks the provider chain
# (file I/O, IMDS, SSO, STS) on first access, and get_frozen_credentials()
# triggers refresh on RefreshableCredentials.
credentials = await asyncio.to_thread(self._session.get_credentials)
if not credentials:
raise ValueError("Unable to load AWS credentials via boto3")

creds = await asyncio.to_thread(credentials.get_frozen_credentials)
if not creds.access_key or not creds.secret_key:
raise ValueError("AWS credentials are incomplete")

return AWSCredentialsIdentity(
access_key_id=creds.access_key,
secret_access_key=creds.secret_key,
session_token=creds.token or None,
expiration=None,
)
Loading
Loading