GetStream · dangusev · May 5, 2026 · Apr 30, 2026 · Apr 30, 2026 · Apr 30, 2026
diff --git a/plugins/aws/README.md b/plugins/aws/README.md
@@ -1,6 +1,6 @@
 # AWS Plugin for Vision Agents
 
-AWS (Bedrock) integration for Vision Agents framework with support for standard LLM, realtime with Nova Sonic, and text-to-speech with automatic session resumption.
+AWS integration for Vision Agents framework with support for standard LLM (Bedrock), realtime with Nova Sonic, text-to-speech (Polly), and streaming speech-to-text (Transcribe).
 
 ## Installation
 
@@ -80,17 +80,18 @@ See `example/aws_realtime_nova_example.py` for a complete example.
 
 ### Text-to-Speech (TTS)
 
-AWS Polly TTS is available for converting text to speech:
+AWS Polly synthesises speech from text and streams the resulting audio. Supports both standard and neural engines, plain-text or SSML input, and Polly lexicons for pronunciation overrides.
 
 ```python
 from vision_agents.plugins import aws
 
 tts = aws.TTS(
     region_name="us-east-1",
-    voice_id="Joanna",  # AWS Polly voice ID
-    engine="neural",  # 'standard' or 'neural'
-    text_type="text",  # 'text' or 'ssml'
-    language_code="en-US"
+    voice_id="Joanna",       # any Polly voice ID
+    engine="neural",         # "standard" | "neural"
+    text_type="text",        # "text" | "ssml"
+    language_code="en-US",
+    lexicon_names=None,      # optional list of Polly lexicons
 )
 
 # Use in agent
@@ -101,6 +102,35 @@ agent = Agent(
 )
 ```
 
+Credentials follow the standard boto3 chain (env vars, `~/.aws/credentials`, SSO, instance profile, etc.). Pass `aws_access_key_id` + `aws_secret_access_key` (both required together, plus `aws_session_token` for temporary credentials from STS / SSO / assumed roles) or `aws_profile` to override. You may also inject a pre-built boto3 Polly client via `client=...`. `region_name` falls back to `AWS_REGION` / `AWS_DEFAULT_REGION` and finally `us-east-1`.
+
+### Speech-to-Text (STT)
+
+AWS Transcribe streaming STT converts audio to text in realtime. The connection auto-reconnects with exponential backoff on idle timeouts, audio-length limits, and transient errors.
+
+```python
+from vision_agents.plugins import aws
+
+stt = aws.STT(
+    language_code="en-US",
+    region_name="us-east-1",
+    show_speaker_label=False,
+    enable_partial_results_stabilization=False,
+    partial_results_stability=None,  # "high" | "medium" | "low"
+)
+
+# Use in agent
+agent = Agent(
+    llm=aws.LLM(model="qwen.qwen3-32b-v1:0"),
+    stt=stt,
+    # ... other components
+)
+```
+
+Credentials follow the standard boto3 chain (env vars, `~/.aws/credentials`, SSO, instance profile, etc.). Pass `aws_access_key_id` + `aws_secret_access_key` (both required together, plus `aws_session_token` for temporary credentials from STS / SSO / assumed roles) or `aws_profile` to override.
+
+See `example/aws_pipeline_example.py` for a complete STT - LLM - TTS pipeline using only AWS components.
+
 ## Function Calling
 
 ### Standard LLM (aws.LLM)

diff --git a/plugins/aws/example/aws_pipeline_example.py b/plugins/aws/example/aws_pipeline_example.py
@@ -0,0 +1,48 @@
+"""
+AWS STT - LLM - TTS Pipeline Example
+
+Voice agent built entirely from AWS components:
+- STT: AWS Transcribe streaming
+- LLM: AWS Bedrock (Qwen)
+- TTS: AWS Polly
+"""
+
+import asyncio
+import logging
+
+from dotenv import load_dotenv
+from vision_agents.core import Agent, Runner, User
+from vision_agents.core.agents import AgentLauncher
+from vision_agents.plugins import aws, getstream
+
+logger = logging.getLogger(__name__)
+
+load_dotenv()
+
+
+async def create_agent(**kwargs) -> Agent:
+    agent = Agent(
+        edge=getstream.Edge(),
+        agent_user=User(name="AWS Voice Agent", id="agent"),
+        instructions="You are a voice agent. Keep replies short and "
+        "conversational. Do not use special characters or formatting.",
+        llm=aws.LLM(model="qwen.qwen3-32b-v1:0", region_name="us-east-1"),
+        stt=aws.STT(language_code="en-US", region_name="us-east-1"),
+        tts=aws.TTS(region_name="us-east-1", voice_id="Joanna", engine="neural"),
+    )
+
+    return agent
+
+
+async def join_call(agent: Agent, call_type: str, call_id: str, **kwargs) -> None:
+    call = await agent.create_call(call_type, call_id)
+
+    async with agent.join(call):
+        await asyncio.sleep(5)
+        await agent.simple_response(text="Ask the user about their day.")
+
+        await agent.finish()
+
+
+if __name__ == "__main__":
+    Runner(AgentLauncher(create_agent=create_agent, join_call=join_call)).cli()
diff --git a/plugins/aws/pyproject.toml b/plugins/aws/pyproject.toml
@@ -5,16 +5,17 @@ build-backend = "hatchling.build"
 [project]
 name = "vision-agents-plugins-aws"
 dynamic = ["version"]
-description = "AWS (Bedrock) LLM integration for Vision Agents"
+description = "AWS (Bedrock LLM, Transcribe STT, Polly TTS) integration for Vision Agents"
 readme = "README.md"
-keywords = ["aws", "bedrock", "LLM", "AI", "voice agents", "agents"]
+keywords = ["aws", "bedrock", "transcribe", "polly", "STT", "TTS", "LLM", "AI", "voice agents", "agents"]
 requires-python = ">=3.12"
 license = "MIT"
 dependencies = [
     "vision-agents",
     "onnxruntime>=1.16.1,<2",
     "boto3>=1.42.65,<2",
     "aws-sdk-bedrock-runtime>=0.4.0,<1",
+    "aws-sdk-transcribe-streaming>=0.5.0,<1",
 ]
 
 [project.urls]

diff --git a/plugins/aws/tests/test_aws_stt.py b/plugins/aws/tests/test_aws_stt.py
@@ -0,0 +1,170 @@
+import asyncio
+
+import pytest
+from aws_sdk_transcribe_streaming.models import (
+    Alternative,
+    Result,
+    Transcript,
+    TranscriptEvent,
+)
+from dotenv import load_dotenv
+from vision_agents.core.turn_detection import TurnEndedEvent, TurnStartedEvent
+from vision_agents.plugins import aws
+
+from conftest import STTSession
+
+load_dotenv()
+
+
+class TestTranscribeSTT:
+    @pytest.fixture
+    def transcript_event_factory(self):
+        def factory(
+            text: str, *, is_partial: bool, start_time: float
+        ) -> TranscriptEvent:
+            return TranscriptEvent(
+                transcript=Transcript(
+                    results=[
+                        Result(
+                            result_id="r",
+                            start_time=start_time,
+                            end_time=start_time + 1.0,
+                            is_partial=is_partial,
+                            alternatives=[Alternative(transcript=text, items=[])],
+                        )
+                    ]
+                )
+            )
+
+        return factory
+
+    async def test_partial_result_emits_partial_transcript_and_turn_started(
+        self, participant, transcript_event_factory
+    ):
+        stt = aws.STT(language_code="en-US")
+        stt._current_participant = participant
+        session = STTSession(stt)
+        turn_started: list[TurnStartedEvent] = []
+
+        @stt.events.subscribe
+        async def on_turn_started(event: TurnStartedEvent):
+            turn_started.append(event)
+
+        stt._handle_transcript_event(
+            transcript_event_factory("hello", is_partial=True, start_time=0.0)
+        )
+        await asyncio.sleep(0.05)
+
+        assert [e.text for e in session.partial_transcripts] == ["hello"]
+        assert not session.transcripts
+        assert len(turn_started) == 1
+        assert turn_started[0].participant == participant
+
+    async def test_final_result_emits_transcript_and_turn_ended(
+        self, participant, transcript_event_factory
+    ):
+        stt = aws.STT(language_code="en-US")
+        stt._current_participant = participant
+        session = STTSession(stt)
+        turn_ended: list[TurnEndedEvent] = []
+
+        @stt.events.subscribe
+        async def on_turn_ended(event: TurnEndedEvent):
+            turn_ended.append(event)
+
+        stt._handle_transcript_event(
+            transcript_event_factory("hello world", is_partial=False, start_time=0.0)
+        )
+        await asyncio.sleep(0.05)
+
+        assert [e.text for e in session.transcripts] == ["hello world"]
+        assert len(turn_ended) == 1
+        assert turn_ended[0].participant == participant
+
+    def test_partial_static_credentials_rejected(self):
+        with pytest.raises(ValueError, match="provided together"):
+            aws.STT(aws_access_key_id="AKIA...")
+        with pytest.raises(ValueError, match="provided together"):
+            aws.STT(aws_secret_access_key="secret")
+
+    async def test_clear_drops_results_before_watermark(
+        self, participant, transcript_event_factory
+    ):
+        stt = aws.STT(language_code="en-US")
+        stt._audio_sent_seconds = 5.0
+        await stt.clear()
+
+        stt._current_participant = participant
+        session = STTSession(stt)
+
+        stt._handle_transcript_event(
+            transcript_event_factory("stale", is_partial=False, start_time=2.0)
+        )
+        stt._handle_transcript_event(
+            transcript_event_factory("fresh", is_partial=False, start_time=6.0)
+        )
+        await asyncio.sleep(0.05)
+
+        assert [e.text for e in session.transcripts] == ["fresh"]
+
+
+@pytest.mark.integration
+class TestTranscribeSTTIntegration:
+    @pytest.fixture
+    async def stt(self):
+        stt = aws.STT(language_code="en-US")
+        try:
+            await stt.start()
+            yield stt
+        finally:
+            await stt.close()
+
+    async def test_transcribe_mia_audio_16khz(
+        self, stt, mia_audio_16khz_chunked, participant
+    ):
+        session = STTSession(stt)
+
+        for chunk in mia_audio_16khz_chunked:
+            await stt.process_audio(chunk, participant=participant)
+
+        await session.wait_for_result(timeout=30.0)
+        assert not session.errors, f"Errors occurred: {session.errors}"
+
+        full_transcript = session.get_full_transcript().lower()
+        assert any(
+            word in full_transcript for word in ["village", "quiet", "mia", "treasures"]
+        ), f"Transcript did not match expected content: {full_transcript!r}"
+
+    async def test_partial_transcripts_emitted(
+        self, stt, mia_audio_16khz_chunked, participant
+    ):
+        session = STTSession(stt)
+
+        for chunk in mia_audio_16khz_chunked:
+            await stt.process_audio(chunk, participant=participant)
+
+        await session.wait_for_result(timeout=30.0)
+        assert session.partial_transcripts, "No partial transcripts received"
+
+    async def test_turn_events_emitted(self, stt, mia_audio_16khz_chunked, participant):
+        session = STTSession(stt)
+        turn_started: list[TurnStartedEvent] = []
+        turn_ended: list[TurnEndedEvent] = []
+
+        @stt.events.subscribe
+        async def on_turn_started(event: TurnStartedEvent):
+            turn_started.append(event)
+
+        @stt.events.subscribe
+        async def on_turn_ended(event: TurnEndedEvent):
+            turn_ended.append(event)
+
+        for chunk in mia_audio_16khz_chunked:
+            await stt.process_audio(chunk, participant=participant)
+
+        await session.wait_for_result(timeout=30.0)
+
+        assert turn_started, "No TurnStartedEvent received"
+        assert turn_ended, "No TurnEndedEvent received"
+        assert turn_started[0].participant == participant
+        assert turn_ended[0].participant == participant
diff --git a/plugins/aws/tests/test_tts.py b/plugins/aws/tests/test_tts.py
@@ -30,6 +30,14 @@ async def tts(self) -> aws.TTS:  # type: ignore[name-defined]
     return aws.TTS(voice_id=os.environ.get("AWS_POLLY_VOICE", "Joanna"))
 
 
+class TestAWSPollyTTS:
+    def test_partial_static_credentials_rejected(self):
+        with pytest.raises(ValueError, match="provided together"):
+            aws.TTS(aws_access_key_id="AKIA...")
+        with pytest.raises(ValueError, match="provided together"):
+            aws.TTS(aws_secret_access_key="secret")
+
+
 @pytest.mark.skip()
 @pytest.mark.integration
 class TestAWSPollyTTSIntegration:

diff --git a/plugins/aws/vision_agents/plugins/aws/__init__.py b/plugins/aws/vision_agents/plugins/aws/__init__.py
@@ -1,5 +1,6 @@
 from .aws_llm import BedrockLLM as LLM
 from .aws_realtime import Realtime
+from .stt import TranscribeSTT as STT
 from .tts import TTS
 
-__all__ = ["LLM", "Realtime", "TTS"]
+__all__ = ["LLM", "Realtime", "STT", "TTS"]
diff --git a/plugins/aws/vision_agents/plugins/aws/_credentials.py b/plugins/aws/vision_agents/plugins/aws/_credentials.py
@@ -0,0 +1,43 @@
+import asyncio
+from typing import Any, Optional
+
+import boto3
+from smithy_aws_core.identity.components import (
+    AWSCredentialsIdentity,
+    AWSIdentityProperties,
+)
+from smithy_core.aio.interfaces.identity import IdentityResolver
+
+
+class Boto3CredentialsResolver(
+    IdentityResolver[AWSCredentialsIdentity, AWSIdentityProperties]
+):
+    """IdentityResolver that delegates to boto3.Session for credential resolution.
+
+    Supports the full boto3 credential chain: env vars, shared credentials files,
+    AWS profiles, SSO, EC2 instance profiles, etc.
+    """
+
+    def __init__(self, profile_name: Optional[str] = None) -> None:
+        self._session = boto3.Session(profile_name=profile_name)
+
+    async def get_identity(
+        self, *, properties: AWSIdentityProperties, **kwargs: Any
+    ) -> AWSCredentialsIdentity:
+        # Both calls can block: get_credentials() walks the provider chain
+        # (file I/O, IMDS, SSO, STS) on first access, and get_frozen_credentials()
+        # triggers refresh on RefreshableCredentials.
+        credentials = await asyncio.to_thread(self._session.get_credentials)
+        if not credentials:
+            raise ValueError("Unable to load AWS credentials via boto3")
+
+        creds = await asyncio.to_thread(credentials.get_frozen_credentials)
+        if not creds.access_key or not creds.secret_key:
+            raise ValueError("AWS credentials are incomplete")
+
+        return AWSCredentialsIdentity(
+            access_key_id=creds.access_key,
+            secret_access_key=creds.secret_key,
+            session_token=creds.token or None,
+            expiration=None,
+        )