From 2e84a449a7f5b6e213996989a492201f937e7169 Mon Sep 17 00:00:00 2001 From: =?UTF-8?q?Mateusz=20S=C5=82uszniak?= Date: Mon, 2 Feb 2026 20:12:19 +0100 Subject: [PATCH 1/2] Fix transcription for longer audio --- apps/speech/screens/SpeechToTextScreen.tsx | 22 +++++++++++++++---- .../models/speech_to_text/asr/ASR.h | 4 +++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 1e4525986..da0374da0 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -50,16 +50,30 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { AudioManager.requestRecordingPermissions(); }, []); + async function getAudioFile(sourceUri: string) { + const destination = FileSystem.cacheDirectory + 'audio_file.wav'; + + if (sourceUri.startsWith('http')) { + // Case A: Remote URL -> Download it + const { uri } = await FileSystem.downloadAsync(sourceUri, destination); + return uri; + } else { + // Case B: Local URI -> Copy it + await FileSystem.copyAsync({ + from: sourceUri, + to: destination, + }); + return destination; + } + } + const handleTranscribeFromURL = async () => { if (!audioURL.trim()) { console.warn('Please provide a valid audio file URL'); return; } - const { uri } = await FileSystem.downloadAsync( - audioURL, - FileSystem.cacheDirectory + 'audio_file' - ); + const uri = await getAudioFile(audioURL); const audioContext = new AudioContext({ sampleRate: 16000 }); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h index a0ea7e181..41d1578b4 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h @@ -34,7 +34,9 @@ class ASR { // The maximum number of tokens the decoder can generate per chunk constexpr static int32_t kMaxDecodeLength = 128; // Maximum duration of each audio chunk to process (in seconds) - constexpr static int32_t kChunkSize = 30; + // It is intentionally set to 29 since otherwise only the last chunk would be + // correctly transcribe due to the model's positional encoding limit + constexpr static int32_t kChunkSize = 29; // Sampling rate expected by Whisper and the model's audio pipeline (16 kHz) constexpr static int32_t kSamplingRate = 16000; // Minimum allowed chunk length before processing (in audio samples) From c20e7f2840c65348e724279bb778432f20eceff0 Mon Sep 17 00:00:00 2001 From: Mateusz Sluszniak <56299341+msluszniak@users.noreply.github.com> Date: Mon, 2 Feb 2026 20:14:44 +0100 Subject: [PATCH 2/2] Update apps/speech/screens/SpeechToTextScreen.tsx --- apps/speech/screens/SpeechToTextScreen.tsx | 2 -- 1 file changed, 2 deletions(-) diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index da0374da0..da7ed0f7e 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -54,11 +54,9 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { const destination = FileSystem.cacheDirectory + 'audio_file.wav'; if (sourceUri.startsWith('http')) { - // Case A: Remote URL -> Download it const { uri } = await FileSystem.downloadAsync(sourceUri, destination); return uri; } else { - // Case B: Local URI -> Copy it await FileSystem.copyAsync({ from: sourceUri, to: destination,