diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx index 1e4525986..da7ed0f7e 100644 --- a/apps/speech/screens/SpeechToTextScreen.tsx +++ b/apps/speech/screens/SpeechToTextScreen.tsx @@ -50,16 +50,28 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => { AudioManager.requestRecordingPermissions(); }, []); + async function getAudioFile(sourceUri: string) { + const destination = FileSystem.cacheDirectory + 'audio_file.wav'; + + if (sourceUri.startsWith('http')) { + const { uri } = await FileSystem.downloadAsync(sourceUri, destination); + return uri; + } else { + await FileSystem.copyAsync({ + from: sourceUri, + to: destination, + }); + return destination; + } + } + const handleTranscribeFromURL = async () => { if (!audioURL.trim()) { console.warn('Please provide a valid audio file URL'); return; } - const { uri } = await FileSystem.downloadAsync( - audioURL, - FileSystem.cacheDirectory + 'audio_file' - ); + const uri = await getAudioFile(audioURL); const audioContext = new AudioContext({ sampleRate: 16000 }); diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h index a0ea7e181..41d1578b4 100644 --- a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h +++ b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h @@ -34,7 +34,9 @@ class ASR { // The maximum number of tokens the decoder can generate per chunk constexpr static int32_t kMaxDecodeLength = 128; // Maximum duration of each audio chunk to process (in seconds) - constexpr static int32_t kChunkSize = 30; + // It is intentionally set to 29 since otherwise only the last chunk would be + // correctly transcribe due to the model's positional encoding limit + constexpr static int32_t kChunkSize = 29; // Sampling rate expected by Whisper and the model's audio pipeline (16 kHz) constexpr static int32_t kSamplingRate = 16000; // Minimum allowed chunk length before processing (in audio samples)