software-mansion · msluszniak · Feb 3, 2026 · Feb 2, 2026 · Feb 2, 2026
diff --git a/apps/speech/screens/SpeechToTextScreen.tsx b/apps/speech/screens/SpeechToTextScreen.tsx
@@ -50,16 +50,28 @@ export const SpeechToTextScreen = ({ onBack }: { onBack: () => void }) => {
     AudioManager.requestRecordingPermissions();
   }, []);
 
+  async function getAudioFile(sourceUri: string) {
+    const destination = FileSystem.cacheDirectory + 'audio_file.wav';
+
+    if (sourceUri.startsWith('http')) {
+      const { uri } = await FileSystem.downloadAsync(sourceUri, destination);
+      return uri;
+    } else {
+      await FileSystem.copyAsync({
+        from: sourceUri,
+        to: destination,
+      });
+      return destination;
+    }
+  }
+
   const handleTranscribeFromURL = async () => {
     if (!audioURL.trim()) {
       console.warn('Please provide a valid audio file URL');
       return;
     }
 
-    const { uri } = await FileSystem.downloadAsync(
-      audioURL,
-      FileSystem.cacheDirectory + 'audio_file'
-    );
+    const uri = await getAudioFile(audioURL);
 
     const audioContext = new AudioContext({ sampleRate: 16000 });
 

diff --git a/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h b/packages/react-native-executorch/common/rnexecutorch/models/speech_to_text/asr/ASR.h
@@ -34,7 +34,9 @@ class ASR {
   // The maximum number of tokens the decoder can generate per chunk
   constexpr static int32_t kMaxDecodeLength = 128;
   // Maximum duration of each audio chunk to process (in seconds)
-  constexpr static int32_t kChunkSize = 30;
+  // It is intentionally set to 29 since otherwise only the last chunk would be
+  // correctly transcribe due to the model's positional encoding limit
+  constexpr static int32_t kChunkSize = 29;
   // Sampling rate expected by Whisper and the model's audio pipeline (16 kHz)
   constexpr static int32_t kSamplingRate = 16000;
   // Minimum allowed chunk length before processing (in audio samples)