diff --git a/App.tsx b/App.tsx index ac8cee15..1020942d 100644 --- a/App.tsx +++ b/App.tsx @@ -14,6 +14,15 @@ import { useTheme } from './src/theme'; import { hardwareService, modelManager, authService, ragService, remoteServerManager } from './src/services'; import logger from './src/utils/logger'; import { useAppStore, useAuthStore, useRemoteServerStore } from './src/stores'; +import { useTTSStore } from './src/stores/ttsStore'; +import { initExecutorch } from 'react-native-executorch'; +import { BareResourceFetcher } from 'react-native-executorch-bare-resource-fetcher'; +import { KokoroTTSManager } from './src/components/KokoroTTSManager'; +import { isExecutorchSupported } from './src/constants/kokoroModels'; + +// Initialise executorch resource fetcher once at module load time. +// This must run before any useTextToSpeech hook is mounted. +initExecutorch({ resourceFetcher: BareResourceFetcher }); import { LockScreen } from './src/screens'; import { useAppState } from './src/hooks/useAppState'; @@ -191,6 +200,9 @@ function App() { // Initialize RAG database tables ragService.ensureReady().catch((err) => logger.error('Failed to initialize RAG service on startup', err)); + // Sync TTS download state so TTSButton / audio mode know models are available + useTTSStore.getState().checkDownloadStatus().catch(() => {}); + // Show the UI immediately setIsInitializing(false); @@ -235,6 +247,7 @@ function App() { + {isExecutorchSupported() && } ({ + ttsService: { + isBackboneDownloaded: jest.fn(), + isVocoderDownloaded: jest.fn(), + downloadBackbone: jest.fn(), + downloadVocoder: jest.fn(), + deleteModels: jest.fn(), + loadModels: jest.fn(), + unloadModels: jest.fn(), + speak: jest.fn(), + stop: jest.fn(), + generateAndSave: jest.fn(), + playFromFile: jest.fn(), + getAudioCacheSizeMB: jest.fn(), + clearAudioCache: jest.fn(), + }, +})); + +jest.mock('../../../src/utils/logger', () => ({ + __esModule: true, + default: { log: jest.fn(), error: jest.fn(), warn: jest.fn() }, +})); + +import { useTTSStore } from '../../../src/stores/ttsStore'; +import { ttsService } from '../../../src/services/ttsService'; + +const mockTTS = ttsService as jest.Mocked; +const getState = () => useTTSStore.getState(); + +const resetStore = () => { + useTTSStore.setState({ + isBackboneDownloaded: false, + isVocoderDownloaded: false, + isDownloadingBackbone: false, + isDownloadingVocoder: false, + backboneDownloadProgress: 0, + vocoderDownloadProgress: 0, + isModelLoading: false, + isModelLoaded: false, + isSpeaking: false, + currentMessageId: null, + audioCacheSizeMB: 0, + settings: { interfaceMode: 'chat', enabled: true, autoPlay: false, speed: 1.0, voiceId: '0', kokoroVoiceId: 'af_heart' }, + error: null, + }); +}; + +describe('TTS integration', () => { + beforeEach(() => { + resetStore(); + jest.clearAllMocks(); + mockTTS.getAudioCacheSizeMB.mockResolvedValue(0); + }); + + // ─── Chat Mode ──────────────────────────────────────────────────────────── + + describe('Chat Mode: download → load → speak → stop', () => { + it('completes the full Chat Mode flow', async () => { + // 1. Download + mockTTS.downloadBackbone.mockResolvedValue('/bb.gguf'); + mockTTS.downloadVocoder.mockResolvedValue('/voc.gguf'); + await getState().downloadModels(); + + expect(getState().isBackboneDownloaded).toBe(true); + expect(getState().isVocoderDownloaded).toBe(true); + + // 2. Load + mockTTS.loadModels.mockResolvedValue(undefined); + await getState().loadModels(); + expect(getState().isModelLoaded).toBe(true); + + // 3. Speak + mockTTS.speak.mockResolvedValue(undefined); + mockTTS.stop.mockReturnValue(undefined); + + const speakPromise = getState().speak('hello', 'msg1'); + expect(getState().isSpeaking).toBe(true); + expect(getState().currentMessageId).toBe('msg1'); + + await speakPromise; + expect(getState().isSpeaking).toBe(false); + expect(getState().currentMessageId).toBeNull(); + + // 4. Stop mid-speech + mockTTS.speak.mockImplementation( + () => new Promise((resolve) => setTimeout(resolve, 1000)), + ); + getState().speak('second', 'msg2'); + getState().stop(); + expect(getState().isSpeaking).toBe(false); + }); + }); + + // ─── Audio Mode ─────────────────────────────────────────────────────────── + + describe('Audio Mode: download → load → generateAndSave → playMessage → stop', () => { + beforeEach(() => { + useTTSStore.setState({ + settings: { interfaceMode: 'audio', enabled: true, autoPlay: false, speed: 1.0, voiceId: '0', kokoroVoiceId: 'af_heart' }, + }); + }); + + it('completes the full Audio Mode flow', async () => { + // 1. Download + mockTTS.downloadBackbone.mockResolvedValue('/bb.gguf'); + mockTTS.downloadVocoder.mockResolvedValue('/voc.gguf'); + await getState().downloadModels(); + + // 2. Load + mockTTS.loadModels.mockResolvedValue(undefined); + await getState().loadModels(); + expect(getState().isModelLoaded).toBe(true); + + // 3. GenerateAndSave + const mockAudio = { + samples: new Float32Array(100), + durationSeconds: 1.5, + sampleRate: 24000, + waveformData: new Array(200).fill(0.2), + }; + mockTTS.generateAndSave.mockResolvedValue({ path: '/cache/c1/m1.pcm', audio: mockAudio } as any); + mockTTS.getAudioCacheSizeMB.mockResolvedValue(1.5); + + const result = await getState().generateAndSave('hello audio', 'conv1', 'msg1'); + + expect(result.path).toBe('/cache/c1/m1.pcm'); + expect(result.waveformData).toHaveLength(200); + expect(result.durationSeconds).toBe(1.5); + expect(getState().audioCacheSizeMB).toBeCloseTo(1.5); + + // 4. PlayMessage + mockTTS.playFromFile.mockResolvedValue(undefined); + mockTTS.stop.mockReturnValue(undefined); + + const playPromise = getState().playMessage('msg1', '/cache/c1/m1.pcm'); + expect(getState().isSpeaking).toBe(true); + expect(getState().currentMessageId).toBe('msg1'); + + await playPromise; + expect(getState().isSpeaking).toBe(false); + + // 5. StopPlayback + getState().stopPlayback(); + expect(mockTTS.stop).toHaveBeenCalled(); + }); + }); + + // ─── Mode switching ─────────────────────────────────────────────────────── + + describe('mode switching', () => { + it('switching interfaceMode to audio takes effect immediately', () => { + expect(getState().settings.interfaceMode).toBe('chat'); + getState().updateSettings({ interfaceMode: 'audio' }); + expect(getState().settings.interfaceMode).toBe('audio'); + }); + + it('switching back to chat mode works', () => { + getState().updateSettings({ interfaceMode: 'audio' }); + getState().updateSettings({ interfaceMode: 'chat' }); + expect(getState().settings.interfaceMode).toBe('chat'); + }); + }); + + // ─── Auto-play ──────────────────────────────────────────────────────────── + + describe('auto-play', () => { + it('speak is called when autoPlay is true and model is loaded', async () => { + useTTSStore.setState({ + isModelLoaded: true, + settings: { interfaceMode: 'chat', enabled: true, autoPlay: true, speed: 1.0, voiceId: '0', kokoroVoiceId: 'af_heart' }, + }); + mockTTS.speak.mockResolvedValue(undefined); + mockTTS.stop.mockReturnValue(undefined); + + // Simulate chat completion triggering speak + await getState().speak('AI response text', 'last-msg-id'); + + expect(mockTTS.speak).toHaveBeenCalledWith( + 'AI response text', + expect.objectContaining({ voiceId: '0', speed: 1.0 }), + expect.any(Function), + ); + }); + }); +}); diff --git a/__tests__/rntl/components/ChatInput.test.tsx b/__tests__/rntl/components/ChatInput.test.tsx index 617430ab..303297d3 100644 --- a/__tests__/rntl/components/ChatInput.test.tsx +++ b/__tests__/rntl/components/ChatInput.test.tsx @@ -51,10 +51,20 @@ jest.mock('../../../src/services/documentService', () => ({ // Mock the stores const mockUseWhisperStore = jest.fn(); const mockUseAppStore = jest.fn(); +const mockUseTTSStore = jest.fn(() => ({ + settings: { interfaceMode: 'chat', enabled: false, speed: 1.0 }, + isBackboneDownloaded: false, + isVocoderDownloaded: false, + isModelLoaded: false, + loadModels: jest.fn(), + unloadModels: jest.fn(), + updateSettings: jest.fn(), +})); jest.mock('../../../src/stores', () => ({ useWhisperStore: () => mockUseWhisperStore(), useAppStore: () => mockUseAppStore(), + useTTSStore: () => mockUseTTSStore(), })); // Mock the whisper hook diff --git a/__tests__/rntl/components/GenerationSettingsModal.test.tsx b/__tests__/rntl/components/GenerationSettingsModal.test.tsx index a9ef4647..ed7272b1 100644 --- a/__tests__/rntl/components/GenerationSettingsModal.test.tsx +++ b/__tests__/rntl/components/GenerationSettingsModal.test.tsx @@ -859,13 +859,13 @@ describe('GenerationSettingsModal', () => { }); it('calls handleSliderComplete on text generation slider (no-op)', () => { - const { getByText, getAllByTestId } = render( + const { getByText, queryAllByTestId } = render( , ); fireEvent.press(getByText('TEXT GENERATION')); - const sliders = getAllByTestId('slider'); + const sliders = queryAllByTestId('slider'); // onSlidingComplete is a no-op but should not throw if (sliders.length > 0 && sliders[0].props.onSlidingComplete) { expect(() => sliders[0].props.onSlidingComplete(0.5)).not.toThrow(); @@ -873,13 +873,13 @@ describe('GenerationSettingsModal', () => { }); it('calls handleSliderChange on text slider value change', () => { - const { getByText, getAllByTestId } = render( + const { getByText, queryAllByTestId } = render( , ); fireEvent.press(getByText('TEXT GENERATION')); - const sliders = getAllByTestId('slider'); + const sliders = queryAllByTestId('slider'); if (sliders.length > 0 && sliders[0].props.onValueChange) { sliders[0].props.onValueChange(0.5); expect(mockUpdateSettings).toHaveBeenCalled(); @@ -1070,17 +1070,16 @@ describe('GenerationSettingsModal', () => { expect(mockUpdateSettings).toHaveBeenCalledWith({ enableGpu: true, cacheType: 'f16' }); }); - it('calls updateSettings with gpuLayers value from GPU layers slider', () => { + it('calls updateSettings with gpuLayers value from GPU layers stepper', () => { mockStoreValues.settings = { ...defaultSettings, enableGpu: true, gpuLayers: 6, flashAttn: false }; const { getByText, getByTestId } = render(); fireEvent.press(getByText('TEXT GENERATION')); fireEvent.press(getByTestId('modal-text-advanced-toggle')); mockUpdateSettings.mockClear(); - const slider = getByTestId('gpu-layers-slider'); - slider.props.onSlidingComplete(12); + fireEvent.press(getByTestId('gpu-layers-stepper-increment')); - expect(mockUpdateSettings).toHaveBeenCalledWith({ gpuLayers: 12 }); + expect(mockUpdateSettings).toHaveBeenCalledWith({ gpuLayers: 7 }); }); }); }); diff --git a/__tests__/rntl/components/VoiceRecordButton.test.tsx b/__tests__/rntl/components/VoiceRecordButton.test.tsx index b92c45a3..84899278 100644 --- a/__tests__/rntl/components/VoiceRecordButton.test.tsx +++ b/__tests__/rntl/components/VoiceRecordButton.test.tsx @@ -87,16 +87,17 @@ describe('VoiceRecordButton', () => { }); it('shows recording indicator when isRecording is true', () => { - const { getByText } = render( + const { toJSON } = render( ); - // When recording, "Slide to cancel" text appears in the cancel hint - expect(getByText('Slide to cancel')).toBeTruthy(); + // In audio mode (default, !asSendButton), recording shows a stop icon (square) + const treeStr = JSON.stringify(toJSON()); + expect(treeStr).toContain('square'); }); it('shows transcribing state when isTranscribing is true', () => { - const { getByText } = render( + const { toJSON } = render( { /> ); - // Transcribing state shows "Transcribing..." text - expect(getByText('Transcribing...')).toBeTruthy(); + // Transcribing state renders a spinning indicator (no text in audio mode) + expect(toJSON()).toBeTruthy(); }); - it('shows partial result text when provided', () => { + it('shows partial result text when provided in chat mode (asSendButton)', () => { const { getByText } = render( @@ -166,7 +168,7 @@ describe('VoiceRecordButton', () => { expect(toJSON()).toBeTruthy(); }); - it('taps unavailable button and triggers alert with error message', () => { + it('taps unavailable button and triggers download prompt alert', () => { const { UNSAFE_getAllByType } = render( { fireEvent.press(touchables[0]); expect(mockShowAlert).toHaveBeenCalledWith( - 'Voice Input Unavailable', - expect.stringContaining('Microphone permission denied'), + 'Download Voice Model', + expect.stringContaining('Download Whisper Small'), expect.any(Array) ); }); - it('taps unavailable button with default error when no error prop', () => { + it('taps unavailable button shows download prompt with size', () => { const { UNSAFE_getAllByType } = render( { fireEvent.press(touchables[0]); expect(mockShowAlert).toHaveBeenCalledWith( - 'Voice Input Unavailable', - expect.stringContaining('No transcription model downloaded'), + 'Download Voice Model', + expect.stringContaining('466 MB'), expect.any(Array) ); }); - it('alert message includes instructions for downloading model', () => { + it('alert message includes Download and Cancel buttons', () => { const { UNSAFE_getAllByType } = render( { fireEvent.press(touchables[0]); expect(mockShowAlert).toHaveBeenCalledWith( - 'Voice Input Unavailable', - expect.stringContaining('Download a Whisper model'), - expect.any(Array) + 'Download Voice Model', + expect.any(String), + expect.arrayContaining([ + expect.objectContaining({ text: 'Cancel' }), + expect.objectContaining({ text: 'Download' }), + ]) ); }); }); @@ -400,11 +405,13 @@ describe('VoiceRecordButton', () => { }); it('does not show cancel hint when not recording', () => { - const { queryByText } = render( + const { toJSON } = render( ); - expect(queryByText('Slide to cancel')).toBeNull(); + // Audio mode (default) uses tap-to-toggle, no slide-to-cancel + const treeStr = JSON.stringify(toJSON()); + expect(treeStr).not.toContain('Slide to cancel'); }); it('does not show partial result when partialResult is empty', () => { @@ -418,12 +425,12 @@ describe('VoiceRecordButton', () => { // partialResult is empty, so the partial result container should not render const treeStr = JSON.stringify(toJSON()); - // The cancel hint should still show - expect(treeStr).toContain('Slide to cancel'); + // Audio mode uses tap-to-toggle with a stop icon + expect(treeStr).toContain('square'); }); it('shows recording UI elements but not transcribing when recording', () => { - const { getByText, queryByText } = render( + const { toJSON, queryByText } = render( { // When isRecording is true AND isTranscribing is true, // the component shows recording UI (not transcribing state) - expect(getByText('Slide to cancel')).toBeTruthy(); + const treeStr = JSON.stringify(toJSON()); + expect(treeStr).toContain('square'); expect(queryByText('Transcribing...')).toBeNull(); }); @@ -446,7 +454,7 @@ describe('VoiceRecordButton', () => { }); it('prioritizes model loading state over recording', () => { - const { getByText, queryByText } = render( + const { getByText, toJSON } = render( { ); expect(getByText('Loading...')).toBeTruthy(); - expect(queryByText('Slide to cancel')).toBeNull(); + // Recording UI should not render when loading + const treeStr = JSON.stringify(toJSON()); + expect(treeStr).not.toContain('square'); }); it('prioritizes model loading state over transcribing', () => { - const { getByText, queryByText } = render( + const { getByText, toJSON } = render( { ); expect(getByText('Loading...')).toBeTruthy(); - expect(queryByText('Transcribing...')).toBeNull(); + // Transcribing state should not render when loading + expect(toJSON()).toBeTruthy(); }); }); }); diff --git a/__tests__/rntl/screens/DownloadManagerScreen.test.tsx b/__tests__/rntl/screens/DownloadManagerScreen.test.tsx index 2a976dfd..255ab5e6 100644 --- a/__tests__/rntl/screens/DownloadManagerScreen.test.tsx +++ b/__tests__/rntl/screens/DownloadManagerScreen.test.tsx @@ -212,20 +212,23 @@ describe('DownloadManagerScreen', () => { }); it('shows empty state when no downloads', () => { - const { getByText } = render(); - expect(getByText('No active downloads')).toBeTruthy(); + const { getByText, queryByText } = render(); + // Active Downloads section is hidden when there are no active items + expect(queryByText('Active Downloads')).toBeNull(); expect(getByText('No models downloaded yet')).toBeTruthy(); }); it('shows section headers for active and completed', () => { - const { getByText } = render(); - expect(getByText('Active Downloads')).toBeTruthy(); + const { getByText, queryByText } = render(); + // Active Downloads section is hidden when empty + expect(queryByText('Active Downloads')).toBeNull(); + // Downloaded Models section is always shown expect(getByText('Downloaded Models')).toBeTruthy(); }); it('shows empty subtext when no models downloaded', () => { const { getByText } = render(); - expect(getByText('Go to the Models tab to browse and download models')).toBeTruthy(); + expect(getByText('No models downloaded yet')).toBeTruthy(); }); it('renders completed text model with details', () => { @@ -305,11 +308,12 @@ describe('DownloadManagerScreen', () => { expect(getByText(/Total storage used/)).toBeTruthy(); }); - it('shows count badges for active and completed sections', () => { + it('shows count badge for completed section', () => { setupSingleModelState(); const { getByText } = render(); - expect(getByText('0')).toBeTruthy(); + // Active section is hidden when empty (no "0" badge) + // Completed section shows count of 1 expect(getByText('1')).toBeTruthy(); }); @@ -344,7 +348,8 @@ describe('DownloadManagerScreen', () => { const { UNSAFE_getAllByType } = render(); const touchables = UNSAFE_getAllByType(TouchableOpacity); - const cancelButtons = touchables.filter((_: any, i: number) => i > 0); + // Skip back button (1) + filter chips (6) = 7 touchables before content + const cancelButtons = touchables.filter((_: any, i: number) => i > 6); if (cancelButtons.length > 0) { fireEvent.press(cancelButtons[0]); } @@ -820,8 +825,8 @@ describe('DownloadManagerScreen', () => { const { UNSAFE_getAllByType, getByTestId } = render(); const touchables = UNSAFE_getAllByType(TouchableOpacity); - // Press the cancel button (second touchable after back button) - const cancelButtons = touchables.filter((_: any, i: number) => i > 0); + // Skip back button (1) + filter chips (6) = 7 touchables before content + const cancelButtons = touchables.filter((_: any, i: number) => i > 6); fireEvent.press(cancelButtons[0]); // Press "Yes" to confirm @@ -852,7 +857,8 @@ describe('DownloadManagerScreen', () => { const { UNSAFE_getAllByType, getByTestId } = render(); const touchables = UNSAFE_getAllByType(TouchableOpacity); - const cancelButtons = touchables.filter((_: any, i: number) => i > 0); + // Skip back button (1) + filter chips (6) = 7 touchables before content + const cancelButtons = touchables.filter((_: any, i: number) => i > 6); fireEvent.press(cancelButtons[0]); await act(async () => { @@ -880,7 +886,8 @@ describe('DownloadManagerScreen', () => { const { UNSAFE_getAllByType, getByTestId } = render(); const touchables = UNSAFE_getAllByType(TouchableOpacity); - const cancelButtons = touchables.filter((_: any, i: number) => i > 0); + // Skip back button (1) + filter chips (6) = 7 touchables before content + const cancelButtons = touchables.filter((_: any, i: number) => i > 6); fireEvent.press(cancelButtons[0]); await act(async () => { @@ -1029,7 +1036,8 @@ describe('DownloadManagerScreen', () => { // Find the cancel button for the RNFS download (which has no downloadId) const touchables = result.UNSAFE_getAllByType(TouchableOpacity); - const cancelButtons = touchables.filter((_: any, i: number) => i > 0); + // Skip back button (1) + filter chips (6) = 7 touchables before content + const cancelButtons = touchables.filter((_: any, i: number) => i > 6); if (cancelButtons.length > 0) { fireEvent.press(cancelButtons[0]); @@ -1367,8 +1375,8 @@ describe('DownloadManagerScreen', () => { // Find and press cancel button on the active download const touchables = result.UNSAFE_getAllByType(TouchableOpacity); - // Find cancel buttons (skip back button) - const cancelButtons = touchables.filter((_: any, i: number) => i > 0); + // Skip back button (1) + filter chips (6) = 7 touchables before content + const cancelButtons = touchables.filter((_: any, i: number) => i > 6); if (cancelButtons.length > 0) { fireEvent.press(cancelButtons[0]); diff --git a/__tests__/rntl/screens/ModelSettingsScreen.test.tsx b/__tests__/rntl/screens/ModelSettingsScreen.test.tsx index 026ba7b1..455b376b 100644 --- a/__tests__/rntl/screens/ModelSettingsScreen.test.tsx +++ b/__tests__/rntl/screens/ModelSettingsScreen.test.tsx @@ -644,14 +644,13 @@ describe('ModelSettingsScreen', () => { expect(useAppStore.getState().settings.enableGpu).toBe(true); }); - it('updates gpuLayers when GPU Layers slider completes', () => { + it('updates gpuLayers when GPU Layers stepper is incremented', () => { useAppStore.getState().updateSettings({ enableGpu: true, flashAttn: false, gpuLayers: 6 }); const { getByTestId } = renderWithSections('text'); - const slider = getByTestId('gpu-layers-slider'); - fireEvent(slider, 'slidingComplete', 12); + fireEvent.press(getByTestId('gpu-layers-stepper-increment')); - expect(useAppStore.getState().settings.gpuLayers).toBe(12); + expect(useAppStore.getState().settings.gpuLayers).toBe(7); }); }); }); diff --git a/__tests__/rntl/screens/VoiceSettingsScreen.test.tsx b/__tests__/rntl/screens/VoiceSettingsScreen.test.tsx index a055a2ad..7d459bde 100644 --- a/__tests__/rntl/screens/VoiceSettingsScreen.test.tsx +++ b/__tests__/rntl/screens/VoiceSettingsScreen.test.tsx @@ -3,15 +3,15 @@ * * Tests for the voice settings screen including: * - Title display - * - Description text about Whisper - * - Download options when no model + * - Privacy note text + * - English and Multilingual model sections * - Back button navigation - * - Downloaded model state (name, status badge, remove button) + * - Active model state (name, badge, remove button) * - Download progress display * - Model download trigger * - Remove model confirmation alert * - Error display and clear - * - Privacy card display + * - Search bar * * Priority: P1 (High) */ @@ -82,6 +82,7 @@ jest.mock('../../../src/components/Button', () => ({ })); const mockDownloadModel = jest.fn(); +const mockDownloadFromUrl = jest.fn(); const mockDeleteModel = jest.fn(); const mockClearError = jest.fn(); @@ -90,6 +91,7 @@ let mockWhisperStoreValues: any = { isDownloading: false, downloadProgress: 0, downloadModel: mockDownloadModel, + downloadFromUrl: mockDownloadFromUrl, deleteModel: mockDeleteModel, error: null, clearError: mockClearError, @@ -101,13 +103,24 @@ jest.mock('../../../src/stores', () => ({ jest.mock('../../../src/services', () => ({ WHISPER_MODELS: [ - { id: 'tiny', name: 'Whisper Tiny', size: '75', description: 'Fastest, lower accuracy' }, - { id: 'base', name: 'Whisper Base', size: '141', description: 'Good accuracy' }, - { id: 'small', name: 'Whisper Small', size: '461', description: 'Better accuracy' }, - { id: 'medium', name: 'Whisper Medium', size: '1500', description: 'Best accuracy' }, + { id: 'tiny.en', name: 'Tiny', size: 75, lang: 'en', description: 'Fastest, English only' }, + { id: 'base.en', name: 'Base', size: 142, lang: 'en', description: 'Better accuracy, English only' }, + { id: 'small.en', name: 'Small', size: 466, lang: 'en', description: 'High accuracy, English only' }, + { id: 'medium.en', name: 'Medium', size: 1500, lang: 'en', description: 'Near human-level, English only' }, + { id: 'tiny', name: 'Tiny', size: 75, lang: 'multi', description: 'Fastest, 99 languages' }, + { id: 'base', name: 'Base', size: 142, lang: 'multi', description: 'Better accuracy, 99 languages' }, + { id: 'small', name: 'Small', size: 466, lang: 'multi', description: 'High accuracy, 99 languages' }, + { id: 'medium', name: 'Medium', size: 1500, lang: 'multi', description: 'Near human-level, 99 languages' }, ], })); +jest.mock('../../../src/services/huggingface', () => ({ + huggingFaceService: { + searchWhisperRepos: jest.fn().mockResolvedValue([]), + getWhisperFiles: jest.fn().mockResolvedValue([]), + }, +})); + import { VoiceSettingsScreen } from '../../../src/screens/VoiceSettingsScreen'; const mockGoBack = jest.fn(); @@ -134,6 +147,7 @@ describe('VoiceSettingsScreen', () => { isDownloading: false, downloadProgress: 0, downloadModel: mockDownloadModel, + downloadFromUrl: mockDownloadFromUrl, deleteModel: mockDeleteModel, error: null, clearError: mockClearError, @@ -149,19 +163,16 @@ describe('VoiceSettingsScreen', () => { expect(getByText('Voice Transcription')).toBeTruthy(); }); - it('shows description text about Whisper', () => { + it('shows privacy note about on-device transcription', () => { const { getByText } = render(); expect( - getByText(/Download a Whisper model to enable on-device voice input/), + getByText(/All transcription runs on-device/), ).toBeTruthy(); }); - it('shows privacy card', () => { - const { getByText } = render(); - expect(getByText('Privacy First')).toBeTruthy(); - expect( - getByText(/Voice transcription happens entirely on your device/), - ).toBeTruthy(); + it('shows search bar', () => { + const { getByPlaceholderText } = render(); + expect(getByPlaceholderText('Search models or HuggingFace...')).toBeTruthy(); }); it('back button calls goBack', () => { @@ -178,48 +189,46 @@ describe('VoiceSettingsScreen', () => { // No Model Downloaded - Download Options // ============================================================================ describe('download options (no model)', () => { - it('shows download options when no model is downloaded', () => { + it('shows English model section', () => { const { getByText } = render(); - expect(getByText('Whisper Tiny')).toBeTruthy(); - expect(getByText('Whisper Base')).toBeTruthy(); - expect(getByText('Whisper Small')).toBeTruthy(); + expect(getByText('ENGLISH ONLY')).toBeTruthy(); }); - it('shows only first 3 models (slice(0, 3))', () => { - const { queryByText } = render(); - // 4th model (medium) should NOT be shown due to .slice(0, 3) - expect(queryByText('Whisper Medium')).toBeNull(); + it('shows Multilingual model section', () => { + const { getByText } = render(); + expect(getByText(/MULTILINGUAL/)).toBeTruthy(); }); - it('shows "Select a model to download" label', () => { - const { getByText } = render(); - expect(getByText('Select a model to download:')).toBeTruthy(); + it('shows model names in English section', () => { + const { getAllByText } = render(); + // "Tiny" appears in both English and Multilingual sections + expect(getAllByText('Tiny').length).toBeGreaterThanOrEqual(1); }); - it('shows model size for each option', () => { - const { getByText } = render(); - expect(getByText('75 MB')).toBeTruthy(); - expect(getByText('141 MB')).toBeTruthy(); - expect(getByText('461 MB')).toBeTruthy(); + it('shows model size for options', () => { + const { getAllByText } = render(); + // Sizes appear in both English and Multilingual sections + expect(getAllByText('75 MB').length).toBeGreaterThanOrEqual(1); + expect(getAllByText('142 MB').length).toBeGreaterThanOrEqual(1); + expect(getAllByText('466 MB').length).toBeGreaterThanOrEqual(1); }); - it('shows model description for each option', () => { + it('shows model description for options', () => { const { getByText } = render(); - expect(getByText('Fastest, lower accuracy')).toBeTruthy(); - expect(getByText('Good accuracy')).toBeTruthy(); - expect(getByText('Better accuracy')).toBeTruthy(); + expect(getByText('Fastest, English only')).toBeTruthy(); + expect(getByText('Better accuracy, English only')).toBeTruthy(); }); it('calls downloadModel when a model option is pressed', () => { - const { getByText } = render(); - fireEvent.press(getByText('Whisper Base')); - expect(mockDownloadModel).toHaveBeenCalledWith('base'); + const { getByTestId } = render(); + fireEvent.press(getByTestId('model-download-base.en')); + expect(mockDownloadModel).toHaveBeenCalledWith('base.en'); }); it('calls downloadModel with correct id for tiny model', () => { - const { getByText } = render(); - fireEvent.press(getByText('Whisper Tiny')); - expect(mockDownloadModel).toHaveBeenCalledWith('tiny'); + const { getByTestId } = render(); + fireEvent.press(getByTestId('model-download-tiny.en')); + expect(mockDownloadModel).toHaveBeenCalledWith('tiny.en'); }); }); @@ -230,28 +239,28 @@ describe('VoiceSettingsScreen', () => { beforeEach(() => { mockWhisperStoreValues = { ...mockWhisperStoreValues, - downloadedModelId: 'base', + downloadedModelId: 'base.en', }; }); - it('shows downloaded model name', () => { + it('shows active model section label', () => { const { getByText } = render(); - expect(getByText('Whisper Base')).toBeTruthy(); + expect(getByText('ACTIVE MODEL')).toBeTruthy(); }); - it('shows "Downloaded" status badge', () => { + it('shows downloaded model name with language', () => { const { getByText } = render(); - expect(getByText('Downloaded')).toBeTruthy(); + expect(getByText(/Base — English/)).toBeTruthy(); }); - it('shows "Remove Model" button', () => { + it('shows "Active" status badge', () => { const { getByText } = render(); - expect(getByText('Remove Model')).toBeTruthy(); + expect(getByText('Active')).toBeTruthy(); }); - it('does not show download options when model is downloaded', () => { - const { queryByText } = render(); - expect(queryByText('Select a model to download:')).toBeNull(); + it('shows "Remove" button', () => { + const { getByText } = render(); + expect(getByText('Remove')).toBeTruthy(); }); it('shows model id as fallback when model not found in WHISPER_MODELS', () => { @@ -263,11 +272,11 @@ describe('VoiceSettingsScreen', () => { expect(getByText('unknown-model')).toBeTruthy(); }); - it('pressing Remove Model shows confirmation alert', () => { + it('pressing Remove shows confirmation alert', () => { const { getByText } = render(); - fireEvent.press(getByText('Remove Model')); + fireEvent.press(getByText('Remove')); expect(mockShowAlert).toHaveBeenCalledWith( - 'Remove Whisper Model', + 'Remove Voice Model', 'This will disable voice input until you download a model again.', expect.arrayContaining([ expect.objectContaining({ text: 'Cancel', style: 'cancel' }), @@ -294,11 +303,6 @@ describe('VoiceSettingsScreen', () => { expect(getByText('Downloading... 45%')).toBeTruthy(); }); - it('does not show download options during download', () => { - const { queryByText } = render(); - expect(queryByText('Select a model to download:')).toBeNull(); - }); - it('shows 0% at start of download', () => { mockWhisperStoreValues = { ...mockWhisperStoreValues, @@ -334,13 +338,13 @@ describe('VoiceSettingsScreen', () => { // Error State // ============================================================================ describe('error state', () => { - it('shows error message when whisperError is set', () => { + it('shows error message with tap to dismiss when whisperError is set', () => { mockWhisperStoreValues = { ...mockWhisperStoreValues, error: 'Download failed: network error', }; const { getByText } = render(); - expect(getByText('Download failed: network error')).toBeTruthy(); + expect(getByText('Download failed: network error (tap to dismiss)')).toBeTruthy(); }); it('calls clearError when error is tapped', () => { @@ -349,7 +353,7 @@ describe('VoiceSettingsScreen', () => { error: 'Download failed', }; const { getByText } = render(); - fireEvent.press(getByText('Download failed')); + fireEvent.press(getByText('Download failed (tap to dismiss)')); expect(mockClearError).toHaveBeenCalled(); }); diff --git a/__tests__/unit/hooks/useKeyboardAwarePopover.test.ts b/__tests__/unit/hooks/useKeyboardAwarePopover.test.ts index 0e37e3e3..727880ba 100644 --- a/__tests__/unit/hooks/useKeyboardAwarePopover.test.ts +++ b/__tests__/unit/hooks/useKeyboardAwarePopover.test.ts @@ -126,12 +126,12 @@ describe('useKeyboardAwarePopover', () => { expect(mockKeyboardDismiss).not.toHaveBeenCalled(); }); - it('measures trigger position with custom offsetX', () => { + it('measures trigger position from button coords', () => { const mockMeasureInWindow = jest.fn((callback) => { callback(10, 100, 50, 30); }); - const { result } = renderHook(() => useKeyboardAwarePopover(20)); + const { result } = renderHook(() => useKeyboardAwarePopover()); // Set up mock ref (result.current.triggerRef as any).current = { @@ -143,9 +143,9 @@ describe('useKeyboardAwarePopover', () => { }); expect(mockMeasureInWindow).toHaveBeenCalled(); - // anchor.y = screenH - y = 800 - 100 = 700 - // anchor.x = offsetX = 20 - expect(result.current.anchor).toEqual({ y: 700, x: 20 }); + // anchor.y = screenH - btnY = 800 - 100 = 700 + // anchor.x = screenW - (btnX + btnW) = 400 - (10 + 50) = 340 + expect(result.current.anchor).toEqual({ y: 700, x: 340 }); }); it('handles missing measureInWindow gracefully', () => { @@ -175,7 +175,8 @@ describe('useKeyboardAwarePopover', () => { }); // y = screenH - (undefined ?? 0) = 800 - 0 = 800 - expect(result.current.anchor).toEqual({ y: 800, x: 12 }); // SPACING.md = 12 + // x = screenW - (btnX + btnW) = 400 - (10 + 50) = 340 + expect(result.current.anchor).toEqual({ y: 800, x: 340 }); }); }); @@ -361,8 +362,8 @@ describe('useKeyboardAwarePopover', () => { }); }); - describe('offsetX parameter', () => { - it('uses default SPACING.md when offsetX not provided', () => { + describe('button position measurement', () => { + it('computes anchorX as right-edge distance from screen right', () => { const mockMeasureInWindow = jest.fn((callback) => { callback(10, 100, 50, 30); }); @@ -377,16 +378,16 @@ describe('useKeyboardAwarePopover', () => { result.current.show(); }); - // SPACING.md = 12 - expect(result.current.anchor.x).toBe(12); + // screenW=400, btnX=10, btnW=50 → x = 400 - (10+50) = 340 + expect(result.current.anchor.x).toBe(340); }); - it('uses custom offsetX when provided', () => { + it('computes anchorY as distance from button top to screen bottom', () => { const mockMeasureInWindow = jest.fn((callback) => { callback(10, 100, 50, 30); }); - const { result } = renderHook(() => useKeyboardAwarePopover(50)); + const { result } = renderHook(() => useKeyboardAwarePopover()); (result.current.triggerRef as any).current = { measureInWindow: mockMeasureInWindow, @@ -396,7 +397,8 @@ describe('useKeyboardAwarePopover', () => { result.current.show(); }); - expect(result.current.anchor.x).toBe(50); + // screenH=800, btnY=100 → y = 800 - 100 = 700 + expect(result.current.anchor.y).toBe(700); }); }); }); \ No newline at end of file diff --git a/__tests__/unit/services/ttsService.test.ts b/__tests__/unit/services/ttsService.test.ts new file mode 100644 index 00000000..4e46d45b --- /dev/null +++ b/__tests__/unit/services/ttsService.test.ts @@ -0,0 +1,302 @@ +/** + * TTS Service Unit Tests + * + * Tests for backbone/vocoder download, model lifecycle, audio generation, + * file persistence, and playback control. + * Priority: P1 - Core TTS functionality. + */ + +jest.mock('llama.rn', () => ({ + initLlama: jest.fn(), +})); + +jest.mock('react-native-fs', () => ({ + DocumentDirectoryPath: '/mock/docs', + exists: jest.fn(), + mkdir: jest.fn(), + unlink: jest.fn(), + downloadFile: jest.fn(), + writeFile: jest.fn(), + readFile: jest.fn(), + stat: jest.fn(), + readDir: jest.fn(), +})); + +jest.mock('react-native-audio-api', () => ({ + AudioContext: jest.fn().mockImplementation(() => ({ + createBuffer: jest.fn().mockReturnValue({ copyToChannel: jest.fn() }), + createBufferSource: jest.fn().mockReturnValue({ + connect: jest.fn(), + start: jest.fn(), + stop: jest.fn(), + playbackRate: { value: 1.0 }, + onended: null, + buffer: null, + }), + destination: {}, + close: jest.fn(), + })), +})); + +jest.mock('../../../src/utils/logger', () => ({ + __esModule: true, + default: { log: jest.fn(), error: jest.fn(), warn: jest.fn() }, +})); + +import RNFS from 'react-native-fs'; +import { initLlama } from 'llama.rn'; +import { ttsService } from '../../../src/services/ttsService'; +import { TTS_BACKBONE_MODEL } from '../../../src/constants/ttsModels'; + +const mockRNFS = RNFS as jest.Mocked; +const mockInitLlama = initLlama as jest.Mock; + +const makeMockContext = (vocoderEnabled = true) => ({ + initVocoder: jest.fn().mockResolvedValue(undefined), + isVocoderEnabled: jest.fn().mockResolvedValue(vocoderEnabled), + releaseVocoder: jest.fn().mockResolvedValue(undefined), + release: jest.fn().mockResolvedValue(undefined), + getFormattedAudioCompletion: jest.fn().mockResolvedValue({ prompt: 'p', grammar: 'g' }), + getAudioCompletionGuideTokens: jest.fn().mockResolvedValue([1, 2, 3]), + completion: jest.fn().mockResolvedValue({ audio_tokens: [10, 20, 30] }), + decodeAudioTokens: jest.fn().mockResolvedValue(new Array(2400).fill(0.1)), +}); + +describe('ttsService', () => { + beforeEach(() => { + jest.clearAllMocks(); + // Reset internal state between tests + (ttsService as any).context = null; + (ttsService as any).isVocoderReady = false; + (ttsService as any).isSpeakingFlag = false; + (ttsService as any).contextLoadPromise = Promise.resolve(); + }); + + // ─── Paths ──────────────────────────────────────────────────────────────── + + describe('paths', () => { + it('backbone path uses tts-models directory', () => { + expect(ttsService.getBackbonePath()).toBe( + `/mock/docs/tts-models/${TTS_BACKBONE_MODEL.backboneFile}`, + ); + }); + + it('vocoder path uses tts-models directory', () => { + expect(ttsService.getVocoderPath()).toBe( + `/mock/docs/tts-models/${TTS_BACKBONE_MODEL.vocoderFile}`, + ); + }); + + it('audio file path scoped to conversationId and messageId', () => { + expect(ttsService.getAudioFilePath('conv1', 'msg1')).toBe( + '/mock/docs/audio-cache/conv1/msg1.pcm', + ); + }); + }); + + // ─── Download ──────────────────────────────────────────────────────────── + + describe('downloadBackbone', () => { + it('returns existing path without downloading if already present', async () => { + mockRNFS.exists.mockResolvedValueOnce(true) // ensureDir + .mockResolvedValueOnce(true); // file exists + const path = await ttsService.downloadBackbone(); + expect(mockRNFS.downloadFile).not.toHaveBeenCalled(); + expect(path).toBe(ttsService.getBackbonePath()); + }); + + it('downloads and returns path on success', async () => { + mockRNFS.exists.mockResolvedValueOnce(false) // dir missing + .mockResolvedValueOnce(false); // file missing + mockRNFS.mkdir.mockResolvedValueOnce(undefined); + mockRNFS.downloadFile.mockReturnValue({ jobId: 1, promise: Promise.resolve({ statusCode: 200, jobId: 1, bytesWritten: 0 }) }); + + const onProgress = jest.fn(); + const path = await ttsService.downloadBackbone(onProgress); + + expect(mockRNFS.downloadFile).toHaveBeenCalledWith( + expect.objectContaining({ fromUrl: TTS_BACKBONE_MODEL.backboneUrl }), + ); + expect(path).toBe(ttsService.getBackbonePath()); + }); + + it('throws and removes partial file on non-200 response', async () => { + mockRNFS.exists.mockResolvedValue(false); + mockRNFS.mkdir.mockResolvedValueOnce(undefined); + mockRNFS.downloadFile.mockReturnValue({ jobId: 1, promise: Promise.resolve({ statusCode: 404, jobId: 1, bytesWritten: 0 }) }); + mockRNFS.unlink.mockResolvedValue(undefined); + + await expect(ttsService.downloadBackbone()).rejects.toThrow('HTTP 404'); + expect(mockRNFS.unlink).toHaveBeenCalled(); + }); + }); + + describe('downloadVocoder', () => { + it('downloads vocoder to correct path', async () => { + mockRNFS.exists.mockResolvedValue(false); + mockRNFS.mkdir.mockResolvedValueOnce(undefined); + mockRNFS.downloadFile.mockReturnValue({ jobId: 1, promise: Promise.resolve({ statusCode: 200, jobId: 1, bytesWritten: 0 }) }); + + const path = await ttsService.downloadVocoder(); + expect(mockRNFS.downloadFile).toHaveBeenCalledWith( + expect.objectContaining({ fromUrl: TTS_BACKBONE_MODEL.vocoderUrl }), + ); + expect(path).toBe(ttsService.getVocoderPath()); + }); + }); + + // ─── Model Lifecycle ───────────────────────────────────────────────────── + + describe('loadModels', () => { + it('calls initLlama with backbone path then initVocoder', async () => { + const ctx = makeMockContext(); + mockInitLlama.mockResolvedValue(ctx); + + await ttsService.loadModels(); + + expect(mockInitLlama).toHaveBeenCalledWith( + expect.objectContaining({ model: ttsService.getBackbonePath() }), + ); + expect(ctx.initVocoder).toHaveBeenCalledWith( + expect.objectContaining({ path: ttsService.getVocoderPath() }), + ); + }); + + it('throws if isVocoderEnabled returns false', async () => { + const ctx = makeMockContext(false); + mockInitLlama.mockResolvedValue(ctx); + + await expect(ttsService.loadModels()).rejects.toThrow('Vocoder failed to initialize'); + }); + + it('is idempotent — does not double-init if already loaded', async () => { + const ctx = makeMockContext(); + mockInitLlama.mockResolvedValue(ctx); + + await ttsService.loadModels(); + await ttsService.loadModels(); + + expect(mockInitLlama).toHaveBeenCalledTimes(1); + }); + }); + + describe('unloadModels', () => { + it('calls releaseVocoder and release', async () => { + const ctx = makeMockContext(); + mockInitLlama.mockResolvedValue(ctx); + await ttsService.loadModels(); + + await ttsService.unloadModels(); + + expect(ctx.releaseVocoder).toHaveBeenCalled(); + expect(ctx.release).toHaveBeenCalled(); + expect(ttsService.isLoaded()).toBe(false); + }); + }); + + // ─── Generation ────────────────────────────────────────────────────────── + + describe('generate', () => { + it('calls completion pipeline in correct order and returns GeneratedAudio', async () => { + const ctx = makeMockContext(); + mockInitLlama.mockResolvedValue(ctx); + await ttsService.loadModels(); + + const audio = await ttsService.generate('hello world'); + + expect(ctx.getFormattedAudioCompletion).toHaveBeenCalled(); + expect(ctx.getAudioCompletionGuideTokens).toHaveBeenCalledWith('hello world'); + expect(ctx.completion).toHaveBeenCalled(); + expect(ctx.decodeAudioTokens).toHaveBeenCalled(); + + expect(audio.samples).toBeInstanceOf(Float32Array); + expect(audio.waveformData).toHaveLength(200); + expect(audio.durationSeconds).toBeGreaterThan(0); + expect(audio.sampleRate).toBe(TTS_BACKBONE_MODEL.sampleRate); + }); + + it('throws if models not loaded', async () => { + await expect(ttsService.generate('test')).rejects.toThrow('TTS models not loaded'); + }); + }); + + describe('saveToFile', () => { + it('writes base64-encoded PCM to correct path', async () => { + mockRNFS.exists.mockResolvedValue(false); + mockRNFS.mkdir.mockResolvedValueOnce(undefined); + mockRNFS.writeFile.mockResolvedValueOnce(undefined); + + const audio = { + samples: new Float32Array([0.1, 0.2, 0.3]), + durationSeconds: 0.01, + sampleRate: 24000, + waveformData: new Array(200).fill(0.1), + }; + + const path = await ttsService.saveToFile(audio, 'conv1', 'msg1'); + + expect(path).toBe('/mock/docs/audio-cache/conv1/msg1.pcm'); + expect(mockRNFS.writeFile).toHaveBeenCalledWith( + '/mock/docs/audio-cache/conv1/msg1.pcm', + expect.any(String), + 'base64', + ); + }); + }); + + // ─── Stop ──────────────────────────────────────────────────────────────── + + describe('stop', () => { + it('sets isSpeakingFlag to false', () => { + (ttsService as any).isSpeakingFlag = true; + ttsService.stop(); + expect(ttsService.isSpeaking()).toBe(false); + }); + + it('calls stop on currentSource', () => { + const mockSource = { stop: jest.fn() }; + (ttsService as any).currentSource = mockSource; + ttsService.stop(); + expect(mockSource.stop).toHaveBeenCalled(); + }); + }); + + // ─── Cache ──────────────────────────────────────────────────────────────── + + describe('getAudioCacheSizeMB', () => { + it('returns 0 if cache directory does not exist', async () => { + mockRNFS.exists.mockResolvedValueOnce(false); + const size = await ttsService.getAudioCacheSizeMB(); + expect(size).toBe(0); + }); + + it('returns size in MB by summing individual file sizes', async () => { + mockRNFS.exists.mockResolvedValueOnce(true); + // readDir(cacheRoot) → one conversation directory + (mockRNFS as any).readDir + .mockResolvedValueOnce([{ isDirectory: () => true, path: '/mock/docs/audio-cache/conv1' }]) + // readDir(conv1) → two .pcm files, each 2.5 MB + .mockResolvedValueOnce([ + { isDirectory: () => false, size: 2.5 * 1024 * 1024 }, + { isDirectory: () => false, size: 2.5 * 1024 * 1024 }, + ]); + const size = await ttsService.getAudioCacheSizeMB(); + expect(size).toBeCloseTo(5); + }); + }); + + describe('clearAudioCache', () => { + it('unlinks the cache root if it exists', async () => { + mockRNFS.exists.mockResolvedValueOnce(true); + mockRNFS.unlink.mockResolvedValueOnce(undefined); + await ttsService.clearAudioCache(); + expect(mockRNFS.unlink).toHaveBeenCalledWith('/mock/docs/audio-cache'); + }); + + it('does nothing if cache does not exist', async () => { + mockRNFS.exists.mockResolvedValueOnce(false); + await ttsService.clearAudioCache(); + expect(mockRNFS.unlink).not.toHaveBeenCalled(); + }); + }); +}); diff --git a/__tests__/unit/stores/ttsStore.test.ts b/__tests__/unit/stores/ttsStore.test.ts new file mode 100644 index 00000000..568fd9c0 --- /dev/null +++ b/__tests__/unit/stores/ttsStore.test.ts @@ -0,0 +1,276 @@ +/** + * TTS Store Unit Tests + * + * Tests for download state, model lifecycle, Chat Mode speak/stop, + * Audio Mode generateAndSave/playMessage, and settings persistence. + * Priority: P1 - Core TTS state management. + */ + +jest.mock('../../../src/services/ttsService', () => ({ + ttsService: { + isBackboneDownloaded: jest.fn(), + isVocoderDownloaded: jest.fn(), + downloadBackbone: jest.fn(), + downloadVocoder: jest.fn(), + deleteModels: jest.fn(), + loadModels: jest.fn(), + unloadModels: jest.fn(), + speak: jest.fn(), + stop: jest.fn(), + generateAndSave: jest.fn(), + playFromFile: jest.fn(), + getAudioCacheSizeMB: jest.fn(), + clearAudioCache: jest.fn(), + }, +})); + +jest.mock('../../../src/utils/logger', () => ({ + __esModule: true, + default: { log: jest.fn(), error: jest.fn(), warn: jest.fn() }, +})); + +import { useTTSStore } from '../../../src/stores/ttsStore'; +import { ttsService } from '../../../src/services/ttsService'; + +const mockTTSService = ttsService as jest.Mocked; +const getState = () => useTTSStore.getState(); + +const resetState = () => { + useTTSStore.setState({ + isBackboneDownloaded: false, + isVocoderDownloaded: false, + isDownloadingBackbone: false, + isDownloadingVocoder: false, + backboneDownloadProgress: 0, + vocoderDownloadProgress: 0, + isModelLoading: false, + isModelLoaded: false, + isSpeaking: false, + currentMessageId: null, + audioCacheSizeMB: 0, + settings: { + interfaceMode: 'chat', + enabled: true, + autoPlay: false, + speed: 1.0, + voiceId: '0', + kokoroVoiceId: 'af_heart', + }, + error: null, + }); +}; + +describe('ttsStore', () => { + beforeEach(() => { + resetState(); + jest.clearAllMocks(); + }); + + // ─── Download ───────────────────────────────────────────────────────────── + + describe('checkDownloadStatus', () => { + it('reflects backbone and vocoder download state', async () => { + mockTTSService.isBackboneDownloaded.mockResolvedValue(true); + mockTTSService.isVocoderDownloaded.mockResolvedValue(false); + + await getState().checkDownloadStatus(); + + expect(getState().isBackboneDownloaded).toBe(true); + expect(getState().isVocoderDownloaded).toBe(false); + }); + }); + + describe('downloadModels', () => { + it('sets progress states and marks both downloaded on success', async () => { + mockTTSService.downloadBackbone.mockImplementation(async (onProgress) => { + onProgress?.(0.5); + onProgress?.(1.0); + return '/path/backbone'; + }); + mockTTSService.downloadVocoder.mockImplementation(async (onProgress) => { + onProgress?.(1.0); + return '/path/vocoder'; + }); + + await getState().downloadModels(); + + const state = getState(); + expect(state.isBackboneDownloaded).toBe(true); + expect(state.isVocoderDownloaded).toBe(true); + expect(state.isDownloadingBackbone).toBe(false); + expect(state.isDownloadingVocoder).toBe(false); + expect(state.error).toBeNull(); + }); + + it('sets error and resets downloading flags on failure', async () => { + mockTTSService.downloadBackbone.mockRejectedValue(new Error('network error')); + + await getState().downloadModels(); + + const state = getState(); + expect(state.error).toBe('network error'); + expect(state.isDownloadingBackbone).toBe(false); + expect(state.isDownloadingVocoder).toBe(false); + }); + }); + + // ─── Model lifecycle ───────────────────────────────────────────────────── + + describe('loadModels', () => { + it('sets isModelLoaded on success', async () => { + mockTTSService.loadModels.mockResolvedValue(undefined); + await getState().loadModels(); + expect(getState().isModelLoaded).toBe(true); + expect(getState().isModelLoading).toBe(false); + }); + + it('sets error on failure', async () => { + mockTTSService.loadModels.mockRejectedValue(new Error('OOM')); + await getState().loadModels(); + expect(getState().error).toBe('OOM'); + expect(getState().isModelLoaded).toBe(false); + }); + + it('is a no-op if already loaded', async () => { + useTTSStore.setState({ isModelLoaded: true }); + await getState().loadModels(); + expect(mockTTSService.loadModels).not.toHaveBeenCalled(); + }); + }); + + // ─── Chat Mode ──────────────────────────────────────────────────────────── + + describe('speak', () => { + beforeEach(() => { + useTTSStore.setState({ isModelLoaded: true }); + }); + + it('sets isSpeaking true then false after completion', async () => { + mockTTSService.speak.mockResolvedValue(undefined); + mockTTSService.stop.mockReturnValue(undefined); + + const speaking: boolean[] = []; + const unsubscribe = useTTSStore.subscribe((s) => speaking.push(s.isSpeaking)); + + await getState().speak('hello', 'msg1'); + + unsubscribe(); + expect(speaking).toContain(true); + expect(getState().isSpeaking).toBe(false); + }); + + it('stops speaking the same message when called again', async () => { + useTTSStore.setState({ isSpeaking: true, currentMessageId: 'msg1' }); + mockTTSService.stop.mockReturnValue(undefined); + + await getState().speak('hello', 'msg1'); + + expect(mockTTSService.stop).toHaveBeenCalled(); + expect(mockTTSService.speak).not.toHaveBeenCalled(); + }); + + it('does nothing if TTS disabled', async () => { + useTTSStore.setState({ settings: { ...getState().settings, enabled: false } }); + await getState().speak('hello', 'msg1'); + expect(mockTTSService.speak).not.toHaveBeenCalled(); + }); + + it('does nothing if model not loaded', async () => { + useTTSStore.setState({ isModelLoaded: false }); + await getState().speak('hello', 'msg1'); + expect(mockTTSService.speak).not.toHaveBeenCalled(); + }); + }); + + // ─── Audio Mode ─────────────────────────────────────────────────────────── + + describe('generateAndSave', () => { + it('returns path, waveformData, durationSeconds and refreshes cache', async () => { + const mockAudio = { + samples: new Float32Array(100), + durationSeconds: 2.5, + sampleRate: 24000, + waveformData: new Array(200).fill(0.1), + }; + mockTTSService.generateAndSave.mockResolvedValue({ + path: '/cache/conv1/msg1.pcm', + audio: mockAudio, + }); + mockTTSService.getAudioCacheSizeMB.mockResolvedValue(3.2); + + const result = await getState().generateAndSave('hello', 'conv1', 'msg1'); + + expect(result.path).toBe('/cache/conv1/msg1.pcm'); + expect(result.waveformData).toHaveLength(200); + expect(result.durationSeconds).toBe(2.5); + expect(getState().audioCacheSizeMB).toBeCloseTo(3.2); + }); + }); + + describe('playMessage', () => { + it('sets isSpeaking true during playback then false after', async () => { + mockTTSService.stop.mockReturnValue(undefined); + mockTTSService.playFromFile.mockResolvedValue(undefined); + + const speaking: boolean[] = []; + const unsubscribe = useTTSStore.subscribe((s) => speaking.push(s.isSpeaking)); + + await getState().playMessage('msg1', '/cache/conv1/msg1.pcm'); + + unsubscribe(); + expect(speaking).toContain(true); + expect(getState().isSpeaking).toBe(false); + }); + + it('stops if same message is already playing', async () => { + useTTSStore.setState({ isSpeaking: true, currentMessageId: 'msg1' }); + mockTTSService.stop.mockReturnValue(undefined); + + await getState().playMessage('msg1', '/cache/conv1/msg1.pcm'); + + expect(mockTTSService.stop).toHaveBeenCalled(); + expect(mockTTSService.playFromFile).not.toHaveBeenCalled(); + }); + }); + + // ─── Settings ───────────────────────────────────────────────────────────── + + describe('updateSettings', () => { + it('merges partial settings correctly', () => { + getState().updateSettings({ speed: 1.5, autoPlay: true }); + const { settings } = getState(); + expect(settings.speed).toBe(1.5); + expect(settings.autoPlay).toBe(true); + // Other fields untouched + expect(settings.enabled).toBe(true); + expect(settings.voiceId).toBe('0'); + }); + + it('can switch interfaceMode', () => { + getState().updateSettings({ interfaceMode: 'audio' }); + expect(getState().settings.interfaceMode).toBe('audio'); + }); + }); + + describe('clearError', () => { + it('clears the error field', () => { + useTTSStore.setState({ error: 'something went wrong' }); + getState().clearError(); + expect(getState().error).toBeNull(); + }); + }); + + // ─── Cache ──────────────────────────────────────────────────────────────── + + describe('clearAudioCache', () => { + it('calls ttsService.clearAudioCache and resets size', async () => { + useTTSStore.setState({ audioCacheSizeMB: 10 }); + mockTTSService.clearAudioCache.mockResolvedValue(undefined); + + await getState().clearAudioCache(); + + expect(mockTTSService.clearAudioCache).toHaveBeenCalled(); + expect(getState().audioCacheSizeMB).toBe(0); + }); + }); +}); diff --git a/__tests__/unit/utils/messageContent.test.ts b/__tests__/unit/utils/messageContent.test.ts index b35b0181..5f79afef 100644 --- a/__tests__/unit/utils/messageContent.test.ts +++ b/__tests__/unit/utils/messageContent.test.ts @@ -118,8 +118,8 @@ describe('stripControlTokens', () => { expect(stripControlTokens('<|im_start|>assistant\n<|im_end|>')).toBe(''); }); - it('preserves whitespace in content', () => { - expect(stripControlTokens(' Hello World ')).toBe(' Hello World '); + it('trims leading/trailing whitespace in content', () => { + expect(stripControlTokens(' Hello World ')).toBe('Hello World'); }); it('preserves HTML-like tags that are not control tokens', () => { diff --git a/android/build.gradle b/android/build.gradle index dad99b02..984e5bed 100644 --- a/android/build.gradle +++ b/android/build.gradle @@ -19,3 +19,4 @@ buildscript { } apply plugin: "com.facebook.react.rootproject" + diff --git a/docs/PERSONAS_IMPLEMENTATION_PLAN.md b/docs/PERSONAS_IMPLEMENTATION_PLAN.md index 93ccdd5d..dd1225fc 100644 --- a/docs/PERSONAS_IMPLEMENTATION_PLAN.md +++ b/docs/PERSONAS_IMPLEMENTATION_PLAN.md @@ -31,7 +31,8 @@ export type Capability = | 'voice' // STT + TTS | 'vision' // image understanding | 'image-gen' // image generation - | 'rag'; // knowledge base search + | 'rag' // knowledge base search (user-uploaded documents) + | 'memory-rag'; // cross-conversation RAG — past messages indexed and retrieved export type SkillTriggerEvent = | 'message_received' // new message in connected app @@ -109,8 +110,9 @@ export interface Persona { capabilities: Capability[]; // What this persona knows - knowledgeBaseIds: string[]; // attached RAG knowledge bases (use projectId as KB id) - memoryFacts: PersonaMemoryFact[]; // persistent learned facts + knowledgeBaseIds: string[]; // attached RAG knowledge bases (user-uploaded documents) + conversationMemoryEnabled: boolean; // true = all past conversations for this persona are embedded + searchable + memoryFacts: PersonaMemoryFact[]; // persistent learned facts (LLM-extracted, concise) // What this persona does automatically skills: Skill[]; @@ -227,8 +229,9 @@ export const DEFAULT_PERSONAS: Omit[] = [ systemPrompt: 'You are Jarvis, a capable and concise personal assistant. You help with anything — questions, tasks, planning, thinking. You are direct, warm, and never verbose unless asked.', icon: 'cpu', accentColor: '#6366F1', - capabilities: ['text', 'voice', 'vision'], + capabilities: ['text', 'voice', 'vision', 'memory-rag'], knowledgeBaseIds: [], + conversationMemoryEnabled: true, // Jarvis indexes all past conversations — gives it cross-chat intelligence memoryFacts: [], skills: [], integrationIds: [], @@ -418,6 +421,113 @@ export function buildMemoryContext(facts: PersonaMemoryFact[]): string { } ``` +### conversationRagService.ts (new — cross-conversation memory) + +This is what makes Jarvis actually intelligent across sessions. Rather than relying only on extracted `memoryFacts` (brief summaries) or the current context window, Jarvis embeds every conversation message into a per-persona vector store. When a new message arrives, relevant past exchanges are retrieved and injected as context — so Jarvis remembers "we discussed your onboarding last Tuesday" without you having to repeat it. + +**How it's different from document KB:** + +| | Document KB (`knowledgeBaseIds`) | Conversation RAG (`conversationMemoryEnabled`) | +|---|---|---| +| Source | User-uploaded PDFs, notes | Past conversation messages | +| Indexed when | User uploads a file | After each assistant response | +| Retrieved by | User explicitly asking about docs | Automatically on every message | +| Scoped to | Attached knowledge bases | All conversations for this persona | + +```typescript +// src/services/conversationRagService.ts + +/** + * Indexes completed conversation messages into the persona's vector store. + * Called after each assistant turn completes (streaming done). + * + * Each chunk stored = ~4–6 messages grouped by semantic coherence, not + * arbitrary token windows. This preserves conversational context. + */ +export async function indexConversationTurn( + personaId: string, + conversationId: string, + messages: Message[], // recent messages to embed (typically last 4–6) +): Promise { + const chunks = chunkMessagesForEmbedding(messages); + for (const chunk of chunks) { + const embedding = await embeddingService.embed(chunk.text); + await vectorStore.upsert({ + id: `${conversationId}:${chunk.startIndex}`, + embedding, + metadata: { + personaId, + conversationId, + timestamp: chunk.timestamp, + preview: chunk.text.slice(0, 120), + }, + }); + } +} + +/** + * Retrieves the most relevant past conversation context for the current message. + * Returns plain text ready to inject into the system prompt. + */ +export async function retrieveRelevantHistory( + personaId: string, + currentMessage: string, + topK = 3, +): Promise { + const queryEmbedding = await embeddingService.embed(currentMessage); + const results = await vectorStore.search({ + embedding: queryEmbedding, + filter: { personaId }, + topK, + minScore: 0.72, // only inject if meaningfully relevant + }); + + if (results.length === 0) return ''; + + const snippets = results.map(r => + `[${formatRelativeDate(r.metadata.timestamp)}]\n${r.metadata.preview}` + ); + return `\n\nRelevant context from past conversations:\n${snippets.join('\n\n---\n\n')}`; +} + +/** + * Groups messages into semantically coherent chunks for embedding. + * Avoids splitting a user question from its assistant answer. + */ +function chunkMessagesForEmbedding(messages: Message[]): EmbeddingChunk[] { + // Pair each user message with its following assistant response + // Output: chunks of ~300–400 tokens each +} +``` + +**System prompt injection** (in `llm.ts` or wherever the prompt is assembled): + +```typescript +// When conversationMemoryEnabled is true for the active persona: +if (persona.conversationMemoryEnabled) { + const history = await conversationRagService.retrieveRelevantHistory( + persona.id, + latestUserMessage, + ); + systemPrompt += history; +} +``` + +**Indexing trigger** (after streaming completes, in chatStore or the streaming callback): + +```typescript +// After assistant response is done streaming: +if (persona.conversationMemoryEnabled) { + conversationRagService.indexConversationTurn( + persona.id, + conversationId, + recentMessages.slice(-6), + ).catch(() => {}); // fire-and-forget, non-blocking +} +``` + +**Storage:** Uses the existing `ragService` vector store, namespaced by `personaId`. No new storage layer needed — just a new indexing source. + --- ## Screens @@ -926,6 +1036,11 @@ export interface Message { 18. Memory injection into system prompt 19. `PersonaMemoryScreen` 20. Memory bar in chat (new fact notification) +21. `conversationRagService.ts` — cross-conversation RAG for `memory-rag` capability + - Index each conversation turn after streaming completes (fire-and-forget) + - Retrieve relevant history and inject into system prompt before each LLM call + - Jarvis has `conversationMemoryEnabled: true` by default; other personas opt in via PersonaEditScreen + - Reuses existing `ragService` vector store, namespaced by `personaId` ### Phase 5 — Integrations in Chat (tool calls) 21. Wire integration tool registry entries diff --git a/docs/TTS_IMPLEMENTATION_PLAN.md b/docs/TTS_IMPLEMENTATION_PLAN.md index 19b6942c..41f548f4 100644 --- a/docs/TTS_IMPLEMENTATION_PLAN.md +++ b/docs/TTS_IMPLEMENTATION_PLAN.md @@ -2,1075 +2,275 @@ ## Product Vision -Two first-class interface modes, switchable from Settings: +Two first-class interface modes, switchable from Chat Settings or TTS Settings: | Mode | Primary output | TTS role | Text | |---|---|---|---| | **Chat Mode** | Text bubbles | Add-on — play button per message | Default visible | -| **Audio Mode** | Waveform bubbles | Core — auto-generated at completion | Hidden by default, expandable | +| **Audio Mode** | Waveform bubbles (both sides) | Core — auto-generated at completion | Hidden by default, expandable | -**Audio Mode is the target product experience.** Messages feel like voice note exchanges — not a chat app that also speaks. The user has full per-message audio controls: scrub to position, adjust playback speed, change voice/tone. Text is always available as a "Show transcript" expand. +**Audio Mode is the target product experience.** Both the user's voice recordings AND the AI's responses appear as waveform audio bubbles — a full voice-note conversation. No text is shown by default; transcript is always accessible via "Show transcript" expand. -Chat Mode is the fallback for devices that can't run TTS models, or users who prefer it. +- User voice recordings: right-aligned audio bubbles (recorded WAV, played back locally) +- AI responses: left-aligned audio bubbles (OuteTTS-generated, with 40-bar waveform visualization) + +Chat Mode is the fallback for devices that can't run TTS models, or users who prefer text. --- ## Decision Log -### Engine -**OuteTTS 0.3 (500M) + WavTokenizer** via `llama.rn`. +### Engine (updated) + +**Two-tier TTS architecture:** + +| Tier | Engine | Use case | Speed | Size | +|---|---|---|---|---| +| **Tier 1 — Speak (Chat Mode)** | Kokoro via `react-native-executorch` | On-demand speak button, long-press Speak action | ~1s (streaming) | ~100MB | +| **Tier 2 — Generate+Save (Audio Mode)** | OuteTTS 0.3 + WavTokenizer via `llama.rn` | Auto-generate waveform bubble after streaming | ~30–120s | ~527MB | + +**Why two tiers:** +- Kokoro via ExecuTorch is fast enough for interactive use (streaming starts < 1s) but outputs raw PCM chunks — no way to write to disk for waveform scrubbing without custom buffering +- OuteTTS via llama.rn generates the full audio up front, returns `Float32Array` + waveform data + duration in one call — ideal for the saved-file + waveform visualisation pattern Audio Mode requires +- OuteTTS is NOT suitable for the speak button (too slow, ~30–120s per sentence) +- Kokoro is NOT currently available as a GGUF via llama.cpp (feature request opened Jan 2025, closed stale Oct 2025, never merged) + +**Previous decision (superseded):** +OuteTTS only via llama.rn for both modes. Superseded because ~1 minute to speak a single sentence is not acceptable for interactive use. + +### Platform constraint -- OuteTTS 1.0 (Qwen3 0.6B) is blocked: the DAC vocoder has no GGUF, and llama.cpp PR#12794 is an open draft. The backbone exists on HuggingFace but the decoder is not implemented upstream. -- OuteTTS 0.3 with WavTokenizer is the **only fully working path** through llama.rn today (confirmed via TTSScreen.tsx in mybigday/llama.rn example app). -- Upgrade to OuteTTS 1.0 will be a model swap with no architecture change once PR#12794 and llama.rn PR#300 land. +`react-native-executorch` requires **Android 13 (API 33)** minimum and **iOS 17** minimum. + +Current app `minSdkVersion` is **24 (Android 7)**. + +**Resolution:** Kokoro speak is available only on Android 13+ / iOS 17+. On older devices, the speak button falls back to OuteTTS (slow but functional). This is detected at runtime — no code path is dead, just slower on older OS. + +`minSdkVersion` stays at 24. No breaking change for existing users. ### Playback -**react-native-audio-api** (Software Mansion). Implements the Web Audio API spec for React Native. `decodeAudioTokens()` returns `number[]` (Float32 PCM at 24kHz mono) which feeds directly into an `AudioBuffer`. +**react-native-audio-api** (Software Mansion, already installed). Implements the Web Audio API spec for React Native. Both Kokoro (streaming `Float32Array` chunks) and OuteTTS (full `Float32Array`) pipe through the same `AudioContext → AudioBufferSourceNode` path at 24kHz mono. ### Audio Persistence (Audio Mode only) -In Audio Mode, generated PCM is written to disk as a WAV file per message so scrubbing works without re-generating. Files live at: +In Audio Mode, generated PCM is written to disk as a raw PCM file per message so scrubbing works without re-generating. Files live at: ``` -${RNFS.DocumentDirectoryPath}/audio-cache/{conversationId}/{messageId}.wav +${RNFS.DocumentDirectoryPath}/audio-cache/{conversationId}/{messageId}.pcm ``` Cache eviction strategy: - Keep the last 50 messages worth of audio per conversation - User can wipe audio cache from Settings ("Clear audio cache — X MB") -- Estimated size: ~1–4 MB per message (24kHz mono, varies by length) +- Estimated size: ~1–4 MB per message (24kHz mono Float32, varies by length) -In Chat Mode, audio is generated on demand, played, then discarded (no disk write). +In Chat Mode, audio is generated (via Kokoro) on demand, played, then discarded (no disk write). ### Voice Selection -OuteTTS 0.3 supports multiple speaker profiles. Expose as a voice picker in TTSSettingsScreen. Store selected voice ID in `ttsStore` settings (persisted). Default: speaker 0 (natural female). +- **Kokoro voices (Chat Mode speak):** 8 built-in voices (US/GB English, male/female). Stored as `kokoroVoiceId` in `ttsStore` settings. Default: `af_heart`. +- **OuteTTS voices (Audio Mode waveform):** Single profile (`speaker 0`) — OuteTTS 0.3 multi-speaker not confirmed working via llama.rn. Will expand when OuteTTS 1.0 lands. ### Device Gate -Require **flagship tier (8GB+ RAM)**. The memory stack: -``` -LLM (3B Q4) ~2.0 GB -Whisper base ~150 MB -OuteTTS backbone ~454 MB -WavTokenizer ~ 73 MB -OS + app ~2.0 GB -───────────────────────── -Total: ~4.7 GB → fits 8GB devices, tight on 6GB -``` -Show a warning (not a hard block) for 6–8GB devices. Hard block below 6GB. If device is blocked, Audio Mode is unavailable — app defaults to Chat Mode and hides the Audio Mode option. - ---- - -## Model Files - -| Role | HuggingFace Repo | File | Size | -|---|---|---|---| -| TTS Backbone | `OuteAI/OuteTTS-0.3-500M-GGUF` | `OuteTTS-0.3-500M-Q4_K_M.gguf` | 454 MB | -| Vocoder | `ggml-org/WavTokenizer` | `WavTokenizer-Large-75-Q5_1.gguf` | 73 MB | +Show a warning (not a hard block) for 6–8GB devices. Hard block below 6GB for Audio Mode (OuteTTS only). Kokoro speak has no RAM gate. -Direct download URLs (HuggingFace resolve): +Memory stack (worst case — both models loaded simultaneously): ``` -https://huggingface.co/OuteAI/OuteTTS-0.3-500M-GGUF/resolve/main/OuteTTS-0.3-500M-Q4_K_M.gguf -https://huggingface.co/ggml-org/WavTokenizer/resolve/main/WavTokenizer-Large-75-Q5_1.gguf +LLM (3B Q4) ~2.0 GB +Whisper base ~150 MB +OuteTTS backbone ~454 MB +WavTokenizer ~ 73 MB +Kokoro (XNNPACK .pte) ~100 MB ← new +OS + app ~2.0 GB +────────────────────────────── +Total: ~4.8 GB → fits 8GB devices ``` -Storage directories: -``` -${RNFS.DocumentDirectoryPath}/tts-models/ ← model weights -${RNFS.DocumentDirectoryPath}/audio-cache/ ← per-message WAV files (Audio Mode only) -``` +Kokoro and OuteTTS are never loaded simultaneously — Kokoro handles Chat Mode speak (OuteTTS not loaded), OuteTTS handles Audio Mode generation (Kokoro not involved). --- -## New Package - -```bash -npm install react-native-audio-api -``` - -iOS: run `pod install` after. -Android: auto-linked. - ---- - -## Interface Mode Setting - -### Where it lives -`ttsStore` settings object gains: +## Model Files -```typescript -export type InterfaceMode = 'chat' | 'audio'; - -export interface TTSSettings { - interfaceMode: InterfaceMode; // default: 'chat' until TTS models downloaded, then user can switch - enabled: boolean; - autoPlay: boolean; // Chat Mode only — auto-speak after completion - speed: number; // 0.5–2.0, default 1.0 - voiceId: string; // OuteTTS speaker profile, default '0' -} -``` +### Tier 1 — Kokoro (react-native-executorch) -### Mode switching rules -- If TTS models not downloaded → `interfaceMode` locked to `'chat'` -- If device RAM < 6GB → `interfaceMode` locked to `'chat'`, Audio Mode option hidden -- Switching mode takes effect immediately for new messages; existing messages render in whatever mode they were generated in (Chat Mode messages have no audio file, Audio Mode messages have one) -- A banner appears at the top of the chat on first switch: "Audio mode on — responses will play as voice notes." +Downloaded automatically by `react-native-executorch` to its internal cache (`react-native-executorch/` in document directory). No manual download management needed. ---- +| File | Source | Size (approx) | +|---|---|---| +| `duration_predictor.pte` | HuggingFace: `software-mansion/react-native-executorch-kokoro` | ~10 MB | +| `synthesizer.pte` | same | ~80 MB | +| Voice `.bin` files (per voice) | same repo | ~3–5 MB each | +| Phonemizer data (tagger + lexicon) | same repo | ~5 MB | -## Audio Mode: Message Bubble +Total cold download: ~100–120 MB. Subsequent launches use cached files. -### Layout (replaces text bubble for assistant messages) +### Tier 2 — OuteTTS (llama.rn, audio mode only) -``` -┌─────────────────────────────────────────────┐ -│ [avatar] ●━━━━━━━━━━━━━━━━━━━ 0:42 1x │ -│ [waveform visualization] │ -│ [Show transcript ▾] │ -└─────────────────────────────────────────────┘ -``` - -- **Waveform bar** — static amplitude visualization drawn from PCM data at generation time (no real-time animation needed, just a static shape like WhatsApp) -- **Scrubber** — draggable progress indicator -- **Timestamp** — elapsed / total duration -- **Speed chip** — tappable, cycles 0.5x → 1x → 1.5x → 2x -- **Show transcript** — expands inline to full text, collapses again - -User messages (voice input via Whisper) show the same bubble layout but with the transcript as primary since we have no TTS for user messages. +| Role | HuggingFace Repo | File | Size | +|---|---|---|---| +| TTS Backbone | `OuteAI/OuteTTS-0.3-500M-GGUF` | `OuteTTS-0.3-500M-Q4_K_M.gguf` | 454 MB | +| Vocoder | `ggml-org/WavTokenizer` | `WavTokenizer-Large-75-Q5_1.gguf` | 73 MB | -### Per-message controls (long press → action sheet) -- Change voice (re-generates audio with new speaker profile, overwrites cached file) -- Regenerate audio -- Copy text -- Delete message +Stored at: `${RNFS.DocumentDirectoryPath}/tts-models/` --- -## Files to Create - -### 1. `src/constants/ttsModels.ts` +## New Packages -```typescript -export const TTS_BACKBONE_MODEL = { - id: 'outetts-0.3-500m-q4', - name: 'OuteTTS 0.3', - backboneFile: 'OuteTTS-0.3-500M-Q4_K_M.gguf', - backboneUrl: 'https://huggingface.co/OuteAI/OuteTTS-0.3-500M-GGUF/resolve/main/OuteTTS-0.3-500M-Q4_K_M.gguf', - backboneSizeMB: 454, - vocoderFile: 'WavTokenizer-Large-75-Q5_1.gguf', - vocoderUrl: 'https://huggingface.co/ggml-org/WavTokenizer/resolve/main/WavTokenizer-Large-75-Q5_1.gguf', - vocoderSizeMB: 73, - sampleRate: 24000, - description: 'Natural-sounding on-device speech. Requires ~530 MB storage.', -}; - -export const TTS_SPEAKER_PROFILES = [ - { id: '0', label: 'Default' }, - // Add more as OuteTTS 0.3 speaker profiles are confirmed -]; - -export const TTS_MIN_RAM_GB = 6; // warn below 8, hard block below 6 -export const TTS_BLOCK_RAM_GB = 6; // hard block -export const TTS_WARN_RAM_GB = 8; // show warning card -export const AUDIO_CACHE_MAX_MESSAGES = 50; // per conversation +```bash +npm install react-native-executorch +npm install react-native-executorch-bare-resource-fetcher +npm install @dr.pogodin/react-native-fs @kesha-antonov/react-native-background-downloader ``` ---- - -### 2. `src/services/ttsService.ts` - -Mirror `whisperService.ts` pattern exactly. - -```typescript -import { initLlama, LlamaContext } from 'llama.rn'; -import RNFS from 'react-native-fs'; -import { AudioContext } from 'react-native-audio-api'; -import logger from '../utils/logger'; -import { TTS_BACKBONE_MODEL } from '../constants/ttsModels'; - -export interface TTSOptions { - speed?: number; // 0.5–2.0, default 1.0 - voiceId?: string; // speaker profile id, default '0' -} - -export interface GeneratedAudio { - samples: Float32Array; - durationSeconds: number; - sampleRate: number; - /** Amplitude envelope (downsampled to ~200 points) for waveform visualization */ - waveformData: number[]; -} - -class TTSService { - private context: LlamaContext | null = null; - private isVocoderReady: boolean = false; - private isSpeakingFlag: boolean = false; - private audioCtx: AudioContext | null = null; - private currentSource: AudioBufferSourceNode | null = null; - private contextLoadPromise: Promise = Promise.resolve(); - - // ─── Directories & Paths ──────────────────────────────────────────────── - - getModelsDir(): string { - return `${RNFS.DocumentDirectoryPath}/tts-models`; - } - - getAudioCacheDir(conversationId: string): string { - return `${RNFS.DocumentDirectoryPath}/audio-cache/${conversationId}`; - } - - getAudioFilePath(conversationId: string, messageId: string): string { - return `${this.getAudioCacheDir(conversationId)}/${messageId}.wav`; - } - - async ensureModelsDirExists(): Promise { - const dir = this.getModelsDir(); - if (!await RNFS.exists(dir)) await RNFS.mkdir(dir); - } - - async ensureAudioCacheDirExists(conversationId: string): Promise { - const dir = this.getAudioCacheDir(conversationId); - if (!await RNFS.exists(dir)) await RNFS.mkdir(dir); - } - - getBackbonePath(): string { - return `${this.getModelsDir()}/${TTS_BACKBONE_MODEL.backboneFile}`; - } - - getVocoderPath(): string { - return `${this.getModelsDir()}/${TTS_BACKBONE_MODEL.vocoderFile}`; - } - - async isBackboneDownloaded(): Promise { - return RNFS.exists(this.getBackbonePath()); - } - - async isVocoderDownloaded(): Promise { - return RNFS.exists(this.getVocoderPath()); - } - - async areBothModelsDownloaded(): Promise { - return (await this.isBackboneDownloaded()) && (await this.isVocoderDownloaded()); - } - - async isAudioCached(conversationId: string, messageId: string): Promise { - return RNFS.exists(this.getAudioFilePath(conversationId, messageId)); - } - - async getAudioCacheSizeMB(): Promise { - const cacheRoot = `${RNFS.DocumentDirectoryPath}/audio-cache`; - if (!await RNFS.exists(cacheRoot)) return 0; - const stat = await RNFS.stat(cacheRoot); - return stat.size / (1024 * 1024); - } - - async clearAudioCache(): Promise { - const cacheRoot = `${RNFS.DocumentDirectoryPath}/audio-cache`; - if (await RNFS.exists(cacheRoot)) await RNFS.unlink(cacheRoot); - } - - // ─── Download ──────────────────────────────────────────────────────────── - - async downloadBackbone(onProgress?: (p: number) => void): Promise { - await this.ensureModelsDirExists(); - const dest = this.getBackbonePath(); - if (await RNFS.exists(dest)) return dest; - const dl = RNFS.downloadFile({ - fromUrl: TTS_BACKBONE_MODEL.backboneUrl, - toFile: dest, - progressDivider: 1, - progress: (res) => onProgress?.(res.bytesWritten / res.contentLength), - }); - const result = await dl.promise; - if (result.statusCode !== 200) { - await RNFS.unlink(dest).catch(() => {}); - throw new Error(`Backbone download failed: HTTP ${result.statusCode}`); - } - return dest; - } - - async downloadVocoder(onProgress?: (p: number) => void): Promise { - await this.ensureModelsDirExists(); - const dest = this.getVocoderPath(); - if (await RNFS.exists(dest)) return dest; - const dl = RNFS.downloadFile({ - fromUrl: TTS_BACKBONE_MODEL.vocoderUrl, - toFile: dest, - progressDivider: 1, - progress: (res) => onProgress?.(res.bytesWritten / res.contentLength), - }); - const result = await dl.promise; - if (result.statusCode !== 200) { - await RNFS.unlink(dest).catch(() => {}); - throw new Error(`Vocoder download failed: HTTP ${result.statusCode}`); - } - return dest; - } - - async deleteModels(): Promise { - await this.unloadModels(); - const bp = this.getBackbonePath(); - const vp = this.getVocoderPath(); - if (await RNFS.exists(bp)) await RNFS.unlink(bp); - if (await RNFS.exists(vp)) await RNFS.unlink(vp); - } - - // ─── Model Lifecycle ───────────────────────────────────────────────────── - - async loadModels(): Promise { - if (this.context && this.isVocoderReady) return; - - this.contextLoadPromise = this.contextLoadPromise.then(async () => { - if (this.context && this.isVocoderReady) return; - - logger.log('[TTS] Loading backbone...'); - this.context = await initLlama({ - model: this.getBackbonePath(), - n_ctx: 8192, - n_threads: 4, - }); - - logger.log('[TTS] Loading vocoder...'); - await this.context.initVocoder({ - path: this.getVocoderPath(), - n_batch: 4096, - }); +iOS: `pod install` after. - this.isVocoderReady = await this.context.isVocoderEnabled(); - if (!this.isVocoderReady) { - throw new Error('Vocoder failed to initialize — check model files.'); - } - - logger.log('[TTS] Ready.'); - }); - - return this.contextLoadPromise; - } - - async unloadModels(): Promise { - this.stop(); - if (this.context) { - await this.context.releaseVocoder().catch(() => {}); - await this.context.release().catch(() => {}); - this.context = null; - } - this.isVocoderReady = false; - this.audioCtx?.close().catch(() => {}); - this.audioCtx = null; - } - - isLoaded(): boolean { - return this.context !== null && this.isVocoderReady; - } - - // ─── Audio Generation ──────────────────────────────────────────────────── - - /** - * Generate PCM audio for `text`. Does NOT play it. - * Returns samples + metadata needed for waveform rendering and playback. - */ - async generate(text: string, options: TTSOptions = {}): Promise { - if (!this.context || !this.isVocoderReady) { - throw new Error('TTS models not loaded.'); - } - - const speakerId = options.voiceId ?? '0'; - const { prompt, grammar } = await this.context.getFormattedAudioCompletion( - speakerId === '0' ? null : speakerId, - text, - ); - const guideTokens = await this.context.getAudioCompletionGuideTokens(text); - - const result = await this.context.completion({ - prompt, - grammar, - guide_tokens: guideTokens, - n_predict: 4096, - temperature: 0.7, - top_p: 0.9, - stop: ['<|im_end|>'], - }); - - const pcmArray = await this.context.decodeAudioTokens(result.audio_tokens); - const samples = new Float32Array(pcmArray); - const sampleRate = TTS_BACKBONE_MODEL.sampleRate; - const durationSeconds = samples.length / sampleRate; - const waveformData = this.downsampleForWaveform(samples, 200); - - return { samples, durationSeconds, sampleRate, waveformData }; - } - - /** - * Write PCM samples to a WAV file on disk. - * Used in Audio Mode to persist audio per message. - */ - async saveToFile(audio: GeneratedAudio, conversationId: string, messageId: string): Promise { - await this.ensureAudioCacheDirExists(conversationId); - const path = this.getAudioFilePath(conversationId, messageId); - const wavBuffer = this.encodeWAV(audio.samples, audio.sampleRate); - await RNFS.writeFile(path, wavBuffer, 'base64'); - return path; - } - - /** - * Generate + save in one step (Audio Mode convenience). - */ - async generateAndSave( - text: string, - conversationId: string, - messageId: string, - options: TTSOptions = {}, - ): Promise<{ path: string; audio: GeneratedAudio }> { - const audio = await this.generate(text, options); - const path = await this.saveToFile(audio, conversationId, messageId); - return { path, audio }; - } - - // ─── Playback ──────────────────────────────────────────────────────────── - - async playFromSamples(samples: Float32Array, speed: number = 1.0, startOffset: number = 0): Promise { - const sampleRate = TTS_BACKBONE_MODEL.sampleRate; - - this.audioCtx?.close().catch(() => {}); - this.audioCtx = new AudioContext({ sampleRate }); - - const buffer = this.audioCtx.createBuffer(1, samples.length, sampleRate); - buffer.copyToChannel(samples, 0); - - const source = this.audioCtx.createBufferSource(); - source.buffer = buffer; - source.playbackRate.value = speed; - source.connect(this.audioCtx.destination); - - this.currentSource = source; - this.isSpeakingFlag = true; - - return new Promise((resolve) => { - source.onended = () => { - this.currentSource = null; - this.isSpeakingFlag = false; - resolve(); - }; - source.start(0, startOffset); - }); - } - - async playFromFile(filePath: string, speed: number = 1.0, startOffset: number = 0): Promise { - const base64 = await RNFS.readFile(filePath, 'base64'); - const samples = this.decodeWAV(base64); - return this.playFromSamples(samples, speed, startOffset); - } - - /** - * Chat Mode convenience: generate + play + discard (no disk write). - */ - async speak(text: string, options: TTSOptions = {}): Promise { - if (this.isSpeakingFlag) this.stop(); - const audio = await this.generate(text, options); - if (!this.isSpeakingFlag) { // may have been stopped during generation - await this.playFromSamples(audio.samples, options.speed ?? 1.0); - } - } - - stop(): void { - this.isSpeakingFlag = false; - try { - this.currentSource?.stop(); - } catch { - // already stopped - } - this.currentSource = null; - } - - isSpeaking(): boolean { - return this.isSpeakingFlag; - } - - // ─── Utilities ─────────────────────────────────────────────────────────── - - private downsampleForWaveform(samples: Float32Array, points: number): number[] { - const blockSize = Math.floor(samples.length / points); - const result: number[] = []; - for (let i = 0; i < points; i++) { - let sum = 0; - for (let j = 0; j < blockSize; j++) { - sum += Math.abs(samples[i * blockSize + j]); - } - result.push(sum / blockSize); - } - return result; - } - - private encodeWAV(samples: Float32Array, sampleRate: number): string { - // Standard 16-bit PCM WAV encoding → base64 - // Implementation: write RIFF header + PCM data - const buffer = new ArrayBuffer(44 + samples.length * 2); - const view = new DataView(buffer); - const writeString = (offset: number, s: string) => { - for (let i = 0; i < s.length; i++) view.setUint8(offset + i, s.charCodeAt(i)); - }; - writeString(0, 'RIFF'); - view.setUint32(4, 36 + samples.length * 2, true); - writeString(8, 'WAVE'); - writeString(12, 'fmt '); - view.setUint32(16, 16, true); - view.setUint16(20, 1, true); - view.setUint16(22, 1, true); - view.setUint32(24, sampleRate, true); - view.setUint32(28, sampleRate * 2, true); - view.setUint16(32, 2, true); - view.setUint16(34, 16, true); - writeString(36, 'data'); - view.setUint32(40, samples.length * 2, true); - for (let i = 0; i < samples.length; i++) { - view.setInt16(44 + i * 2, Math.max(-32768, Math.min(32767, samples[i] * 32768)), true); - } - return Buffer.from(buffer).toString('base64'); - } - - private decodeWAV(base64: string): Float32Array { - const buffer = Buffer.from(base64, 'base64'); - const view = new DataView(buffer.buffer); - const sampleCount = (buffer.length - 44) / 2; - const samples = new Float32Array(sampleCount); - for (let i = 0; i < sampleCount; i++) { - samples[i] = view.getInt16(44 + i * 2, true) / 32768; - } - return samples; - } -} - -export const ttsService = new TTSService(); -``` +**Note:** `react-native-executorch-bare-resource-fetcher` requires its own RNFS fork (`@dr.pogodin/react-native-fs`) alongside the existing `react-native-fs`. Both can coexist. --- -### 3. `src/stores/ttsStore.ts` +## Architecture -Mirror `whisperStore.ts` pattern, using Zustand with `persist`. +### Initialization (`App.tsx`) ```typescript -import { create } from 'zustand'; -import { persist, createJSONStorage } from 'zustand/middleware'; -import AsyncStorage from '@react-native-async-storage/async-storage'; -import { ttsService } from '../services/ttsService'; -import logger from '../utils/logger'; - -export type InterfaceMode = 'chat' | 'audio'; - -export interface TTSSettings { - interfaceMode: InterfaceMode; - enabled: boolean; - autoPlay: boolean; // Chat Mode only - speed: number; // 0.5–2.0 - voiceId: string; // OuteTTS speaker profile -} +import { initExecutorch } from 'react-native-executorch'; +import { BareResourceFetcher } from 'react-native-executorch-bare-resource-fetcher'; -export interface TTSState { - // Download state - isBackboneDownloaded: boolean; - isVocoderDownloaded: boolean; - isDownloadingBackbone: boolean; - isDownloadingVocoder: boolean; - backboneDownloadProgress: number; - vocoderDownloadProgress: number; - - // Model lifecycle - isModelLoading: boolean; - isModelLoaded: boolean; - - // Playback - isSpeaking: boolean; - currentMessageId: string | null; - playbackPosition: number; // seconds, for scrubber - - // Cache - audioCacheSizeMB: number; - - // Settings (persisted) - settings: TTSSettings; - - error: string | null; - - // Actions - checkDownloadStatus: () => Promise; - downloadModels: () => Promise; - deleteModels: () => Promise; - loadModels: () => Promise; - unloadModels: () => Promise; - - // Chat Mode - speak: (text: string, messageId: string) => Promise; - stop: () => void; - - // Audio Mode - generateAndSave: (text: string, conversationId: string, messageId: string) => Promise<{ path: string; waveformData: number[]; durationSeconds: number }>; - playMessage: (messageId: string, filePath: string, startOffset?: number) => Promise; - stopPlayback: () => void; - - // Cache management - refreshCacheSize: () => Promise; - clearAudioCache: () => Promise; - - updateSettings: (patch: Partial) => void; - clearError: () => void; -} - -export const useTTSStore = create()( - persist( - (set, get) => ({ - isBackboneDownloaded: false, - isVocoderDownloaded: false, - isDownloadingBackbone: false, - isDownloadingVocoder: false, - backboneDownloadProgress: 0, - vocoderDownloadProgress: 0, - isModelLoading: false, - isModelLoaded: false, - isSpeaking: false, - currentMessageId: null, - playbackPosition: 0, - audioCacheSizeMB: 0, - settings: { - interfaceMode: 'chat', - enabled: true, - autoPlay: false, - speed: 1.0, - voiceId: '0', - }, - error: null, - - checkDownloadStatus: async () => { - const [backbone, vocoder] = await Promise.all([ - ttsService.isBackboneDownloaded(), - ttsService.isVocoderDownloaded(), - ]); - set({ isBackboneDownloaded: backbone, isVocoderDownloaded: vocoder }); - }, - - downloadModels: async () => { - set({ error: null }); - try { - set({ isDownloadingBackbone: true, backboneDownloadProgress: 0 }); - await ttsService.downloadBackbone((p) => set({ backboneDownloadProgress: p })); - set({ isDownloadingBackbone: false, isBackboneDownloaded: true }); - - set({ isDownloadingVocoder: true, vocoderDownloadProgress: 0 }); - await ttsService.downloadVocoder((p) => set({ vocoderDownloadProgress: p })); - set({ isDownloadingVocoder: false, isVocoderDownloaded: true }); - } catch (err) { - const msg = err instanceof Error ? err.message : 'Download failed'; - logger.error('[TTS Store] Download error:', msg); - set({ isDownloadingBackbone: false, isDownloadingVocoder: false, error: msg }); - } - }, - - deleteModels: async () => { - await ttsService.deleteModels(); - set({ isBackboneDownloaded: false, isVocoderDownloaded: false, isModelLoaded: false }); - }, - - loadModels: async () => { - if (get().isModelLoaded || get().isModelLoading) return; - set({ isModelLoading: true, error: null }); - try { - await ttsService.loadModels(); - set({ isModelLoaded: true }); - } catch (err) { - const msg = err instanceof Error ? err.message : 'Failed to load TTS models'; - logger.error('[TTS Store] Load error:', msg); - set({ error: msg }); - } finally { - set({ isModelLoading: false }); - } - }, - - unloadModels: async () => { - await ttsService.unloadModels(); - set({ isModelLoaded: false, isSpeaking: false, currentMessageId: null }); - }, - - // ── Chat Mode ────────────────────────────────────────────────────────── - - speak: async (text: string, messageId: string) => { - const { isModelLoaded, settings } = get(); - if (!settings.enabled) return; - if (!isModelLoaded) return; - - if (get().currentMessageId === messageId && get().isSpeaking) { - get().stop(); - return; - } - - ttsService.stop(); - set({ isSpeaking: true, currentMessageId: messageId, error: null }); - - try { - await ttsService.speak(text, { speed: settings.speed, voiceId: settings.voiceId }); - } catch (err) { - const msg = err instanceof Error ? err.message : 'Speech failed'; - logger.error('[TTS Store] Speak error:', msg); - set({ error: msg }); - } finally { - set({ isSpeaking: false, currentMessageId: null }); - } - }, - - stop: () => { - ttsService.stop(); - set({ isSpeaking: false, currentMessageId: null }); - }, - - // ── Audio Mode ───────────────────────────────────────────────────────── - - generateAndSave: async (text: string, conversationId: string, messageId: string) => { - const { settings } = get(); - const { path, audio } = await ttsService.generateAndSave( - text, - conversationId, - messageId, - { voiceId: settings.voiceId }, - ); - await get().refreshCacheSize(); - return { path, waveformData: audio.waveformData, durationSeconds: audio.durationSeconds }; - }, - - playMessage: async (messageId: string, filePath: string, startOffset: number = 0) => { - const { settings } = get(); - - if (get().currentMessageId === messageId && get().isSpeaking) { - get().stopPlayback(); - return; - } - - ttsService.stop(); - set({ isSpeaking: true, currentMessageId: messageId, playbackPosition: startOffset }); - - try { - await ttsService.playFromFile(filePath, settings.speed, startOffset); - } catch (err) { - const msg = err instanceof Error ? err.message : 'Playback failed'; - logger.error('[TTS Store] Playback error:', msg); - set({ error: msg }); - } finally { - set({ isSpeaking: false, currentMessageId: null, playbackPosition: 0 }); - } - }, - - stopPlayback: () => { - ttsService.stop(); - set({ isSpeaking: false, currentMessageId: null, playbackPosition: 0 }); - }, - - // ── Cache ────────────────────────────────────────────────────────────── - - refreshCacheSize: async () => { - const mb = await ttsService.getAudioCacheSizeMB(); - set({ audioCacheSizeMB: mb }); - }, - - clearAudioCache: async () => { - await ttsService.clearAudioCache(); - set({ audioCacheSizeMB: 0 }); - }, - - updateSettings: (patch) => { - set((state) => ({ settings: { ...state.settings, ...patch } })); - }, - - clearError: () => set({ error: null }), - }), - { - name: 'tts-store', - storage: createJSONStorage(() => AsyncStorage), - partialize: (state) => ({ settings: state.settings }), - } - ) -); +// Called once at startup, before any model hook is used +initExecutorch({ resourceFetcher: BareResourceFetcher }); ``` ---- +### KokoroTTSManager component -### 4. `src/hooks/useTTS.ts` +`react-native-executorch`'s `useTextToSpeech` is a React hook — it must live in a component. A `KokoroTTSManager` component mounts near the root, holds the hook instance, and exposes its methods via a module-level ref (`kokoroRef`). -```typescript -import { useEffect, useCallback } from 'react'; -import { useTTSStore } from '../stores/ttsStore'; -import { hardwareService } from '../services/hardware'; -import { TTS_BLOCK_RAM_GB, TTS_WARN_RAM_GB } from '../constants/ttsModels'; - -export function useTTS() { - const store = useTTSStore(); - - useEffect(() => { - store.checkDownloadStatus(); - }, []); - - const canRunOnDevice = useCallback(async (): Promise<{ allowed: boolean; warning: boolean }> => { - const ramGB = await hardwareService.getTotalMemoryGB(); - return { - allowed: ramGB >= TTS_BLOCK_RAM_GB, - warning: ramGB < TTS_WARN_RAM_GB, - }; - }, []); - - const speakMessage = useCallback( - (text: string, messageId: string) => { - if (!store.isModelLoaded && store.isBackboneDownloaded && store.isVocoderDownloaded) { - store.loadModels().then(() => store.speak(text, messageId)); - return; - } - store.speak(text, messageId); - }, - [store] - ); - - return { - ...store, - speakMessage, - canRunOnDevice, - areBothDownloaded: store.isBackboneDownloaded && store.isVocoderDownloaded, - isDownloading: store.isDownloadingBackbone || store.isDownloadingVocoder, - overallDownloadProgress: - store.backboneDownloadProgress * 0.86 + store.vocoderDownloadProgress * 0.14, - isAudioMode: store.settings.interfaceMode === 'audio', - isChatMode: store.settings.interfaceMode === 'chat', - }; -} ``` - ---- - -### 5. `src/components/AudioMessageBubble/index.tsx` *(Audio Mode only)* - -Replaces `ChatMessage` assistant bubble when `interfaceMode === 'audio'`. - -```typescript -interface AudioMessageBubbleProps { - messageId: string; - conversationId: string; - audioPath: string; // path to WAV on disk - waveformData: number[]; // 200-point amplitude array - durationSeconds: number; - isGenerating?: boolean; // true while TTS is still running -} +App +└── KokoroTTSManager ← mounts useTextToSpeech, wires to kokoroRef + └── AppNavigator + └── ChatScreen + └── TTSButton ← calls kokoroRef.stream(text, callbacks) ``` -**Layout:** -- Static waveform bar (200 rect bars, amplitude-scaled, filled up to scrubber position) -- Draggable scrubber thumb -- `MM:SS` elapsed / total -- Speed chip (cycles 0.5x → 1x → 1.5x → 2x, persists to store) -- "Show transcript" collapse/expand -- Long press → action sheet (Change voice, Regenerate, Copy text, Delete) - ---- - -### 6. `src/components/TTSButton/index.tsx` *(Chat Mode only)* - -Play/stop button that appears on each assistant message bubble. Unchanged from original plan — only rendered when `interfaceMode === 'chat'`. +### Speak flow (Chat Mode — Kokoro, fast) -```typescript -// Don't render in Audio Mode or if TTS disabled/not downloaded -if (settings.interfaceMode === 'audio' || !settings.enabled || !areBothDownloaded) return null; ``` - ---- - -### 7. `src/screens/TTSSettingsScreen/index.tsx` - -Accessible from SettingsScreen → "Text to Speech" row. - -**Sections:** -1. **Header** — back button + "Text to Speech" title -2. **Interface Mode card** — segmented control: `Chat` / `Audio` - - If device RAM < `TTS_BLOCK_RAM_GB`: Audio option is greyed out with "Requires 6GB+ RAM" - - If RAM is between block and warn thresholds: yellow warning under the control -3. **Master toggle card** — enable/disable TTS (Chat Mode only — in Audio Mode, TTS is always on) -4. **Model download card** — download status for both files with separate progress bars; "Download (527 MB)" / "Remove" buttons -5. **Voice card** (shown when downloaded) — voice picker from `TTS_SPEAKER_PROFILES` -6. **Playback card** (shown when downloaded) — Speed slider (0.5–2.0x), Auto-play toggle (Chat Mode only) -7. **Audio cache card** (Audio Mode only) — "Audio cache: X MB" + "Clear cache" button -8. **Device compatibility card** — RAM check with status -9. **Privacy card** — "All speech generated on your device. Nothing is sent to any server." - ---- - -### 8. `src/stores/index.ts` - -Add: -```typescript -export { useTTSStore } from './ttsStore'; +TTSButton tap + → kokoroRef.stream({ text, onNext: playChunk, onBegin, onEnd }) + → AudioContext buffers played as Float32Array chunks arrive + → Streaming: audio starts < 1s after tap ``` -### 9. `src/services/index.ts` +### Voice input flow (Audio Mode — user side) -Add: -```typescript -export { ttsService } from './ttsService'; ``` - -### 10. `src/navigation/types.ts` - -Add `TTSSettings: undefined` to `RootStackParamList`. - -### 11. `src/navigation/AppNavigator.tsx` - -```tsx - +User taps mic button + → audioRecorderService.startRecording() — records WAV to disk + → User releases mic + → audioRecorderService.stopRecording() → { path, durationSeconds } + → whisperService.transcribeFile(path) — file-based STT + → onAutoSend(transcript, { uri: path, format: 'wav', durationSeconds }) + → ChatInput builds MediaAttachment { type: 'audio', uri, durationSeconds } + → onSend(transcript, [audioAttachment]) — content = transcript, attachment = WAV + → MessageRenderer: user message with audio attachment → right-aligned AudioMessageBubble + → LLM receives transcript as text input (standard text generation) ``` -### 12. `src/screens/index.ts` - -Export `TTSSettingsScreen` and `AudioMessageBubble`. +For models that natively support audio input (e.g. Qwen2-Audio): WAV is passed directly as `input_audio` to the model — Whisper is bypassed entirely. -### 13. `src/screens/SettingsScreen.tsx` +### Generate+Save flow (Audio Mode — AI side) -Add nav row pointing to `TTSSettings` (after the Voice row): -```tsx - navigation.navigate('TTSSettings')}> - - Text to Speech - - ``` - -### 14. `src/components/ChatMessage/index.tsx` - -Mode-branch the assistant message render path: - -```tsx -import { AudioMessageBubble } from '../AudioMessageBubble'; -import { TTSButton } from '../TTSButton'; - -// In assistant message render: -const { settings } = useTTSStore(); - -if (settings.interfaceMode === 'audio' && message.audioPath) { - return ( - - ); -} - -// Chat Mode: existing text bubble + TTSButton +Streaming LLM response ends + → triggerAudioModeGeneration(conversationId, messageId, content) + (reads fresh message from useChatStore.getState() — not stale closure) + → ttsService.generateAndSave(text, ctx, options) + → OuteTTS runs inference → Float32Array + waveformData + duration + → Write PCM to disk → update message { audioPath, waveformData, audioDurationSeconds } + → MessageRenderer shows left-aligned AudioMessageBubble ``` -This requires adding `audioPath`, `waveformData`, `audioDurationSeconds`, and `isGeneratingAudio` fields to the message model. +--- -### 15. Message model update (`src/types/` or wherever `Message` is defined) +## ttsStore additions ```typescript -export interface Message { - // ... existing fields ... - audioPath?: string; // Audio Mode: path to WAV on disk - waveformData?: number[]; // Audio Mode: 200-point amplitude envelope - audioDurationSeconds?: number; // Audio Mode: total duration - isGeneratingAudio?: boolean; // true while TTS is running for this message -} +// Kokoro state +kokoroReady: boolean; // useTextToSpeech.isReady +kokoroDownloadProgress: number; // 0–1, during initial model download +kokoroVoiceId: KokoroVoiceId; // persisted setting + +// Actions +setKokoroReady: (ready: boolean, progress: number) => void; +kokoroSpeak: (text: string, messageId: string) => void; // delegates to kokoroRef +kokoroStop: () => void; ``` -### 16. Chat completion flow - -**Chat Mode (autoPlay):** unchanged from original plan — call `speak()` after streaming completes when `autoPlay: true`. - -**Audio Mode:** after streaming completes, immediately trigger `generateAndSave()` and update the message record with the returned `audioPath`, `waveformData`, `durationSeconds`. Set `isGeneratingAudio: true` on the message while generation runs so the bubble shows a loading state. - +The existing `speak()` action becomes: ```typescript -// After streaming completes, if Audio Mode: -if (settings.interfaceMode === 'audio') { - updateMessage(lastMessage.id, { isGeneratingAudio: true }); - const { path, waveformData, durationSeconds } = await ttsStore.generateAndSave( - stripControlTokens(lastMessage.content), - conversationId, - lastMessage.id, - ); - updateMessage(lastMessage.id, { - audioPath: path, - waveformData, - audioDurationSeconds: durationSeconds, - isGeneratingAudio: false, - }); +speak: (text, messageId) => { + if (kokoroReady) { + kokoroSpeak(text, messageId); // fast path + } else { + // OuteTTS fallback (slow, Android <13 or first launch before Kokoro loads) + outeTTSSpeak(text, messageId); + } } ``` --- -## Tests to Write - -### `__tests__/unit/services/ttsService.test.ts` -- `generate` calls `getFormattedAudioCompletion`, `getAudioCompletionGuideTokens`, `completion`, `decodeAudioTokens` in order -- `generate` returns correct `durationSeconds` and 200-point `waveformData` -- `saveToFile` writes a valid WAV file to the correct path -- `generateAndSave` calls both and returns path + audio -- `playFromFile` reads WAV, decodes, and calls `playFromSamples` -- `stop` sets `isSpeakingFlag` to false and calls `currentSource.stop()` -- `encodeWAV` / `decodeWAV` round-trip preserves samples (within 16-bit quantization error) -- `getAudioCacheSizeMB` returns correct value -- `clearAudioCache` removes the cache directory - -### `__tests__/unit/stores/ttsStore.test.ts` -- `generateAndSave` sets correct waveformData and calls `refreshCacheSize` -- `playMessage` sets `isSpeaking: true`, then `false` after completion -- `playMessage` on same messageId while playing → calls `stopPlayback` -- `updateSettings` merges partial settings correctly -- Settings persisted: `interfaceMode`, `speed`, `voiceId`, `enabled` survive re-hydration - -### `__tests__/integration/tts.test.ts` -- **Chat Mode full flow:** download → load → speak → stop -- **Audio Mode full flow:** download → load → generateAndSave → playMessage → stop -- **Auto-play:** Chat Mode with `autoPlay: true`, streaming completes → `speak` called -- **Audio Mode post-completion:** streaming completes → `generateAndSave` called → message updated with `audioPath` -- **Mode switch:** switching `interfaceMode` from `'chat'` to `'audio'` takes effect for next message +## Kokoro Voice IDs ---- - -## Implementation Order - -1. `src/constants/ttsModels.ts` -2. `src/services/ttsService.ts` (with WAV encode/decode + `generate`/`generateAndSave`/`playFromFile`) -3. `src/stores/ttsStore.ts` (with Audio Mode actions) -4. `src/hooks/useTTS.ts` -5. `src/stores/index.ts` — add export -6. `src/services/index.ts` — add export -7. `src/navigation/types.ts` — add route -8. Message model — add `audioPath`, `waveformData`, `audioDurationSeconds`, `isGeneratingAudio` -9. `src/components/AudioMessageBubble/index.tsx` -10. `src/components/TTSButton/index.tsx` (Chat Mode only, unchanged) -11. `src/screens/TTSSettingsScreen/index.tsx` (with Interface Mode section) -12. `src/screens/index.ts` — add exports -13. `src/navigation/AppNavigator.tsx` — add screen -14. `src/screens/SettingsScreen.tsx` — add nav row -15. `src/components/ChatMessage/index.tsx` — mode-branch render -16. Wire Audio Mode generation into chat completion flow -17. Write all tests -18. `npm install react-native-audio-api` + `pod install` +| ID | Label | Accent | Gender | +|---|---|---|---| +| `af_heart` | Heart | US English | Female | +| `af_river` | River | US English | Female | +| `af_sarah` | Sarah | US English | Female | +| `am_adam` | Adam | US English | Male | +| `am_michael` | Michael | US English | Male | +| `am_santa` | Santa | US English | Male | +| `bf_emma` | Emma | British English | Female | +| `bm_daniel` | Daniel | British English | Male | --- -## Memory Safety +## Files to Create / Modify -Before calling `loadModels()`, check available memory: +### New files +- `src/components/KokoroTTSManager.tsx` — mounts the hook, exposes via ref +- `src/constants/kokoroModels.ts` — voice/model constants mirroring executorch exports -```typescript -const available = await hardwareService.getAvailableMemoryGB(); -if (available < 1.0) { - throw new Error('Not enough free memory. Try closing image generation first.'); -} -``` +### Modified files +- `App.tsx` — add `initExecutorch()` call + mount `` +- `src/stores/ttsStore.ts` — add Kokoro state + `kokoroVoiceId` setting +- `src/services/ttsService.ts` — no change to OuteTTS path +- `src/components/TTSButton/index.tsx` — use Kokoro speak when available +- `src/screens/TTSSettingsScreen/index.tsx` — add voice picker (8 Kokoro voices) -This check belongs in `useTTSStore.loadModels()` before calling `ttsService.loadModels()`. +### android/build.gradle +- Bump `minSdkVersion` for executorch: **leave at 24**, guard Kokoro at runtime via `Platform.Version >= 33` --- -## Future: Upgrade to OuteTTS 1.0 - -When llama.cpp PR#12794 (DAC decoder) merges and llama.rn PR#300 (codec.cpp integration) ships: - -1. Add `TTS_BACKBONE_MODEL_V2` to `ttsModels.ts` (backbone + DAC vocoder GGUF) -2. `ttsService.ts` API is unchanged — model-agnostic -3. Store gets a `modelVersion` setting; 0.3 and 1.0 can coexist on disk +## Status + +| Task | Status | +|---|---| +| OuteTTS speak (Chat Mode) | ✅ Implemented (slow, functional) | +| OuteTTS generate+save (Audio Mode — AI side) | ✅ Implemented | +| Stale-closure bug fix (reads fresh store state) | ✅ Fixed | +| TTSButton + Speak long-press action | ✅ Implemented | +| Generation vs playback state (spinner) | ✅ Implemented | +| 300-char text truncation | ✅ Implemented | +| checkDownloadStatus on app start | ✅ Implemented | +| User voice recording → audio bubble (Audio Mode) | ✅ Implemented | +| Auto-send on voice stop in Audio Mode | ✅ Implemented | +| User audio bubble right-aligned | ✅ Implemented | +| TTS section in Chat Settings modal | ✅ Implemented | +| Chat Settings modal: TTS Settings deep link | ✅ Implemented | +| Multimodal audio input (bypass Whisper for audio-capable models) | ✅ Implemented | +| Kokoro via react-native-executorch | 🔲 Not started | +| KokoroTTSManager component | 🔲 Not started | +| Voice picker in TTSSettingsScreen | 🔲 Not started | +| Kokoro → OuteTTS fallback for Android <13 | 🔲 Not started | diff --git a/ios/Podfile.lock b/ios/Podfile.lock index a076829d..3f58a70e 100644 --- a/ios/Podfile.lock +++ b/ios/Podfile.lock @@ -2797,6 +2797,121 @@ PODS: - React-perflogger (= 0.83.1) - React-utils (= 0.83.1) - SocketRocket + - RNAudioAPI (0.11.7): + - boost + - DoubleConversion + - fast_float + - fmt + - glog + - hermes-engine + - RCT-Folly + - RCT-Folly/Fabric + - RCTRequired + - RCTTypeSafety + - React-Core + - React-debug + - React-Fabric + - React-featureflags + - React-graphics + - React-ImageManager + - React-jsi + - React-NativeModulesApple + - React-RCTFabric + - React-renderercss + - React-rendererdebug + - React-utils + - ReactCodegen + - ReactCommon/turbomodule/bridging + - ReactCommon/turbomodule/core + - RNAudioAPI/audioapi (= 0.11.7) + - SocketRocket + - Yoga + - RNAudioAPI/audioapi (0.11.7): + - boost + - DoubleConversion + - fast_float + - fmt + - glog + - hermes-engine + - RCT-Folly + - RCT-Folly/Fabric + - RCTRequired + - RCTTypeSafety + - React-Core + - React-debug + - React-Fabric + - React-featureflags + - React-graphics + - React-ImageManager + - React-jsi + - React-NativeModulesApple + - React-RCTFabric + - React-renderercss + - React-rendererdebug + - React-utils + - ReactCodegen + - ReactCommon/turbomodule/bridging + - ReactCommon/turbomodule/core + - RNAudioAPI/audioapi/audioapi_dsp (= 0.11.7) + - RNAudioAPI/audioapi/ios (= 0.11.7) + - SocketRocket + - Yoga + - RNAudioAPI/audioapi/audioapi_dsp (0.11.7): + - boost + - DoubleConversion + - fast_float + - fmt + - glog + - hermes-engine + - RCT-Folly + - RCT-Folly/Fabric + - RCTRequired + - RCTTypeSafety + - React-Core + - React-debug + - React-Fabric + - React-featureflags + - React-graphics + - React-ImageManager + - React-jsi + - React-NativeModulesApple + - React-RCTFabric + - React-renderercss + - React-rendererdebug + - React-utils + - ReactCodegen + - ReactCommon/turbomodule/bridging + - ReactCommon/turbomodule/core + - SocketRocket + - Yoga + - RNAudioAPI/audioapi/ios (0.11.7): + - boost + - DoubleConversion + - fast_float + - fmt + - glog + - hermes-engine + - RCT-Folly + - RCT-Folly/Fabric + - RCTRequired + - RCTTypeSafety + - React-Core + - React-debug + - React-Fabric + - React-featureflags + - React-graphics + - React-ImageManager + - React-jsi + - React-NativeModulesApple + - React-RCTFabric + - React-renderercss + - React-rendererdebug + - React-utils + - ReactCodegen + - ReactCommon/turbomodule/bridging + - ReactCommon/turbomodule/core + - SocketRocket + - Yoga - RNCAsyncStorage (2.2.0): - boost - DoubleConversion @@ -3368,6 +3483,7 @@ DEPENDENCIES: - ReactAppDependencyProvider (from `build/generated/ios/ReactAppDependencyProvider`) - ReactCodegen (from `build/generated/ios/ReactCodegen`) - ReactCommon/turbomodule/core (from `../node_modules/react-native/ReactCommon`) + - RNAudioAPI (from `../node_modules/react-native-audio-api`) - "RNCAsyncStorage (from `../node_modules/@react-native-async-storage/async-storage`)" - RNDeviceInfo (from `../node_modules/react-native-device-info`) - RNFS (from `../node_modules/react-native-fs`) @@ -3566,6 +3682,8 @@ EXTERNAL SOURCES: :path: build/generated/ios/ReactCodegen ReactCommon: :path: "../node_modules/react-native/ReactCommon" + RNAudioAPI: + :path: "../node_modules/react-native-audio-api" RNCAsyncStorage: :path: "../node_modules/@react-native-async-storage/async-storage" RNDeviceInfo: @@ -3684,6 +3802,7 @@ SPEC CHECKSUMS: ReactAppDependencyProvider: 0eb286cc274abb059ee601b862ebddac2e681d01 ReactCodegen: 3d48510bcef445f6403c0004047d4d9cbb915435 ReactCommon: ac934cb340aee91282ecd6f273a26d24d4c55cae + RNAudioAPI: 106257d5f3713bb667d6d74ebb3105c9cf5d60db RNCAsyncStorage: 29f0230e1a25f36c20b05f65e2eb8958d6526e82 RNDeviceInfo: 36d7f232bfe7c9b5c494cb7793230424ed32c388 RNFS: 89de7d7f4c0f6bafa05343c578f61118c8282ed8 diff --git a/jest.setup.ts b/jest.setup.ts index 15d0f8cb..af694a3d 100644 --- a/jest.setup.ts +++ b/jest.setup.ts @@ -149,6 +149,61 @@ jest.mock('whisper.rn', () => ({ }, }), { virtual: true }); +// react-native-audio-api mock +jest.mock('react-native-audio-api', () => ({ + AudioContext: jest.fn().mockImplementation(() => ({ + createBuffer: jest.fn().mockReturnValue({ copyToChannel: jest.fn() }), + createBufferSource: jest.fn().mockReturnValue({ + connect: jest.fn(), + start: jest.fn(), + stop: jest.fn(), + playbackRate: { value: 1.0 }, + onEnded: null, + buffer: null, + }), + destination: {}, + close: jest.fn(), + })), + AudioRecorder: jest.fn().mockImplementation(() => ({ + enableFileOutput: jest.fn().mockReturnValue({ status: 'success', path: '/mock/audio/input.wav' }), + start: jest.fn().mockReturnValue({ status: 'success', path: '/mock/audio/input.wav' }), + stop: jest.fn().mockReturnValue({ status: 'success', path: '/mock/audio/input.wav', size: 1024, duration: 1.0 }), + pause: jest.fn(), + resume: jest.fn(), + isRecording: jest.fn().mockReturnValue(false), + isPaused: jest.fn().mockReturnValue(false), + })), + FileFormat: { Wav: 0, Caf: 1, M4A: 2, Flac: 3 }, + FileDirectory: { Document: 0, Cache: 1 }, +}), { virtual: true }); + +// @react-native-community/slider mock +jest.mock('@react-native-community/slider', () => { + const { View } = require('react-native'); + return { __esModule: true, default: View }; +}); + +// react-native-executorch mock +const mockVoiceConfig = { id: 'mock_voice' }; +jest.mock('react-native-executorch', () => ({ + useTextToSpeech: jest.fn(() => ({ + isReady: true, + downloadProgress: 1, + error: null, + stream: jest.fn(() => Promise.resolve()), + streamStop: jest.fn(), + })), + KOKORO_MEDIUM: 'kokoro-medium', + KOKORO_VOICE_AF_HEART: mockVoiceConfig, + KOKORO_VOICE_AF_RIVER: mockVoiceConfig, + KOKORO_VOICE_AF_SARAH: mockVoiceConfig, + KOKORO_VOICE_AM_ADAM: mockVoiceConfig, + KOKORO_VOICE_AM_MICHAEL: mockVoiceConfig, + KOKORO_VOICE_AM_SANTA: mockVoiceConfig, + KOKORO_VOICE_BF_EMMA: mockVoiceConfig, + KOKORO_VOICE_BM_DANIEL: mockVoiceConfig, +})); + // react-native-fs mock jest.mock('react-native-fs', () => ({ DocumentDirectoryPath: '/mock/documents', diff --git a/package-lock.json b/package-lock.json index 9353548f..1d6a7f40 100644 --- a/package-lock.json +++ b/package-lock.json @@ -9,7 +9,9 @@ "version": "0.0.86", "hasInstallScript": true, "dependencies": { + "@dr.pogodin/react-native-fs": "^2.38.1", "@gorhom/bottom-sheet": "^5.2.8", + "@kesha-antonov/react-native-background-downloader": "^4.5.4", "@op-engineering/op-sqlite": "^15.2.5", "@react-native-async-storage/async-storage": "^2.2.0", "@react-native-community/blur": "^4.4.1", @@ -31,7 +33,10 @@ "patch-package": "^8.0.1", "react": "19.2.0", "react-native": "0.83.1", + "react-native-audio-api": "^0.11.7", "react-native-device-info": "^15.0.1", + "react-native-executorch": "^0.8.1", + "react-native-executorch-bare-resource-fetcher": "^0.8.0", "react-native-fs": "^2.20.0", "react-native-gesture-handler": "^2.30.0", "react-native-haptic-feedback": "^2.3.3", @@ -2113,6 +2118,51 @@ "devOptional": true, "license": "MIT" }, + "node_modules/@dr.pogodin/react-native-fs": { + "version": "2.38.1", + "resolved": "https://registry.npmjs.org/@dr.pogodin/react-native-fs/-/react-native-fs-2.38.1.tgz", + "integrity": "sha512-H5uxbEy61as7m5p4dNhv4a/huO8g9r4weu0FM/UjlgRd1PSYqpZaJBi2nhDGums/N+MrK8IZFOHVV5ukHWX8UQ==", + "license": "MIT", + "workspaces": [ + "example" + ], + "dependencies": { + "buffer": "^6.0.3", + "http-status-codes": "^2.3.0" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/birdofpreyru" + }, + "peerDependencies": { + "react": "*", + "react-native": "*" + } + }, + "node_modules/@dr.pogodin/react-native-fs/node_modules/buffer": { + "version": "6.0.3", + "resolved": "https://registry.npmjs.org/buffer/-/buffer-6.0.3.tgz", + "integrity": "sha512-FTiCpNxtwiZZHEZbcbTIcZjERVICn9yq/pDFkTl95/AxzD1naBctN7YO68riM/gLSDY7sdrMby8hofADYuuqOA==", + "funding": [ + { + "type": "github", + "url": "https://github.com/sponsors/feross" + }, + { + "type": "patreon", + "url": "https://www.patreon.com/feross" + }, + { + "type": "consulting", + "url": "https://feross.org/support" + } + ], + "license": "MIT", + "dependencies": { + "base64-js": "^1.3.1", + "ieee754": "^1.2.1" + } + }, "node_modules/@egjs/hammerjs": { "version": "2.0.17", "resolved": "https://registry.npmjs.org/@egjs/hammerjs/-/hammerjs-2.0.17.tgz", @@ -2559,6 +2609,15 @@ "@hapi/hoek": "^9.0.0" } }, + "node_modules/@huggingface/jinja": { + "version": "0.5.6", + "resolved": "https://registry.npmjs.org/@huggingface/jinja/-/jinja-0.5.6.tgz", + "integrity": "sha512-MyMWyLnjqo+KRJYSH7oWNbsOn5onuIvfXYPcc0WOGxU0eHUV7oAYUoQTl2BMdu7ml+ea/bu11UM+EshbeHwtIA==", + "license": "MIT", + "engines": { + "node": ">=18" + } + }, "node_modules/@humanwhocodes/config-array": { "version": "0.13.0", "resolved": "https://registry.npmjs.org/@humanwhocodes/config-array/-/config-array-0.13.0.tgz", @@ -3110,6 +3169,15 @@ "@jridgewell/sourcemap-codec": "^1.4.14" } }, + "node_modules/@kesha-antonov/react-native-background-downloader": { + "version": "4.5.4", + "resolved": "https://registry.npmjs.org/@kesha-antonov/react-native-background-downloader/-/react-native-background-downloader-4.5.4.tgz", + "integrity": "sha512-WH9n7Sy8MebWiVZqZYpvP4q2sJeOIiNLrbHB64ue/YYsXnWtdJ3iMQowv/QEmU2Cw9biI1d2k8LFHKV9oACLsw==", + "license": "Apache-2.0", + "peerDependencies": { + "react-native": ">=0.57.0" + } + }, "node_modules/@motionone/animation": { "version": "10.18.0", "resolved": "https://registry.npmjs.org/@motionone/animation/-/animation-10.18.0.tgz", @@ -8090,6 +8158,12 @@ "node": ">= 0.8" } }, + "node_modules/http-status-codes": { + "version": "2.3.0", + "resolved": "https://registry.npmjs.org/http-status-codes/-/http-status-codes-2.3.0.tgz", + "integrity": "sha512-RJ8XvFvpPM/Dmc5SV+dC4y5PCeOhT3x1Hq0NU3rjGeg5a/CqlhZ7uudknPwZFz4aeAXDcbAyaeP7GAo9lvngtA==", + "license": "MIT" + }, "node_modules/https-proxy-agent": { "version": "7.0.6", "resolved": "https://registry.npmjs.org/https-proxy-agent/-/https-proxy-agent-7.0.6.tgz", @@ -8146,7 +8220,6 @@ "version": "1.2.1", "resolved": "https://registry.npmjs.org/ieee754/-/ieee754-1.2.1.tgz", "integrity": "sha512-dcyqhDvX1C46lXZcVqCpK+FtMRQVdIMN6/Df5js2zouUsqG7I6sFxitIC+7KYK29KdXOLHdu9zL4sFnoVQnqaA==", - "devOptional": true, "funding": [ { "type": "github", @@ -9609,6 +9682,24 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/jsonrepair": { + "version": "3.13.3", + "resolved": "https://registry.npmjs.org/jsonrepair/-/jsonrepair-3.13.3.tgz", + "integrity": "sha512-BTznj0owIt2CBAH/LTo7+1I5pMvl1e1033LRl/HUowlZmJOIhzC0zbX5bxMngLkfT4WnzPP26QnW5wMr2g9tsQ==", + "license": "ISC", + "bin": { + "jsonrepair": "bin/cli.js" + } + }, + "node_modules/jsonschema": { + "version": "1.5.0", + "resolved": "https://registry.npmjs.org/jsonschema/-/jsonschema-1.5.0.tgz", + "integrity": "sha512-K+A9hhqbn0f3pJX17Q/7H6yQfD/5OXgdrR5UE12gMXCiN9D5Xq2o5mddV2QEcX/bjla99ASsAAQUyMCCRWAEhw==", + "license": "MIT", + "engines": { + "node": "*" + } + }, "node_modules/jsx-ast-utils": { "version": "3.3.5", "resolved": "https://registry.npmjs.org/jsx-ast-utils/-/jsx-ast-utils-3.3.5.tgz", @@ -11862,6 +11953,15 @@ "node": ">=8.0" } }, + "node_modules/pngjs": { + "version": "7.0.0", + "resolved": "https://registry.npmjs.org/pngjs/-/pngjs-7.0.0.tgz", + "integrity": "sha512-LKWqWJRhstyYo9pGvgor/ivk2w94eSjE3RGVuzLGlr3NmD8bf7RcYGze1mNdEHRP6TRP6rMuDHk5t44hnTRyow==", + "license": "MIT", + "engines": { + "node": ">=14.19.0" + } + }, "node_modules/popmotion": { "version": "11.0.3", "resolved": "https://registry.npmjs.org/popmotion/-/popmotion-11.0.3.tgz", @@ -12220,6 +12320,34 @@ } } }, + "node_modules/react-native-audio-api": { + "version": "0.11.7", + "resolved": "https://registry.npmjs.org/react-native-audio-api/-/react-native-audio-api-0.11.7.tgz", + "integrity": "sha512-2oIoP77Tn2nlouRVfEC3bAsuSyKU6xhGNkSnVXTLLQQZslEDoYX2cN9pVRZoWOqhFrLT8q4IZI9HaFgYL13L1A==", + "license": "MIT", + "dependencies": { + "semver": "^7.7.3" + }, + "bin": { + "setup-rn-audio-api-web": "scripts/setup-rn-audio-api-web.js" + }, + "peerDependencies": { + "react": "*", + "react-native": "*" + } + }, + "node_modules/react-native-audio-api/node_modules/semver": { + "version": "7.7.4", + "resolved": "https://registry.npmjs.org/semver/-/semver-7.7.4.tgz", + "integrity": "sha512-vFKC2IEtQnVhpT78h1Yp8wzwrf8CM+MzKMHGJZfBtzhZNycRFnXsHk6E5TxIkkMsgNS7mdX3AGB7x2QM2di4lA==", + "license": "ISC", + "bin": { + "semver": "bin/semver.js" + }, + "engines": { + "node": ">=10" + } + }, "node_modules/react-native-device-info": { "version": "15.0.1", "resolved": "https://registry.npmjs.org/react-native-device-info/-/react-native-device-info-15.0.1.tgz", @@ -12229,6 +12357,38 @@ "react-native": "*" } }, + "node_modules/react-native-executorch": { + "version": "0.8.1", + "resolved": "https://registry.npmjs.org/react-native-executorch/-/react-native-executorch-0.8.1.tgz", + "integrity": "sha512-DEVWs+Ki7p1C8mEgsHiabZizO/kDM0zELlJ+JFCfNCb2RrraMUXBTZIARWHPUbxpG17nqFswIZmwjUoNK5V36g==", + "license": "MIT", + "workspaces": [ + "example" + ], + "dependencies": { + "@huggingface/jinja": "^0.5.0", + "jsonrepair": "^3.12.0", + "jsonschema": "^1.5.0", + "pngjs": "^7.0.0", + "zod": "^4.3.6" + }, + "peerDependencies": { + "react": "*", + "react-native": "*" + } + }, + "node_modules/react-native-executorch-bare-resource-fetcher": { + "version": "0.8.0", + "resolved": "https://registry.npmjs.org/react-native-executorch-bare-resource-fetcher/-/react-native-executorch-bare-resource-fetcher-0.8.0.tgz", + "integrity": "sha512-PzSzK31qnKmwW06+JCbpQML24u3XiqYcWKQG0Y1cwPmkOqz0VppI0ZOeCZh03/03SMyuvwwEgteJtgO0uSP8sg==", + "license": "MIT", + "peerDependencies": { + "@dr.pogodin/react-native-fs": "^2.0.0", + "@kesha-antonov/react-native-background-downloader": "^4.0.0", + "react-native": "*", + "react-native-executorch": "*" + } + }, "node_modules/react-native-fit-image": { "version": "1.5.5", "resolved": "https://registry.npmjs.org/react-native-fit-image/-/react-native-fit-image-1.5.5.tgz", @@ -14716,7 +14876,6 @@ "version": "4.3.6", "resolved": "https://registry.npmjs.org/zod/-/zod-4.3.6.tgz", "integrity": "sha512-rftlrkhHZOcjDwkGlnUtZZkvaPHCsDATp4pGpuOOMDaTdDDXF91wuVDJoWoPsKX/3YPQ5fHuF3STjcYyKr+Qhg==", - "dev": true, "license": "MIT", "funding": { "url": "https://github.com/sponsors/colinhacks" diff --git a/package.json b/package.json index 0650d784..54ceb6a2 100644 --- a/package.json +++ b/package.json @@ -20,7 +20,9 @@ "postinstall": "patch-package" }, "dependencies": { + "@dr.pogodin/react-native-fs": "^2.38.1", "@gorhom/bottom-sheet": "^5.2.8", + "@kesha-antonov/react-native-background-downloader": "^4.5.4", "@op-engineering/op-sqlite": "^15.2.5", "@react-native-async-storage/async-storage": "^2.2.0", "@react-native-community/blur": "^4.4.1", @@ -42,7 +44,10 @@ "patch-package": "^8.0.1", "react": "19.2.0", "react-native": "0.83.1", + "react-native-audio-api": "^0.11.7", "react-native-device-info": "^15.0.1", + "react-native-executorch": "^0.8.1", + "react-native-executorch-bare-resource-fetcher": "^0.8.0", "react-native-fs": "^2.20.0", "react-native-gesture-handler": "^2.30.0", "react-native-haptic-feedback": "^2.3.3", diff --git a/src/components/AudioMessageBubble/PlaybackControls.tsx b/src/components/AudioMessageBubble/PlaybackControls.tsx new file mode 100644 index 00000000..1972a263 --- /dev/null +++ b/src/components/AudioMessageBubble/PlaybackControls.tsx @@ -0,0 +1,264 @@ +import React, { useState, useCallback, useEffect, useRef } from 'react'; +import { + View, + Text, + TouchableOpacity, + ActivityIndicator, +} from 'react-native'; +import { ScrollView } from 'react-native-gesture-handler'; +import Slider from '@react-native-community/slider'; +import { stripMarkdownForSpeech } from '../../utils/messageContent'; +import { MarkdownText } from '../MarkdownText'; +import Icon from 'react-native-vector-icons/Feather'; +import { useTTSStore } from '../../stores/ttsStore'; +import type { ThemeColors } from '../../theme'; + +const SPEED_STEPS: number[] = [0.5, 0.8, 0.9, 1.0, 1.1, 1.2, 1.5, 2.0]; + +function formatDuration(seconds: number): string { + const m = Math.floor(seconds / 60); + const s = Math.floor(seconds % 60); + return `${m}:${s.toString().padStart(2, '0')}`; +} + +interface PlaybackState { + isThisPlaying: boolean; + isThisPaused: boolean; + isThisAudible: boolean; + isThisLoading: boolean; +} + +/** Derives playback state for a given messageId from TTS store selectors */ +export function usePlaybackState(messageId: string): PlaybackState { + const isSpeaking = useTTSStore((s) => s.isSpeaking); + const isPaused = useTTSStore((s) => s.isPaused); + const isAudioPlaying = useTTSStore((s) => s.isAudioPlaying); + const currentMessageId = useTTSStore((s) => s.currentMessageId); + + const isThisPlaying = isSpeaking && currentMessageId === messageId && !isPaused; + const isThisPaused = isSpeaking && currentMessageId === messageId && isPaused; + const isThisAudible = isAudioPlaying && currentMessageId === messageId && !isPaused; + const isThisLoading = isThisPlaying && !isThisAudible; + + return { isThisPlaying, isThisPaused, isThisAudible, isThisLoading }; +} + +/** Hook for wall-clock elapsed timer */ +export function useElapsedTimer( + playback: { isThisAudible: boolean; isThisPaused: boolean }, + seekOffsetRef: React.MutableRefObject, +) { + const { isThisAudible, isThisPaused } = playback; + // playSessionId is a monotonic counter that increments on every new play — + // guarantees the effect re-runs even if boolean deps appear unchanged. + const playSessionId = useTTSStore((s) => s.playSessionId); + const [localElapsed, setLocalElapsed] = useState(0); + const startTimeRef = useRef(0); + const pausedAtRef = useRef(0); + + useEffect(() => { + console.log('[Timer] effect: isThisAudible=', isThisAudible, 'isThisPaused=', isThisPaused, 'playSessionId=', playSessionId); + if (!isThisAudible && !isThisPaused) { + if (seekOffsetRef.current === 0) { + setLocalElapsed(0); + pausedAtRef.current = 0; + } + console.log('[Timer] not audible, not paused — resetting'); + return; + } + if (isThisPaused) { + pausedAtRef.current = localElapsed; + console.log('[Timer] paused at', localElapsed); + return; + } + const offset = seekOffsetRef.current || pausedAtRef.current; + seekOffsetRef.current = 0; + startTimeRef.current = Date.now() - offset * 1000; + console.log('[Timer] STARTING interval, offset=', offset); + const id = setInterval(() => { + setLocalElapsed((Date.now() - startTimeRef.current) / 1000); + }, 50); + return () => { console.log('[Timer] CLEARING interval'); clearInterval(id); }; + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [isThisAudible, isThisPaused, playSessionId]); + + return { localElapsed, setLocalElapsed }; +} + +/** Play/pause button with loading states */ +export const PlayButton: React.FC<{ + isLoading: boolean; + isThisLoading: boolean; + isThisPlaying: boolean; + onPlayPause: () => void; + colors: ThemeColors; + styles: any; +}> = ({ isLoading, isThisLoading, isThisPlaying, onPlayPause, colors, styles }) => { + if (isLoading) { + return ( + + + + ); + } + if (isThisLoading) { + return ( + + + + ); + } + return ( + + + + ); +}; + +/** Speed cycle chip */ +export const SpeedChip: React.FC<{ + styles: any; +}> = ({ styles }) => { + const speed = useTTSStore((s) => s.settings.speed); + const updateSettings = useTTSStore((s) => s.updateSettings); + + const handleSpeedCycle = useCallback(() => { + let idx = SPEED_STEPS.indexOf(speed); + if (idx < 0) { + idx = SPEED_STEPS.findIndex((s) => s > speed) - 1; + if (idx < 0) idx = 0; + } + const next = (idx + 1) % SPEED_STEPS.length; + updateSettings({ speed: SPEED_STEPS[next] }); + }, [speed, updateSettings]); + + return ( + + {speed}x + + ); +}; + +/** Duration display */ +export const DurationText: React.FC<{ + isLoading: boolean; + totalDuration: number; + styles: any; +}> = ({ isLoading, totalDuration, styles }) => ( + + {isLoading ? '—' : formatDuration(totalDuration)} + +); + +/** Seekable progress bar using native Slider component */ +export const SeekBar: React.FC<{ + displayProgress: number; + colors: ThemeColors; + styles: any; + onSeek: (fraction: number) => void; +}> = ({ displayProgress, colors, styles, onSeek }) => { + const [isSeeking, setIsSeeking] = useState(false); + const [seekValue, setSeekValue] = useState(0); + + return ( + { setIsSeeking(true); setSeekValue(val); }} + onValueChange={(val) => { if (isSeeking) setSeekValue(val); }} + onSlidingComplete={(val) => { setIsSeeking(false); onSeek(val); }} + /> + ); +}; + +/** Transcript toggle and content */ +export const TranscriptToggle: React.FC<{ + transcript?: string; + colors: ThemeColors; + styles: any; + isOpen: boolean; + onToggle: (v: boolean) => void; +}> = ({ transcript, colors, styles, isOpen, onToggle }) => { + if (!transcript) return null; + + return ( + onToggle(!isOpen)} + style={styles.transcriptToggle} + > + + {isOpen ? 'Hide transcript' : 'Show transcript'} + + + + ); +}; + +export const TranscriptContent: React.FC<{ + transcript: string; + styles: any; +}> = ({ transcript, styles }) => ( + + + {transcript} + + +); + +/** Hook for seek logic */ +interface SeekHandlerParams { + transcript: string | undefined; + audioPath: string; + messageId: string; + totalDurationRef: React.MutableRefObject; + seekOffsetRef: React.MutableRefObject; + setLocalElapsed: (v: number) => void; + setIsSeeking: (v: boolean) => void; +} + +export function useSeekHandler({ + transcript, audioPath, messageId, + totalDurationRef, seekOffsetRef, setLocalElapsed, setIsSeeking, +}: SeekHandlerParams) { + const stop = useTTSStore((s) => s.stop); + const speak = useTTSStore((s) => s.speak); + + return useCallback((fraction: number) => { + if (!transcript || audioPath) return; + const text = stripMarkdownForSpeech(transcript); + const charOffset = Math.floor(fraction * text.length); + const seekPoint = text.lastIndexOf('. ', charOffset) + 2 || charOffset; + const remaining = text.slice(seekPoint).trim(); + console.log(`[AudioBubble] seeking to ${Math.round(fraction * 100)}%`, 'charOffset:', charOffset, 'remaining:', remaining.length, 'chars'); + if (!remaining) return; + const seekSeconds = Math.floor(fraction * totalDurationRef.current); + seekOffsetRef.current = seekSeconds; + setLocalElapsed(seekSeconds); + setIsSeeking(true); + stop(); + setTimeout(() => { + speak(remaining, messageId).finally(() => setIsSeeking(false)); + }, 200); + }, [transcript, audioPath, stop, speak, messageId, totalDurationRef, seekOffsetRef, setLocalElapsed, setIsSeeking]); +} diff --git a/src/components/AudioMessageBubble/index.tsx b/src/components/AudioMessageBubble/index.tsx new file mode 100644 index 00000000..c18cfa6c --- /dev/null +++ b/src/components/AudioMessageBubble/index.tsx @@ -0,0 +1,390 @@ +import React, { useState, useCallback, useEffect, useRef, useMemo } from 'react'; +import { + View, + Text, + TouchableOpacity, + StyleSheet, + Animated, +} from 'react-native'; +import { stripMarkdownForSpeech } from '../../utils/messageContent'; +import { useTheme, useThemedStyles } from '../../theme'; +import { useTTSStore } from '../../stores/ttsStore'; +import { triggerHaptic } from '../../utils/haptics'; +import { TYPOGRAPHY, SPACING } from '../../constants'; +import type { ThemeColors, ThemeShadows } from '../../theme'; +import { ActionMenuSheet } from '../ChatMessage/components/ActionMenuSheet'; +import { createStyles as createChatStyles } from '../ChatMessage/styles'; +import { + usePlaybackState, + useElapsedTimer, + useSeekHandler, + PlayButton, + SpeedChip, + DurationText, + SeekBar, + TranscriptToggle, + TranscriptContent, +} from './PlaybackControls'; + +const WAVEFORM_BARS = 48; + +interface AudioMessageBubbleProps { + messageId: string; + audioPath: string; + waveformData: number[]; + durationSeconds: number; + transcript?: string; + isUser?: boolean; + isLoading?: boolean; + _reasoningContent?: string; + onCopy?: (content: string) => void; + onRetry?: () => void; + onEdit?: (newContent: string) => void; +} + +function subsample(data: number[], count: number): number[] { + if (data.length === 0) { + return Array.from({ length: count }, (_, i) => 0.25 + 0.25 * Math.sin((i / count) * Math.PI * 4)); + } + const step = data.length / count; + const result: number[] = []; + for (let i = 0; i < count; i++) { + result.push(data[Math.floor(i * step)] ?? 0.1); + } + return result; +} + +function normalize(data: number[]): number[] { + const max = Math.max(...data, 0.001); + return data.map((v) => v / max); +} + +/** WhatsApp-style waveform — bars tint as the playhead passes over them. + * Played bars are full color, unplayed bars are muted. */ +const WaveformBars: React.FC<{ + data: number[]; + colors: ThemeColors; + /** 0–1 playback progress — bars behind the playhead are tinted */ + progress?: number; +}> = ({ data, colors, progress = 0 }) => { + const bars = useMemo(() => normalize(subsample(data, WAVEFORM_BARS)), [data]); + + return ( + + {bars.map((shape, i) => { + const played = progress > 0 && (i / bars.length) < progress; + return ( + + ); + })} + + ); +}; + +const barStyles = StyleSheet.create({ + container: { + flex: 1, + flexDirection: 'row', + alignItems: 'center', + gap: 1.5, + height: 40, + overflow: 'hidden', + }, + bar: { + flex: 1, + borderRadius: 2, + }, +}); + +/** Three pulsing dots shown while the LLM is generating */ +const ThinkingDots: React.FC<{ colors: ThemeColors }> = ({ colors }) => { + const dots = useRef([new Animated.Value(0.3), new Animated.Value(0.3), new Animated.Value(0.3)]).current; + + useEffect(() => { + const anims = dots.map((v, i) => + Animated.loop( + Animated.sequence([ + Animated.delay(i * 150), + Animated.timing(v, { toValue: 1, duration: 300, useNativeDriver: false }), + Animated.timing(v, { toValue: 0.3, duration: 300, useNativeDriver: false }), + ]), + ), + ); + anims.forEach((a) => a.start()); + return () => anims.forEach((a) => a.stop()); + }, [dots]); + + return ( + + {dots.map((v, i) => ( + + ))} + + ); +}; + +const dotStyles = StyleSheet.create({ + container: { + flex: 1, + flexDirection: 'row', + alignItems: 'center', + gap: 6, + paddingHorizontal: 4, + height: 32, + }, + dot: { + width: 7, + height: 7, + borderRadius: 4, + }, +}); + +export const AudioMessageBubble: React.FC = ({ + messageId, + audioPath, + waveformData, + durationSeconds, + transcript, + isUser = false, + isLoading = false, + _reasoningContent, + onCopy, + onRetry, + onEdit, +}) => { + const { colors } = useTheme(); + const styles = useThemedStyles(createStyles); + const chatStyles = useThemedStyles(createChatStyles); + const [showActionMenu, setShowActionMenu] = useState(false); + const speed = useTTSStore((s) => s.settings.speed); + const playMessage = useTTSStore((s) => s.playMessage); + const speak = useTTSStore((s) => s.speak); + + const { isThisPlaying, isThisPaused, isThisAudible, isThisLoading } = usePlaybackState(messageId); + const currentMessageId = useTTSStore((s) => s.currentMessageId); + + useEffect(() => { + console.log('[AudioBubble] state: messageId=', messageId, 'currentMessageId=', currentMessageId, 'isThisAudible=', isThisAudible, 'isThisPlaying=', isThisPlaying); + }, [messageId, currentMessageId, isThisAudible, isThisPlaying]); + const [showTranscript, setShowTranscript] = useState(false); + const [isSeeking, setIsSeeking] = useState(false); + const seekOffsetRef = useRef(0); + const { localElapsed, setLocalElapsed } = useElapsedTimer({ isThisAudible, isThisPaused }, seekOffsetRef); + + const handlePlayPause = useCallback(() => { + const { pause, resume } = useTTSStore.getState(); + if (isThisPaused) { resume(); return; } + if (isThisPlaying) { pause(); return; } + if (audioPath) { + playMessage(messageId, audioPath); + } else { + const text = stripMarkdownForSpeech(transcript ?? ''); + speak(text, messageId); + } + }, [isThisPlaying, isThisPaused, playMessage, speak, messageId, audioPath, transcript]); + + const totalDurationRef = useRef(0); + const totalDuration = useMemo(() => { + if (!audioPath && transcript) { + const wordCount = transcript.trim().split(/\s+/).filter(Boolean).length; + return Math.max(1, wordCount / (2.5 * speed)); + } + return durationSeconds; + }, [audioPath, transcript, speed, durationSeconds]); + totalDurationRef.current = totalDuration; + + const handleSeek = useSeekHandler({ + transcript, audioPath, messageId, + totalDurationRef, seekOffsetRef, setLocalElapsed, setIsSeeking, + }); + + const isThisActive = ((isThisPlaying || isThisPaused) && currentMessageId === messageId) || isSeeking; + const progress = isThisActive ? Math.min(1, localElapsed / Math.max(1, totalDuration)) : 0; + + // Waveform + seekbar overlay — seekbar sits on top of the waveform, centered vertically + const waveformWithSeek = ( + + {isLoading && !isUser + ? + : } + {!isLoading && ( + + + + )} + + ); + + const handleLongPress = useCallback(() => { + if (isLoading) return; + triggerHaptic('impactMedium'); + setShowActionMenu(true); + }, [isLoading]); + + const showActions = !!(onCopy || onRetry || onEdit); + + return ( + + + + + {waveformWithSeek} + + + + + + + + {showActions && !isLoading && ( + { triggerHaptic('impactLight'); setShowActionMenu(true); }}> + ••• + + )} + + + + + {showTranscript && transcript ? ( + + ) : null} + + setShowActionMenu(false)} + isUser={isUser} + canEdit={isUser && !!onEdit} + canRetry={!!onRetry} + canGenerateImage={false} + canSpeak={false} + styles={chatStyles} + onCopy={() => { onCopy?.(transcript ?? ''); setShowActionMenu(false); }} + onEdit={() => setShowActionMenu(false)} + onRetry={() => { onRetry?.(); setShowActionMenu(false); }} + onGenerateImage={() => setShowActionMenu(false)} + onSpeak={() => setShowActionMenu(false)} + /> + + ); +}; + +const createStyles = (colors: ThemeColors, _shadows: ThemeShadows) => ({ + bubble: { + backgroundColor: colors.surface, + borderRadius: 12, + borderWidth: 1, + borderColor: colors.border, + padding: SPACING.md, + width: '88%' as const, + alignSelf: 'flex-start' as const, + gap: SPACING.sm, + overflow: 'hidden' as const, + }, + bubbleUser: { + alignSelf: 'flex-end' as const, + backgroundColor: `${colors.primary}18`, + borderColor: `${colors.primary}40`, + }, + playRow: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + gap: SPACING.xs, + }, + metaRow: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + justifyContent: 'space-between' as const, + }, + metaRight: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + gap: SPACING.sm, + }, + playButton: { + width: 28, + height: 28, + borderRadius: 14, + backgroundColor: `${colors.primary}20`, + alignItems: 'center' as const, + justifyContent: 'center' as const, + }, + playButtonDisabled: { + opacity: 0.35, + }, + duration: { + ...TYPOGRAPHY.meta, + color: colors.textMuted, + minWidth: 32, + textAlign: 'right' as const, + }, + speedChip: { + backgroundColor: colors.surfaceLight, + borderRadius: 10, + paddingHorizontal: SPACING.sm, + paddingVertical: SPACING.xs, + borderWidth: 1, + borderColor: colors.border, + }, + speedText: { + ...TYPOGRAPHY.metaSmall, + color: colors.textSecondary, + }, + waveformSeekContainer: { + flex: 1, + position: 'relative' as const, + marginLeft: SPACING.sm, + }, + seekOverlay: { + position: 'absolute' as const, + top: 0, + left: -16, + right: -16, + bottom: 0, + justifyContent: 'center' as const, + }, + seekSlider: { + height: 40, + }, + transcriptToggle: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + gap: SPACING.xs, + }, + transcriptToggleText: { + ...TYPOGRAPHY.meta, + color: colors.textMuted, + }, + transcriptContent: { + paddingTop: SPACING.xs, + }, + transcriptScroll: { + maxHeight: 120, + }, + transcriptText: { + ...TYPOGRAPHY.bodySmall, + lineHeight: 20, + }, + actionHint: { + padding: 4, + }, + actionHintText: { + ...TYPOGRAPHY.bodySmall, + color: colors.textMuted, + letterSpacing: 1, + }, +}); diff --git a/src/components/ChatInput/Attachments.tsx b/src/components/ChatInput/Attachments.tsx index bdf90cdf..b96e3b53 100644 --- a/src/components/ChatInput/Attachments.tsx +++ b/src/components/ChatInput/Attachments.tsx @@ -101,9 +101,21 @@ export function useAttachments(setAlertState: (state: AlertState) => void) { } }; + const addAudioAttachment = (uri: string, audioFormat: 'wav' | 'mp3', audioDurationSeconds?: number) => { + const attachment: MediaAttachment = { + id: nextAttachmentId(), + type: 'audio', + uri, + audioFormat, + audioDurationSeconds, + fileName: uri.split('/').pop(), + }; + setAttachments(prev => [...prev, attachment]); + }; + const clearAttachments = () => setAttachments([]); - return { attachments, removeAttachment, clearAttachments, handlePickImage, handlePickDocument }; + return { attachments, removeAttachment, clearAttachments, handlePickImage, handlePickDocument, addAudioAttachment }; } // ─── AttachmentPreview component ───────────────────────────────────────────── @@ -135,6 +147,11 @@ export const AttachmentPreview: React.FC = ({ attachment source={{ uri: attachment.uri }} style={styles.attachmentImage} /> + ) : attachment.type === 'audio' ? ( + + + Voice + ) : ( diff --git a/src/components/ChatInput/AudioModeLayout.tsx b/src/components/ChatInput/AudioModeLayout.tsx new file mode 100644 index 00000000..ee2a1a4c --- /dev/null +++ b/src/components/ChatInput/AudioModeLayout.tsx @@ -0,0 +1,239 @@ +import React from 'react'; +import { View, TouchableOpacity, Text, ActivityIndicator } from 'react-native'; +import Icon from 'react-native-vector-icons/Feather'; +import { useTheme } from '../../theme'; +import { ImageModeState, MediaAttachment } from '../../types'; +import { VoiceRecordButton } from '../VoiceRecordButton'; +import { triggerHaptic } from '../../utils/haptics'; +import { CustomAlert, hideAlert, AlertState } from '../CustomAlert'; +import { QueueRow } from './Toolbar'; +import { AttachmentPreview } from './Attachments'; +import { AttachPickerPopover, VoicePickerPopover, QuickSettingsPopover } from './Popovers'; +import { useTTSStore } from '../../stores/ttsStore'; +import type { KOKORO_VOICES } from '../../constants/kokoroModels'; + +interface AudioModeLayoutProps { + styles: any; + disabled?: boolean; + isGenerating?: boolean; + imageMode: ImageModeState; + imageModelLoaded: boolean; + supportsThinking: boolean; + supportsToolCalling: boolean; + enabledToolCount: number; + thinkingEnabled: boolean; + currentVoice: typeof KOKORO_VOICES[number]; + // Attachments + attachments: MediaAttachment[]; + onRemoveAttachment: (id: string) => void; + // Queue + queueCount: number; + queuedTexts: string[]; + onClearQueue?: () => void; + // Voice recording + isRecording: boolean; + voiceAvailable: boolean; + isModelLoading: boolean; + isTranscribing: boolean; + partialResult: string; + error: string | null; + onStartRecording: () => void; + onStopRecording: () => void; + onCancelRecording: () => void; + // Handlers + onStop?: () => void; + onImageModeToggle: () => void; + onThinkingToggle: () => void; + onToolsPress?: () => void; + onVisionPress: () => void; + onPickDocument: () => void; + // Popovers + attachPicker: any; + voicePicker: any; + quickSettings: any; + supportsVision: boolean; + // Alert + alertState: AlertState; + setAlertState: (s: AlertState) => void; +} + +export const AudioModeLayout: React.FC = ({ + styles, + disabled, + isGenerating, + imageMode, + imageModelLoaded, + supportsThinking, + supportsToolCalling, + enabledToolCount, + thinkingEnabled, + currentVoice, + attachments, + onRemoveAttachment, + queueCount, + queuedTexts, + onClearQueue, + isRecording, + voiceAvailable, + isModelLoading, + isTranscribing, + partialResult, + error, + onStartRecording, + onStopRecording, + onCancelRecording, + onStop, + onImageModeToggle, + onThinkingToggle, + onToolsPress, + onVisionPress, + onPickDocument, + attachPicker, + voicePicker, + quickSettings, + supportsVision, + alertState, + setAlertState, +}) => { + const { colors } = useTheme(); + const isChangingVoice = useTTSStore((s) => s.settings.kokoroVoiceId !== s.kokoroActiveVoiceId); + + const handleStop = () => { + if (onStop && isGenerating) { + triggerHaptic('impactLight'); + onStop(); + } + }; + + const audioStopButton = isGenerating && onStop ? ( + + + + ) : null; + + return ( + + + + + attachPicker.show()} + disabled={disabled} + hitSlop={{ top: 4, bottom: 4, left: 8, right: 8 }} + > + + + { + triggerHaptic('impactLight'); + useTTSStore.getState().updateSettings({ interfaceMode: 'chat' }); + }} + hitSlop={{ top: 4, bottom: 4, left: 8, right: 8 }} + > + + + + + + {supportsThinking && ( + + + + )} + { triggerHaptic('impactLight'); onToolsPress?.(); }} + disabled={disabled || !supportsToolCalling} + hitSlop={{ top: 4, bottom: 4, left: 8, right: 8 }} + > + 0 ? colors.primary : !supportsToolCalling ? colors.textMuted : colors.textSecondary} /> + + voicePicker.show()} + hitSlop={{ top: 4, bottom: 4, left: 8, right: 8 }} + > + {isChangingVoice + ? + : } + {currentVoice.label} + + + {isGenerating && onStop ? ( + audioStopButton + ) : ( + + )} + + + + + + setAlertState(hideAlert())} + /> + + ); +}; diff --git a/src/components/ChatInput/Popovers.tsx b/src/components/ChatInput/Popovers.tsx index 52a61b69..aaa27521 100644 --- a/src/components/ChatInput/Popovers.tsx +++ b/src/components/ChatInput/Popovers.tsx @@ -1,11 +1,16 @@ import React from 'react'; -import { View, TouchableOpacity, Text, StyleSheet, Modal, TouchableWithoutFeedback } from 'react-native'; +import { View, TouchableOpacity, Text, StyleSheet, Modal, TouchableWithoutFeedback, ActivityIndicator } from 'react-native'; import Icon from 'react-native-vector-icons/Feather'; +import { useNavigation } from '@react-navigation/native'; import { useTheme } from '../../theme'; import { ImageModeState } from '../../types'; -import { useAppStore } from '../../stores'; +import { useAppStore, useTTSStore } from '../../stores'; import { triggerHaptic } from '../../utils/haptics'; -import { FONTS } from '../../constants'; +import { FONTS, TYPOGRAPHY } from '../../constants'; +import { KOKORO_VOICES } from '../../constants/kokoroModels'; +import type { KokoroVoiceId } from '../../constants/kokoroModels'; +import type { NativeStackNavigationProp } from '@react-navigation/native-stack'; +import type { RootStackParamList } from '../../navigation/types'; // ─── Shared Styles ────────────────────────────────────────────────────────── @@ -100,11 +105,30 @@ export const QuickSettingsPopover: React.FC = ({ }) => { const { colors } = useTheme(); const { settings, updateSettings } = useAppStore(); + const { settings: ttsSettings, isBackboneDownloaded, isVocoderDownloaded, isModelLoaded, loadModels, unloadModels, updateSettings: updateTTSSettings } = useTTSStore(); + const navigation = useNavigation>(); if (!visible) return null; const imgBadge = getImageModeBadge(imageMode, colors); const tools = getToolsStyle(supportsToolCalling, enabledToolCount, colors); + const ttsAvailable = isBackboneDownloaded && isVocoderDownloaded; + const ttsMode = ttsSettings.interfaceMode; + const ttsBadge = !ttsAvailable + ? { label: 'N/A', bg: colors.textMuted } + : ttsMode === 'audio' + ? { label: 'Audio', bg: colors.primary } + : { label: 'Chat', bg: `${colors.textMuted}80` }; + + const handleTTSToggle = () => { + triggerHaptic('impactLight'); + if (!ttsAvailable) { onClose(); navigation.navigate('TTSSettings'); return; } + onClose(); + const next = ttsMode === 'audio' ? 'chat' : 'audio'; + updateTTSSettings({ interfaceMode: next }); + if (next === 'audio' && !isModelLoaded) { loadModels(); } + if (next === 'chat' && isModelLoaded) { unloadModels(); } + }; return ( @@ -150,6 +174,18 @@ export const QuickSettingsPopover: React.FC = ({ )} + + + Voice + + {ttsBadge.label} + + + = ({ ); }; + +// ─── Voice Picker Popover ────────────────────────────────────────────────── + +interface VoicePickerPopoverProps { + visible: boolean; + onClose: () => void; + anchorY: number; + anchorX: number; +} + +export const VoicePickerPopover: React.FC = ({ + visible, onClose, anchorY, anchorX, +}) => { + const { colors } = useTheme(); + const kokoroVoiceId = useTTSStore((s) => s.settings.kokoroVoiceId); + const isChangingVoice = useTTSStore((s) => s.settings.kokoroVoiceId !== s.kokoroActiveVoiceId); + const { isSpeaking, stop, updateSettings } = useTTSStore(); + + if (!visible) return null; + + const handleSelect = (voice: typeof KOKORO_VOICES[number]) => { + triggerHaptic('impactLight'); + // Stop playback first — KokoroTTSManager defers voice config changes + // until isSpeaking is false, so no native crash + if (isSpeaking) { stop(); } + updateSettings({ kokoroVoiceId: voice.id as KokoroVoiceId, speed: voice.defaultSpeed }); + onClose(); + }; + + return ( + + + + + + {KOKORO_VOICES.map((voice) => { + const isActive = voice.id === kokoroVoiceId; + return ( + handleSelect(voice)} + > + + + + {voice.label} + + + {voice.persona} + + + {isActive && ( + isChangingVoice + ? + : + )} + + ); + })} + + + + + + ); +}; + +const voicePickerStyles = StyleSheet.create({ + popover: { + minWidth: 200, + }, + labelCol: { + flex: 1, + }, + accent: { + ...TYPOGRAPHY.meta, + marginTop: 1, + }, +}); diff --git a/src/components/ChatInput/Voice.ts b/src/components/ChatInput/Voice.ts index 1cc66a19..616b6bca 100644 --- a/src/components/ChatInput/Voice.ts +++ b/src/components/ChatInput/Voice.ts @@ -1,35 +1,195 @@ -import { useEffect, useRef } from 'react'; +import { useEffect, useRef, useState } from 'react'; import { useWhisperTranscription } from '../../hooks/useWhisperTranscription'; -import { useWhisperStore } from '../../stores'; +import { useWhisperStore, useChatStore } from '../../stores'; +import { useTTSStore } from '../../stores/ttsStore'; +import { llmService } from '../../services/llm'; +import { audioRecorderService } from '../../services/audioRecorderService'; +import { whisperService } from '../../services/whisperService'; +import logger from '../../utils/logger'; interface UseVoiceInputParams { conversationId?: string | null; onTranscript: (text: string) => void; + onAudioAttachment?: (uri: string, format: 'wav' | 'mp3', durationSeconds?: number) => void; + /** Called in Audio Mode to auto-send. Includes audio info so caller can build attachment atomically. */ + onAutoSend?: (text: string, audio: { uri: string; format: 'wav' | 'mp3'; durationSeconds: number }) => void; } -export function useVoiceInput({ conversationId, onTranscript }: UseVoiceInputParams) { +export function useVoiceInput({ conversationId, onTranscript, onAudioAttachment, onAutoSend }: UseVoiceInputParams) { const recordingConversationIdRef = useRef(null); const onTranscriptRef = useRef(onTranscript); onTranscriptRef.current = onTranscript; + const onAudioAttachmentRef = useRef(onAudioAttachment); + onAudioAttachmentRef.current = onAudioAttachment; + const onAutoSendRef = useRef(onAutoSend); + onAutoSendRef.current = onAutoSend; const { downloadedModelId } = useWhisperStore(); + const [isDirectRecording, setIsDirectRecording] = useState(false); + const [isAudioModeRecording, setIsAudioModeRecording] = useState(false); + const [isTranscribingFile, setIsTranscribingFile] = useState(false); + const [directError, setDirectError] = useState(null); const { - isRecording, + isRecording: isWhisperRecording, isModelLoading, - isTranscribing, + isTranscribing: isWhisperTranscribing, partialResult, finalResult, - error, - startRecording: startRecordingBase, - stopRecording, + error: whisperError, + startRecording: startWhisperRecording, + stopRecording: stopWhisperRecording, clearResult, } = useWhisperTranscription(); - const voiceAvailable = !!downloadedModelId; + const supportsDirectAudio = (): boolean => { + const support = llmService.getMultimodalSupport(); + return Boolean(support?.audio) && audioRecorderService.supportsDirectAudioInput(); + }; + + const isInAudioInterfaceMode = (): boolean => + useTTSStore.getState().settings.interfaceMode === 'audio'; + + // Use file-based transcription path when: Audio Mode + Whisper available + not direct audio model + const shouldUseFilePath = (): boolean => + isInAudioInterfaceMode() && !!downloadedModelId && !supportsDirectAudio(); + + const isTranscribing = isWhisperTranscribing || isTranscribingFile; + const isRecording = isDirectRecording || isAudioModeRecording || isWhisperRecording; + const error = directError ?? whisperError; + + // voiceAvailable: direct audio OR whisper downloaded + const voiceAvailable = supportsDirectAudio() || !!downloadedModelId; const startRecording = async () => { recordingConversationIdRef.current = conversationId || null; - await startRecordingBase(); + setDirectError(null); + // Stop any TTS playback before recording — mic and speaker shouldn't overlap + const tts = useTTSStore.getState(); + if (tts.isSpeaking) { tts.stop(); } + + if (supportsDirectAudio()) { + try { + setIsDirectRecording(true); + await audioRecorderService.startRecording(); + } catch (err) { + setIsDirectRecording(false); + const msg = err instanceof Error ? err.message : 'Recording failed'; + logger.error('[Voice] Direct audio recording error:', err); + setDirectError(msg); + } + return; + } + + if (shouldUseFilePath()) { + try { + setIsAudioModeRecording(true); + await audioRecorderService.startRecording(); + } catch (err) { + setIsAudioModeRecording(false); + const msg = err instanceof Error ? err.message : 'Recording failed'; + logger.error('[Voice] Audio mode recording error:', err); + setDirectError(msg); + } + return; + } + + await startWhisperRecording(); + }; + + const stopRecording = async () => { + if (isDirectRecording) { + try { + const { path, durationSeconds } = await audioRecorderService.stopRecording(); + setIsDirectRecording(false); + if (!recordingConversationIdRef.current || recordingConversationIdRef.current === conversationId) { + const format = audioRecorderService.getFormat(); + // In Audio Mode, auto-send directly — no transcription needed for multimodal models + if (onAutoSendRef.current && isInAudioInterfaceMode()) { + onAutoSendRef.current('', { uri: path, format, durationSeconds }); + + // Parallel transcription: send audio to model immediately, transcribe in background + // so the voice bubble gets a transcript for display/playback review + if (downloadedModelId) { + const convId = conversationId; + whisperService.transcribeFile(path).then(text => { + if (!text?.trim() || !convId) return; + const conv = useChatStore.getState().conversations.find(c => c.id === convId); + const msg = conv?.messages.find(m => + m.role === 'user' && m.attachments?.some(a => a.uri === path), + ); + if (msg) { + useChatStore.getState().updateMessageContent(convId, msg.id, text.trim()); + } + }).catch(err => logger.error('[Voice] Background transcription error:', err)); + } + } else { + onAudioAttachmentRef.current?.(path, format, durationSeconds); + } + } + recordingConversationIdRef.current = null; + } catch (err) { + setIsDirectRecording(false); + logger.error('[Voice] Failed to stop direct recording:', err); + } + return; + } + + if (isAudioModeRecording) { + try { + const { path, durationSeconds } = await audioRecorderService.stopRecording(); + setIsAudioModeRecording(false); + if (recordingConversationIdRef.current && recordingConversationIdRef.current !== conversationId) { + recordingConversationIdRef.current = null; + return; + } + setIsTranscribingFile(true); + let text = ''; + try { + text = await whisperService.transcribeFile(path); + } catch (transcribeErr) { + logger.error('[Voice] File transcription error:', transcribeErr); + } + setIsTranscribingFile(false); + recordingConversationIdRef.current = null; + if (text.trim()) { + if (onAutoSendRef.current) { + onAutoSendRef.current(text.trim(), { uri: path, format: 'wav', durationSeconds }); + } else { + onAudioAttachmentRef.current?.(path, 'wav', durationSeconds); + onTranscriptRef.current(text.trim()); + } + } else { + // Transcription returned nothing — clip too short or too quiet + setDirectError("Couldn't hear that — try again"); + setTimeout(() => setDirectError(null), 3000); + } + } catch (err) { + setIsAudioModeRecording(false); + setIsTranscribingFile(false); + logger.error('[Voice] Failed to stop audio mode recording:', err); + } + return; + } + + await stopWhisperRecording(); + }; + + const cancelRecording = () => { + if (isDirectRecording) { + audioRecorderService.cancelRecording(); + setIsDirectRecording(false); + recordingConversationIdRef.current = null; + return; + } + if (isAudioModeRecording) { + audioRecorderService.cancelRecording(); + setIsAudioModeRecording(false); + recordingConversationIdRef.current = null; + return; + } + stopWhisperRecording(); + clearResult(); + recordingConversationIdRef.current = null; }; useEffect(() => { @@ -49,5 +209,20 @@ export function useVoiceInput({ conversationId, onTranscript }: UseVoiceInputPar } }, [finalResult, clearResult, conversationId]); - return { isRecording, isModelLoading, isTranscribing, partialResult, error, voiceAvailable, startRecording, stopRecording, clearResult }; + return { + isRecording, + isModelLoading, + isTranscribing, + partialResult, + error, + voiceAvailable, + startRecording, + stopRecording, + cancelRecording, + clearResult, + /** True when model accepts audio directly (no Whisper needed) */ + isDirectAudioMode: supportsDirectAudio(), + /** True when recording in Audio Mode for file-based transcription */ + isAudioModeRecording, + }; } diff --git a/src/components/ChatInput/index.tsx b/src/components/ChatInput/index.tsx index 1ebbb496..0f2a97a1 100644 --- a/src/components/ChatInput/index.tsx +++ b/src/components/ChatInput/index.tsx @@ -1,4 +1,4 @@ -import React, { useState, useRef, useEffect } from 'react'; +import React, { useState, useRef, useEffect, useMemo } from 'react'; import { View, TextInput, TouchableOpacity, Animated, StyleSheet } from 'react-native'; import Icon from 'react-native-vector-icons/Feather'; import { useTheme, useThemedStyles } from '../../theme'; @@ -13,6 +13,10 @@ import { AttachmentPreview, useAttachments } from './Attachments'; import { useVoiceInput } from './Voice'; import { QuickSettingsPopover, AttachPickerPopover } from './Popovers'; import { useKeyboardAwarePopover } from './useKeyboardAwarePopover'; +import { useTTSStore } from '../../stores/ttsStore'; +import { useAppStore } from '../../stores'; +import { KOKORO_VOICES } from '../../constants/kokoroModels'; +import { AudioModeLayout } from './AudioModeLayout'; interface ChatInputProps { onSend: (message: string, attachments?: MediaAttachment[], imageMode?: ImageModeState) => void; @@ -33,7 +37,6 @@ interface ChatInputProps { supportsToolCalling?: boolean; supportsThinking?: boolean; onRepairVision?: () => void; - /** When set, mounts a single AttachStep for that index. Only one at a time to avoid waypoint dots. */ activeSpotlight?: number | null; } @@ -69,7 +72,9 @@ export const ChatInput: React.FC = ({ const [alertState, setAlertState] = useState(initialAlertState); const quickSettings = useKeyboardAwarePopover(); const attachPicker = useKeyboardAwarePopover(); + const voicePicker = useKeyboardAwarePopover(); const inputRef = useRef(null); + const attachmentsRef = useRef([]); const hasText = message.length > 0; const iconsAnim = useRef(new Animated.Value(0)).current; @@ -81,9 +86,17 @@ export const ChatInput: React.FC = ({ }).start(); }, [hasText, iconsAnim]); - const { attachments, removeAttachment, clearAttachments, handlePickImage, handlePickDocument } = useAttachments(setAlertState); + const { attachments, removeAttachment, clearAttachments, handlePickImage, handlePickDocument, addAudioAttachment } = useAttachments(setAlertState); + attachmentsRef.current = attachments; + const ttsInterfaceMode = useTTSStore((s) => s.settings.interfaceMode); + const kokoroVoiceId = useTTSStore((s) => s.settings.kokoroVoiceId); + const isAudioMode = ttsInterfaceMode === 'audio'; + const currentVoice = useMemo( + () => KOKORO_VOICES.find((v) => v.id === kokoroVoiceId) ?? KOKORO_VOICES[0], + [kokoroVoiceId], + ); - const { isRecording, isModelLoading, isTranscribing, partialResult, error, voiceAvailable, startRecording, stopRecording, clearResult } = useVoiceInput({ + const { isRecording, isModelLoading, isTranscribing, partialResult, error, voiceAvailable, startRecording, stopRecording, cancelRecording } = useVoiceInput({ conversationId, onTranscript: (text) => { setMessage(prev => { @@ -91,8 +104,33 @@ export const ChatInput: React.FC = ({ return prefix + text; }); }, + onAudioAttachment: (uri, format, durationSeconds) => { + addAudioAttachment(uri, format, durationSeconds); + }, + onAutoSend: isAudioMode ? (text, audio) => { + const audioAttachment: MediaAttachment = { + id: `audio-${Date.now()}`, + type: 'audio', + uri: audio.uri, + audioFormat: audio.format, + audioDurationSeconds: audio.durationSeconds, + fileName: audio.uri.split('/').pop(), + }; + triggerHaptic('impactMedium'); + const all = [...attachmentsRef.current, audioAttachment]; + onSend(text, all, imageMode); + clearAttachments(); + } : undefined, }); + const { settings: appSettings, updateSettings: updateAppSettings } = useAppStore(); + const thinkingEnabled = appSettings.thinkingEnabled; + + const handleThinkingToggle = () => { + triggerHaptic('impactLight'); + updateAppSettings({ thinkingEnabled: !thinkingEnabled }); + }; + const canSend = (message.trim().length > 0 || attachments.length > 0) && !disabled; const handleSend = () => { @@ -137,9 +175,49 @@ export const ChatInput: React.FC = ({ } }; - const handleQuickSettingsPress = () => quickSettings.show(); - - const handleAttachPress = () => attachPicker.show(); + // ─── Audio mode: simplified mic-only layout ───────────────────────────────── + if (isAudioMode) { + return ( + + ); + } const actionButton = canSend ? ( = ({ disabled={disabled} onStartRecording={startRecording} onStopRecording={stopRecording} - onCancelRecording={() => { stopRecording(); clearResult(); }} + onCancelRecording={cancelRecording} asSendButton /> ); - const content = ( + return ( = ({ onClearQueue={onClearQueue} /> - {/* Pill: text input + right icons */} = ({ blurOnSubmit={false} returnKeyType="default" /> - {/* Icons collapse when user starts typing, reappear when input is empty */} = ({ overflow: 'hidden' as const, }]} > - {/* Attach button — opens picker for image or document */} attachPicker.show()} disabled={disabled} hitSlop={{ top: 4, bottom: 4, left: 4, right: 4 }} > - + - - {/* Quick settings button */} + {supportsThinking && ( + + + + )} quickSettings.show()} disabled={disabled} hitSlop={{ top: 4, bottom: 4, left: 4, right: 4 }} > - - {/* Circular action button — conditionally wrapped with AttachStep */} {activeSpotlight === 12 ? ( {actionButton} ) : actionButton} @@ -253,7 +331,6 @@ export const ChatInput: React.FC = ({ onPhoto={handleVisionPress} onDocument={handlePickDocument} /> - = ({ enabledToolCount={enabledToolCount} onToolsPress={onToolsPress} /> - = ({ /> ); - - return content; }; const spotlightStyles = StyleSheet.create({ centered: { alignSelf: 'center' }, }); - diff --git a/src/components/ChatInput/styles.ts b/src/components/ChatInput/styles.ts index a9f8df69..7aab9a88 100644 --- a/src/components/ChatInput/styles.ts +++ b/src/components/ChatInput/styles.ts @@ -1,5 +1,5 @@ import type { ThemeColors, ThemeShadows } from '../../theme'; -import { FONTS } from '../../constants'; +import { FONTS, TYPOGRAPHY, SPACING } from '../../constants'; import { Platform } from 'react-native'; export const PILL_ICON_SIZE = 32; @@ -208,4 +208,25 @@ export const createStyles = (colors: ThemeColors, _shadows: ThemeShadows) => ({ fontWeight: '500' as const, color: colors.primary, }, + // Audio mode layout + audioModeRow: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + justifyContent: 'center' as const, + gap: SPACING.md, + paddingVertical: SPACING.xs, + }, + // Voice cycle button — shows icon + voice name + audioVoiceButton: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + gap: 4, + paddingHorizontal: SPACING.sm, + height: 32, + borderRadius: 16, + }, + audioVoiceLabel: { + ...TYPOGRAPHY.meta, + color: colors.textSecondary, + }, }); diff --git a/src/components/ChatInput/useKeyboardAwarePopover.ts b/src/components/ChatInput/useKeyboardAwarePopover.ts index 13cdfaa4..dc4f0b7b 100644 --- a/src/components/ChatInput/useKeyboardAwarePopover.ts +++ b/src/components/ChatInput/useKeyboardAwarePopover.ts @@ -1,13 +1,15 @@ import { useRef, useEffect, useState, useCallback } from 'react'; import { Keyboard, Dimensions, Platform, StatusBar, TouchableOpacity } from 'react-native'; -import { SPACING } from '../../constants'; /** * Hook that manages keyboard-aware popover positioning. * When the keyboard is visible, dismisses it and waits for `keyboardDidHide` * before measuring position to ensure correct coordinates. + * + * anchorY → distance from screen bottom to trigger top (popover sits above trigger) + * anchorX → distance from screen right to trigger right edge (popover right-aligns with trigger) */ -export function useKeyboardAwarePopover(offsetX: number = SPACING.md) { +export function useKeyboardAwarePopover() { const [anchor, setAnchor] = useState({ y: 0, x: 0 }); const [visible, setVisible] = useState(false); const triggerRef = useRef>(null); @@ -27,13 +29,15 @@ export function useKeyboardAwarePopover(offsetX: number = SPACING.md) { const show = useCallback(() => { const measureAndShow = () => { - triggerRef.current?.measureInWindow?.((...args: number[]) => { - const screenH = Dimensions.get('window').height; - // On Android, measureInWindow Y includes the status bar but - // Dimensions.get('window').height may not — subtract the offset - // so the popover sits snugly above the trigger button. + triggerRef.current?.measureInWindow?.((btnX: number, btnY: number, btnW: number) => { + const { height: screenH, width: screenW } = Dimensions.get('window'); + // On Android, measureInWindow Y includes the status bar height. const statusBarOffset = Platform.OS === 'android' ? (StatusBar.currentHeight ?? 0) : 0; - setAnchor({ y: screenH - (args[1] ?? 0) - statusBarOffset, x: offsetX }); + // bottom: how far the popover bottom sits above the screen bottom (= above the trigger) + const y = screenH - (btnY ?? 0) - statusBarOffset; + // right: align popover's right edge with the trigger button's right edge + const x = screenW - ((btnX ?? 0) + (btnW ?? 0)); + setAnchor({ y, x }); }); setVisible(true); }; @@ -54,7 +58,7 @@ export function useKeyboardAwarePopover(offsetX: number = SPACING.md) { } else { measureAndShow(); } - }, [offsetX]); + }, []); const hide = useCallback(() => setVisible(false), []); diff --git a/src/components/ChatMessage/components/ActionMenuSheet.tsx b/src/components/ChatMessage/components/ActionMenuSheet.tsx index 1f380fe2..802bc5db 100644 --- a/src/components/ChatMessage/components/ActionMenuSheet.tsx +++ b/src/components/ChatMessage/components/ActionMenuSheet.tsx @@ -12,11 +12,13 @@ interface ActionMenuSheetProps { canEdit: boolean; canRetry: boolean; canGenerateImage: boolean; + canSpeak: boolean; styles: any; onCopy: () => void; onEdit: () => void; onRetry: () => void; onGenerateImage: () => void; + onSpeak: () => void; } export function ActionMenuSheet({ @@ -26,11 +28,13 @@ export function ActionMenuSheet({ canEdit, canRetry, canGenerateImage, + canSpeak, styles, onCopy, onEdit, onRetry, onGenerateImage, + onSpeak, }: ActionMenuSheetProps) { const { colors } = useTheme(); @@ -89,6 +93,18 @@ export function ActionMenuSheet({ Generate Image )} + + {!isUser && canSpeak && ( + + + Speak + + )} ); diff --git a/src/components/ChatMessage/components/MessageAttachments.tsx b/src/components/ChatMessage/components/MessageAttachments.tsx index adead2c9..b798a2fc 100644 --- a/src/components/ChatMessage/components/MessageAttachments.tsx +++ b/src/components/ChatMessage/components/MessageAttachments.tsx @@ -78,7 +78,22 @@ export function MessageAttachments({ return ( {attachments.map((attachment, index) => - attachment.type === 'document' ? ( + attachment.type === 'audio' ? ( + + + + Voice message + + + ) : attachment.type === 'document' ? ( ); } + // No content but may have thinking — render ThinkingBlock alone (audio mode above-bubble use case) + if (parsedContent.thinking) { + return ( + + ); + } return null; } diff --git a/src/components/ChatMessage/index.tsx b/src/components/ChatMessage/index.tsx index d80310b7..6a6a20e4 100644 --- a/src/components/ChatMessage/index.tsx +++ b/src/components/ChatMessage/index.tsx @@ -1,6 +1,7 @@ import React, { useState } from 'react'; import { View, Text, TouchableOpacity, Clipboard } from 'react-native'; import { useTheme, useThemedStyles } from '../../theme'; +import { useTTSStore } from '../../stores/ttsStore'; import Icon from 'react-native-vector-icons/Feather'; import { stripControlTokens } from '../../utils/messageContent'; import { CustomAlert, showAlert, hideAlert, AlertState, initialAlertState } from '../CustomAlert'; @@ -133,14 +134,16 @@ type MetaRowProps = { isStreaming?: boolean; showActions: boolean; onMenuOpen: () => void; + metaExtra?: React.ReactNode; }; -const MessageMetaRow: React.FC = ({ message, styles, isStreaming, showActions, onMenuOpen }) => ( +const MessageMetaRow: React.FC = ({ message, styles, isStreaming, showActions, onMenuOpen, metaExtra }) => ( {formatTime(message.timestamp)} {message.generationTimeMs != null && message.role === 'assistant' && ( {formatDuration(message.generationTimeMs)} )} + {metaExtra} {showActions && !isStreaming && ( ••• @@ -157,7 +160,9 @@ const ToolCallWithThinking: React.FC<{ return ( {!!tc?.thinking && ( - + + + )} {hasText && ( @@ -179,11 +184,17 @@ export const ChatMessage: React.FC = ({ onGenerateImage, showActions = true, canGenerateImage = false, + canSpeak: canSpeakProp = false, + onSpeak: onSpeakProp, showGenerationDetails = false, animateEntry = false, + metaExtra, }) => { const { colors } = useTheme(); const styles = useThemedStyles(createStyles); + const ttsCanSpeak = useTTSStore( + s => s.settings.enabled && s.isBackboneDownloaded && s.isVocoderDownloaded, + ); const [showActionMenu, setShowActionMenu] = useState(false); const [isEditing, setIsEditing] = useState(false); const [editedContent, setEditedContent] = useState(message.content); @@ -242,6 +253,22 @@ export const ChatMessage: React.FC = ({ setShowActionMenu(false); }; + const canSpeak = !isUser && !isStreaming && (canSpeakProp || ttsCanSpeak); + + const handleSpeak = () => { + setShowActionMenu(false); + if (onSpeakProp) { + onSpeakProp(); + return; + } + const tts = useTTSStore.getState(); + if (!tts.isModelLoaded) { + tts.loadModels().then(() => useTTSStore.getState().speak(displayContent, message.id)); + } else { + tts.speak(displayContent, message.id); + } + }; + if (message.isSystemInfo) { return setAlertState(hideAlert())} />; @@ -291,6 +318,7 @@ export const ChatMessage: React.FC = ({ isStreaming={isStreaming} showActions={showActions} onMenuOpen={() => setShowActionMenu(true)} + metaExtra={metaExtra} /> {showGenerationDetails && !isUser && message.generationMeta && ( @@ -310,11 +338,13 @@ export const ChatMessage: React.FC = ({ canEdit={!!onEdit} canRetry={!!onRetry} canGenerateImage={canGenerateImage && !!onGenerateImage} + canSpeak={canSpeak} styles={styles} onCopy={handleCopy} onEdit={handleEdit} onRetry={handleRetry} onGenerateImage={handleGenerateImage} + onSpeak={handleSpeak} /> ({ overflow: 'hidden' as const, width: '100%' as const, }, + /** Constrains the ThinkingBlock when rendered outside a message bubble (e.g. ToolCallWithThinking) */ + thinkingBlockWrapper: { + width: '88%' as const, + alignSelf: 'flex-start' as const, + }, thinkingHeader: { flexDirection: 'row' as const, alignItems: 'flex-start' as const, diff --git a/src/components/ChatMessage/types.ts b/src/components/ChatMessage/types.ts index f93ef8ec..becd367a 100644 --- a/src/components/ChatMessage/types.ts +++ b/src/components/ChatMessage/types.ts @@ -10,8 +10,12 @@ export interface ChatMessageProps { onGenerateImage?: (prompt: string) => void; showActions?: boolean; canGenerateImage?: boolean; + canSpeak?: boolean; + onSpeak?: () => void; showGenerationDetails?: boolean; animateEntry?: boolean; + /** Extra element rendered at the end of the meta row (e.g. TTSButton) */ + metaExtra?: React.ReactNode; } export interface ParsedContent { diff --git a/src/components/GenerationSettingsModal/ImageQualitySliders.tsx b/src/components/GenerationSettingsModal/ImageQualitySliders.tsx index f1e0544d..2feac93a 100644 --- a/src/components/GenerationSettingsModal/ImageQualitySliders.tsx +++ b/src/components/GenerationSettingsModal/ImageQualitySliders.tsx @@ -1,6 +1,6 @@ import React from 'react'; import { View, Text, Switch, Platform, TouchableOpacity } from 'react-native'; -import Slider from '@react-native-community/slider'; +import { NumericStepper } from '../NumericStepper'; import { useTheme, useThemedStyles } from '../../theme'; import { useAppStore } from '../../stores'; import { useClearGpuCache } from '../../hooks/useImageGenerationSettings'; @@ -24,70 +24,38 @@ const ClearGPUCacheButton: React.FC = () => { ); }; -/** Basic sliders: Image Steps + Image Size */ +/** Basic controls: Image Steps + Image Size */ export const ImageQualityBasicSliders: React.FC = () => { - const { colors } = useTheme(); const styles = useThemedStyles(createStyles); const { settings, updateSettings } = useAppStore(); return ( <> - - Image Steps - {settings.imageSteps || 8} - - - 4-8 steps for speed, 20-50 for quality - - Image Steps + 4-8 steps for speed, 20-50 for quality + updateSettings({ imageSteps: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + min={4} max={50} step={1} + onChange={(value) => updateSettings({ imageSteps: value })} /> - - 4 - 50 - - - Image Size - - {settings.imageWidth ?? 256}x{settings.imageHeight ?? 256} - - - - Output resolution (smaller = faster, larger = more detail) - - Image Size + Output resolution (smaller = faster, larger = more detail) + updateSettings({ imageWidth: value, imageHeight: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + min={128} max={512} step={64} + formatValue={(v) => `${v}x${v}`} + onChange={(value) => updateSettings({ imageWidth: value, imageHeight: value })} /> - - 128 - 512 - ); }; -/** Advanced sliders: Guidance Scale, Image Threads, GPU Acceleration */ +/** Advanced controls: Guidance Scale, Image Threads, GPU Acceleration */ export const ImageQualityAdvancedSliders: React.FC = () => { const { colors } = useTheme(); const styles = useThemedStyles(createStyles); @@ -96,53 +64,23 @@ export const ImageQualityAdvancedSliders: React.FC = () => { return ( <> - - Guidance Scale - {(settings.imageGuidanceScale || 7.5).toFixed(1)} - - - Higher = follows prompt more strictly (5-15 range) - - Guidance Scale + Higher = follows prompt more strictly (5-15 range) + updateSettings({ imageGuidanceScale: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + min={1} max={20} step={0.5} decimals={1} + onChange={(value) => updateSettings({ imageGuidanceScale: value })} /> - - 1 - 20 - - - Image Threads - {settings.imageThreads ?? 4} - - - CPU threads used for image generation. Takes effect next time the image model loads. - - Image Threads + CPU threads used for image generation. Takes effect next time the image model loads. + updateSettings({ imageThreads: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + min={1} max={8} step={1} + onChange={(value) => updateSettings({ imageThreads: value })} /> - - 1 - 8 - {Platform.OS === 'android' && ( @@ -157,7 +95,7 @@ export const ImageQualityAdvancedSliders: React.FC = () => { /> - Use GPU for faster image generation. First run may be slower while optimizing for your device. For best performance, use NPU models on supported Snapdragon devices. + Use GPU for faster image generation. First run may be slower while optimizing for your device. {(settings.imageUseOpenCL ?? true) && } diff --git a/src/components/GenerationSettingsModal/TTSSection.tsx b/src/components/GenerationSettingsModal/TTSSection.tsx new file mode 100644 index 00000000..a4a7af8d --- /dev/null +++ b/src/components/GenerationSettingsModal/TTSSection.tsx @@ -0,0 +1,250 @@ +import React from 'react'; +import { View, Text, Switch, TouchableOpacity, ActivityIndicator } from 'react-native'; +import Icon from 'react-native-vector-icons/Feather'; +import { NumericStepper } from '../NumericStepper'; +import { useTheme, useThemedStyles } from '../../theme'; +import type { ThemeColors, ThemeShadows } from '../../theme'; +import { SPACING } from '../../constants'; +import { useTTSStore } from '../../stores/ttsStore'; +import { KOKORO_VOICES, isExecutorchSupported } from '../../constants/kokoroModels'; +import type { KokoroVoiceId } from '../../constants/kokoroModels'; +import { createStyles as createModalStyles } from './styles'; + +const createLocalStyles = (colors: ThemeColors, _shadows: ThemeShadows) => ({ + modeChipDisabled: { opacity: 0.4 as const }, + linkButton: { + alignSelf: 'flex-start' as const, + paddingHorizontal: SPACING.md, + paddingVertical: SPACING.sm, + borderRadius: 8, + borderWidth: 1, + borderColor: colors.border, + marginTop: SPACING.sm, + }, + linkButtonRow: { flexDirection: 'row' as const, alignItems: 'center' as const, gap: SPACING.xs }, + flex1: { flex: 1 }, + toggleRow: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + justifyContent: 'space-between' as const, + marginBottom: SPACING.lg, + }, + toggleInfo: { flex: 1 }, + noBottomMargin: { marginBottom: 0 }, + divider: { height: 1, backgroundColor: colors.border, marginBottom: SPACING.lg }, + voiceRow: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + justifyContent: 'space-between' as const, + paddingVertical: SPACING.sm, + }, + voiceRowBorder: { borderTopWidth: 1, borderTopColor: colors.border }, + voiceInfo: { flex: 1 }, + voiceName: { fontSize: 13, color: colors.text }, + voiceMeta: { fontSize: 11, color: colors.textMuted, marginTop: 2 }, + voiceSectionHeader: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + justifyContent: 'space-between' as const, + marginBottom: SPACING.sm, + }, + voiceSectionLabel: { fontSize: 11, color: colors.textMuted, textTransform: 'uppercase' as const, letterSpacing: 0.3 }, + downloadRow: { flexDirection: 'row' as const, alignItems: 'center' as const, gap: SPACING.sm, marginBottom: SPACING.md }, + downloadText: { fontSize: 12, color: colors.textSecondary, flex: 1 }, +}); + +// ─── Mode Picker ────────────────────────────────────────────────────────────── + +const ModePicker: React.FC<{ areBothDownloaded: boolean }> = ({ areBothDownloaded }) => { + const modal = useThemedStyles(createModalStyles); + const local = useThemedStyles(createLocalStyles); + const { + settings, updateSettings, + isModelLoaded, loadModels, unloadModels, + kokoroReady, + } = useTTSStore(); + const mode = settings.interfaceMode; + // Audio mode needs OuteTTS (waveform generation) + const audioEnabled = areBothDownloaded; + + const handleModeChange = (next: 'chat' | 'audio') => { + if (next === 'audio' && !audioEnabled) { return; } + updateSettings({ interfaceMode: next }); + if (next === 'audio' && !isModelLoaded && areBothDownloaded) { loadModels(); } + if (next === 'chat' && isModelLoaded && !kokoroReady) { unloadModels(); } + }; + + return ( + + + Interface Mode + + {mode === 'audio' + ? 'Audio Mode — responses rendered as voice notes' + : 'Chat Mode — play button added to text messages'} + + + + {(['chat', 'audio'] as const).map((m) => { + const active = mode === m; + const disabled = m === 'audio' && !audioEnabled; + return ( + handleModeChange(m)} + disabled={disabled} + > + + {m === 'chat' ? 'Chat' : 'Audio'} + + + ); + })} + + + ); +}; + +// ─── Voice Picker ───────────────────────────────────────────────────────────── + +const VoicePicker: React.FC = () => { + const { colors } = useTheme(); + const local = useThemedStyles(createLocalStyles); + const { settings, updateSettings, kokoroReady, kokoroDownloadProgress, kokoroActiveVoiceId } = useTTSStore(); + const isChangingVoice = settings.kokoroVoiceId !== kokoroActiveVoiceId; + const supported = isExecutorchSupported(); + + return ( + + + Voice + {supported && !kokoroReady && ( + kokoroDownloadProgress > 0 + ? {Math.round(kokoroDownloadProgress * 100)}% + : + )} + {supported && kokoroReady && ( + + )} + {!supported && ( + Android 13+ only + )} + + + {KOKORO_VOICES.map((voice, i) => { + const active = settings.kokoroVoiceId === voice.id; + return ( + 0 && local.voiceRowBorder]} + onPress={() => updateSettings({ kokoroVoiceId: voice.id as KokoroVoiceId })} + disabled={!supported} + > + + + {voice.label} + + {voice.accent} · {voice.gender} + + {active && ( + isChangingVoice + ? + : + )} + + ); + })} + + + + ); +}; + +// ─── Main TTS Section ───────────────────────────────────────────────────────── + +interface TTSSectionProps { + onNavigateToTTSSettings?: () => void; +} + +export const TTSSection: React.FC = ({ onNavigateToTTSSettings }) => { + const { colors } = useTheme(); + const modal = useThemedStyles(createModalStyles); + const local = useThemedStyles(createLocalStyles); + const { + settings, updateSettings, + isBackboneDownloaded, isVocoderDownloaded, + kokoroReady, + } = useTTSStore(); + + const areBothDownloaded = isBackboneDownloaded && isVocoderDownloaded; + const hasAnySpeech = kokoroReady || areBothDownloaded; + const trackColor = { false: colors.surfaceLight, true: `${colors.primary}80` }; + const isChatMode = settings.interfaceMode === 'chat'; + + if (!hasAnySpeech) { + return ( + + + No voice models downloaded. Go to TTS Settings to download them. + + {onNavigateToTTSSettings && ( + + + + TTS Settings + + + )} + + ); + } + + return ( + + + + {isChatMode && ( + + + Enable TTS + Show play buttons on assistant messages + + updateSettings({ enabled: v })} + trackColor={trackColor} + thumbColor={settings.enabled ? colors.primary : colors.textMuted} + /> + + )} + + + + + Speed + `${v.toFixed(1)}x`} + onChange={(v) => updateSettings({ speed: v })} + /> + + + {isChatMode && ( + + + Auto-play + Speak AI responses automatically + + updateSettings({ autoPlay: v })} + trackColor={trackColor} + thumbColor={settings.autoPlay ? colors.primary : colors.textMuted} + /> + + )} + + ); +}; diff --git a/src/components/GenerationSettingsModal/TextGenerationAdvanced.tsx b/src/components/GenerationSettingsModal/TextGenerationAdvanced.tsx index 0b017e57..3d44a999 100644 --- a/src/components/GenerationSettingsModal/TextGenerationAdvanced.tsx +++ b/src/components/GenerationSettingsModal/TextGenerationAdvanced.tsx @@ -1,7 +1,7 @@ import React from 'react'; import { View, Text, TouchableOpacity } from 'react-native'; -import Slider from '@react-native-community/slider'; -import { useTheme, useThemedStyles } from '../../theme'; +import { NumericStepper } from '../NumericStepper'; +import { useThemedStyles } from '../../theme'; import { useAppStore } from '../../stores'; import { CacheType } from '../../types'; import { @@ -15,7 +15,6 @@ import { createStyles } from './styles'; // ─── GPU Acceleration ───────────────────────────────────────────────────────── export const GpuAccelerationToggle: React.FC = () => { - const { colors } = useTheme(); const styles = useThemedStyles(createStyles); const { settings, updateSettings } = useAppStore(); const { gpuLayersEffective, handleGpuToggle } = useTextGenerationAdvanced(); @@ -51,24 +50,15 @@ export const GpuAccelerationToggle: React.FC = () => { {settings.enableGpu && ( - - GPU Layers - {gpuLayersEffective} - + GPU Layers Layers offloaded to GPU. Higher = faster but may crash on low-VRAM devices. Requires model reload. - updateSettings({ gpuLayers: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + min={1} max={GPU_LAYERS_MAX} step={1} + onChange={(value) => updateSettings({ gpuLayers: value })} /> )} @@ -199,56 +189,34 @@ export const ModelLoadingStrategyToggle: React.FC = () => { // ─── CPU Threads & Batch Size ──────────────────────────────────────────────── export const CpuThreadsSlider: React.FC = () => { - const { colors } = useTheme(); const styles = useThemedStyles(createStyles); const { settings, updateSettings } = useAppStore(); - const value = settings.nThreads ?? 6; return ( - - CPU Threads - {value} - + CPU Threads Parallel threads for inference - updateSettings({ nThreads: v })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + updateSettings({ nThreads: v })} /> ); }; export const BatchSizeSlider: React.FC = () => { - const { colors } = useTheme(); const styles = useThemedStyles(createStyles); const { settings, updateSettings } = useAppStore(); - const value = settings.nBatch ?? 512; return ( - - Batch Size - {value} - + Batch Size Tokens processed per batch - updateSettings({ nBatch: v })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + updateSettings({ nBatch: v })} /> ); diff --git a/src/components/GenerationSettingsModal/TextGenerationSection.tsx b/src/components/GenerationSettingsModal/TextGenerationSection.tsx index 18ed0c03..9ef8070d 100644 --- a/src/components/GenerationSettingsModal/TextGenerationSection.tsx +++ b/src/components/GenerationSettingsModal/TextGenerationSection.tsx @@ -1,6 +1,6 @@ import React, { useState } from 'react'; import { View, Text, TouchableOpacity, Platform } from 'react-native'; -import Slider from '@react-native-community/slider'; +import { NumericStepper } from '../NumericStepper'; import { AdvancedToggle } from '../AdvancedToggle'; import { useTheme, useThemedStyles } from '../../theme'; import { useAppStore } from '../../stores'; @@ -103,35 +103,23 @@ const SettingSlider: React.FC = ({ config }) => { const rawValue = (settings as Record)[config.key]; const value = (rawValue ?? DEFAULT_SETTINGS[config.key]) as number; const warningText = config.warning?.(value) ?? null; + const decimals = config.step < 1 ? 2 : 0; return ( - - {config.label} - {config.format(value)} - + {config.label} {config.description && ( {config.description} )} {warningText && ( {warningText} )} - updateSettings({ [config.key]: v })} - onSlidingComplete={() => {}} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surfaceLight} - thumbTintColor={colors.primary} + min={config.min} max={config.max} step={config.step} decimals={decimals} + formatValue={config.format} + onChange={(v) => updateSettings({ [config.key]: v })} /> - - {config.format(config.min)} - {config.format(config.max)} - ); }; diff --git a/src/components/GenerationSettingsModal/index.tsx b/src/components/GenerationSettingsModal/index.tsx index b23a3b74..fa54ea96 100644 --- a/src/components/GenerationSettingsModal/index.tsx +++ b/src/components/GenerationSettingsModal/index.tsx @@ -9,6 +9,7 @@ import { createStyles } from './styles'; import { ConversationActionsSection } from './ConversationActionsSection'; import { ImageGenerationSection } from './ImageGenerationSection'; import { TextGenerationSection } from './TextGenerationSection'; +import { TTSSection } from './TTSSection'; const DEFAULT_SETTINGS = { temperature: 0.7, @@ -26,6 +27,7 @@ interface GenerationSettingsModalProps { onOpenProject?: () => void; onOpenGallery?: () => void; onDeleteConversation?: () => void; + onOpenTTSSettings?: () => void; conversationImageCount?: number; activeProjectName?: string | null; isRemote?: boolean; @@ -37,6 +39,7 @@ export const GenerationSettingsModal: React.FC = ( onOpenProject, onOpenGallery, onDeleteConversation, + onOpenTTSSettings, conversationImageCount = 0, activeProjectName, isRemote, @@ -48,6 +51,7 @@ export const GenerationSettingsModal: React.FC = ( const [performanceStats, setPerformanceStats] = useState(llmService.getPerformanceStats()); const [imageSettingsOpen, setImageSettingsOpen] = useState(false); const [textSettingsOpen, setTextSettingsOpen] = useState(false); + const [ttsSettingsOpen, setTtsSettingsOpen] = useState(false); useEffect(() => { if (visible) { @@ -144,6 +148,23 @@ export const GenerationSettingsModal: React.FC = ( )} + {/* TTS SETTINGS */} + setTtsSettingsOpen(!ttsSettingsOpen)} + activeOpacity={0.7} + > + TEXT TO SPEECH + + + {ttsSettingsOpen && ( + + )} + Reset to Defaults diff --git a/src/components/KokoroTTSManager.tsx b/src/components/KokoroTTSManager.tsx new file mode 100644 index 00000000..77799daf --- /dev/null +++ b/src/components/KokoroTTSManager.tsx @@ -0,0 +1,171 @@ +/** + * KokoroTTSManager + * + * Mounts the react-native-executorch useTextToSpeech hook and exposes its + * speak/stop methods via module-level refs so they can be called from the + * ttsStore without a React context dependency. + * + * Mount exactly once, near the root (App.tsx), only on supported platforms. + * On Android <26 / iOS <17 this component should not be rendered at all. + * + * Voice changes use a key-based remount strategy: the outer component manages + * voice switching with a cooldown, then remounts the inner component with a new + * key so executorch gets a clean teardown/init cycle (avoids native SIGSEGV). + */ +import React, { useEffect, useRef } from 'react'; +import { useTextToSpeech } from 'react-native-executorch'; +import { AudioContext } from 'react-native-audio-api'; +import { useTTSStore } from '../stores/ttsStore'; +import { KOKORO_MEDIUM, getKokoroVoiceConfig } from '../constants/kokoroModels'; +import type { KokoroVoiceId } from '../constants/kokoroModels'; +import logger from '../utils/logger'; + +// ─── Module-level refs (callable from ttsStore without React context) ───────── + +let _streamFn: ((text: string, speed: number) => Promise) | null = null; +let _stopFn: ((instant?: boolean) => void) | null = null; +let _audioCtxRef: { current: AudioContext | null } = { current: null }; +// Pending onNext resolvers — force-resolved on stop so isSpeaking is always cleared +const _pendingResolvers: Set<() => void> = new Set(); +// When true, onEnd skips ctx.suspend() so the next chunk can start cleanly +let _skipSuspendOnEnd = false; +/** Timestamp of the last stream completion/stop — used by voice change cooldown */ +let _lastStreamEndTime = 0; + +export const kokoroRef = { + speak: (text: string, speed = 1.0): Promise => + _streamFn ? _streamFn(text, speed) : Promise.resolve(), + /** Call before sequential chunks to prevent AudioContext suspension between them */ + setKeepAlive: (keepAlive: boolean) => { _skipSuspendOnEnd = keepAlive; }, + stop: (instant = true) => { + _pendingResolvers.forEach((resolve) => resolve()); + _pendingResolvers.clear(); + _stopFn?.(instant); + _lastStreamEndTime = Date.now(); + }, + /** Pause playback — suspends AudioContext, Kokoro waits for onNext to resolve */ + pause: () => { _audioCtxRef.current?.suspend().catch(() => {}); }, + /** Resume playback — AudioContext resumes, current chunk finishes, Kokoro continues */ + resume: () => { _audioCtxRef.current?.resume().catch(() => {}); }, +}; + +// ─── Inner component — holds the useTextToSpeech hook for a single voice ───── + +const KokoroTTSInner: React.FC<{ voiceId: KokoroVoiceId }> = ({ voiceId }) => { + const audioCtxRef = useRef(null); + _audioCtxRef = audioCtxRef; + + const tts = useTextToSpeech({ + model: KOKORO_MEDIUM, + voice: getKokoroVoiceConfig(voiceId), + }); + + // Sync isReady + downloadProgress into ttsStore + useEffect(() => { + logger.log('[Kokoro] isReady=', tts.isReady, 'downloadProgress=', tts.downloadProgress, 'voiceId=', voiceId); + useTTSStore.getState().setKokoroState(tts.isReady, tts.downloadProgress); + if (tts.isReady) { + logger.log('[Kokoro] Setting kokoroActiveVoiceId to', voiceId); + useTTSStore.getState().setKokoroActiveVoiceId(voiceId); + } + }, [tts.isReady, tts.downloadProgress, voiceId]); + + useEffect(() => { + if (tts.error) { + logger.warn('[Kokoro] Runtime error — falling back to OuteTTS:', tts.error); + useTTSStore.getState().setKokoroState(false, 0); + } + }, [tts.error]); + + // Keep module refs pointing to the latest hook functions on every render + _streamFn = async (text: string, speed: number) => { + if (!audioCtxRef.current || audioCtxRef.current.state === 'closed') { + audioCtxRef.current = new AudioContext({ sampleRate: 24000 }); + } else if (audioCtxRef.current.state === 'suspended') { + await audioCtxRef.current.resume().catch(() => {}); + } + const ctx = audioCtxRef.current; + + try { + await tts.stream({ + text, + speed, + onNext: (chunk: Float32Array) => + new Promise((resolve) => { + _pendingResolvers.add(resolve); + const done = () => { _pendingResolvers.delete(resolve); resolve(); }; + useTTSStore.getState().setAudioPlaying(true); + const currentSpeed = useTTSStore.getState().settings.speed; + const buffer = ctx.createBuffer(1, chunk.length, 24000); + buffer.copyToChannel(chunk, 0); + const source = ctx.createBufferSource(); + source.buffer = buffer; + source.playbackRate.value = currentSpeed; + source.connect(ctx.destination); + source.onEnded = done; + source.start(); + }), + onEnd: async () => { + if (!_skipSuspendOnEnd) { + await ctx.suspend().catch(() => {}); + } + }, + }); + } catch (err) { + logger.error('[Kokoro] stream error:', err); + throw err; + } + }; + + _stopFn = (instant = true) => { + tts.streamStop(instant); + audioCtxRef.current?.close().catch(() => {}); + audioCtxRef.current = null; + }; + + // Clear refs on unmount so stale closures don't fire during voice switch + useEffect(() => { + return () => { + logger.log('[Kokoro] Inner unmounting, clearing refs'); + _streamFn = null; + _stopFn = null; + }; + }, []); + + return null; +}; + +// ─── Outer component — manages voice switching via key-based remount ───────── + +export const KokoroTTSManager: React.FC = () => { + const kokoroVoiceId = useTTSStore(s => s.settings.kokoroVoiceId) as KokoroVoiceId; + const isSpeaking = useTTSStore(s => s.isSpeaking); + + // activeVoiceId controls which voice the inner component is mounted with. + // Changed only after a cooldown to give executorch time to clean up. + const [activeVoiceId, setActiveVoiceId] = React.useState(kokoroVoiceId); + const cooldownRef = useRef | null>(null); + + React.useEffect(() => { + logger.log('[Kokoro] Voice effect: kokoroVoiceId=', kokoroVoiceId, 'activeVoiceId=', activeVoiceId, 'isSpeaking=', isSpeaking); + if (isSpeaking || kokoroVoiceId === activeVoiceId) { + if (cooldownRef.current) { clearTimeout(cooldownRef.current); cooldownRef.current = null; } + return; + } + const elapsed = Date.now() - _lastStreamEndTime; + const waitMs = Math.max(100, 2000 - elapsed); + logger.log('[Kokoro] Starting voice change cooldown:', waitMs, 'ms'); + // Mark Kokoro as not ready during the switch so UI shows loader + useTTSStore.getState().setKokoroState(false, 0); + cooldownRef.current = setTimeout(() => { + logger.log('[Kokoro] Cooldown done, remounting with voice', kokoroVoiceId); + setActiveVoiceId(kokoroVoiceId); + cooldownRef.current = null; + }, waitMs); + return () => { if (cooldownRef.current) { clearTimeout(cooldownRef.current); cooldownRef.current = null; } }; + }, [kokoroVoiceId, isSpeaking, activeVoiceId]); + + // Key-based remount: when activeVoiceId changes, the inner component + // fully unmounts (executorch teardown) then remounts (fresh init). + return ; +}; diff --git a/src/components/MarkdownText.tsx b/src/components/MarkdownText.tsx index 78d6c9ae..233a606a 100644 --- a/src/components/MarkdownText.tsx +++ b/src/components/MarkdownText.tsx @@ -1,5 +1,5 @@ import React, { useCallback, useMemo } from 'react'; -import { Linking, Pressable, Text, StyleSheet } from 'react-native'; +import { Linking, Text } from 'react-native'; import Markdown from '@ronradtke/react-native-markdown-display'; import { useTheme } from '../theme'; import type { ThemeColors } from '../theme'; @@ -14,21 +14,17 @@ export function preprocessMarkdown(text: string): string { return text.replaceAll(/(\d)\*(?=\d)/g, String.raw`$1\*`); } -const linkWrapperStyles = StyleSheet.create({ - pressable: { flexShrink: 1, paddingBottom: 6 }, -}); - -/** Custom link rule that constrains the Pressable wrapper width */ +/** Custom link rule — renders as inline Text so it wraps correctly inside list items */ function createLinkRule(onPress: (url: string) => void) { - return (node: any, renderChildren: any, _parent: any) => ( - ( + onPress(node.attributes?.href ?? '')} > - {renderChildren} - + {children} + ); } diff --git a/src/components/NumericStepper.tsx b/src/components/NumericStepper.tsx new file mode 100644 index 00000000..342cc669 --- /dev/null +++ b/src/components/NumericStepper.tsx @@ -0,0 +1,105 @@ +import React from 'react'; +import { View, Text, TouchableOpacity, StyleSheet } from 'react-native'; +import Icon from 'react-native-vector-icons/Feather'; +import { useTheme } from '../theme'; +import { TYPOGRAPHY, SPACING } from '../constants'; + +interface NumericStepperProps { + value: number; + min: number; + max: number; + step: number; + decimals?: number; + onChange: (value: number) => void; + formatValue?: (value: number) => string; + testID?: string; +} + +export const NumericStepper: React.FC = ({ + value, + min, + max, + step, + decimals = 0, + onChange, + formatValue, + testID, +}) => { + const { colors } = useTheme(); + + const round = (v: number) => Math.round(v / step) * step; + + const decrement = () => { + const next = round(value - step); + if (next >= min) onChange(parseFloat(next.toFixed(decimals))); + }; + + const increment = () => { + const next = round(value + step); + if (next <= max) onChange(parseFloat(next.toFixed(decimals))); + }; + + const display = formatValue ? formatValue(value) : value.toFixed(decimals); + const canDecrement = value > min; + const canIncrement = value < max; + + return ( + + + + + + + {display} + + + + + + + ); +}; + +const styles = StyleSheet.create({ + row: { + flexDirection: 'row', + alignItems: 'center', + justifyContent: 'center', + gap: SPACING.sm, + marginTop: SPACING.sm, + }, + button: { + width: 32, + height: 32, + borderRadius: 8, + borderWidth: 1, + alignItems: 'center', + justifyContent: 'center', + }, + buttonDisabled: { + opacity: 0.35, + }, + value: { + ...TYPOGRAPHY.body, + fontWeight: '400', + minWidth: 72, + textAlign: 'center', + paddingHorizontal: SPACING.sm, + paddingVertical: SPACING.xs, + borderRadius: 8, + borderWidth: 1, + overflow: 'hidden', + }, +}); diff --git a/src/components/TTSButton/index.tsx b/src/components/TTSButton/index.tsx new file mode 100644 index 00000000..c33a18b7 --- /dev/null +++ b/src/components/TTSButton/index.tsx @@ -0,0 +1,117 @@ +import React, { useEffect } from 'react'; +import { TouchableOpacity, ActivityIndicator, StyleSheet } from 'react-native'; +import Animated, { + useSharedValue, + useAnimatedStyle, + withRepeat, + withSequence, + withTiming, +} from 'react-native-reanimated'; +import Icon from 'react-native-vector-icons/Feather'; +import { useTheme } from '../../theme'; +import { useTTSStore } from '../../stores/ttsStore'; +import { SPACING } from '../../constants'; + +interface TTSButtonProps { + text: string; + messageId: string; +} + +export const TTSButton: React.FC = ({ text, messageId }) => { + const { colors } = useTheme(); + const { + speak, + stop, + isSpeaking, + isGeneratingAudio, + isModelLoading, + isModelLoaded, + currentMessageId, + settings, + isBackboneDownloaded, + isVocoderDownloaded, + kokoroReady, + loadModels, + } = useTTSStore(); + + const areBothDownloaded = isBackboneDownloaded && isVocoderDownloaded; + const isThisMessage = currentMessageId === messageId; + // Kokoro streams so no separate generation phase — only OuteTTS sets isGeneratingAudio + const isThisMessageGenerating = isGeneratingAudio && isThisMessage; + const isThisMessageSpeaking = isSpeaking && !isGeneratingAudio && isThisMessage; + + // Button is usable if Kokoro is ready (fast path) OR OuteTTS is downloaded (slow path) + const canSpeak = kokoroReady || areBothDownloaded; + + const opacity = useSharedValue(1); + useEffect(() => { + if (isThisMessageSpeaking) { + opacity.value = withRepeat( + withSequence( + withTiming(0.4, { duration: 600 }), + withTiming(1, { duration: 600 }), + ), + -1, + false, + ); + } else { + opacity.value = withTiming(1, { duration: 200 }); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [isThisMessageSpeaking]); + + const animatedStyle = useAnimatedStyle(() => ({ opacity: opacity.value })); + + // Don't render if TTS disabled or no model is usable (Kokoro or OuteTTS) + if (!settings.enabled || !canSpeak) { + return null; + } + + // Show spinner while model is loading for this message, or while generating audio tokens + if ((isModelLoading && isThisMessage) || isThisMessageGenerating) { + return ; + } + + const handlePress = () => { + if (isThisMessageSpeaking || isThisMessageGenerating) { + stop(); + return; + } + // Kokoro: ready immediately, no model loading step needed + if (kokoroReady) { + speak(text, messageId); + return; + } + // OuteTTS fallback: load models on first press if needed + if (!isModelLoaded) { + loadModels().then(() => { + useTTSStore.getState().speak(text, messageId); + }); + return; + } + speak(text, messageId); + }; + + return ( + + + + + + ); +}; + +const styles = StyleSheet.create({ + button: { + padding: SPACING.xs, + }, +}); diff --git a/src/components/VoiceRecordButton/index.tsx b/src/components/VoiceRecordButton/index.tsx index bd1cca73..6844c05f 100644 --- a/src/components/VoiceRecordButton/index.tsx +++ b/src/components/VoiceRecordButton/index.tsx @@ -9,6 +9,7 @@ import { PanResponderGestureState, Vibration, } from 'react-native'; +import Icon from 'react-native-vector-icons/Feather'; import ReanimatedAnimated, { useSharedValue, useAnimatedStyle, @@ -16,15 +17,16 @@ import ReanimatedAnimated, { withTiming, Easing, } from 'react-native-reanimated'; -import { useNavigation } from '@react-navigation/native'; -import { NativeStackNavigationProp } from '@react-navigation/native-stack'; import { useThemedStyles } from '../../theme'; import { CustomAlert, showAlert, hideAlert, AlertState, initialAlertState } from '../CustomAlert'; import { createStyles } from './styles'; import { LoadingState, TranscribingState, UnavailableButton, ButtonIcon } from './states'; -import { RootStackParamList } from '../../navigation/types'; +import { useWhisperStore } from '../../stores'; import logger from '../../utils/logger'; +const DOWNLOAD_MODEL_ID = 'small.en'; +const DOWNLOAD_MODEL_SIZE_MB = 466; + interface VoiceRecordButtonProps { isRecording: boolean; isAvailable: boolean; @@ -95,7 +97,7 @@ export const VoiceRecordButton: React.FC = ({ isModelLoading, isTranscribing, partialResult, - error, + error: _error, disabled, onStartRecording, onStopRecording, @@ -103,7 +105,7 @@ export const VoiceRecordButton: React.FC = ({ asSendButton = false, }) => { const styles = useThemedStyles(createStyles); - const navigation = useNavigation>(); + const { downloadModel, isDownloading, downloadProgress } = useWhisperStore(); const pulseAnim = useRef(new Animated.Value(1)).current; const loadingAnim = useRef(new Animated.Value(0)).current; @@ -125,6 +127,7 @@ export const VoiceRecordButton: React.FC = ({ rippleOpacity.value = 0; } + // eslint-disable-next-line react-hooks/exhaustive-deps }, [isRecording]); const rippleStyle = useAnimatedStyle(() => ({ @@ -161,15 +164,20 @@ export const VoiceRecordButton: React.FC = ({ const panResponder = useRef(buildPanResponder({ isDraggingToCancel, cancelOffsetX, callbacksRef })).current; const handleUnavailableTap = () => { - const errorDetail = error || 'No transcription model downloaded'; + if (isDownloading) { return; } setAlertState(showAlert( - 'Voice Input Unavailable', - `${errorDetail}\n\nDownload a Whisper model to enable on-device voice input.`, + 'Download Voice Model', + `Download Whisper Small to enable voice input? (${DOWNLOAD_MODEL_SIZE_MB} MB)`, [ - { text: 'Cancel' }, + { text: 'Cancel', style: 'cancel' }, { - text: 'Go to Voice Settings', - onPress: () => navigation.navigate('VoiceSettings'), + text: 'Download', + onPress: () => { + setAlertState(hideAlert()); + downloadModel(DOWNLOAD_MODEL_ID).catch((err) => { + logger.error('[VoiceRecordButton] Download failed:', err); + }); + }, }, ], )); @@ -206,8 +214,8 @@ export const VoiceRecordButton: React.FC = ({ if (!isAvailable) { return ( - - + + {alert} @@ -221,6 +229,42 @@ export const VoiceRecordButton: React.FC = ({ disabled && styles.buttonDisabled, ]; + // ── Audio mode: tap-to-toggle (tap to start, tap to stop & send) ─────────── + if (!asSendButton) { + const handleToggle = () => { + if (disabled) return; + Vibration.vibrate(50); + if (isRecording) { + onStopRecording(); + } else { + onStartRecording(); + } + }; + + return ( + + {isRecording && } + + + + {isRecording + ? + : } + + + + {alert} + + ); + } + + // ── Chat mode: hold-to-record with slide-to-cancel ───────────────────────── return ( {isRecording && ( diff --git a/src/components/VoiceRecordButton/states.tsx b/src/components/VoiceRecordButton/states.tsx index d0ba1ab2..889a820c 100644 --- a/src/components/VoiceRecordButton/states.tsx +++ b/src/components/VoiceRecordButton/states.tsx @@ -43,7 +43,6 @@ export const TranscribingState: React.FC = ({ asSendButt {asSendButton ? : } - {!asSendButton && Transcribing...} ); }; @@ -52,16 +51,30 @@ export const TranscribingState: React.FC = ({ asSendButt interface UnavailableButtonProps { asSendButton: boolean; + /** 0–1 while downloading, undefined when idle */ + downloadProgress?: number; } -export const UnavailableButton: React.FC = ({ asSendButton }) => { +export const UnavailableButton: React.FC = ({ asSendButton, downloadProgress }) => { const { colors } = useTheme(); const styles = useThemedStyles(createStyles); + const isDownloading = downloadProgress !== undefined; + + if (asSendButton) { + return ( + + + + ); + } return ( - - {asSendButton ? ( - + + {isDownloading ? ( + <> + + {Math.round(downloadProgress * 100)}% + ) : ( <> diff --git a/src/constants/kokoroModels.ts b/src/constants/kokoroModels.ts new file mode 100644 index 00000000..9cf90b6e --- /dev/null +++ b/src/constants/kokoroModels.ts @@ -0,0 +1,62 @@ +import { Platform } from 'react-native'; +import { + KOKORO_MEDIUM, + KOKORO_VOICE_AF_HEART, + KOKORO_VOICE_AF_RIVER, + KOKORO_VOICE_AF_SARAH, + KOKORO_VOICE_AM_ADAM, + KOKORO_VOICE_AM_MICHAEL, + KOKORO_VOICE_AM_SANTA, + KOKORO_VOICE_BF_EMMA, + KOKORO_VOICE_BM_DANIEL, +} from 'react-native-executorch'; +import type { VoiceConfig } from 'react-native-executorch'; + +export { KOKORO_MEDIUM }; + +export type KokoroVoiceId = + | 'af_heart' + | 'af_river' + | 'af_sarah' + | 'am_adam' + | 'am_michael' + | 'am_santa' + | 'bf_emma' + | 'bm_daniel'; + +export const KOKORO_VOICES: { + id: KokoroVoiceId; + label: string; + persona: string; + accent: string; + gender: 'Female' | 'Male'; + /** Recommended playback speed for this persona's mood */ + defaultSpeed: number; + config: VoiceConfig; +}[] = [ + { id: 'af_heart', label: 'Warm', persona: 'Friendly and approachable', accent: 'US', gender: 'Female', defaultSpeed: 1.0, config: KOKORO_VOICE_AF_HEART }, + { id: 'af_river', label: 'Calm', persona: 'Relaxed and soothing', accent: 'US', gender: 'Female', defaultSpeed: 0.9, config: KOKORO_VOICE_AF_RIVER }, + { id: 'af_sarah', label: 'Clear', persona: 'Crisp and professional', accent: 'US', gender: 'Female', defaultSpeed: 1.0, config: KOKORO_VOICE_AF_SARAH }, + { id: 'am_adam', label: 'Steady', persona: 'Composed and reliable', accent: 'US', gender: 'Male', defaultSpeed: 1.0, config: KOKORO_VOICE_AM_ADAM }, + { id: 'am_michael', label: 'Bold', persona: 'Confident and direct', accent: 'US', gender: 'Male', defaultSpeed: 1.1, config: KOKORO_VOICE_AM_MICHAEL }, + { id: 'am_santa', label: 'Cheerful', persona: 'Upbeat and energetic', accent: 'US', gender: 'Male', defaultSpeed: 1.2, config: KOKORO_VOICE_AM_SANTA }, + { id: 'bf_emma', label: 'Gentle', persona: 'Soft and thoughtful', accent: 'British', gender: 'Female', defaultSpeed: 0.9, config: KOKORO_VOICE_BF_EMMA }, + { id: 'bm_daniel', label: 'Refined', persona: 'Polished and articulate', accent: 'British', gender: 'Male', defaultSpeed: 1.0, config: KOKORO_VOICE_BM_DANIEL }, +]; + +export const DEFAULT_KOKORO_VOICE_ID: KokoroVoiceId = 'af_heart'; + +export function getKokoroVoiceConfig(id: KokoroVoiceId): VoiceConfig { + return KOKORO_VOICES.find(v => v.id === id)?.config ?? KOKORO_VOICE_AF_HEART; +} + +/** Runtime check — executorch gradle.properties sets minSdkVersion=26; README says 33 but that's conservative */ +export function isExecutorchSupported(): boolean { + if (Platform.OS === 'android') { + return (Platform.Version as number) >= 26; + } + if (Platform.OS === 'ios') { + return parseInt(Platform.Version as string, 10) >= 17; + } + return false; +} diff --git a/src/constants/ttsModels.ts b/src/constants/ttsModels.ts new file mode 100644 index 00000000..f93dfe85 --- /dev/null +++ b/src/constants/ttsModels.ts @@ -0,0 +1,25 @@ +export const TTS_BACKBONE_MODEL = { + id: 'outetts-0.3-500m-q4', + name: 'OuteTTS 0.3', + backboneFile: 'OuteTTS-0.3-500M-Q4_K_M.gguf', + backboneUrl: + 'https://huggingface.co/OuteAI/OuteTTS-0.3-500M-GGUF/resolve/main/OuteTTS-0.3-500M-Q4_K_M.gguf', + backboneSizeMB: 454, + vocoderFile: 'WavTokenizer-Large-75-Q5_1.gguf', + vocoderUrl: + 'https://huggingface.co/ggml-org/WavTokenizer/resolve/main/WavTokenizer-Large-75-Q5_1.gguf', + vocoderSizeMB: 73, + sampleRate: 24000, + description: 'Natural-sounding on-device speech. Requires ~530 MB storage.', +}; + +export const TTS_SPEAKER_PROFILES = [ + { id: '0', label: 'Default' }, +]; + +/** Warn user if device RAM is below this threshold */ +export const TTS_WARN_RAM_GB = 8; +/** Hard-block TTS on devices below this threshold */ +export const TTS_BLOCK_RAM_GB = 6; +/** Max cached audio messages per conversation before eviction */ +export const AUDIO_CACHE_MAX_MESSAGES = 50; diff --git a/src/hooks/useTTS.ts b/src/hooks/useTTS.ts new file mode 100644 index 00000000..5ad948a3 --- /dev/null +++ b/src/hooks/useTTS.ts @@ -0,0 +1,48 @@ +import { useEffect, useCallback } from 'react'; +import { useTTSStore } from '../stores/ttsStore'; +import { hardwareService } from '../services/hardware'; +import { TTS_BLOCK_RAM_GB, TTS_WARN_RAM_GB } from '../constants/ttsModels'; + +export function useTTS() { + const store = useTTSStore(); + + useEffect(() => { + store.checkDownloadStatus(); + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); + + const canRunOnDevice = useCallback((): { allowed: boolean; warning: boolean } => { + const ramGB = hardwareService.getTotalMemoryGB(); + return { + allowed: ramGB >= TTS_BLOCK_RAM_GB, + warning: ramGB < TTS_WARN_RAM_GB, + }; + }, []); + + const speakMessage = useCallback( + (text: string, messageId: string) => { + if (!store.isModelLoaded && store.isBackboneDownloaded && store.isVocoderDownloaded) { + store.loadModels().then(() => store.speak(text, messageId)); + return; + } + store.speak(text, messageId); + }, + // eslint-disable-next-line react-hooks/exhaustive-deps + [store.isModelLoaded, store.isBackboneDownloaded, store.isVocoderDownloaded], + ); + + const areBothDownloaded = store.isBackboneDownloaded && store.isVocoderDownloaded; + + return { + ...store, + speakMessage, + canRunOnDevice, + areBothDownloaded, + isDownloading: store.isDownloadingBackbone || store.isDownloadingVocoder, + // weighted by file size (454 MB backbone, 73 MB vocoder → 86% / 14%) + overallDownloadProgress: + store.backboneDownloadProgress * 0.86 + store.vocoderDownloadProgress * 0.14, + isAudioMode: store.settings.interfaceMode === 'audio', + isChatMode: store.settings.interfaceMode === 'chat', + }; +} diff --git a/src/navigation/AppNavigator.tsx b/src/navigation/AppNavigator.tsx index 1d15b73a..517357a2 100644 --- a/src/navigation/AppNavigator.tsx +++ b/src/navigation/AppNavigator.tsx @@ -32,6 +32,7 @@ import { DownloadManagerScreen, ModelSettingsScreen, VoiceSettingsScreen, + TTSSettingsScreen, DeviceInfoScreen, StorageSettingsScreen, SecuritySettingsScreen, @@ -229,6 +230,7 @@ export const AppNavigator: React.FC = () => { + diff --git a/src/navigation/types.ts b/src/navigation/types.ts index 21b876da..b58d03c1 100644 --- a/src/navigation/types.ts +++ b/src/navigation/types.ts @@ -16,6 +16,7 @@ export type RootStackParamList = { ModelSettings: undefined; RemoteServers: undefined; VoiceSettings: undefined; + TTSSettings: undefined; DeviceInfo: undefined; StorageSettings: undefined; SecuritySettings: undefined; diff --git a/src/screens/ChatScreen/ChatMessageArea.tsx b/src/screens/ChatScreen/ChatMessageArea.tsx index f7611cc0..374c80bc 100644 --- a/src/screens/ChatScreen/ChatMessageArea.tsx +++ b/src/screens/ChatScreen/ChatMessageArea.tsx @@ -1,5 +1,6 @@ import React, { useState, useMemo } from 'react'; -import { View, FlatList, Text, Keyboard, ActivityIndicator, Platform } from 'react-native'; +import { View, FlatList, Text, Keyboard, ActivityIndicator, Platform, StyleSheet } from 'react-native'; +import { useTTSStore } from '../../stores/ttsStore'; import Icon from 'react-native-vector-icons/Feather'; import Animated, { FadeIn } from 'react-native-reanimated'; import { AttachStep } from 'react-native-spotlight-tour'; @@ -28,6 +29,10 @@ export type ChatMessageAreaProps = { export const ChatMessageArea: React.FC = ({ flatListRef, isNearBottomRef, chat, styles, colors, handleScroll, renderItem, chatSpotlight, }) => { + // Hide FlatList until initial layout + scroll is complete to prevent visible scroll jump + const [isListReady, setIsListReady] = useState(false); + const hasScrolledRef = React.useRef(false); + const interfaceMode = useTTSStore((s) => s.settings.interfaceMode); const tabNav = useNavigation>(); const [inputHeight, setInputHeight] = useState(84); const activeModelRepoId = chat.activeModelId?.split('/').slice(0, 2).join('/'); @@ -52,12 +57,26 @@ export const ChatMessageArea: React.FC = ({ ) : ( item.id} + extraData={interfaceMode} contentContainerStyle={styles.messageList} onScroll={handleScroll} - onContentSizeChange={(_w, _h) => { if (isNearBottomRef.current) flatListRef.current?.scrollToEnd({ animated: false }); }} + onContentSizeChange={(_w, h) => { + if (!hasScrolledRef.current && h > 0) { + // Initial layout: force scroll to bottom regardless of isNearBottom + flatListRef.current?.scrollToEnd({ animated: false }); + hasScrolledRef.current = true; + // Reveal after a frame so the scroll position settles + requestAnimationFrame(() => { + requestAnimationFrame(() => setIsListReady(true)); + }); + } else if (isNearBottomRef.current) { + flatListRef.current?.scrollToEnd({ animated: false }); + } + }} onLayout={() => { }} scrollEventThrottle={16} keyboardDismissMode="on-drag" @@ -140,3 +159,7 @@ export const ChatMessageArea: React.FC = ({ ); }; + +const hiddenStyle = StyleSheet.create({ + hidden: { opacity: 0 }, +}); diff --git a/src/screens/ChatScreen/ChatModalSection.tsx b/src/screens/ChatScreen/ChatModalSection.tsx index 301b3bdc..76f90703 100644 --- a/src/screens/ChatScreen/ChatModalSection.tsx +++ b/src/screens/ChatScreen/ChatModalSection.tsx @@ -83,6 +83,7 @@ export const ChatModalSection: React.FC = ({ onOpenProject={() => setShowProjectSelector(true)} onOpenGallery={imageCount > 0 ? () => navigation.navigate('Gallery', { conversationId: activeConversationId }) : undefined} onDeleteConversation={activeConversation ? handleDeleteConversation : undefined} + onOpenTTSSettings={() => { setShowSettingsPanel(false); navigation.navigate('TTSSettings'); }} conversationImageCount={imageCount} activeProjectName={activeProject?.name || null} isRemote={isRemote} diff --git a/src/screens/ChatScreen/MessageRenderer.tsx b/src/screens/ChatScreen/MessageRenderer.tsx index 5cf4a0cc..e5511441 100644 --- a/src/screens/ChatScreen/MessageRenderer.tsx +++ b/src/screens/ChatScreen/MessageRenderer.tsx @@ -1,7 +1,18 @@ -import React from 'react'; +import React, { useState } from 'react'; +import { View, StyleSheet } from 'react-native'; import { ChatMessage } from '../../components'; +import { AudioMessageBubble } from '../../components/AudioMessageBubble'; +import { TTSButton } from '../../components/TTSButton'; +import { AnimatedEntry } from '../../components/AnimatedEntry'; +import { useTTSStore } from '../../stores/ttsStore'; +import { stripControlTokens } from '../../utils/messageContent'; import { Message } from '../../types'; +import '../../types/tts'; import { ChatMessageItem } from './useChatScreen'; +import { parseThinkingContent, buildMessageData } from '../../components/ChatMessage/utils'; +import { ThinkingBlock } from '../../components/ChatMessage/components/ThinkingBlock'; +import { createStyles as createChatStyles } from '../../components/ChatMessage/styles'; +import { useThemedStyles } from '../../theme'; type MessageRendererProps = { item: Message | ChatMessageItem; @@ -19,31 +30,215 @@ type MessageRendererProps = { onImagePress: (uri: string) => void; }; -export const MessageRenderer: React.FC = ({ - item, - index, - displayMessagesLength, - animateLastN, - imageModelLoaded, - isStreaming, - isGeneratingImage, - showGenerationDetails, - onCopy, - onRetry, - onEdit, - onGenerateImage, - onImagePress, -}) => ( - 0 && index >= displayMessagesLength - animateLastN} - /> -); +/** Renders the thinking/reasoning block for audio mode without the ChatMessage bubble wrapper */ +const AudioModeThinkingBlock: React.FC<{ msg: Message }> = ({ msg }) => { + const chatStyles = useThemedStyles(createChatStyles); + const [showThinking, setShowThinking] = useState(false); + const { parsedContent } = buildMessageData(msg); + if (!parsedContent.thinking) return null; + return ( + + setShowThinking((v) => !v)} + styles={chatStyles} + /> + + ); +}; + +interface AudioBubbleProps { + messageId: string; + audioPath: string; + waveformData: number[]; + durationSeconds: number; + transcript: string; + _reasoningContent?: string; +} + +function buildAudioBubbleProps(msg: Message): AudioBubbleProps { + const transcript = stripControlTokens(msg.content); + console.log('[AudioBubble] buildProps: msgId=', msg.id, 'contentLen=', msg.content.length, 'transcriptLen=', transcript.length); + return { + messageId: msg.id, + audioPath: msg.audioPath ?? '', + waveformData: msg.waveformData ?? [], + durationSeconds: msg.audioDurationSeconds ?? 0, + transcript, + _reasoningContent: msg.reasoningContent, + }; +} + +/** Wraps content with AnimatedEntry if needed */ +function wrapAnimated(content: React.ReactElement, shouldAnimate: boolean): React.ReactElement { + return shouldAnimate ? {content} : content; +} + +/** Renders a user voice message as an audio bubble */ +function renderUserAudioBubble( + opts: { msg: Message; audioAtt: any; shouldAnimate: boolean }, + props: MessageRendererProps, +): React.ReactElement { + const { msg, audioAtt, shouldAnimate } = opts; + const bubble = ( + + props.onRetry(msg)} + /> + + ); + return wrapAnimated(bubble, shouldAnimate); +} + +/** Renders a streaming/thinking assistant message in audio mode as a ChatMessage */ +function renderAudioStreamingMessage( + msg: Message, + isStreamingThis: boolean, + props: MessageRendererProps, +): React.ReactElement { + return ( + + ); +} + +/** Renders a completed assistant audio bubble, with optional tool call UI */ +function renderAudioAssistantBubble( + msg: Message, + shouldAnimate: boolean, + props: MessageRendererProps, +): React.ReactElement { + const hasThinking = !!msg.reasoningContent || !!parseThinkingContent(msg.content).thinking; + const hasToolCalls = !!msg.toolCalls?.length; + + // For messages with tool calls, render as a regular ChatMessage (has proper tool call UI) + // followed by the audio bubble for the spoken text + if (hasToolCalls) { + const element = ( + + + + ); + return wrapAnimated(element, shouldAnimate); + } + + const bubble = ( + + {hasThinking && } + props.onRetry(msg)} + /> + + ); + return wrapAnimated(bubble, shouldAnimate); +} + +export const MessageRenderer: React.FC = (props) => { + const { + item, + index, + displayMessagesLength, + animateLastN, + imageModelLoaded, + isStreaming, + isGeneratingImage, + showGenerationDetails, + onCopy, + onRetry, + onEdit, + onGenerateImage, + onImagePress, + } = props; + + const ttsMode = useTTSStore((s) => s.settings.interfaceMode); + const msg = item as Message; + const animateEntry = animateLastN > 0 && index >= displayMessagesLength - animateLastN; + const isStreamingThis = item.id === 'streaming'; + + // User voice message: always show as audio bubble + if (msg.role === 'user') { + const audioAtt = msg.attachments?.find((a) => a.type === 'audio'); + if (audioAtt) { + return renderUserAudioBubble({ msg, audioAtt, shouldAnimate: animateEntry }, props); + } + } + + const isAudioAssistant = msg.role === 'assistant' && !msg.isSystemInfo; + + // Thinking placeholder + audio streaming + const isThinkingItem = !!(msg as any).isThinking; + if (isAudioAssistant && ttsMode === 'audio' && (isStreamingThis || isThinkingItem)) { + return renderAudioStreamingMessage(msg, isStreamingThis, props); + } + + // Audio Mode: show assistant messages as audio bubbles after streaming ends + if (isAudioAssistant && ttsMode === 'audio' && !isStreamingThis) { + return renderAudioAssistantBubble(msg, animateEntry, props); + } + + // Chat Mode: TTSButton lives in the meta row + const isPlainAssistant = msg.role === 'assistant' && !msg.isSystemInfo && !msg.toolCalls?.length; + const ttsMeta = isPlainAssistant && !isStreamingThis + ? + : undefined; + + return ( + + ); +}; + +const audioStyles = StyleSheet.create({ + userContainer: { + paddingRight: 16, + marginVertical: 8, + alignItems: 'flex-end', + }, + assistantContainer: { + paddingHorizontal: 16, + marginVertical: 8, + alignItems: 'flex-start', + }, +}); diff --git a/src/screens/ChatScreen/index.tsx b/src/screens/ChatScreen/index.tsx index 2be6468e..bdf0c138 100644 --- a/src/screens/ChatScreen/index.tsx +++ b/src/screens/ChatScreen/index.tsx @@ -1,5 +1,6 @@ import React, { useCallback, useEffect, useRef, useState } from 'react'; import { FlatList, KeyboardAvoidingView, InteractionManager } from 'react-native'; +import { useTTSStore } from '../../stores/ttsStore'; import { SafeAreaView } from 'react-native-safe-area-context'; import { useFocusEffect } from '@react-navigation/native'; import { useSpotlightTour } from 'react-native-spotlight-tour'; @@ -101,6 +102,22 @@ export const ChatScreen: React.FC = () => { setTimeout(() => { flatListRef.current?.scrollToEnd({ animated: true }); }, 100); } }, [chat.activeConversation?.messages.length]); + + // Reset scroll when switching between chat/audio interface modes + const interfaceMode = useTTSStore((s) => s.settings.interfaceMode); + const prevModeRef = React.useRef(interfaceMode); + React.useEffect(() => { + if (prevModeRef.current !== interfaceMode) { + prevModeRef.current = interfaceMode; + isNearBottomRef.current = true; + chat.setShowScrollToBottom(false); + // FlatList re-renders via extraData; onContentSizeChange fires and scrolls. + // Backup: scroll after items have had time to re-measure. + setTimeout(() => { flatListRef.current?.scrollToEnd({ animated: false }); }, 300); + } + // eslint-disable-next-line react-hooks/exhaustive-deps + }, [interfaceMode]); + const alertEl = ( = Dispatch>; const FALLBACK_RECENT_MESSAGE_COUNT = 2; + +/** + * Appended to the system prompt when TTS audio mode is active. + * Guides the model to respond conversationally for voice output. + */ +const AUDIO_MODE_PROMPT_HINT = ` + +[VOICE MODE ACTIVE — your response will be spoken aloud via text-to-speech] +Respond as if you are speaking to the user in a natural conversation: +- Be concise and conversational — talk like a person, not a document +- Never use markdown formatting (no headers, bullets, bold, code blocks, tables) +- Never use special characters, symbols, or emoji that sound awkward when read aloud +- Use short sentences and natural spoken transitions ("So,", "Basically,", "Here's the thing —") +- If summarizing research or long content, give the key takeaways in a few spoken paragraphs, not an essay +- Numbers: say "about two thousand" not "~2,000" +- Keep responses under 2-3 paragraphs unless the user explicitly asks for detail +- Use expressive punctuation for natural prosody: exclamation marks for emphasis!, question marks for curiosity?, ellipses for pauses..., and vary sentence length for rhythm`; export type GenerationDeps = { activeModelId: string | null; activeModel: DownloadedModel | null | undefined; @@ -248,7 +265,13 @@ export async function startGenerationFn(deps: GenerationDeps, call: StartGenerat } const conversation = useChatStore.getState().conversations.find(c => c.id === targetConversationId); const { enabledTools, rawPrompt } = resolveToolsAndPrompt(deps, conversation); - const basePrompt = await injectRagContext(conversation?.projectId, messageText, rawPrompt); + let basePrompt = await injectRagContext(conversation?.projectId, messageText, rawPrompt); + + // In audio mode, append instructions for conversational voice-friendly responses + if (useTTSStore.getState().settings.interfaceMode === 'audio') { + basePrompt += AUDIO_MODE_PROMPT_HINT; + } + const isRemote = !!useRemoteServerStore.getState().activeRemoteTextModelId; const activeTools = enabledTools; const systemPrompt = applyGemma4ThinkToken( diff --git a/src/screens/ChatScreen/useChatMessageHandlers.ts b/src/screens/ChatScreen/useChatMessageHandlers.ts index c9ff7f1c..f20d8237 100644 --- a/src/screens/ChatScreen/useChatMessageHandlers.ts +++ b/src/screens/ChatScreen/useChatMessageHandlers.ts @@ -1,6 +1,7 @@ import { Dispatch, SetStateAction } from 'react'; import { showAlert, AlertState } from '../../components'; import { Message } from '../../types'; +import { useTTSStore } from '../../stores/ttsStore'; import { regenerateResponseFn, executeDeleteConversationFn, handleImageGenerationFn, } from './useChatGenerationActions'; @@ -20,6 +21,8 @@ export async function handleRetryMessageFn( message: Message, genDeps: GenerationDeps, p: RetryParams, ): Promise { if (!p.activeConversationId || !p.hasActiveModel) return; + // Stop any in-flight TTS before deleting messages + useTTSStore.getState().stop(); const msgs = p.activeConversation?.messages || []; if (message.role === 'user') { const idx = msgs.findIndex((m: Message) => m.id === message.id); diff --git a/src/screens/ChatScreen/useChatScreen.ts b/src/screens/ChatScreen/useChatScreen.ts index e543b7e5..b9e7683e 100644 --- a/src/screens/ChatScreen/useChatScreen.ts +++ b/src/screens/ChatScreen/useChatScreen.ts @@ -1,7 +1,9 @@ import { useCallback, useEffect, useMemo, useRef, useState } from 'react'; +import { AppState } from 'react-native'; import { useNavigation, useRoute, RouteProp } from '@react-navigation/native'; import { AlertState, initialAlertState } from '../../components'; -import { useAppStore, useChatStore, useProjectStore, useRemoteServerStore } from '../../stores'; +import { useAppStore, useChatStore, useProjectStore, useRemoteServerStore, useTTSStore } from '../../stores'; +import '../../types/tts'; import logger from '../../utils/logger'; import { llmService, generationService, imageGenerationService, activeModelService, @@ -15,10 +17,16 @@ import { startGenerationFn, handleSendFn, handleStopFn, handleSelectProjectFn } import { handleRetryMessageFn, handleEditMessageFn, handleDeleteConversationFn, handleGenerateImageFromMsgFn } from './useChatMessageHandlers'; import { getDisplayMessages, getPlaceholderText, ChatMessageItem, StreamingState } from './types'; import { saveImageToGallery } from './useSaveImage'; +import { stripControlTokens, stripMarkdownForSpeech } from '../../utils/messageContent'; export type { AlertState, ChatMessageItem, StreamingState }; export { getDisplayMessages, getPlaceholderText }; +function _triggerAudioModeGeneration(conversationId: string, messageId: string, content: string) { + useChatStore.getState().updateMessageAudio(conversationId, messageId, { isAudioModeMessage: true }); + useTTSStore.getState().speak(stripMarkdownForSpeech(stripControlTokens(content)), messageId); +} + type ChatScreenRouteProp = RouteProp; type ActiveModelInfo = { @@ -53,6 +61,26 @@ export const useChatScreen = () => { const [isCompacting, setIsCompacting] = useState(false); const lastMessageCountRef = useRef(0); const generatingForConversationRef = useRef(null); + + // Stop TTS when navigating away, app backgrounded, or screen locked + useEffect(() => { + const unsubBlur = navigation.addListener('blur', () => { + useTTSStore.getState().stop(); + }); + // beforeRemove fires on back button — more reliable than blur for native-stack + const unsubRemove = navigation.addListener('beforeRemove', () => { + useTTSStore.getState().stop(); + }); + const appStateSub = AppState.addEventListener('change', (nextState) => { + const tts = useTTSStore.getState(); + if (nextState !== 'active') { + if (tts.isSpeaking && !tts.isPaused) { tts.pause(); } + } else { + if (tts.isSpeaking && tts.isPaused) { tts.resume(); } + } + }); + return () => { unsubBlur(); unsubRemove(); appStateSub.remove(); }; + }, [navigation]); const modelLoadStartTimeRef = useRef(null); const startGenerationRef = useRef<(id: string, text: string) => Promise>(null as any); const addMessageRef = useRef(null as any); @@ -193,6 +221,95 @@ export const useChatScreen = () => { lastMessageCountRef.current = curr; }, [displayMessages.length]); useEffect(() => { lastMessageCountRef.current = 0; setAnimateLastN(0); }, [activeConversationId]); + const prevStreamingRef = useRef(false); + const ttsStreamRef = useRef<{ nextPos: number; pending: string[]; isPlaying: boolean }>({ + nextPos: 0, pending: [], isPlaying: false, + }); + + // Buffer-based streaming TTS: feed text to Kokoro as soon as enough runway accumulates. + // No sentence detection — just split at word boundaries when buffer exceeds threshold. + // Works even at low tok/sec because the threshold is much smaller than a full sentence. + useEffect(() => { + if (!isStreamingForThisConversation) return; + const tts = useTTSStore.getState(); + if (tts.settings.interfaceMode !== 'audio') return; + if (!tts.kokoroReady && !tts.isModelLoaded) return; + if (!streamingMessage) return; + + const ref = ttsStreamRef.current; + const stripped = stripControlTokens(streamingMessage); + const buffered = stripped.slice(ref.nextPos); + + // Need enough chars for Kokoro to have meaningful speech (~2-3 seconds worth) + const MIN_CHARS = 50; + if (buffered.length < MIN_CHARS) return; + + // Split at the last word boundary so we don't cut mid-word + const lastSpace = buffered.lastIndexOf(' '); + if (lastSpace <= 0) return; + + const chunk = buffered.slice(0, lastSpace).trim(); + ref.nextPos += lastSpace + 1; + if (!chunk) return; + + ref.pending.push(stripMarkdownForSpeech(chunk)); + logger.log('[StreamTTS] chunk queued, pending=', ref.pending.length, 'isPlaying=', ref.isPlaying); + + if (!ref.isPlaying) { + const playNext = () => { + // If another message took over playback (e.g. user tapped a recording), stop the chain + const currentId = useTTSStore.getState().currentMessageId; + if (currentId !== null && currentId !== 'streaming') { + logger.log('[StreamTTS] chain interrupted, currentId=', currentId); + ref.pending = []; + ref.isPlaying = false; + return; + } + const next = ref.pending.shift(); + if (!next) { ref.isPlaying = false; logger.log('[StreamTTS] chain done, no more pending'); return; } + ref.isPlaying = true; + logger.log('[StreamTTS] playing next chunk, remaining=', ref.pending.length); + useTTSStore.getState().speak(next, 'streaming').finally(playNext); + }; + playNext(); + } + }, [streamingMessage, isStreamingForThisConversation]); + + useEffect(() => { + const was = prevStreamingRef.current; + prevStreamingRef.current = isStreamingForThisConversation; + if (!was || isStreamingForThisConversation || !activeConversationId) return; + const { nextPos: alreadySpoken } = ttsStreamRef.current; + ttsStreamRef.current = { nextPos: 0, pending: [], isPlaying: false }; + const tts = useTTSStore.getState(); + if (tts.settings.interfaceMode !== 'audio') return; + const conv = useChatStore.getState().conversations.find((c) => c.id === activeConversationId); + const last = (conv?.messages ?? []).at(-1); + if (!last || last.role !== 'assistant' || last.isSystemInfo || last.toolCalls?.length || last.audioPath) return; + // Stamp as audio-mode. Estimate duration from word count (avg 2.5 words/sec) + const wordCount = last.content.split(/\s+/).filter(Boolean).length; + const speed = useTTSStore.getState().settings.speed || 1; + const estDuration = Math.max(1, wordCount / (2.5 * speed)); + logger.log('[StreamTTS] post-stream: messageId=', last.id, 'alreadySpoken=', alreadySpoken, 'wordCount=', wordCount, 'estDuration=', estDuration); + useChatStore.getState().updateMessageAudio(activeConversationId, last.id, { + isAudioModeMessage: true, + audioDurationSeconds: estDuration, + }); + // Only speak if a TTS engine is available + if (!tts.kokoroReady && !tts.isModelLoaded) { logger.log('[StreamTTS] post-stream: no TTS engine available'); return; } + // Strip thinking/control tokens — must match how positions were tracked during streaming + const cleanContent = stripMarkdownForSpeech(stripControlTokens(last.content)); + const remaining = cleanContent.slice(alreadySpoken).trim(); + logger.log('[StreamTTS] post-stream: remaining chars=', remaining.length, 'isSpeaking=', tts.isSpeaking, 'currentMessageId=', tts.currentMessageId); + if (remaining) { + useTTSStore.getState().speak(remaining, last.id); + } else if (useTTSStore.getState().currentMessageId === 'streaming') { + // All text was already spoken by streaming chunks — transfer ownership + // to the real message ID so the AudioMessageBubble's seekbar works. + logger.log('[StreamTTS] post-stream: transferring ownership from streaming to', last.id); + useTTSStore.setState({ currentMessageId: last.id }); + } + }, [isStreamingForThisConversation]); // eslint-disable-line react-hooks/exhaustive-deps const startGeneration = async (targetConversationId: string, messageText: string) => { await startGenerationFn(genDeps, { setDebugInfo, targetConversationId, messageText }); diff --git a/src/screens/DownloadManagerScreen/index.tsx b/src/screens/DownloadManagerScreen/index.tsx index 3829299f..46c2312f 100644 --- a/src/screens/DownloadManagerScreen/index.tsx +++ b/src/screens/DownloadManagerScreen/index.tsx @@ -1,5 +1,5 @@ -import React from 'react'; -import { View, Text, FlatList, TouchableOpacity, RefreshControl } from 'react-native'; +import React, { useState, useCallback } from 'react'; +import { View, Text, FlatList, TouchableOpacity, RefreshControl, ScrollView } from 'react-native'; import { SafeAreaView } from 'react-native-safe-area-context'; import Icon from 'react-native-vector-icons/Feather'; import { Card } from '../../components'; @@ -7,13 +7,35 @@ import { CustomAlert, hideAlert } from '../../components/CustomAlert'; import { useTheme, useThemedStyles } from '../../theme'; import { useNavigation } from '@react-navigation/native'; import { createStyles } from './styles'; -import { ActiveDownloadCard, CompletedDownloadCard, formatBytes } from './items'; +import { ActiveDownloadCard, CompletedDownloadCard, formatBytes, type DownloadItem } from './items'; import { useDownloadManager } from './useDownloadManager'; +type FilterType = 'all' | 'text' | 'vision' | 'image' | 'tts' | 'stt'; + +const FILTERS: { id: FilterType; label: string }[] = [ + { id: 'all', label: 'All' }, + { id: 'text', label: 'Text' }, + { id: 'vision', label: 'Vision' }, + { id: 'image', label: 'Image Gen' }, + { id: 'tts', label: 'Text to Speech' }, + { id: 'stt', label: 'Speech to Text' }, +]; + +function matchesFilter(item: DownloadItem, filter: FilterType): boolean { + if (filter === 'all') return true; + if (filter === 'vision') return item.modelType === 'text' && !!item.isVisionModel; + if (filter === 'text') return item.modelType === 'text' && !item.isVisionModel; + if (filter === 'image') return item.modelType === 'image'; + if (filter === 'tts') return item.modelType === 'tts'; + if (filter === 'stt') return item.modelType === 'stt'; + return true; +} + export const DownloadManagerScreen: React.FC = () => { const navigation = useNavigation(); const { colors } = useTheme(); const styles = useThemedStyles(createStyles); + const [activeFilter, setActiveFilter] = useState('all'); const { isRefreshing, activeItems, @@ -27,6 +49,30 @@ export const DownloadManagerScreen: React.FC = () => { totalStorageUsed, } = useDownloadManager(); + const filteredActive = activeItems.filter(item => matchesFilter(item, activeFilter)); + const filteredCompleted = completedItems.filter(item => matchesFilter(item, activeFilter)); + + const renderHeader = useCallback(() => ( + + {FILTERS.map(f => { + const active = activeFilter === f.id; + return ( + setActiveFilter(f.id)} + > + {f.label} + + ); + })} + + ), [activeFilter, colors, styles]); + return ( @@ -39,52 +85,47 @@ export const DownloadManagerScreen: React.FC = () => { ( - {/* Active Downloads */} - - - - Active Downloads - - {activeItems.length} + {/* Active Downloads — only show when there are active items */} + {filteredActive.length > 0 && ( + + + + Active Downloads + + {filteredActive.length} + - - {activeItems.length > 0 ? ( - activeItems.map(item => ( + {filteredActive.map(item => ( - )) - ) : ( - - - No active downloads - - )} - + ))} + + )} - {/* Completed Downloads */} + {/* Downloaded Models */} - + Downloaded Models - {completedItems.length} + {filteredCompleted.length} - {completedItems.length > 0 ? ( - completedItems.map(item => ( + {filteredCompleted.length > 0 ? ( + filteredCompleted.map(item => ( )) ) : ( - - No models downloaded yet - - Go to the Models tab to browse and download models + + + {activeFilter === 'all' ? 'No models downloaded yet' : `No ${FILTERS.find(f => f.id === activeFilter)?.label ?? ''} models`} )} diff --git a/src/screens/DownloadManagerScreen/items.tsx b/src/screens/DownloadManagerScreen/items.tsx index f2d20d80..8cc45992 100644 --- a/src/screens/DownloadManagerScreen/items.tsx +++ b/src/screens/DownloadManagerScreen/items.tsx @@ -12,7 +12,7 @@ import { createStyles } from './styles'; export type DownloadItem = { type: 'active' | 'completed'; - modelType: 'text' | 'image'; + modelType: 'text' | 'image' | 'tts' | 'stt'; downloadId?: number; modelId: string; fileName: string; @@ -222,9 +222,9 @@ export const CompletedDownloadCard: React.FC = ({ it diff --git a/src/screens/DownloadManagerScreen/styles.ts b/src/screens/DownloadManagerScreen/styles.ts index 39120fa0..8f40c283 100644 --- a/src/screens/DownloadManagerScreen/styles.ts +++ b/src/screens/DownloadManagerScreen/styles.ts @@ -33,17 +33,17 @@ export const createStyles = (colors: ThemeColors, shadows: ThemeShadows) => ({ flex: 1, }, listContent: { - paddingTop: SPACING.lg, + paddingTop: SPACING.md, paddingBottom: SPACING.xxl, }, section: { - marginBottom: SPACING.xl, + marginBottom: SPACING.md, }, sectionHeader: { flexDirection: 'row' as const, alignItems: 'center' as const, paddingHorizontal: SPACING.lg, - marginBottom: SPACING.md, + marginBottom: SPACING.sm, gap: SPACING.sm, }, sectionTitle: { @@ -63,7 +63,7 @@ export const createStyles = (colors: ThemeColors, shadows: ThemeShadows) => ({ }, downloadCard: { marginHorizontal: SPACING.lg, - marginBottom: SPACING.md, + marginBottom: SPACING.sm, }, downloadHeader: { flexDirection: 'row' as const, @@ -160,19 +160,47 @@ export const createStyles = (colors: ThemeColors, shadows: ThemeShadows) => ({ emptyCard: { marginHorizontal: SPACING.lg, alignItems: 'center' as const, - paddingVertical: SPACING.xxl, - gap: SPACING.sm, + paddingVertical: SPACING.xl, + gap: SPACING.xs, }, emptyText: { - ...TYPOGRAPHY.body, - color: colors.textSecondary, - marginTop: SPACING.sm, + ...TYPOGRAPHY.bodySmall, + color: colors.textMuted, + marginTop: SPACING.xs, }, emptySubtext: { - ...TYPOGRAPHY.bodySmall, + ...TYPOGRAPHY.meta, color: colors.textMuted, textAlign: 'center' as const, }, + filterBarContent: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + paddingHorizontal: SPACING.lg, + paddingVertical: SPACING.sm, + gap: SPACING.xs, + }, + filterChip: { + flexDirection: 'row' as const, + alignItems: 'center' as const, + paddingHorizontal: SPACING.sm + 2, + paddingVertical: 5, + borderRadius: 12, + borderWidth: 1, + borderColor: colors.border, + backgroundColor: colors.background, + }, + filterChipActive: { + borderColor: colors.primary, + backgroundColor: `${colors.primary}15`, + }, + filterChipText: { + ...TYPOGRAPHY.meta, + color: colors.textSecondary, + }, + filterChipTextActive: { + color: colors.primary, + }, storageSection: { paddingHorizontal: SPACING.lg, }, diff --git a/src/screens/ModelSettingsScreen/ImageGenerationSection.tsx b/src/screens/ModelSettingsScreen/ImageGenerationSection.tsx index ea7c9306..4d84b130 100644 --- a/src/screens/ModelSettingsScreen/ImageGenerationSection.tsx +++ b/src/screens/ModelSettingsScreen/ImageGenerationSection.tsx @@ -1,7 +1,7 @@ import React, { useState } from 'react'; import { View, Text, Switch, Platform, TouchableOpacity } from 'react-native'; -import Slider from '@react-native-community/slider'; import { AdvancedToggle, Card } from '../../components'; +import { NumericStepper } from '../../components/NumericStepper'; import { Button } from '../../components/Button'; import { useTheme, useThemedStyles } from '../../theme'; import { useAppStore } from '../../stores'; @@ -114,49 +114,28 @@ const DetectionMethodRow: React.FC = () => { // ─── Advanced Section ──────────────────────────────────────────────────────── const ImageAdvancedSection: React.FC = () => { - const { colors } = useTheme(); const styles = useThemedStyles(createStyles); const { settings, updateSettings } = useAppStore(); return ( <> - - Guidance Scale - {(settings?.imageGuidanceScale || 7.5).toFixed(1)} - + Guidance Scale Higher = follows prompt more strictly - updateSettings({ imageGuidanceScale: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={1} max={20} step={0.5} decimals={1} + onChange={(value) => updateSettings({ imageGuidanceScale: value })} /> - - Image Threads - {settings?.imageThreads ?? 4} - - - CPU threads used for image generation (applies on next image model load) - - Image Threads + CPU threads used for image generation (applies on next image model load) + updateSettings({ imageThreads: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={1} max={8} step={1} + onChange={(value) => updateSettings({ imageThreads: value })} /> @@ -212,40 +191,23 @@ export const ImageGenerationSection: React.FC = () => { - - Image Steps - {settings?.imageSteps || 8} - + Image Steps More steps = better quality but slower (4-8 fast, 20-50 high quality) - updateSettings({ imageSteps: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={4} max={50} step={1} + onChange={(value) => updateSettings({ imageSteps: value })} /> - - Image Size - {settings?.imageWidth ?? 256}x{settings?.imageHeight ?? 256} - + Image Size Output resolution (smaller = faster, larger = more detail) - updateSettings({ imageWidth: value, imageHeight: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={128} max={512} step={64} + formatValue={(v) => `${v}x${v}`} + onChange={(value) => updateSettings({ imageWidth: value, imageHeight: value })} /> diff --git a/src/screens/ModelSettingsScreen/TextGenerationAdvanced.tsx b/src/screens/ModelSettingsScreen/TextGenerationAdvanced.tsx index 33faa229..e1387488 100644 --- a/src/screens/ModelSettingsScreen/TextGenerationAdvanced.tsx +++ b/src/screens/ModelSettingsScreen/TextGenerationAdvanced.tsx @@ -1,7 +1,7 @@ import React from 'react'; import { View, Text, Switch, Platform } from 'react-native'; -import Slider from '@react-native-community/slider'; import { Button } from '../../components/Button'; +import { NumericStepper } from '../../components/NumericStepper'; import { useTheme, useThemedStyles } from '../../theme'; import { useAppStore } from '../../stores'; import { CacheType } from '../../types'; @@ -52,24 +52,15 @@ const GpuSection: React.FC = ({ {isGpuEnabled && ( - - GPU Layers - {gpuLayersEffective} - + GPU Layers Layers offloaded to GPU. Higher = faster but may crash on low-VRAM devices. - updateSettings({ gpuLayers: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={1} max={GPU_LAYERS_MAX} step={1} + onChange={(value) => updateSettings({ gpuLayers: value })} /> )} @@ -207,78 +198,42 @@ export const TextGenerationAdvanced: React.FC = () => { return ( <> - - Top P - {(settings?.topP || 0.9).toFixed(2)} - + Top P Nucleus sampling threshold - updateSettings({ topP: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={0.1} max={1.0} step={0.05} decimals={2} + onChange={(value) => updateSettings({ topP: value })} /> - - Repeat Penalty - {(settings?.repeatPenalty || 1.1).toFixed(2)} - + Repeat Penalty Penalize repeated tokens - updateSettings({ repeatPenalty: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={1.0} max={2.0} step={0.05} decimals={2} + onChange={(value) => updateSettings({ repeatPenalty: value })} /> - - CPU Threads - {settings?.nThreads || 6} - + CPU Threads Parallel threads for inference - updateSettings({ nThreads: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={1} max={12} step={1} + onChange={(value) => updateSettings({ nThreads: value })} /> - - Batch Size - {settings?.nBatch || 256} - + Batch Size Tokens processed per batch - updateSettings({ nBatch: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={32} max={512} step={32} + onChange={(value) => updateSettings({ nBatch: value })} /> diff --git a/src/screens/ModelSettingsScreen/TextGenerationSection.tsx b/src/screens/ModelSettingsScreen/TextGenerationSection.tsx index 5b1d9099..3ae132f4 100644 --- a/src/screens/ModelSettingsScreen/TextGenerationSection.tsx +++ b/src/screens/ModelSettingsScreen/TextGenerationSection.tsx @@ -1,7 +1,7 @@ import React, { useState } from 'react'; import { View, Text, Switch } from 'react-native'; -import Slider from '@react-native-community/slider'; import { AdvancedToggle, Card } from '../../components'; +import { NumericStepper } from '../../components/NumericStepper'; import { useTheme, useThemedStyles } from '../../theme'; import { useAppStore } from '../../stores'; import { createStyles } from './styles'; @@ -26,56 +26,40 @@ export const TextGenerationSection: React.FC = () => { const contextLengthLabel = contextLength >= 1024 ? `${(contextLength / 1024).toFixed(0)}K` : String(contextLength); - const ctxSliderMax = modelMaxContext || FALLBACK_MAX_CONTEXT; + const ctxMax = modelMaxContext || FALLBACK_MAX_CONTEXT; return ( Configure LLM behavior for text responses. - {/* ── Basic Settings ── */} - Temperature - {(settings?.temperature || 0.7).toFixed(2)} Higher = more creative, Lower = more focused - updateSettings({ temperature: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={0} max={2} step={0.05} decimals={2} + onChange={(value) => updateSettings({ temperature: value })} /> Max Tokens - {maxTokensLabel} Maximum response length - updateSettings({ maxTokens: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={64} max={8192} step={64} + formatValue={() => maxTokensLabel} + onChange={(value) => updateSettings({ maxTokens: value })} /> Context Length - {contextLengthLabel} KV cache size — larger uses more RAM (requires reload) {contextLength > HIGH_CONTEXT_THRESHOLD && ( @@ -83,16 +67,11 @@ export const TextGenerationSection: React.FC = () => { High context uses significant RAM and may crash on some devices )} - updateSettings({ contextLength: value })} - minimumTrackTintColor={colors.primary} - maximumTrackTintColor={colors.surface} - thumbTintColor={colors.primary} + min={512} max={ctxMax} step={1024} + formatValue={() => contextLengthLabel} + onChange={(value) => updateSettings({ contextLength: value })} /> diff --git a/src/screens/ModelSettingsScreen/index.tsx b/src/screens/ModelSettingsScreen/index.tsx index e0aefc79..319c9302 100644 --- a/src/screens/ModelSettingsScreen/index.tsx +++ b/src/screens/ModelSettingsScreen/index.tsx @@ -33,6 +33,7 @@ export const ModelSettingsScreen: React.FC = () => { const task = InteractionManager.runAfterInteractions(() => goTo(pending)); return () => task.cancel(); } + // eslint-disable-next-line react-hooks/exhaustive-deps }, []); const handleReset = () => { diff --git a/src/screens/SettingsScreen.tsx b/src/screens/SettingsScreen.tsx index f1cd721a..353c9b23 100644 --- a/src/screens/SettingsScreen.tsx +++ b/src/screens/SettingsScreen.tsx @@ -151,6 +151,7 @@ export const SettingsScreen: React.FC = () => { { icon: 'wifi', title: 'Remote Servers', desc: 'Connect to Ollama, LM Studio, and more', screen: 'RemoteServers' as const }, // { icon: 'search', title: 'Web Search', desc: 'Configure search API key for reliable results', screen: 'WebSearchSettings' as const }, { icon: 'mic', title: 'Voice Transcription', desc: 'On-device speech to text', screen: 'VoiceSettings' as const }, + { icon: 'volume-2', title: 'Text to Speech', desc: 'On-device voice responses', screen: 'TTSSettings' as const }, { icon: 'lock', title: 'Security', desc: 'Passphrase and app lock', screen: 'SecuritySettings' as const }, { icon: 'smartphone', title: 'Device Information', desc: 'Hardware and compatibility', screen: 'DeviceInfo' as const }, { icon: 'hard-drive', title: 'Storage', desc: 'Models and data usage', screen: 'StorageSettings' as const }, diff --git a/src/screens/TTSSettingsScreen/index.tsx b/src/screens/TTSSettingsScreen/index.tsx new file mode 100644 index 00000000..b982c6f7 --- /dev/null +++ b/src/screens/TTSSettingsScreen/index.tsx @@ -0,0 +1,412 @@ +import React, { useEffect, useState } from 'react'; +import { View, Text, ScrollView, TouchableOpacity, Switch, ActivityIndicator } from 'react-native'; +import { SafeAreaView } from 'react-native-safe-area-context'; +import Icon from 'react-native-vector-icons/Feather'; +import { NumericStepper } from '../../components/NumericStepper'; +import { useNavigation } from '@react-navigation/native'; +import { Card, Button } from '../../components'; +import { CustomAlert, showAlert, hideAlert, AlertState, initialAlertState } from '../../components/CustomAlert'; +import { useTheme, useThemedStyles } from '../../theme'; +import type { ThemeColors, ThemeShadows } from '../../theme'; +import { TYPOGRAPHY, SPACING } from '../../constants'; +import { useTTSStore } from '../../stores/ttsStore'; +import { hardwareService } from '../../services/hardware'; +import { TTS_BACKBONE_MODEL, TTS_WARN_RAM_GB, TTS_BLOCK_RAM_GB } from '../../constants/ttsModels'; +import { KOKORO_VOICES, isExecutorchSupported } from '../../constants/kokoroModels'; +import type { KokoroVoiceId } from '../../constants/kokoroModels'; +import type { InterfaceMode } from '../../stores/ttsStore'; + +// ─── Sub-components ─────────────────────────────────────────────────────────── + +type Styles = ReturnType; + +const ProgressRow: React.FC<{ + label: string; + sizeMB: number; + downloaded: boolean; + downloading: boolean; + progress: number; + styles: Styles; + colors: ThemeColors; + border?: boolean; +}> = ({ label, sizeMB, downloaded, downloading, progress, styles, colors, border }) => ( + + + + {label} + {sizeMB} MB + + {downloaded && } + {downloading && {Math.round(progress * 100)}%} + {!downloaded && !downloading && } + + {downloading && ( + + + + )} + +); + +const InterfaceModeCard: React.FC<{ + mode: InterfaceMode; + deviceBlocked: boolean; + areBothDownloaded: boolean; + onModeChange: (m: InterfaceMode) => void; + styles: Styles; +}> = ({ mode, deviceBlocked, areBothDownloaded, onModeChange, styles }) => ( + + Interface Mode + + Audio Mode renders responses as voice notes. Chat Mode adds a play button to text bubbles. + + + {(['chat', 'audio'] as InterfaceMode[]).map((m) => { + const active = mode === m; + const blocked = m === 'audio' && (deviceBlocked || !areBothDownloaded); + return ( + onModeChange(m)} + disabled={blocked} + > + + {m === 'chat' ? 'Chat' : 'Audio'} + + + ); + })} + + {!areBothDownloaded && ( + Download models below to enable Audio Mode. + )} + +); + +const PlaybackCard: React.FC<{ + settings: ReturnType['settings']; + onUpdate: (patch: Partial['settings']>) => void; + colors: ThemeColors; + styles: Styles; +}> = ({ settings, onUpdate, colors, styles }) => ( + + Playback + Speed + `${v.toFixed(1)}x`} + onChange={(v) => onUpdate({ speed: v })} + /> + {settings.interfaceMode === 'chat' && ( + + + Auto-play + Speak AI responses automatically + + onUpdate({ autoPlay: v })} + trackColor={{ true: colors.primary }} + /> + + )} + +); + +const CompatibilityCard: React.FC<{ + ramGB: number; + deviceBlocked: boolean; + deviceWarning: boolean; + styles: Styles; + colors: ThemeColors; +}> = ({ ramGB, deviceBlocked, deviceWarning, styles, colors }) => { + if (!deviceWarning && !deviceBlocked) { return null; } + return ( + + + + + {deviceBlocked + ? `TTS requires at least ${TTS_BLOCK_RAM_GB} GB RAM. Your device has ${ramGB.toFixed(1)} GB.` + : `Your device (${ramGB.toFixed(1)} GB RAM) may run TTS but performance could be slow. 8 GB recommended.`} + + + + ); +}; + +const KokoroCard: React.FC<{ + kokoroReady: boolean; + kokoroDownloadProgress: number; + selectedVoiceId: KokoroVoiceId; + isChangingVoice: boolean; + onVoiceChange: (id: KokoroVoiceId) => void; + styles: Styles; + colors: ThemeColors; +}> = ({ kokoroReady, kokoroDownloadProgress, selectedVoiceId, isChangingVoice, onVoiceChange, styles, colors }) => { + const supported = isExecutorchSupported(); + return ( + + + Voice + {!supported && ( + Requires Android 13+ / iOS 17 + )} + {supported && !kokoroReady && kokoroDownloadProgress > 0 && ( + {Math.round(kokoroDownloadProgress * 100)}% + )} + {supported && !kokoroReady && kokoroDownloadProgress === 0 && ( + + )} + {supported && kokoroReady && ( + + )} + + + Fast on-device voice synthesis. Used for the speak button in Chat Mode. + + {KOKORO_VOICES.map((voice, i) => { + const active = selectedVoiceId === voice.id; + return ( + 0 && styles.voiceRowBorder]} + onPress={() => onVoiceChange(voice.id)} + disabled={!supported} + > + + {voice.label} + {voice.accent} · {voice.gender} + + {active && ( + isChangingVoice + ? + : + )} + + ); + })} + + ); +}; + +// ─── Main screen ────────────────────────────────────────────────────────────── + +export const TTSSettingsScreen: React.FC = () => { + const navigation = useNavigation(); + const { colors } = useTheme(); + const styles = useThemedStyles(createStyles); + const [alertState, setAlertState] = useState(initialAlertState); + const [ramGB, setRamGB] = useState(8); + + const { + isBackboneDownloaded, isVocoderDownloaded, + isDownloadingBackbone, isDownloadingVocoder, + backboneDownloadProgress, vocoderDownloadProgress, + isModelLoaded, isModelLoading, + audioCacheSizeMB, settings, error, + kokoroReady, kokoroDownloadProgress, kokoroActiveVoiceId, + downloadModels, deleteModels, loadModels, unloadModels, + checkDownloadStatus, refreshCacheSize, clearAudioCache, updateSettings, clearError, + } = useTTSStore(); + + useEffect(() => { + setRamGB(hardwareService.getTotalMemoryGB()); + checkDownloadStatus(); + refreshCacheSize(); + // eslint-disable-next-line react-hooks/exhaustive-deps + }, []); + + const areBothDownloaded = isBackboneDownloaded && isVocoderDownloaded; + const isDownloading = isDownloadingBackbone || isDownloadingVocoder; + const deviceBlocked = ramGB < TTS_BLOCK_RAM_GB; + const deviceWarning = !deviceBlocked && ramGB < TTS_WARN_RAM_GB; + const totalSizeMB = TTS_BACKBONE_MODEL.backboneSizeMB + TTS_BACKBONE_MODEL.vocoderSizeMB; + + const handleDelete = () => { + setAlertState( + showAlert('Remove TTS Models', 'This will delete both model files and disable text-to-speech.', [ + { text: 'Cancel', style: 'cancel' }, + { text: 'Remove', style: 'destructive', onPress: () => { setAlertState(hideAlert()); deleteModels(); } }, + ]), + ); + }; + + const handleClearCache = () => { + setAlertState( + showAlert('Clear Audio Cache', `This will delete ${audioCacheSizeMB.toFixed(1)} MB of cached audio.`, [ + { text: 'Cancel', style: 'cancel' }, + { text: 'Clear', style: 'destructive', onPress: () => { setAlertState(hideAlert()); clearAudioCache(); } }, + ]), + ); + }; + + const handleModeChange = (mode: InterfaceMode) => { + if (mode === 'audio' && deviceBlocked) { return; } + updateSettings({ interfaceMode: mode }); + if (mode === 'audio' && !isModelLoaded && areBothDownloaded) { loadModels(); } + if (mode === 'chat' && isModelLoaded) { unloadModels(); } + }; + + return ( + + + navigation.goBack()}> + + + Text to Speech + {isModelLoading && } + + + + + + + {settings.interfaceMode === 'chat' && ( + + + + Enable TTS + Show play buttons on assistant messages + + updateSettings({ enabled: v })} trackColor={{ true: colors.primary }} /> + + + )} + + + Models ({totalSizeMB} MB total) + + + + {areBothDownloaded + ?