feat: support language-based speech recognition model switching

lsustc · lsustc · commit 47adf9c5f893 · 2025-12-02T08:49:37.000+08:00
- Add English speech model (vosk-model-small-en-us-0.15.zip)
- Auto-switch models when language changes (small-cn for Chinese, small-en for English)
- Add language-specific text processing (remove spaces for Chinese, keep for English)
- Fix race condition by serializing cleanup and initialization operations
- Clean up debug logging while preserving dynamic prompts
diff --git a/public/models/vosk-model-small-en-us-0.15.zip b/public/models/vosk-model-small-en-us-0.15.zip
diff --git a/src/hooks/useVoiceInput.ts b/src/hooks/useVoiceInput.ts
@@ -7,6 +7,7 @@ import {
 } from '@/services/speech-recognition';
 import { SpeechRecognitionConfig } from '@/models/speech-recognition/speech-recognition-base';
 import { logger } from '@/utils/logger';
+import { useLanguageStore } from '@/stores/languageStore';
 
 export type VoiceInputStatus = 'idle' | 'recording' | 'error';
 
@@ -25,27 +26,37 @@ export const useVoiceInput = ({ onTextRecognized, onError }: UseVoiceInputOption
   const isRecordingRef = useRef(false); // Track actual recording state
   const onTextRecognizedRef = useRef(onTextRecognized);
   const onErrorRef = useRef(onError);
+  const { language } = useLanguageStore();
 
   // Update refs when callbacks change
   useEffect(() => {
     onTextRecognizedRef.current = onTextRecognized;
     onErrorRef.current = onError;
   }, [onTextRecognized, onError]);
 
-  // Initialize speech recognition on mount (only once)
+  // Initialize speech recognition and reinitialize when language changes
   useEffect(() => {
-    if (isInitialized.current) return;
-
-    const config: SpeechRecognitionConfig = {
-      provider: 'vosk',
-      modelType: 'small-cn'
-    };
-
     let mounted = true;
+    let initializationStarted = false;
 
     // Async initialization
     (async () => {
       try {
+        // Cleanup previous instance and wait for it to complete
+        await cleanupSpeechRecognition();
+
+        // Check if still mounted after cleanup
+        if (!mounted) return;
+
+        // Select model based on current language
+        const modelType = language === 'zh-CN' ? 'small-cn' : 'small-en';
+
+        const config: SpeechRecognitionConfig = {
+          provider: 'vosk',
+          modelType
+        };
+
+        initializationStarted = true;
         await initSpeechRecognitionWithProvider(config, (text: string) => {
           if (onTextRecognizedRef.current) {
             onTextRecognizedRef.current(text);
@@ -63,14 +74,15 @@ export const useVoiceInput = ({ onTextRecognized, onError }: UseVoiceInputOption
       }
     })();
 
-    // Cleanup on unmount
+    // Cleanup on unmount or language change
     return () => {
       mounted = false;
       isRecordingRef.current = false;
-      cleanupSpeechRecognition();
-      isInitialized.current = false;
+      if (initializationStarted) {
+        cleanupSpeechRecognition();
+      }
     };
-  }, []); // Empty dependency array - only run once
+  }, [language]); // Reinitialize when language changes
 
   /**
    * Start voice recording
diff --git a/src/models/speech-recognition/speech-recognition-base.ts b/src/models/speech-recognition/speech-recognition-base.ts
@@ -9,7 +9,7 @@ export interface SpeechRecognitionConfig {
     appId?: string;
     apiSecret?: string;
     xfApiKey?: string;
-    modelType?: 'small-cn' | 'standard-cn';
+    modelType?: 'small-cn' | 'small-en';
 }
 
 export interface SpeechRecognitionError {
diff --git a/src/models/speech-recognition/speech-recognition-vosk.ts b/src/models/speech-recognition/speech-recognition-vosk.ts
@@ -3,7 +3,7 @@ import { SpeechRecognitionBase, SpeechRecognitionConfig } from "./speech-recogni
 // Model configuration
 const MODEL_CONFIG = {
     'small-cn': '/models/vosk-model-small-cn-0.22.tar.gz',
-    'standard-cn': '/models/vosk-model-cn-0.22.tar.gz'
+    'small-en': '/models/vosk-model-small-en-us-0.15.zip'
 } as const;
 
 // Audio configuration
@@ -51,9 +51,8 @@ export class SpeechRecognitionVosk implements SpeechRecognitionBase {
                 return;
             }
 
-            // 3. Load model
-            const modelPath = MODEL_CONFIG[this.config.modelType || 'small-cn'];
-            console.log(`🎤 Loading speech model: ${modelPath}`);
+            // Load model based on config
+            const modelPath = MODEL_CONFIG[this.config.modelType || 'small-en'];
 
             // Ensure Vosk is globally available
             const Vosk = (window as any).Vosk;
@@ -121,12 +120,11 @@ export class SpeechRecognitionVosk implements SpeechRecognitionBase {
             // Set recognition result callback
             try {
                 this.recognizer.on('result', (message: any) => {
-                    console.log('🎤 Received recognition result event:', message);
                     const text = message.result?.text;
                     if (text && text.trim()) {
-                        // Remove all spaces for Chinese text
-                        const cleanedText = text.replace(/\s+/g, '');
-                        console.log('🎤 Speech recognition result:', cleanedText);
+                        // Remove spaces only for Chinese, keep spaces for English
+                        const isChinese = this.config.modelType === 'small-cn';
+                        const cleanedText = isChinese ? text.replace(/\s+/g, '') : text.trim();
                         if (cleanedText && this.onRecognizedCallback) {
                             this.onRecognizedCallback(cleanedText);
                         }
diff --git a/src/services/speech-recognition.ts b/src/services/speech-recognition.ts
@@ -7,12 +7,13 @@ import { logger } from "@/utils/logger";
 
 let speechRecognition: SpeechRecognitionBase | null = null;
 let initializationPromise: Promise<void> | null = null;
+let currentCleanup: Promise<void> | null = null;
 
 // New initialization function, supports multiple providers (async to wait for Vosk model loading)
 export async function initSpeechRecognitionWithProvider(config: SpeechRecognitionConfig, onRecognized?: (text: string) => void): Promise<void> {
-  // Return existing initialization promise if already initializing
-  if (initializationPromise) {
-    return initializationPromise;
+  // Wait for ongoing cleanup to complete
+  if (currentCleanup) {
+    await currentCleanup;
   }
 
   initializationPromise = (async () => {
@@ -78,7 +79,18 @@ export async function stopSpeechRecognition() {
 
 // Cleanup resources
 export async function cleanupSpeechRecognition() {
-  await speechRecognition?.cleanup();
-  speechRecognition = null;
-  initializationPromise = null;
+  if (currentCleanup) {
+    return currentCleanup;
+  }
+
+  currentCleanup = (async () => {
+    if (speechRecognition) {
+      await speechRecognition.cleanup();
+      speechRecognition = null;
+    }
+    initializationPromise = null;
+  })();
+
+  await currentCleanup;
+  currentCleanup = null;
 }

Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@ export interface SpeechRecognitionConfig {`
`9`	`9`	`appId?: string;`
`10`	`10`	`apiSecret?: string;`
`11`	`11`	`xfApiKey?: string;`
`12`		`- modelType?: 'small-cn' \| 'standard-cn';`
	`12`	`+ modelType?: 'small-cn' \| 'small-en';`
`13`	`13`	`}`
`14`	`14`
`15`	`15`	`export interface SpeechRecognitionError {`