Update app to use Direct CSM by default

toddllm · toddllm · commit 1449c555c9ac · 2025-03-17T01:34:57.000-04:00
diff --git a/app/api/voice_generator.py b/app/api/voice_generator.py
@@ -130,6 +130,9 @@ def load_model(self, device: str = None) -> bool:
                     return True
                 except DirectCSMError as e:
                     logger.warning(f"Failed to load Direct CSM implementation: {e}")
+                    if not config.DIRECT_CSM_FALLBACK_TO_STANDARD:
+                        logger.error("Direct CSM fallback is disabled, failing")
+                        raise RuntimeError(f"Failed to load Direct CSM and fallback is disabled: {e}")
                     logger.info("Falling back to standard CSM model")
                     self.direct_csm = None
             
@@ -161,6 +164,9 @@ def load_model(self, device: str = None) -> bool:
                             return True
                         except DirectCSMError as e:
                             logger.warning(f"Failed to load Direct CSM implementation on CPU: {e}")
+                            if not config.DIRECT_CSM_FALLBACK_TO_STANDARD:
+                                logger.error("Direct CSM fallback is disabled, failing")
+                                raise RuntimeError(f"Failed to load Direct CSM on CPU and fallback is disabled: {e}")
                             logger.info("Falling back to standard CSM model on CPU")
                             self.direct_csm = None
                     
@@ -252,6 +258,7 @@ def generate(
             if self.direct_csm is not None:
                 try:
                     # Generate speech using direct CSM
+                    logger.info("Using Direct CSM for voice generation")
                     audio, sample_rate = self.direct_csm.generate_speech(
                         text=text,
                         speaker_id=speaker_id,
@@ -277,12 +284,16 @@ def generate(
                     return output_path, url
                     
                 except DirectCSMError as e:
-                    logger.warning(f"Direct CSM failed: {e}, falling back to standard CSM model")
-                    # Fall back to standard CSM model
+                    logger.warning(f"Direct CSM failed: {e}")
+                    if not config.DIRECT_CSM_FALLBACK_TO_STANDARD:
+                        logger.error("Direct CSM fallback is disabled, failing")
+                        return None, None
+                    logger.info("Falling back to standard CSM model")
             
             # If direct CSM failed or is not available, use the standard CSM model
             if self.model is not None:
                 # Generate speech
+                logger.info("Using standard CSM model for voice generation")
                 audio, sample_rate = self.model.generate_speech(
                     text=text,
                     speaker_id=speaker_id,
@@ -309,77 +320,9 @@ def generate(
             else:
                 logger.error("No model available for voice generation")
                 return None, None
-            
+                
         except Exception as e:
-            logger.error(f"Error generating voice: {e}")
-            
-            # Try again with CPU if we were using CUDA and it failed
-            if device == "cuda" or device == "auto":
-                logger.info("Attempting to fall back to CPU after error")
-                try:
-                    # Try direct CSM on CPU if available
-                    if self.direct_csm is not None:
-                        try:
-                            # Generate speech using direct CSM on CPU
-                            audio, sample_rate = self.direct_csm.generate_speech(
-                                text=text,
-                                speaker_id=speaker_id,
-                                temperature=temperature,
-                                top_k=top_k,
-                                device="cpu"
-                            )
-                            
-                            # Save the audio
-                            self.direct_csm.save_audio(audio, sample_rate, output_path)
-                            
-                            # Check if file was created
-                            if not os.path.exists(output_path):
-                                logger.error(f"Output file not created: {output_path}")
-                                raise DirectCSMError("Output file not created")
-                            
-                            # Create URL for accessing the file
-                            relative_path = os.path.relpath(output_path, self.output_dir)
-                            url = f"/voices/{relative_path}"
-                            
-                            logger.info(f"Voice generated successfully with Direct CSM on CPU: {output_path}")
-                            return output_path, url
-                            
-                        except DirectCSMError as cpu_e:
-                            logger.warning(f"Direct CSM on CPU failed: {cpu_e}, falling back to standard CSM model on CPU")
-                    
-                    # If direct CSM failed or is not available, use the standard CSM model on CPU
-                    if self.model is not None:
-                        # Generate speech on CPU
-                        audio, sample_rate = self.model.generate_speech(
-                            text=text,
-                            speaker_id=speaker_id,
-                            temperature=temperature,
-                            top_k=top_k,
-                            device="cpu"
-                        )
-                        
-                        # Save the audio
-                        self.model.save_audio(audio, sample_rate, output_path)
-                        
-                        # Check if file was created
-                        if not os.path.exists(output_path):
-                            logger.error(f"Output file not created: {output_path}")
-                            return None, None
-                        
-                        # Create URL for accessing the file
-                        relative_path = os.path.relpath(output_path, self.output_dir)
-                        url = f"/voices/{relative_path}"
-                        
-                        logger.info(f"Voice generated successfully with standard CSM on CPU: {output_path}")
-                        return output_path, url
-                    else:
-                        logger.error("No model available for voice generation on CPU")
-                        return None, None
-                    
-                except Exception as cpu_e:
-                    logger.error(f"Error generating voice on CPU: {cpu_e}")
-                    return None, None
-            
+            logger.error(f"Error generating voice: {str(e)}")
             return None, None
     
     def list_available_voices(self) -> List[Dict[str, Any]]:
diff --git a/app/core/config.py b/app/core/config.py
@@ -39,8 +39,9 @@
 MODEL_PATH = "/home/tdeshane/.cache/huggingface/hub/models--sesame--csm-1b/snapshots/03ab46ff5cfdcc783cc76fcf9ea6fd0838503093/ckpt.pt"
 
 # Direct CSM settings
-USE_DIRECT_CSM = os.environ.get("USE_DIRECT_CSM", "true").lower() == "true"
+USE_DIRECT_CSM = os.environ.get("USE_DIRECT_CSM", "true").lower() == "true"  # Enable by default
 DIRECT_CSM_PATH = os.environ.get("DIRECT_CSM_PATH", "/home/tdeshane/tts_poc/voice_poc/csm")
+DIRECT_CSM_FALLBACK_TO_STANDARD = os.environ.get("DIRECT_CSM_FALLBACK_TO_STANDARD", "true").lower() == "true"
 
 # Output settings
 OUTPUT_DIR = os.environ.get("OUTPUT_DIR", "/tmp/echoforge/voices")
diff --git a/main.py b/main.py
@@ -69,6 +69,26 @@ async def startup_event():
     voices_dir.mkdir(parents=True, exist_ok=True)
     
     logger.info(f"Initialized data directories: {data_dir}")
+    
+    # Initialize voice generator with Direct CSM
+    from app.api.voice_generator import voice_generator
+    try:
+        # Get device from environment or use auto
+        device = os.environ.get("ECHOFORGE_DEVICE", "auto")
+        logger.info(f"Initializing voice generator with device: {device}")
+        
+        # Initialize the voice generator
+        voice_generator.initialize(device=device)
+        
+        # Log whether we're using Direct CSM
+        if voice_generator.direct_csm is not None:
+            logger.info("Voice generator initialized with Direct CSM")
+        elif voice_generator.model is not None:
+            logger.info("Voice generator initialized with standard CSM model")
+        else:
+            logger.warning("Voice generator initialized but no model is loaded")
+    except Exception as e:
+        logger.error(f"Failed to initialize voice generator: {e}")
 
 
 def parse_args():
@@ -98,6 +118,18 @@ def parse_args():
         default="auto", 
         help="Device to use for TTS (auto, cuda, cpu)"
     )
+    parser.add_argument(
+        "--direct-csm", 
+        action="store_true", 
+        default=True,
+        help="Use Direct CSM implementation (default: True)"
+    )
+    parser.add_argument(
+        "--no-direct-csm", 
+        action="store_false", 
+        dest="direct_csm",
+        help="Disable Direct CSM implementation"
+    )
     parser.add_argument(
         "--debug", 
         action="store_true", 
@@ -115,13 +147,16 @@ def parse_args():
         os.environ["ECHOFORGE_MODEL_PATH"] = args.model_path
     os.environ["ECHOFORGE_DEVICE"] = args.device
     
+    # Set Direct CSM environment variable
+    os.environ["USE_DIRECT_CSM"] = str(args.direct_csm).lower()
+    
     # Configure logging level based on debug mode
     if args.debug:
         logging.getLogger().setLevel(logging.DEBUG)
         logger.debug("Debug mode enabled")
     
     # Start the server
-    logger.info(f"Starting server on {args.host}:{args.port}")
+    logger.info(f"Starting server on {args.host}:{args.port} with Direct CSM {'enabled' if args.direct_csm else 'disabled'}")
     uvicorn.run(
         "main:app",
         host=args.host,
diff --git a/run.py b/run.py
@@ -101,6 +101,25 @@ def parse_arguments():
         help=f"Serve on {config.PUBLIC_HOST} to make the app publicly accessible"
     )
     
+    # Direct CSM arguments
+    parser.add_argument(
+        "--direct-csm", 
+        action="store_true",
+        help="Enable Direct CSM implementation (default: enabled)"
+    )
+    
+    parser.add_argument(
+        "--no-direct-csm", 
+        action="store_false",
+        dest="direct_csm",
+        help="Disable Direct CSM implementation"
+    )
+    
+    parser.add_argument(
+        "--direct-csm-path",
+        help=f"Path to Direct CSM implementation (default: {config.DIRECT_CSM_PATH})"
+    )
+    
     # Auth arguments - support both styles for compatibility
     parser.add_argument(
         "--auth", 
@@ -158,6 +177,15 @@ def main():
     if args.password or args.auth_pass:
         os.environ["AUTH_PASSWORD"] = args.auth_pass or args.password
     
+    # Direct CSM settings
+    if hasattr(args, 'direct_csm'):
+        os.environ["USE_DIRECT_CSM"] = str(args.direct_csm).lower()
+        logger.info(f"Direct CSM is {'enabled' if args.direct_csm else 'disabled'}")
+    
+    if args.direct_csm_path:
+        os.environ["DIRECT_CSM_PATH"] = args.direct_csm_path
+        logger.info(f"Using Direct CSM path: {args.direct_csm_path}")
+    
     # Set appropriate environment variables based on arguments
     if public_serving:
         os.environ["ALLOW_PUBLIC_SERVING"] = "true"