From ae0aa7beddcb36d6e8b480e81c292fb290d3d262 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 28 Jan 2026 23:43:46 +0000
Subject: [PATCH 1/3] Initial plan


From 0dd1e648251bdad27c62b5c1b79ddeb430cc5a74 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 28 Jan 2026 23:48:50 +0000
Subject: [PATCH 2/3] Add Apple Metal (MPS) GPU support for macOS systems

Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com>
---
 backend/app/services/music_service.py | 101 ++++++++++++++++++++++++--
 1 file changed, 93 insertions(+), 8 deletions(-)

diff --git a/backend/app/services/music_service.py b/backend/app/services/music_service.py
index 9defe0e..155c9c7 100644
--- a/backend/app/services/music_service.py
+++ b/backend/app/services/music_service.py
@@ -94,6 +94,7 @@ def detect_optimal_gpu_config() -> dict:
         - gpu_info: dict - info about each GPU (name, vram, compute capability)
         - config_name: str - human-readable name of the selected configuration
         - warning: str or None - any warnings about the configuration
+        - device_type: str - type of device to use ("cuda", "mps", or "cpu")
     """
     result = {
         "use_quantization": True,  # Default to quantization for safety
@@ -102,10 +103,38 @@ def detect_optimal_gpu_config() -> dict:
         "gpu_info": {},
         "config_name": "CPU Only",
         "warning": None,
+        "device_type": "cpu",
     }
 
-    if not torch.cuda.is_available():
-        result["warning"] = "No CUDA GPU detected. Running on CPU will be very slow."
+    # Check for CUDA GPUs first
+    if torch.cuda.is_available():
+        result["device_type"] = "cuda"
+        # Continue with existing CUDA logic below
+    # Check for Apple Metal (MPS) on macOS
+    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+        result["device_type"] = "mps"
+        result["num_gpus"] = 1
+        result["use_quantization"] = False  # MPS works better with full precision
+        result["use_sequential_offload"] = False  # Unified memory architecture
+        result["config_name"] = "Apple Metal (MPS)"
+        result["gpu_info"] = {
+            0: {
+                "name": "Apple Metal Performance Shaders",
+                "vram_gb": "Unified Memory",
+                "compute_capability": "MPS",
+                "supports_flash_attention": False,
+            }
+        }
+        print(f"\n[Auto-Config] Using Apple Metal (MPS) device", flush=True)
+        print(f"[Auto-Config] MPS uses unified memory - no VRAM limits", flush=True)
+        return result
+    # No GPU available - fall back to CPU
+    else:
+        result["warning"] = "No CUDA GPU or Metal GPU detected. Running on CPU will be very slow."
+        return result
+
+    # Continue with CUDA-specific logic if CUDA is available
+    if result["device_type"] != "cuda":
         return result
 
     num_gpus = torch.cuda.device_count()
@@ -433,6 +462,11 @@ def configure_flash_attention_for_gpu(device_id: int):
     - NVIDIA SM 6.x and older: Disables Flash Attention, uses math backend
     - AMD ROCm: Conservatively disables Flash Attention (compatibility varies)
     """
+    # Skip if MPS is being used
+    if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() and not torch.cuda.is_available():
+        logger.info("[GPU Config] Apple Metal (MPS) device - skipping Flash Attention configuration")
+        return
+    
     if not torch.cuda.is_available():
         logger.info("[GPU Config] CUDA not available - skipping Flash Attention configuration")
         return
@@ -662,9 +696,12 @@ def _pad_audio_token(token):
             pipeline.mula.reset_caches()
             pipeline._mula.to("cpu")
             gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
-            print(f"[Sequential Offload] VRAM after offload: {torch.cuda.memory_allocated()/1024**3:.2f}GB", flush=True)
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()
+                print(f"[Sequential Offload] VRAM after offload: {torch.cuda.memory_allocated()/1024**3:.2f}GB", flush=True)
+            elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                torch.mps.empty_cache()
         else:
             pipeline._unload()
 
@@ -683,7 +720,8 @@ def _pad_audio_token(token):
                     device_map=pipeline.codec_device,
                     dtype=torch.float32,
                 )
-                print(f"[Lazy Loading] HeartCodec loaded. VRAM: {torch.cuda.memory_allocated()/1024**3:.2f}GB", flush=True)
+                if torch.cuda.is_available():
+                    print(f"[Lazy Loading] HeartCodec loaded. VRAM: {torch.cuda.memory_allocated()/1024**3:.2f}GB", flush=True)
             else:
                 raise RuntimeError("Cannot load HeartCodec: codec_path not available")
 
@@ -696,8 +734,11 @@ def _pad_audio_token(token):
             del pipeline._codec
             pipeline._codec = None
             gc.collect()
-            torch.cuda.empty_cache()
-            torch.cuda.synchronize()
+            if torch.cuda.is_available():
+                torch.cuda.empty_cache()
+                torch.cuda.synchronize()
+            elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                torch.mps.empty_cache()
 
         if pipeline._sequential_offload:
             # Move HeartMuLa back to GPU for next generation
@@ -726,6 +767,9 @@ def cleanup_gpu_memory():
                 torch.cuda.empty_cache()
                 torch.cuda.synchronize()
         logger.info("GPU memory cleaned up")
+    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+        torch.mps.empty_cache()
+        logger.info("MPS memory cleaned up")
 
 
 def get_gpu_memory(device_id):
@@ -1036,6 +1080,9 @@ def _unload_all_models(self):
                 with torch.cuda.device(i):
                     torch.cuda.empty_cache()
                     torch.cuda.synchronize()
+        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            # MPS memory cleanup
+            torch.mps.empty_cache()
 
         logger.info("All models unloaded")
 
@@ -1043,11 +1090,24 @@ def get_gpu_info(self) -> dict:
         """Get GPU hardware information."""
         result = {
             "cuda_available": torch.cuda.is_available(),
+            "mps_available": hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(),
             "num_gpus": 0,
             "gpus": [],
             "total_vram_gb": 0
         }
 
+        # Check for MPS (Apple Metal) first
+        if result["mps_available"] and not result["cuda_available"]:
+            result["num_gpus"] = 1
+            result["gpus"].append({
+                "index": 0,
+                "name": "Apple Metal Performance Shaders",
+                "vram_gb": "Unified Memory",
+                "compute_capability": "MPS",
+                "supports_flash_attention": False
+            })
+            return result
+
         if not torch.cuda.is_available():
             return result
 
@@ -1157,8 +1217,31 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
         # Store the detected config for reference
         self.gpu_config = auto_config
 
+        device_type = auto_config.get("device_type", "cpu")
         num_gpus = auto_config["num_gpus"]
 
+        # Handle Apple Metal (MPS) devices
+        if device_type == "mps":
+            logger.info("Using Apple Metal (MPS) for GPU acceleration")
+            self.gpu_mode = "single"
+            print("[Apple Metal] Using MPS device for inference", flush=True)
+            print("[Apple Metal] Note: MPS uses unified memory architecture", flush=True)
+            
+            # MPS doesn't support bfloat16, use float32 instead
+            pipeline = HeartMuLaGenPipeline.from_pretrained(
+                model_path,
+                device={
+                    "mula": torch.device("mps"),
+                    "codec": torch.device("mps"),
+                },
+                dtype={
+                    "mula": torch.float32,
+                    "codec": torch.float32,
+                },
+                version=version,
+            )
+            return patch_pipeline_with_callback(pipeline, sequential_offload=False)
+
         if use_quantization:
             print(f"[Quantization] 4-bit quantization ENABLED - model will use ~3GB instead of ~11GB", flush=True)
         else:
@@ -1640,6 +1723,8 @@ async def update_compile_progress():
                     if torch.cuda.is_available():
                         torch.cuda.empty_cache()
                         torch.cuda.synchronize()
+                    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                        torch.mps.empty_cache()
                     logger.info("GPU memory cleaned up after generation")
                 except Exception as cleanup_err:
                     logger.warning(f"Memory cleanup warning: {cleanup_err}")

From 576a90b1d333bb1596690b61bb9e6588c7529e71 Mon Sep 17 00:00:00 2001
From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com>
Date: Wed, 28 Jan 2026 23:53:16 +0000
Subject: [PATCH 3/3] Address code review feedback: Add CPU fallback,
 quantization warning, and helper function

Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com>
---
 backend/app/services/music_service.py | 53 +++++++++++++++++++++++----
 1 file changed, 45 insertions(+), 8 deletions(-)

diff --git a/backend/app/services/music_service.py b/backend/app/services/music_service.py
index 155c9c7..8c1c280 100644
--- a/backend/app/services/music_service.py
+++ b/backend/app/services/music_service.py
@@ -83,6 +83,16 @@
 SETTINGS_FILE = os.path.join(os.environ.get("HEARTMULA_DB_PATH", _default_db_dir).replace("jobs.db", ""), "settings.json")
 
 
+def is_mps_available() -> bool:
+    """
+    Check if Apple Metal Performance Shaders (MPS) is available.
+    
+    Returns:
+        bool: True if MPS is available, False otherwise
+    """
+    return hasattr(torch.backends, 'mps') and torch.backends.mps.is_available()
+
+
 def detect_optimal_gpu_config() -> dict:
     """
     Auto-detect the optimal GPU configuration based on available VRAM.
@@ -111,7 +121,7 @@ def detect_optimal_gpu_config() -> dict:
         result["device_type"] = "cuda"
         # Continue with existing CUDA logic below
     # Check for Apple Metal (MPS) on macOS
-    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+    elif is_mps_available():
         result["device_type"] = "mps"
         result["num_gpus"] = 1
         result["use_quantization"] = False  # MPS works better with full precision
@@ -463,7 +473,7 @@ def configure_flash_attention_for_gpu(device_id: int):
     - AMD ROCm: Conservatively disables Flash Attention (compatibility varies)
     """
     # Skip if MPS is being used
-    if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() and not torch.cuda.is_available():
+    if is_mps_available() and not torch.cuda.is_available():
         logger.info("[GPU Config] Apple Metal (MPS) device - skipping Flash Attention configuration")
         return
     
@@ -700,7 +710,7 @@ def _pad_audio_token(token):
                 torch.cuda.empty_cache()
                 torch.cuda.synchronize()
                 print(f"[Sequential Offload] VRAM after offload: {torch.cuda.memory_allocated()/1024**3:.2f}GB", flush=True)
-            elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            elif is_mps_available():
                 torch.mps.empty_cache()
         else:
             pipeline._unload()
@@ -737,7 +747,7 @@ def _pad_audio_token(token):
             if torch.cuda.is_available():
                 torch.cuda.empty_cache()
                 torch.cuda.synchronize()
-            elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+            elif is_mps_available():
                 torch.mps.empty_cache()
 
         if pipeline._sequential_offload:
@@ -767,7 +777,7 @@ def cleanup_gpu_memory():
                 torch.cuda.empty_cache()
                 torch.cuda.synchronize()
         logger.info("GPU memory cleaned up")
-    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+    elif is_mps_available():
         torch.mps.empty_cache()
         logger.info("MPS memory cleaned up")
 
@@ -1080,7 +1090,7 @@ def _unload_all_models(self):
                 with torch.cuda.device(i):
                     torch.cuda.empty_cache()
                     torch.cuda.synchronize()
-        elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+        elif is_mps_available():
             # MPS memory cleanup
             torch.mps.empty_cache()
 
@@ -1090,7 +1100,7 @@ def get_gpu_info(self) -> dict:
         """Get GPU hardware information."""
         result = {
             "cuda_available": torch.cuda.is_available(),
-            "mps_available": hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(),
+            "mps_available": is_mps_available(),
             "num_gpus": 0,
             "gpus": [],
             "total_vram_gb": 0
@@ -1227,6 +1237,11 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
             print("[Apple Metal] Using MPS device for inference", flush=True)
             print("[Apple Metal] Note: MPS uses unified memory architecture", flush=True)
             
+            # Check if quantization is manually enabled
+            if use_quantization:
+                logger.warning("4-bit quantization is not supported on MPS. Using full precision instead.")
+                print("[Apple Metal] WARNING: 4-bit quantization not supported on MPS, using full precision", flush=True)
+            
             # MPS doesn't support bfloat16, use float32 instead
             pipeline = HeartMuLaGenPipeline.from_pretrained(
                 model_path,
@@ -1242,6 +1257,28 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str):
             )
             return patch_pipeline_with_callback(pipeline, sequential_offload=False)
 
+        # Handle CPU-only mode (no CUDA or MPS available)
+        if device_type == "cpu":
+            logger.warning("No GPU detected - running on CPU will be very slow")
+            self.gpu_mode = "cpu"
+            print("[CPU Mode] No GPU detected, using CPU for inference", flush=True)
+            print("[CPU Mode] WARNING: This will be extremely slow. Consider using a system with GPU support.", flush=True)
+            
+            pipeline = HeartMuLaGenPipeline.from_pretrained(
+                model_path,
+                device={
+                    "mula": torch.device("cpu"),
+                    "codec": torch.device("cpu"),
+                },
+                dtype={
+                    "mula": torch.float32,
+                    "codec": torch.float32,
+                },
+                version=version,
+            )
+            return patch_pipeline_with_callback(pipeline, sequential_offload=False)
+
+        # At this point, device_type must be "cuda"
         if use_quantization:
             print(f"[Quantization] 4-bit quantization ENABLED - model will use ~3GB instead of ~11GB", flush=True)
         else:
@@ -1723,7 +1760,7 @@ async def update_compile_progress():
                     if torch.cuda.is_available():
                         torch.cuda.empty_cache()
                         torch.cuda.synchronize()
-                    elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available():
+                    elif is_mps_available():
                         torch.mps.empty_cache()
                     logger.info("GPU memory cleaned up after generation")
                 except Exception as cleanup_err: