From ae0aa7beddcb36d6e8b480e81c292fb290d3d262 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 23:43:46 +0000 Subject: [PATCH 1/3] Initial plan From 0dd1e648251bdad27c62b5c1b79ddeb430cc5a74 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 23:48:50 +0000 Subject: [PATCH 2/3] Add Apple Metal (MPS) GPU support for macOS systems Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com> --- backend/app/services/music_service.py | 101 ++++++++++++++++++++++++-- 1 file changed, 93 insertions(+), 8 deletions(-) diff --git a/backend/app/services/music_service.py b/backend/app/services/music_service.py index 9defe0e..155c9c7 100644 --- a/backend/app/services/music_service.py +++ b/backend/app/services/music_service.py @@ -94,6 +94,7 @@ def detect_optimal_gpu_config() -> dict: - gpu_info: dict - info about each GPU (name, vram, compute capability) - config_name: str - human-readable name of the selected configuration - warning: str or None - any warnings about the configuration + - device_type: str - type of device to use ("cuda", "mps", or "cpu") """ result = { "use_quantization": True, # Default to quantization for safety @@ -102,10 +103,38 @@ def detect_optimal_gpu_config() -> dict: "gpu_info": {}, "config_name": "CPU Only", "warning": None, + "device_type": "cpu", } - if not torch.cuda.is_available(): - result["warning"] = "No CUDA GPU detected. Running on CPU will be very slow." + # Check for CUDA GPUs first + if torch.cuda.is_available(): + result["device_type"] = "cuda" + # Continue with existing CUDA logic below + # Check for Apple Metal (MPS) on macOS + elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + result["device_type"] = "mps" + result["num_gpus"] = 1 + result["use_quantization"] = False # MPS works better with full precision + result["use_sequential_offload"] = False # Unified memory architecture + result["config_name"] = "Apple Metal (MPS)" + result["gpu_info"] = { + 0: { + "name": "Apple Metal Performance Shaders", + "vram_gb": "Unified Memory", + "compute_capability": "MPS", + "supports_flash_attention": False, + } + } + print(f"\n[Auto-Config] Using Apple Metal (MPS) device", flush=True) + print(f"[Auto-Config] MPS uses unified memory - no VRAM limits", flush=True) + return result + # No GPU available - fall back to CPU + else: + result["warning"] = "No CUDA GPU or Metal GPU detected. Running on CPU will be very slow." + return result + + # Continue with CUDA-specific logic if CUDA is available + if result["device_type"] != "cuda": return result num_gpus = torch.cuda.device_count() @@ -433,6 +462,11 @@ def configure_flash_attention_for_gpu(device_id: int): - NVIDIA SM 6.x and older: Disables Flash Attention, uses math backend - AMD ROCm: Conservatively disables Flash Attention (compatibility varies) """ + # Skip if MPS is being used + if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() and not torch.cuda.is_available(): + logger.info("[GPU Config] Apple Metal (MPS) device - skipping Flash Attention configuration") + return + if not torch.cuda.is_available(): logger.info("[GPU Config] CUDA not available - skipping Flash Attention configuration") return @@ -662,9 +696,12 @@ def _pad_audio_token(token): pipeline.mula.reset_caches() pipeline._mula.to("cpu") gc.collect() - torch.cuda.empty_cache() - torch.cuda.synchronize() - print(f"[Sequential Offload] VRAM after offload: {torch.cuda.memory_allocated()/1024**3:.2f}GB", flush=True) + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + print(f"[Sequential Offload] VRAM after offload: {torch.cuda.memory_allocated()/1024**3:.2f}GB", flush=True) + elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + torch.mps.empty_cache() else: pipeline._unload() @@ -683,7 +720,8 @@ def _pad_audio_token(token): device_map=pipeline.codec_device, dtype=torch.float32, ) - print(f"[Lazy Loading] HeartCodec loaded. VRAM: {torch.cuda.memory_allocated()/1024**3:.2f}GB", flush=True) + if torch.cuda.is_available(): + print(f"[Lazy Loading] HeartCodec loaded. VRAM: {torch.cuda.memory_allocated()/1024**3:.2f}GB", flush=True) else: raise RuntimeError("Cannot load HeartCodec: codec_path not available") @@ -696,8 +734,11 @@ def _pad_audio_token(token): del pipeline._codec pipeline._codec = None gc.collect() - torch.cuda.empty_cache() - torch.cuda.synchronize() + if torch.cuda.is_available(): + torch.cuda.empty_cache() + torch.cuda.synchronize() + elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + torch.mps.empty_cache() if pipeline._sequential_offload: # Move HeartMuLa back to GPU for next generation @@ -726,6 +767,9 @@ def cleanup_gpu_memory(): torch.cuda.empty_cache() torch.cuda.synchronize() logger.info("GPU memory cleaned up") + elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + torch.mps.empty_cache() + logger.info("MPS memory cleaned up") def get_gpu_memory(device_id): @@ -1036,6 +1080,9 @@ def _unload_all_models(self): with torch.cuda.device(i): torch.cuda.empty_cache() torch.cuda.synchronize() + elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + # MPS memory cleanup + torch.mps.empty_cache() logger.info("All models unloaded") @@ -1043,11 +1090,24 @@ def get_gpu_info(self) -> dict: """Get GPU hardware information.""" result = { "cuda_available": torch.cuda.is_available(), + "mps_available": hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(), "num_gpus": 0, "gpus": [], "total_vram_gb": 0 } + # Check for MPS (Apple Metal) first + if result["mps_available"] and not result["cuda_available"]: + result["num_gpus"] = 1 + result["gpus"].append({ + "index": 0, + "name": "Apple Metal Performance Shaders", + "vram_gb": "Unified Memory", + "compute_capability": "MPS", + "supports_flash_attention": False + }) + return result + if not torch.cuda.is_available(): return result @@ -1157,8 +1217,31 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str): # Store the detected config for reference self.gpu_config = auto_config + device_type = auto_config.get("device_type", "cpu") num_gpus = auto_config["num_gpus"] + # Handle Apple Metal (MPS) devices + if device_type == "mps": + logger.info("Using Apple Metal (MPS) for GPU acceleration") + self.gpu_mode = "single" + print("[Apple Metal] Using MPS device for inference", flush=True) + print("[Apple Metal] Note: MPS uses unified memory architecture", flush=True) + + # MPS doesn't support bfloat16, use float32 instead + pipeline = HeartMuLaGenPipeline.from_pretrained( + model_path, + device={ + "mula": torch.device("mps"), + "codec": torch.device("mps"), + }, + dtype={ + "mula": torch.float32, + "codec": torch.float32, + }, + version=version, + ) + return patch_pipeline_with_callback(pipeline, sequential_offload=False) + if use_quantization: print(f"[Quantization] 4-bit quantization ENABLED - model will use ~3GB instead of ~11GB", flush=True) else: @@ -1640,6 +1723,8 @@ async def update_compile_progress(): if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() + elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + torch.mps.empty_cache() logger.info("GPU memory cleaned up after generation") except Exception as cleanup_err: logger.warning(f"Memory cleanup warning: {cleanup_err}") From 576a90b1d333bb1596690b61bb9e6588c7529e71 Mon Sep 17 00:00:00 2001 From: "copilot-swe-agent[bot]" <198982749+Copilot@users.noreply.github.com> Date: Wed, 28 Jan 2026 23:53:16 +0000 Subject: [PATCH 3/3] Address code review feedback: Add CPU fallback, quantization warning, and helper function Co-authored-by: lmangani <1423657+lmangani@users.noreply.github.com> --- backend/app/services/music_service.py | 53 +++++++++++++++++++++++---- 1 file changed, 45 insertions(+), 8 deletions(-) diff --git a/backend/app/services/music_service.py b/backend/app/services/music_service.py index 155c9c7..8c1c280 100644 --- a/backend/app/services/music_service.py +++ b/backend/app/services/music_service.py @@ -83,6 +83,16 @@ SETTINGS_FILE = os.path.join(os.environ.get("HEARTMULA_DB_PATH", _default_db_dir).replace("jobs.db", ""), "settings.json") +def is_mps_available() -> bool: + """ + Check if Apple Metal Performance Shaders (MPS) is available. + + Returns: + bool: True if MPS is available, False otherwise + """ + return hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() + + def detect_optimal_gpu_config() -> dict: """ Auto-detect the optimal GPU configuration based on available VRAM. @@ -111,7 +121,7 @@ def detect_optimal_gpu_config() -> dict: result["device_type"] = "cuda" # Continue with existing CUDA logic below # Check for Apple Metal (MPS) on macOS - elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + elif is_mps_available(): result["device_type"] = "mps" result["num_gpus"] = 1 result["use_quantization"] = False # MPS works better with full precision @@ -463,7 +473,7 @@ def configure_flash_attention_for_gpu(device_id: int): - AMD ROCm: Conservatively disables Flash Attention (compatibility varies) """ # Skip if MPS is being used - if hasattr(torch.backends, 'mps') and torch.backends.mps.is_available() and not torch.cuda.is_available(): + if is_mps_available() and not torch.cuda.is_available(): logger.info("[GPU Config] Apple Metal (MPS) device - skipping Flash Attention configuration") return @@ -700,7 +710,7 @@ def _pad_audio_token(token): torch.cuda.empty_cache() torch.cuda.synchronize() print(f"[Sequential Offload] VRAM after offload: {torch.cuda.memory_allocated()/1024**3:.2f}GB", flush=True) - elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + elif is_mps_available(): torch.mps.empty_cache() else: pipeline._unload() @@ -737,7 +747,7 @@ def _pad_audio_token(token): if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() - elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + elif is_mps_available(): torch.mps.empty_cache() if pipeline._sequential_offload: @@ -767,7 +777,7 @@ def cleanup_gpu_memory(): torch.cuda.empty_cache() torch.cuda.synchronize() logger.info("GPU memory cleaned up") - elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + elif is_mps_available(): torch.mps.empty_cache() logger.info("MPS memory cleaned up") @@ -1080,7 +1090,7 @@ def _unload_all_models(self): with torch.cuda.device(i): torch.cuda.empty_cache() torch.cuda.synchronize() - elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + elif is_mps_available(): # MPS memory cleanup torch.mps.empty_cache() @@ -1090,7 +1100,7 @@ def get_gpu_info(self) -> dict: """Get GPU hardware information.""" result = { "cuda_available": torch.cuda.is_available(), - "mps_available": hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(), + "mps_available": is_mps_available(), "num_gpus": 0, "gpus": [], "total_vram_gb": 0 @@ -1227,6 +1237,11 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str): print("[Apple Metal] Using MPS device for inference", flush=True) print("[Apple Metal] Note: MPS uses unified memory architecture", flush=True) + # Check if quantization is manually enabled + if use_quantization: + logger.warning("4-bit quantization is not supported on MPS. Using full precision instead.") + print("[Apple Metal] WARNING: 4-bit quantization not supported on MPS, using full precision", flush=True) + # MPS doesn't support bfloat16, use float32 instead pipeline = HeartMuLaGenPipeline.from_pretrained( model_path, @@ -1242,6 +1257,28 @@ def _load_pipeline_multi_gpu(self, model_path: str, version: str): ) return patch_pipeline_with_callback(pipeline, sequential_offload=False) + # Handle CPU-only mode (no CUDA or MPS available) + if device_type == "cpu": + logger.warning("No GPU detected - running on CPU will be very slow") + self.gpu_mode = "cpu" + print("[CPU Mode] No GPU detected, using CPU for inference", flush=True) + print("[CPU Mode] WARNING: This will be extremely slow. Consider using a system with GPU support.", flush=True) + + pipeline = HeartMuLaGenPipeline.from_pretrained( + model_path, + device={ + "mula": torch.device("cpu"), + "codec": torch.device("cpu"), + }, + dtype={ + "mula": torch.float32, + "codec": torch.float32, + }, + version=version, + ) + return patch_pipeline_with_callback(pipeline, sequential_offload=False) + + # At this point, device_type must be "cuda" if use_quantization: print(f"[Quantization] 4-bit quantization ENABLED - model will use ~3GB instead of ~11GB", flush=True) else: @@ -1723,7 +1760,7 @@ async def update_compile_progress(): if torch.cuda.is_available(): torch.cuda.empty_cache() torch.cuda.synchronize() - elif hasattr(torch.backends, 'mps') and torch.backends.mps.is_available(): + elif is_mps_available(): torch.mps.empty_cache() logger.info("GPU memory cleaned up after generation") except Exception as cleanup_err: