Skip to content

Commit 9dcad01

Browse files
authored
Merge pull request #266 from clamsproject/243-fix-cuda-mem-leak
updated cuda memory handling and related documentation
2 parents 16f110d + 671560b commit 9dcad01

9 files changed

Lines changed: 478 additions & 41 deletions

File tree

clams/app/__init__.py

Lines changed: 203 additions & 29 deletions
Original file line numberDiff line numberDiff line change
@@ -13,10 +13,11 @@
1313
from typing import Union, Any, Optional, Dict, List, Tuple
1414

1515
from mmif import Mmif, Document, DocumentTypes, View
16+
from mmif.utils.cli.describe import generate_param_hash # pytype: disable=import-error
1617
from clams.appmetadata import AppMetadata, real_valued_primitives, python_type, map_param_kv_delimiter
1718

1819
logging.basicConfig(
19-
level=logging.WARNING,
20+
level=getattr(logging, os.environ.get('CLAMS_LOGLEVEL', 'WARNING').upper(), logging.WARNING),
2021
format="%(asctime)s %(name)s %(levelname)-8s %(thread)d %(message)s",
2122
datefmt="%Y-%m-%d %H:%M:%S")
2223

@@ -47,7 +48,7 @@ class ClamsApp(ABC):
4748
'description': 'The JSON body of the HTTP response will be re-formatted with 2-space indentation',
4849
},
4950
{
50-
'name': 'runningTime', 'type': 'boolean', 'choices': None, 'default': False, 'multivalued': False,
51+
'name': 'runningTime', 'type': 'boolean', 'choices': None, 'default': True, 'multivalued': False,
5152
'description': 'The running time of the app will be recorded in the view metadata',
5253
},
5354
{
@@ -160,20 +161,19 @@ def annotate(self, mmif: Union[str, dict, Mmif], **runtime_params: List[str]) ->
160161
hwFetch = refined.get('hwFetch', False)
161162
runtime_recs = {}
162163
if hwFetch:
164+
import multiprocessing
163165
import platform, shutil, subprocess
164-
runtime_recs['architecture'] = platform.machine()
165-
# runtime_recs['processor'] = platform.processor() # this only works on Windows
166+
runtime_recs['cpu'] = f"{platform.machine()}, {multiprocessing.cpu_count()} cores"
166167
runtime_recs['cuda'] = []
167168
# Use cuda_profiler data if available, otherwise fallback to nvidia-smi
168169
if cuda_profiler:
169-
for gpu_info, peak_memory_bytes in cuda_profiler.items():
170-
# Convert peak memory to human-readable format
171-
peak_memory_mb = peak_memory_bytes / (1000 * 1000)
172-
if peak_memory_mb >= 1000:
173-
peak_memory_str = f"{peak_memory_mb / 1000:.2f} GiB"
174-
else:
175-
peak_memory_str = f"{peak_memory_mb:.1f} MiB"
176-
runtime_recs['cuda'].append(f"{gpu_info}, Used {self._cuda_memory_to_str(peak_memory_bytes)}")
170+
for gpu_name, mem_info in cuda_profiler.items():
171+
total_str = self._cuda_memory_to_str(mem_info['total'])
172+
available_str = self._cuda_memory_to_str(mem_info['available_before'])
173+
peak_str = self._cuda_memory_to_str(mem_info['peak'])
174+
runtime_recs['cuda'].append(
175+
f"{gpu_name}, {total_str} total, {available_str} available, {peak_str} peak used"
176+
)
177177
elif shutil.which('nvidia-smi'):
178178
for gpu in subprocess.run(['nvidia-smi', '--query-gpu=name,memory.total', '--format=csv,noheader'],
179179
stdout=subprocess.PIPE).stdout.decode('utf-8').strip().split('\n'):
@@ -348,50 +348,224 @@ def _cuda_device_name_concat(name, mem):
348348
mem = ClamsApp._cuda_memory_to_str(mem)
349349
return f"{name}, With {mem}"
350350

351+
def _get_profile_path(self, param_hash: str) -> pathlib.Path:
352+
"""
353+
Get filesystem path for memory profile file.
354+
355+
Profile files are stored in a per-app directory under user's cache.
356+
357+
:param param_hash: Hash of parameters from :func:`mmif.utils.cli.describe.generate_param_hash`
358+
:return: Path to the profile file
359+
"""
360+
# Sanitize app identifier for filesystem use
361+
app_id = self.metadata.identifier.replace('/', '-').replace(':', '-')
362+
cache_base = pathlib.Path(os.environ.get('XDG_CACHE_HOME', pathlib.Path.home() / '.cache'))
363+
cache_dir = cache_base / 'clams' / 'memory_profiles' / app_id
364+
return cache_dir / f"memory_{param_hash}.json"
365+
366+
@staticmethod
367+
def _get_available_vram() -> int:
368+
"""
369+
Get currently available VRAM in bytes (GPU-wide, across all processes).
370+
371+
Uses nvidia-smi to get actual available memory, not just current process.
372+
373+
:return: Available VRAM in bytes, or 0 if unavailable
374+
"""
375+
try:
376+
import subprocess
377+
import shutil
378+
if shutil.which('nvidia-smi'):
379+
# Get free memory from nvidia-smi (reports GPU-wide, not per-process)
380+
result = subprocess.run(
381+
['nvidia-smi', '--query-gpu=memory.free', '--format=csv,noheader,nounits', '-i', '0'],
382+
capture_output=True, text=True, timeout=5
383+
)
384+
if result.returncode == 0 and result.stdout.strip():
385+
free_mb = float(result.stdout.strip())
386+
return int(free_mb * 1024 * 1024) # Convert MB to bytes
387+
except Exception:
388+
pass
389+
390+
# Fallback to torch (only sees current process memory)
391+
try:
392+
import torch # pytype: disable=import-error
393+
if not torch.cuda.is_available():
394+
return 0
395+
396+
device = torch.cuda.current_device()
397+
total = torch.cuda.get_device_properties(device).total_memory
398+
used = max(torch.cuda.memory_allocated(device),
399+
torch.cuda.memory_reserved(device))
400+
return total - used
401+
except Exception:
402+
return 0
403+
404+
def _record_vram_usage(self, parameters: dict, peak_bytes: int) -> None:
405+
"""
406+
Record peak memory usage to profile file.
407+
408+
Uses atomic write (temp + rename) to avoid corruption from
409+
concurrent writes. Only updates if new value is higher.
410+
411+
Profile files are JSON containing:
412+
- peak_bytes: Peak VRAM usage by the torch process
413+
- parameters: Original parameters for human readability
414+
415+
:param parameters: Request parameters (for hash and recording)
416+
:param peak_bytes: Measured peak VRAM usage
417+
"""
418+
import json
419+
420+
if peak_bytes <= 0:
421+
return
422+
423+
param_hash = generate_param_hash(parameters)
424+
profile_path = self._get_profile_path(param_hash)
425+
426+
try:
427+
profile_path.parent.mkdir(parents=True, exist_ok=True)
428+
429+
# Check if we should update
430+
should_write = True
431+
if profile_path.exists():
432+
try:
433+
existing_data = json.loads(profile_path.read_text())
434+
existing = existing_data.get('peak_bytes', 0)
435+
if peak_bytes <= existing:
436+
should_write = False # Existing value is sufficient
437+
else:
438+
self.logger.debug(
439+
f"Updating peak memory for {param_hash}: "
440+
f"{existing/1024**3:.2f}GB -> {peak_bytes/1024**3:.2f}GB"
441+
)
442+
except (ValueError, IOError, json.JSONDecodeError):
443+
pass # Corrupted file, overwrite
444+
445+
if should_write:
446+
# Prepare profile data with original parameters for readability
447+
# Filter out internal keys and non-serializable values
448+
clean_params = {
449+
k: v for k, v in parameters.items()
450+
if k != self._RAW_PARAMS_KEY and not k.startswith('#')
451+
}
452+
profile_data = {
453+
'peak_bytes': peak_bytes,
454+
'parameters': clean_params
455+
}
456+
457+
# Atomic write: write to temp, then rename
458+
temp_path = profile_path.with_suffix('.tmp')
459+
temp_path.write_text(json.dumps(profile_data, indent=2))
460+
temp_path.rename(profile_path) # Atomic on POSIX
461+
462+
self.logger.info(
463+
f"Recorded peak memory for {param_hash}: "
464+
f"{peak_bytes/1024**3:.2f}GB"
465+
)
466+
except Exception as e:
467+
self.logger.warning(f"Failed to record memory profile: {e}")
468+
351469
@staticmethod
352470
def _profile_cuda_memory(func):
353471
"""
354-
Decorator for profiling CUDA memory usage during _annotate execution.
355-
472+
Decorator for profiling CUDA memory usage and managing VRAM availability.
473+
474+
This decorator:
475+
1. Checks VRAM requirements before execution (if conditions met)
476+
2. Rejects requests if insufficient VRAM
477+
3. Records peak memory usage after execution
478+
4. Calls empty_cache() for cleanup
479+
356480
:param func: The function to wrap (typically _annotate)
357481
:return: Decorated function that returns (result, cuda_profiler)
358482
where cuda_profiler is dict with "<GPU_NAME>, <GPU_TOTAL_MEMORY>" keys
359-
and peak memory usage values
483+
and dict values containing 'available_before' and 'peak' memory in bytes
360484
"""
361485
def wrapper(*args, **kwargs):
486+
# Get the ClamsApp instance from the bound method
487+
app_instance = getattr(func, '__self__', None)
488+
362489
cuda_profiler = {}
363490
torch_available = False
364491
cuda_available = False
365492
device_count = 0
366-
493+
available_before = {}
494+
367495
try:
368496
import torch # pytype: disable=import-error
369497
torch_available = True
370498
cuda_available = torch.cuda.is_available()
371499
device_count = torch.cuda.device_count()
372-
if cuda_available:
373-
# Reset peak memory stats for all devices
374-
torch.cuda.reset_peak_memory_stats('cuda')
375500
except ImportError:
376501
pass
377-
502+
503+
# Capture available VRAM before execution and reset stats
504+
if torch_available and cuda_available:
505+
for device_id in range(device_count):
506+
device_id_str = f'cuda:{device_id}'
507+
# Get GPU-wide available memory via nvidia-smi
508+
try:
509+
import subprocess
510+
import shutil
511+
if shutil.which('nvidia-smi'):
512+
result = subprocess.run(
513+
['nvidia-smi', '--query-gpu=memory.free',
514+
'--format=csv,noheader,nounits', '-i', str(device_id)],
515+
capture_output=True, text=True, timeout=5
516+
)
517+
if result.returncode == 0 and result.stdout.strip():
518+
free_mb = float(result.stdout.strip())
519+
available_before[device_id] = int(free_mb * 1024 * 1024)
520+
else:
521+
# Fallback to torch (process-specific)
522+
total = torch.cuda.get_device_properties(device_id_str).total_memory
523+
allocated = torch.cuda.memory_allocated(device_id_str)
524+
available_before[device_id] = total - allocated
525+
else:
526+
# Fallback to torch (process-specific)
527+
total = torch.cuda.get_device_properties(device_id_str).total_memory
528+
allocated = torch.cuda.memory_allocated(device_id_str)
529+
available_before[device_id] = total - allocated
530+
except Exception:
531+
# Fallback to torch (process-specific)
532+
total = torch.cuda.get_device_properties(device_id_str).total_memory
533+
allocated = torch.cuda.memory_allocated(device_id_str)
534+
available_before[device_id] = total - allocated
535+
# Reset peak memory stats for all devices
536+
torch.cuda.reset_peak_memory_stats('cuda')
537+
378538
try:
379539
result = func(*args, **kwargs)
380-
540+
541+
# Record peak memory usage
542+
total_peak = 0
381543
if torch_available and cuda_available and device_count > 0:
382544
for device_id in range(device_count):
383-
device_id = f'cuda:{device_id}'
384-
peak_memory = torch.cuda.max_memory_allocated(device_id)
385-
gpu_name = torch.cuda.get_device_name(device_id)
386-
gpu_total_memory = torch.cuda.get_device_properties(device_id).total_memory
387-
key = ClamsApp._cuda_device_name_concat(gpu_name, gpu_total_memory)
388-
cuda_profiler[key] = peak_memory
389-
545+
device_id_str = f'cuda:{device_id}'
546+
peak_memory = torch.cuda.max_memory_allocated(device_id_str)
547+
total_peak = max(total_peak, peak_memory)
548+
gpu_name = torch.cuda.get_device_name(device_id_str)
549+
gpu_total_memory = torch.cuda.get_device_properties(device_id_str).total_memory
550+
cuda_profiler[gpu_name] = {
551+
'total': gpu_total_memory,
552+
'available_before': available_before.get(device_id, 0),
553+
'peak': peak_memory
554+
}
555+
556+
# Record peak memory for future requests (if GPU app)
557+
gpu_app = (
558+
hasattr(app_instance, 'metadata') and
559+
getattr(app_instance.metadata, 'est_gpu_mem_min', 0) > 0
560+
)
561+
if gpu_app and total_peak > 0:
562+
app_instance._record_vram_usage(kwargs, total_peak)
563+
390564
return result, cuda_profiler
391565
finally:
392566
if torch_available and cuda_available:
393567
torch.cuda.empty_cache()
394-
568+
395569
return wrapper
396570

397571
@staticmethod

clams/appmetadata/__init__.py

Lines changed: 27 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -352,9 +352,20 @@ class AppMetadata(pydantic.BaseModel):
352352
"a package name and its version in the string value at the minimum (e.g., ``clams-python==1.2.3``)."
353353
)
354354
more: Optional[Dict[str, str]] = pydantic.Field(
355-
None,
355+
None,
356356
description="(optional) A string-to-string map that can be used to store any additional metadata of the app."
357357
)
358+
est_gpu_mem_min: int = pydantic.Field(
359+
0,
360+
description="(optional) Minimum GPU memory required to run the app, in megabytes (MB). "
361+
"Set to 0 (default) if the app does not use GPU."
362+
)
363+
est_gpu_mem_typ: int = pydantic.Field(
364+
0,
365+
description="(optional) Typical GPU memory usage for default parameters, in megabytes (MB). "
366+
"Must be equal or larger than est_gpu_mem_min. "
367+
"Set to 0 (default) if the app does not use GPU."
368+
)
358369

359370
model_config = {
360371
'title': 'CLAMS AppMetadata',
@@ -372,6 +383,21 @@ def assign_versions(cls, data):
372383
data.mmif_version = get_mmif_specver()
373384
return data
374385

386+
@pydantic.model_validator(mode='after')
387+
@classmethod
388+
def validate_gpu_memory(cls, data):
389+
import warnings
390+
if data.est_gpu_mem_typ > 0 and data.est_gpu_mem_min > 0:
391+
if data.est_gpu_mem_typ < data.est_gpu_mem_min:
392+
warnings.warn(
393+
f"est_gpu_mem_typ ({data.est_gpu_mem_typ} MB) is less than "
394+
f"est_gpu_mem_min ({data.est_gpu_mem_min} MB). "
395+
f"Setting est_gpu_mem_typ to {data.est_gpu_mem_min} MB.",
396+
UserWarning
397+
)
398+
data.est_gpu_mem_typ = data.est_gpu_mem_min
399+
return data
400+
375401
@pydantic.field_validator('identifier', mode='before')
376402
@classmethod
377403
def append_version(cls, val):

clams/develop/templates/app/metadata.py.template

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -39,6 +39,9 @@ def appmetadata() -> AppMetadata:
3939
# this trick can also be useful (replace ANALYZER_NAME with the pypi dist name)
4040
analyzer_version=[l.strip().rsplit('==')[-1] for l in open(pathlib.Path(__file__).parent / 'requirements.txt').readlines() if re.match(r'^ANALYZER_NAME==', l)][0],
4141
analyzer_license="", # short name for a software license
42+
# GPU memory estimates (in MB). Set to 0 if the app does not use GPU.
43+
est_gpu_mem_min=0, # estimated memory usage with minimal computation parameters
44+
est_gpu_mem_typ=0, # estimated memory usage with default parameters, must be >= est_gpu_mem_min
4245
)
4346
# and then add I/O specifications: an app must have at least one input and one output
4447
metadata.add_input(DocumentTypes.Document)

0 commit comments

Comments
 (0)