diff --git a/parsec/data.c b/parsec/data.c index fa4fe9d2a..ffeb78636 100644 --- a/parsec/data.c +++ b/parsec/data.c @@ -96,7 +96,6 @@ static void parsec_data_destruct(parsec_data_t* obj ) copy, copy->original, i); } #endif /* defined(PARSEC_DEBUG_PARANOID) */ - assert(obj->super.obj_reference_count > 1); parsec_data_copy_detach( obj, copy, i ); if ( !(device->type & PARSEC_DEV_CUDA) && !(device->type & PARSEC_DEV_HIP) ) { @@ -200,7 +199,11 @@ int parsec_data_copy_detach(parsec_data_t* data, copy->original = NULL; copy->older = NULL; - PARSEC_OBJ_RELEASE(data); + /* if the host copy is discarded it has already released its reference so + * we do not release the data again */ + if (!(copy->flags & PARSEC_DATA_FLAG_DISCARDED)) { + PARSEC_OBJ_RELEASE(data); + } return PARSEC_SUCCESS; } @@ -565,3 +568,61 @@ parsec_data_destroy( parsec_data_t *data ) #endif PARSEC_OBJ_RELEASE(data); } + +void +parsec_data_discard( parsec_data_t *data ) +{ + /* defensive */ + if (NULL == data) return; + + /* lock the data so it's safe to touch the flags */ + parsec_atomic_lock( &data->lock ); + + /** + * Mark the host copy as discarded + * + * We mark the host copy as having given up its reference to the data_t + * so when the data_t is destroyed (parsec_data_destruct) and + * the host copy is being detached we don't release the copy's reference + * on the data_t again. We have to releae the copy's reference here + * to break the cyclic dependency between the copy and the data_t. + * We cannot release the copy immediately as there may device management + * threads working with it, e.g., evicting data into it. + * */ + parsec_data_copy_t *cpu_copy = data->device_copies[0]; + if (NULL != cpu_copy) { + cpu_copy->flags |= PARSEC_DATA_FLAG_DISCARDED; + + /* release the reference that the host copy had on the data_t to break + * the circular reference. */ + PARSEC_OBJ_RELEASE(data); + } + + /** + * Tell the devices that they have discarded data. + */ + for (uint32_t i = 1; i < parsec_nb_devices; i++) { + if (parsec_mca_device_is_gpu(i)) { + parsec_data_copy_t *device_copy = data->device_copies[i]; + if (NULL != device_copy) { + parsec_device_module_t* device = parsec_mca_device_get(i); + if (NULL != device) { + parsec_atomic_fetch_inc_int64(&device->nb_discarded); + } + } + } + } + + /* unlock before releasing our references */ + parsec_atomic_unlock( &data->lock ); + + /* release the reference the application held */ + PARSEC_OBJ_RELEASE(data); + + /* From here, any device copy that is still attached to the data_t + * can continue to use the host copy and once all device copies are + * detached the data_t and the host copy are destroyed. + * If there were no device copies then the release above will + * have destroyed the data_t already. */ + +} diff --git a/parsec/data.h b/parsec/data.h index b4989fac2..22f6456ba 100644 --- a/parsec/data.h +++ b/parsec/data.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2012-2021 The University of Tennessee and The University + * Copyright (c) 2012-2025 The University of Tennessee and The University * of Tennessee Research Foundation. All rights * reserved. */ @@ -55,6 +55,7 @@ typedef uint8_t parsec_data_status_t; typedef uint8_t parsec_data_flag_t; #define PARSEC_DATA_FLAG_ARENA ((parsec_data_flag_t)1<<0) #define PARSEC_DATA_FLAG_TRANSIT ((parsec_data_flag_t)1<<1) +#define PARSEC_DATA_FLAG_DISCARDED ((parsec_data_flag_t)1<<2) #define PARSEC_DATA_FLAG_EVICTED ((parsec_data_flag_t)1<<5) #define PARSEC_DATA_FLAG_PARSEC_MANAGED ((parsec_data_flag_t)1<<6) #define PARSEC_DATA_FLAG_PARSEC_OWNED ((parsec_data_flag_t)1<<7) @@ -148,6 +149,17 @@ parsec_data_create_with_type( parsec_data_collection_t *desc, PARSEC_DECLSPEC void parsec_data_destroy( parsec_data_t *holder ); +/** + * Mark the parsec_data_t and its host copy as discarded. + * Any host-side copies will remain allocated as long as + * there are potential device copies referencing it. + * Once all device copies have been released the host + * copy and the data_t are destroyed. + * The parsec_data_t must not be used after this call. + */ +PARSEC_DECLSPEC void +parsec_data_discard( parsec_data_t *data ); + END_C_DECLS /** @} */ diff --git a/parsec/mca/device/cuda/device_cuda_component.c b/parsec/mca/device/cuda/device_cuda_component.c index 3529917b0..fd9788f94 100644 --- a/parsec/mca/device/cuda/device_cuda_component.c +++ b/parsec/mca/device/cuda/device_cuda_component.c @@ -161,6 +161,9 @@ static int device_cuda_component_register(void) (void)parsec_mca_param_reg_int_name("device_cuda", "max_number_of_ejected_data", "Sets up the maximum number of blocks that can be ejected from GPU memory", false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_flows); + (void)parsec_mca_param_reg_int_name("device_cuda", "max_number_of_discarded_data", + "Sets up the maximum number of discarded blocks to be collected at once", + false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_discarded); (void)parsec_mca_param_reg_int_name("device_cuda", "max_streams", "Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3", false, false, PARSEC_GPU_MAX_STREAMS, &parsec_cuda_max_streams); diff --git a/parsec/mca/device/device.h b/parsec/mca/device/device.h index 48ee9eab0..3c4201fea 100644 --- a/parsec/mca/device/device.h +++ b/parsec/mca/device/device.h @@ -166,6 +166,7 @@ struct parsec_device_module_s { uint64_t executed_tasks; uint64_t nb_data_faults; uint64_t nb_evictions; + volatile int64_t nb_discarded; /**< Track number of discarded data copies on this device */ /* We provide the compute capacity of the device in GFlop/s so that conversion to #nanosec in load estimates is straightforward */ /* These compute capacities can be useful for users when providing their own * time_estimate functions: the user can divide the number of flops for the diff --git a/parsec/mca/device/device_gpu.c b/parsec/mca/device/device_gpu.c index 67903d51f..1691c9621 100644 --- a/parsec/mca/device/device_gpu.c +++ b/parsec/mca/device/device_gpu.c @@ -400,6 +400,34 @@ static parsec_task_class_t parsec_device_data_prefetch_tc = { .fini = NULL }; +/** + * Release a gpu copy to the zone allocator. + */ +void parsec_device_release_gpu_copy(parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t *gpu_elem) +{ +#if !defined(PARSEC_GPU_ALLOC_PER_TILE) +#if defined(PARSEC_PROF_TRACE) + if((gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) && + (gpu_device->exec_stream[0]->prof_event_track_enable || + gpu_device->exec_stream[1]->prof_event_track_enable)) { + parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, + parsec_gpu_free_memory_key, (int64_t)gpu_elem->device_private, + gpu_device->super.device_index, + NULL, PARSEC_PROFILING_EVENT_COUNTER); + parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, + parsec_gpu_use_memory_key_end, + (uint64_t)gpu_elem->device_private, + gpu_device->super.device_index, NULL, 0); + } +#endif // PARSEC_PROF_TRACE + assert( 0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); + zone_free( gpu_device->memory, (void*)(gpu_elem->device_private) ); + gpu_elem->device_private = NULL; + PARSEC_OBJ_RELEASE(gpu_elem); + assert( NULL == gpu_elem ); +#endif // PARSEC_GPU_ALLOC_PER_TILE +} + static int parsec_device_release_resources_prefetch_task(parsec_device_gpu_module_t* gpu_device, parsec_gpu_task_t** out_task) @@ -719,6 +747,49 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device, return PARSEC_SUCCESS; } +/** + * Release discarded data copies from the LRU list. + * Returns the number of discarded items released. + */ +static int parsec_device_memory_release_discarded(parsec_device_gpu_module_t* gpu_device, + parsec_list_t* list) +{ + parsec_list_item_t* item; + parsec_list_item_t* ring = NULL; + int count = 0; + + if (gpu_device->super.nb_discarded == 0) { + return 0; + } + + while (NULL != (item = parsec_list_pop_front(list))) { + parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item; + parsec_data_t* original = gpu_copy->original; + if (NULL != original) { + parsec_data_copy_t *cpu_copy = original->device_copies[0]; + parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); + PARSEC_LIST_ITEM_SINGLETON(item); + + if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { + count++; + parsec_device_release_gpu_copy(gpu_device, gpu_copy); + PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream, + "Releasing discarded GPU copy %p from data %p", gpu_copy, original); + } else if (ring == NULL) { + ring = item; + } else { + parsec_list_item_ring_push(ring, item); + } + } + } + /* put the ring back into the list */ + if (NULL != ring) { + parsec_list_chain_front(list, ring); + } + parsec_atomic_fetch_sub_int64(&gpu_device->super.nb_discarded, count); + return count; +} + static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_device, parsec_list_t* list) { @@ -733,8 +804,8 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->device_private, gpu_copy->super.super.obj_reference_count, original, (NULL != original ? original->dc : NULL)); assert( gpu_copy->device_index == gpu_device->super.device_index ); - - if( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state ) { + /* warn about device data that has not been pushed back to the host or was discarded */ + if( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state) { parsec_warning("GPU[%d:%s] still OWNS the master memory copy for data %d and it is discarding it!", gpu_device->super.device_index, gpu_device->super.name, original->key); } @@ -742,24 +813,6 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de #if defined(PARSEC_GPU_ALLOC_PER_TILE) gpu_device->memory_free( gpu_copy->device_private ); -#else - -#if defined(PARSEC_PROF_TRACE) - if((gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) && - (gpu_device->exec_stream[0]->prof_event_track_enable || - gpu_device->exec_stream[1]->prof_event_track_enable)) { - parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, - parsec_gpu_free_memory_key, (int64_t)gpu_copy->device_private, - gpu_device->super.device_index, - NULL, PARSEC_PROFILING_EVENT_COUNTER); - parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, - parsec_gpu_use_memory_key_end, - (uint64_t)gpu_copy->device_private, - gpu_device->super.device_index, NULL, 0); - } -#endif - zone_free( gpu_device->memory, (void*)gpu_copy->device_private ); -#endif gpu_copy->device_private = NULL; /* At this point the data copies should have no attachment to a data_t. Thus, @@ -767,6 +820,9 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de * collection must have been called, releasing all the copies. */ PARSEC_OBJ_RELEASE(gpu_copy); assert(NULL == gpu_copy); +#else + parsec_device_release_gpu_copy(gpu_device, gpu_copy); +#endif } } @@ -778,7 +834,11 @@ parsec_device_flush_lru( parsec_device_module_t *device ) { size_t in_use; parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)device; - /* Free all memory on GPU */ + /* Remove discarded data copies */ + parsec_device_memory_release_discarded(gpu_device, &gpu_device->gpu_mem_lru); + parsec_device_memory_release_discarded(gpu_device, &gpu_device->gpu_mem_owned_lru); + assert(gpu_device->super.nb_discarded == 0); + /* Free all remaining memory on GPU */ parsec_device_memory_release_list(gpu_device, &gpu_device->gpu_mem_lru); parsec_device_memory_release_list(gpu_device, &gpu_device->gpu_mem_owned_lru); parsec_device_free_workspace(gpu_device); @@ -941,6 +1001,12 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, #endif parsec_atomic_unlock(&master->lock); return PARSEC_HOOK_RETURN_NEXT; + } else if (NULL != lru_gpu_elem->original) { + /* account for discarded data */ + parsec_data_copy_t* cpu_copy = lru_gpu_elem->original->device_copies[0]; + if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { + parsec_atomic_fetch_dec_int64(&gpu_device->super.nb_discarded); + } } PARSEC_LIST_ITEM_SINGLETON(lru_gpu_elem); @@ -1069,34 +1135,12 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device, #if !defined(PARSEC_GPU_ALLOC_PER_TILE) /* Let's free this space, and try again to malloc some space */ PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, - "GPU[%d:%s] Release GPU copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p", - gpu_device->super.device_index, gpu_device->super.name, + "GPU[%d:%s]:%s Release GPU copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p", + gpu_device->super.device_index, gpu_device->super.name, task_name, lru_gpu_elem, lru_gpu_elem->device_private, lru_gpu_elem->super.super.obj_reference_count, oldmaster); -#if defined(PARSEC_PROF_TRACE) - if((gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) && - (gpu_device->exec_stream[0]->prof_event_track_enable || - gpu_device->exec_stream[1]->prof_event_track_enable)) { - parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, - parsec_gpu_free_memory_key, (int64_t)lru_gpu_elem->device_private, - gpu_device->super.device_index, - NULL, PARSEC_PROFILING_EVENT_COUNTER); - parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling, - parsec_gpu_use_memory_key_end, - (uint64_t)lru_gpu_elem->device_private, - gpu_device->super.device_index, NULL, 0); - } -#endif - assert( 0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) ); - zone_free( gpu_device->memory, (void*)(lru_gpu_elem->device_private) ); - lru_gpu_elem->device_private = NULL; + parsec_device_release_gpu_copy(gpu_device, lru_gpu_elem); data_avail_epoch++; - PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream, - "GPU[%d:%s]:%s: Release LRU-retrieved GPU copy %p [ref_count %d: must be 1]", - gpu_device->super.device_index, gpu_device->super.name, task_name, - lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count); - PARSEC_OBJ_RELEASE(lru_gpu_elem); - assert( NULL == lru_gpu_elem ); goto malloc_data; } PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream, @@ -2388,7 +2432,18 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device, assert( 0 <= gpu_copy->readers ); - if( gpu_task->pushout & (1 << i) ) { + if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { + PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, + "GPU[%d:%s] Release GPU copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p", + gpu_device->super.device_index, gpu_device->super.name, + gpu_copy, gpu_copy->device_private, gpu_copy->super.super.obj_reference_count, + original); + parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); + PARSEC_LIST_ITEM_SINGLETON(gpu_copy); + /* release the original and */ + parsec_device_release_gpu_copy(gpu_device, gpu_copy); + parsec_atomic_fetch_dec_int64(&gpu_device->super.nb_discarded); + } else if( gpu_task->pushout & (1 << i)) { PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream, "GPU copy %p [ref_count %d] moved to the read LRU in %s", gpu_copy, gpu_copy->super.super.obj_reference_count, __func__); @@ -2574,6 +2629,11 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module, } assert(NULL == progress_task); + /* try to release all discarded copies and try again if succesful */ + if (0 < parsec_device_memory_release_discarded(gpu_device, &gpu_device->gpu_mem_owned_lru)) { + goto check_in_deps; + } + /* TODO: check this */ /* If we can extract data go for it, otherwise try to drain the pending tasks */ gpu_task = parsec_gpu_create_w2r_task(gpu_device, es); diff --git a/parsec/mca/device/device_gpu.h b/parsec/mca/device/device_gpu.h index fa25b87a3..74cab4e65 100644 --- a/parsec/mca/device/device_gpu.h +++ b/parsec/mca/device/device_gpu.h @@ -279,6 +279,7 @@ typedef struct parsec_gpu_workspace_s { PARSEC_DECLSPEC extern int parsec_gpu_output_stream; PARSEC_DECLSPEC extern int parsec_gpu_verbosity; PARSEC_DECLSPEC extern int32_t parsec_gpu_d2h_max_flows; +PARSEC_DECLSPEC extern int32_t parsec_gpu_d2h_max_discarded; /** * Debugging functions. @@ -352,6 +353,12 @@ int parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data, int parsec_device_flush_lru( parsec_device_module_t *device ); int parsec_device_memory_release( parsec_device_gpu_module_t* gpu_device ); + +/** + * Release a gpu copy and return its memory to the zone allocator of the device. + */ +void parsec_device_release_gpu_copy(parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t *gpu_elem); + /** * This version is based on 4 streams: one for transfers from the memory to * the GPU, 2 for kernel executions and one for transfers from the GPU into diff --git a/parsec/mca/device/level_zero/device_level_zero_component.c b/parsec/mca/device/level_zero/device_level_zero_component.c index f50f2a817..4407cfa31 100644 --- a/parsec/mca/device/level_zero/device_level_zero_component.c +++ b/parsec/mca/device/level_zero/device_level_zero_component.c @@ -271,6 +271,9 @@ static int device_level_zero_component_register(void) (void)parsec_mca_param_reg_int_name("device_level_zero", "max_number_of_ejected_data", "Sets up the maximum number of blocks that can be ejected from GPU memory", false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_flows); + (void)parsec_mca_param_reg_int_name("device_level_zero", "max_number_of_discarded_data", + "Sets up the maximum number of discarded blocks to be collected at once", + false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_discarded); (void)parsec_mca_param_reg_int_name("device_level_zero", "max_streams", "Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3", false, false, PARSEC_GPU_MAX_STREAMS, &parsec_level_zero_max_streams); diff --git a/parsec/mca/device/transfer_gpu.c b/parsec/mca/device/transfer_gpu.c index 50b0d886e..1e90366c5 100644 --- a/parsec/mca/device/transfer_gpu.c +++ b/parsec/mca/device/transfer_gpu.c @@ -179,6 +179,7 @@ static const parsec_symbol_t symb_gpu_d2h_task_param = { }; int32_t parsec_gpu_d2h_max_flows = 0; +int32_t parsec_gpu_d2h_max_discarded = 0; static const parsec_task_class_t parsec_gpu_d2h_task_class = { .name = "GPU D2H data transfer", @@ -215,6 +216,16 @@ static const parsec_task_class_t parsec_gpu_d2h_task_class = { #endif }; +static inline void release_discarded_data(parsec_device_gpu_module_t *gpu_device, parsec_gpu_data_copy_t* gpu_copy) +{ + parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); + PARSEC_LIST_ITEM_SINGLETON(gpu_copy); + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, + "D2H[%d:%s] GPU data copy %p of discarded data %p will be released", + gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->original); + parsec_device_release_gpu_copy(gpu_device, gpu_copy); + +} /** * Transfer at most the MAX_PARAM_COUNT oldest data from the GPU back @@ -227,7 +238,7 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, { parsec_gpu_task_t *w2r_task = NULL; parsec_gpu_d2h_task_t *d2h_task = NULL; - parsec_gpu_data_copy_t *gpu_copy; + parsec_gpu_data_copy_t *gpu_copy, *cpu_copy; parsec_list_item_t* item = (parsec_list_item_t*)gpu_device->gpu_mem_owned_lru.ghost_element.list_next; int nb_cleaned = 0; @@ -239,10 +250,19 @@ parsec_gpu_create_w2r_task(parsec_device_gpu_module_t *gpu_device, break; } gpu_copy = (parsec_gpu_data_copy_t*)item; + cpu_copy = gpu_copy->original->device_copies[0]; parsec_atomic_lock( &gpu_copy->original->lock ); /* get the next item before altering the next pointer */ item = (parsec_list_item_t*)item->list_next; /* conversion needed for volatile */ - if( 0 == gpu_copy->readers ) { + if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) { + parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy); + PARSEC_LIST_ITEM_SINGLETON(gpu_copy); + PARSEC_DEBUG_VERBOSE(10, parsec_gpu_output_stream, + "D2H[%d:%s] GPU data copy %p of discarded data %p will be released", + gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->original); + parsec_atomic_unlock( &gpu_copy->original->lock ); + parsec_device_release_gpu_copy(gpu_device, gpu_copy); + } else if( 0 == gpu_copy->readers ) { if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* allocate on-demand */ d2h_task = (parsec_gpu_d2h_task_t*)parsec_thread_mempool_allocate(es->context_mempool); if( PARSEC_UNLIKELY(NULL == d2h_task) ) { /* we're running out of memory. Bail out. */ @@ -346,4 +366,4 @@ int parsec_gpu_complete_w2r_task(parsec_device_gpu_module_t *gpu_device, free(gpu_task); gpu_device->data_avail_epoch++; return 0; -} +} \ No newline at end of file