Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
65 changes: 63 additions & 2 deletions parsec/data.c
Original file line number Diff line number Diff line change
Expand Up @@ -96,7 +96,6 @@ static void parsec_data_destruct(parsec_data_t* obj )
copy, copy->original, i);
}
#endif /* defined(PARSEC_DEBUG_PARANOID) */
assert(obj->super.obj_reference_count > 1);
parsec_data_copy_detach( obj, copy, i );
if ( !(device->type & PARSEC_DEV_CUDA)
&& !(device->type & PARSEC_DEV_HIP) ) {
Expand Down Expand Up @@ -200,7 +199,11 @@ int parsec_data_copy_detach(parsec_data_t* data,

copy->original = NULL;
copy->older = NULL;
PARSEC_OBJ_RELEASE(data);
/* if the host copy is discarded it has already released its reference so
* we do not release the data again */
if (!(copy->flags & PARSEC_DATA_FLAG_DISCARDED)) {
PARSEC_OBJ_RELEASE(data);
}

return PARSEC_SUCCESS;
}
Expand Down Expand Up @@ -565,3 +568,61 @@ parsec_data_destroy( parsec_data_t *data )
#endif
PARSEC_OBJ_RELEASE(data);
}

void
parsec_data_discard( parsec_data_t *data )
{
/* defensive */
if (NULL == data) return;

/* lock the data so it's safe to touch the flags */
parsec_atomic_lock( &data->lock );

/**
* Mark the host copy as discarded
*
* We mark the host copy as having given up its reference to the data_t
* so when the data_t is destroyed (parsec_data_destruct) and
* the host copy is being detached we don't release the copy's reference
* on the data_t again. We have to releae the copy's reference here
* to break the cyclic dependency between the copy and the data_t.
* We cannot release the copy immediately as there may device management
* threads working with it, e.g., evicting data into it.
* */
parsec_data_copy_t *cpu_copy = data->device_copies[0];
if (NULL != cpu_copy) {
cpu_copy->flags |= PARSEC_DATA_FLAG_DISCARDED;

/* release the reference that the host copy had on the data_t to break
* the circular reference. */
PARSEC_OBJ_RELEASE(data);
}

/**
* Tell the devices that they have discarded data.
*/
for (uint32_t i = 1; i < parsec_nb_devices; i++) {
if (parsec_mca_device_is_gpu(i)) {
parsec_data_copy_t *device_copy = data->device_copies[i];
if (NULL != device_copy) {
parsec_device_module_t* device = parsec_mca_device_get(i);
if (NULL != device) {
parsec_atomic_fetch_inc_int64(&device->nb_discarded);
}
}
}
}

/* unlock before releasing our references */
parsec_atomic_unlock( &data->lock );

/* release the reference the application held */
PARSEC_OBJ_RELEASE(data);

/* From here, any device copy that is still attached to the data_t
* can continue to use the host copy and once all device copies are
* detached the data_t and the host copy are destroyed.
* If there were no device copies then the release above will
* have destroyed the data_t already. */

}
14 changes: 13 additions & 1 deletion parsec/data.h
Original file line number Diff line number Diff line change
@@ -1,5 +1,5 @@
/*
* Copyright (c) 2012-2021 The University of Tennessee and The University
* Copyright (c) 2012-2025 The University of Tennessee and The University
* of Tennessee Research Foundation. All rights
* reserved.
*/
Expand Down Expand Up @@ -55,6 +55,7 @@ typedef uint8_t parsec_data_status_t;
typedef uint8_t parsec_data_flag_t;
#define PARSEC_DATA_FLAG_ARENA ((parsec_data_flag_t)1<<0)
#define PARSEC_DATA_FLAG_TRANSIT ((parsec_data_flag_t)1<<1)
#define PARSEC_DATA_FLAG_DISCARDED ((parsec_data_flag_t)1<<2)
#define PARSEC_DATA_FLAG_EVICTED ((parsec_data_flag_t)1<<5)
#define PARSEC_DATA_FLAG_PARSEC_MANAGED ((parsec_data_flag_t)1<<6)
#define PARSEC_DATA_FLAG_PARSEC_OWNED ((parsec_data_flag_t)1<<7)
Expand Down Expand Up @@ -148,6 +149,17 @@ parsec_data_create_with_type( parsec_data_collection_t *desc,
PARSEC_DECLSPEC void
parsec_data_destroy( parsec_data_t *holder );

/**
* Mark the parsec_data_t and its host copy as discarded.
* Any host-side copies will remain allocated as long as
* there are potential device copies referencing it.
* Once all device copies have been released the host
* copy and the data_t are destroyed.
* The parsec_data_t must not be used after this call.
*/
PARSEC_DECLSPEC void
parsec_data_discard( parsec_data_t *data );

END_C_DECLS

/** @} */
Expand Down
3 changes: 3 additions & 0 deletions parsec/mca/device/cuda/device_cuda_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -161,6 +161,9 @@ static int device_cuda_component_register(void)
(void)parsec_mca_param_reg_int_name("device_cuda", "max_number_of_ejected_data",
"Sets up the maximum number of blocks that can be ejected from GPU memory",
false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_flows);
(void)parsec_mca_param_reg_int_name("device_cuda", "max_number_of_discarded_data",
"Sets up the maximum number of discarded blocks to be collected at once",
false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_discarded);
(void)parsec_mca_param_reg_int_name("device_cuda", "max_streams",
"Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3",
false, false, PARSEC_GPU_MAX_STREAMS, &parsec_cuda_max_streams);
Expand Down
1 change: 1 addition & 0 deletions parsec/mca/device/device.h
Original file line number Diff line number Diff line change
Expand Up @@ -166,6 +166,7 @@ struct parsec_device_module_s {
uint64_t executed_tasks;
uint64_t nb_data_faults;
uint64_t nb_evictions;
volatile int64_t nb_discarded; /**< Track number of discarded data copies on this device */
/* We provide the compute capacity of the device in GFlop/s so that conversion to #nanosec in load estimates is straightforward */
/* These compute capacities can be useful for users when providing their own
* time_estimate functions: the user can divide the number of flops for the
Expand Down
154 changes: 107 additions & 47 deletions parsec/mca/device/device_gpu.c
Original file line number Diff line number Diff line change
Expand Up @@ -400,6 +400,34 @@ static parsec_task_class_t parsec_device_data_prefetch_tc = {
.fini = NULL
};

/**
* Release a gpu copy to the zone allocator.
*/
void parsec_device_release_gpu_copy(parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t *gpu_elem)
{
#if !defined(PARSEC_GPU_ALLOC_PER_TILE)
#if defined(PARSEC_PROF_TRACE)
if((gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) &&
(gpu_device->exec_stream[0]->prof_event_track_enable ||
gpu_device->exec_stream[1]->prof_event_track_enable)) {
parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
parsec_gpu_free_memory_key, (int64_t)gpu_elem->device_private,
gpu_device->super.device_index,
NULL, PARSEC_PROFILING_EVENT_COUNTER);
parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
parsec_gpu_use_memory_key_end,
(uint64_t)gpu_elem->device_private,
gpu_device->super.device_index, NULL, 0);
}
#endif // PARSEC_PROF_TRACE
assert( 0 != (gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
zone_free( gpu_device->memory, (void*)(gpu_elem->device_private) );
gpu_elem->device_private = NULL;
PARSEC_OBJ_RELEASE(gpu_elem);
assert( NULL == gpu_elem );
#endif // PARSEC_GPU_ALLOC_PER_TILE
}

static int
parsec_device_release_resources_prefetch_task(parsec_device_gpu_module_t* gpu_device,
parsec_gpu_task_t** out_task)
Expand Down Expand Up @@ -719,6 +747,49 @@ parsec_device_memory_reserve( parsec_device_gpu_module_t* gpu_device,
return PARSEC_SUCCESS;
}

/**
* Release discarded data copies from the LRU list.
* Returns the number of discarded items released.
*/
static int parsec_device_memory_release_discarded(parsec_device_gpu_module_t* gpu_device,
parsec_list_t* list)
{
parsec_list_item_t* item;
parsec_list_item_t* ring = NULL;
int count = 0;

if (gpu_device->super.nb_discarded == 0) {
return 0;
}

while (NULL != (item = parsec_list_pop_front(list))) {
parsec_gpu_data_copy_t* gpu_copy = (parsec_gpu_data_copy_t*)item;
parsec_data_t* original = gpu_copy->original;
if (NULL != original) {
parsec_data_copy_t *cpu_copy = original->device_copies[0];
parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
PARSEC_LIST_ITEM_SINGLETON(item);

if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) {
count++;
parsec_device_release_gpu_copy(gpu_device, gpu_copy);
PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream,
"Releasing discarded GPU copy %p from data %p", gpu_copy, original);
} else if (ring == NULL) {
ring = item;
} else {
parsec_list_item_ring_push(ring, item);
}
}
}
/* put the ring back into the list */
if (NULL != ring) {
parsec_list_chain_front(list, ring);
}
parsec_atomic_fetch_sub_int64(&gpu_device->super.nb_discarded, count);
return count;
}

static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_device,
parsec_list_t* list)
{
Expand All @@ -733,40 +804,25 @@ static void parsec_device_memory_release_list(parsec_device_gpu_module_t* gpu_de
gpu_device->super.device_index, gpu_device->super.name, gpu_copy, gpu_copy->device_private, gpu_copy->super.super.obj_reference_count,
original, (NULL != original ? original->dc : NULL));
assert( gpu_copy->device_index == gpu_device->super.device_index );

if( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state ) {
/* warn about device data that has not been pushed back to the host or was discarded */
if( PARSEC_DATA_COHERENCY_OWNED == gpu_copy->coherency_state) {
parsec_warning("GPU[%d:%s] still OWNS the master memory copy for data %d and it is discarding it!",
gpu_device->super.device_index, gpu_device->super.name, original->key);
}
assert(0 != (gpu_copy->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );

#if defined(PARSEC_GPU_ALLOC_PER_TILE)
gpu_device->memory_free( gpu_copy->device_private );
#else

#if defined(PARSEC_PROF_TRACE)
if((gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) &&
(gpu_device->exec_stream[0]->prof_event_track_enable ||
gpu_device->exec_stream[1]->prof_event_track_enable)) {
parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
parsec_gpu_free_memory_key, (int64_t)gpu_copy->device_private,
gpu_device->super.device_index,
NULL, PARSEC_PROFILING_EVENT_COUNTER);
parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
parsec_gpu_use_memory_key_end,
(uint64_t)gpu_copy->device_private,
gpu_device->super.device_index, NULL, 0);
}
#endif
zone_free( gpu_device->memory, (void*)gpu_copy->device_private );
#endif
gpu_copy->device_private = NULL;

/* At this point the data copies should have no attachment to a data_t. Thus,
* before we get here (aka below parsec_fini), the destructor of the data
* collection must have been called, releasing all the copies.
*/
PARSEC_OBJ_RELEASE(gpu_copy); assert(NULL == gpu_copy);
#else
parsec_device_release_gpu_copy(gpu_device, gpu_copy);
#endif
}
}

Expand All @@ -778,7 +834,11 @@ parsec_device_flush_lru( parsec_device_module_t *device )
{
size_t in_use;
parsec_device_gpu_module_t *gpu_device = (parsec_device_gpu_module_t*)device;
/* Free all memory on GPU */
/* Remove discarded data copies */
parsec_device_memory_release_discarded(gpu_device, &gpu_device->gpu_mem_lru);
parsec_device_memory_release_discarded(gpu_device, &gpu_device->gpu_mem_owned_lru);
assert(gpu_device->super.nb_discarded == 0);
/* Free all remaining memory on GPU */
parsec_device_memory_release_list(gpu_device, &gpu_device->gpu_mem_lru);
parsec_device_memory_release_list(gpu_device, &gpu_device->gpu_mem_owned_lru);
parsec_device_free_workspace(gpu_device);
Expand Down Expand Up @@ -941,6 +1001,12 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
#endif
parsec_atomic_unlock(&master->lock);
return PARSEC_HOOK_RETURN_NEXT;
} else if (NULL != lru_gpu_elem->original) {
/* account for discarded data */
parsec_data_copy_t* cpu_copy = lru_gpu_elem->original->device_copies[0];
if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) {
parsec_atomic_fetch_dec_int64(&gpu_device->super.nb_discarded);
}
}

PARSEC_LIST_ITEM_SINGLETON(lru_gpu_elem);
Expand Down Expand Up @@ -1069,34 +1135,12 @@ parsec_device_data_reserve_space( parsec_device_gpu_module_t* gpu_device,
#if !defined(PARSEC_GPU_ALLOC_PER_TILE)
/* Let's free this space, and try again to malloc some space */
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%d:%s] Release GPU copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p",
gpu_device->super.device_index, gpu_device->super.name,
"GPU[%d:%s]:%s Release GPU copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p",
gpu_device->super.device_index, gpu_device->super.name, task_name,
lru_gpu_elem, lru_gpu_elem->device_private, lru_gpu_elem->super.super.obj_reference_count,
oldmaster);
#if defined(PARSEC_PROF_TRACE)
if((gpu_device->trackable_events & PARSEC_PROFILE_GPU_TRACK_MEM_USE) &&
(gpu_device->exec_stream[0]->prof_event_track_enable ||
gpu_device->exec_stream[1]->prof_event_track_enable)) {
parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
parsec_gpu_free_memory_key, (int64_t)lru_gpu_elem->device_private,
gpu_device->super.device_index,
NULL, PARSEC_PROFILING_EVENT_COUNTER);
parsec_profiling_trace_flags(gpu_device->exec_stream[0]->profiling,
parsec_gpu_use_memory_key_end,
(uint64_t)lru_gpu_elem->device_private,
gpu_device->super.device_index, NULL, 0);
}
#endif
assert( 0 != (lru_gpu_elem->flags & PARSEC_DATA_FLAG_PARSEC_OWNED) );
zone_free( gpu_device->memory, (void*)(lru_gpu_elem->device_private) );
lru_gpu_elem->device_private = NULL;
parsec_device_release_gpu_copy(gpu_device, lru_gpu_elem);
data_avail_epoch++;
PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream,
"GPU[%d:%s]:%s: Release LRU-retrieved GPU copy %p [ref_count %d: must be 1]",
gpu_device->super.device_index, gpu_device->super.name, task_name,
lru_gpu_elem, lru_gpu_elem->super.super.obj_reference_count);
PARSEC_OBJ_RELEASE(lru_gpu_elem);
assert( NULL == lru_gpu_elem );
goto malloc_data;
}
PARSEC_DEBUG_VERBOSE(30, parsec_gpu_output_stream,
Expand Down Expand Up @@ -2388,7 +2432,18 @@ parsec_device_kernel_epilog( parsec_device_gpu_module_t *gpu_device,

assert( 0 <= gpu_copy->readers );

if( gpu_task->pushout & (1 << i) ) {
if (cpu_copy->flags & PARSEC_DATA_FLAG_DISCARDED) {
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU[%d:%s] Release GPU copy %p (device_ptr %p) [ref_count %d: must be 1], attached to %p",
gpu_device->super.device_index, gpu_device->super.name,
gpu_copy, gpu_copy->device_private, gpu_copy->super.super.obj_reference_count,
original);
parsec_list_item_ring_chop((parsec_list_item_t*)gpu_copy);
PARSEC_LIST_ITEM_SINGLETON(gpu_copy);
/* release the original and */
parsec_device_release_gpu_copy(gpu_device, gpu_copy);
parsec_atomic_fetch_dec_int64(&gpu_device->super.nb_discarded);
} else if( gpu_task->pushout & (1 << i)) {
PARSEC_DEBUG_VERBOSE(20, parsec_gpu_output_stream,
"GPU copy %p [ref_count %d] moved to the read LRU in %s",
gpu_copy, gpu_copy->super.super.obj_reference_count, __func__);
Expand Down Expand Up @@ -2574,6 +2629,11 @@ parsec_device_kernel_scheduler( parsec_device_module_t *module,
}
assert(NULL == progress_task);

/* try to release all discarded copies and try again if succesful */
if (0 < parsec_device_memory_release_discarded(gpu_device, &gpu_device->gpu_mem_owned_lru)) {
goto check_in_deps;
}

/* TODO: check this */
/* If we can extract data go for it, otherwise try to drain the pending tasks */
gpu_task = parsec_gpu_create_w2r_task(gpu_device, es);
Expand Down
7 changes: 7 additions & 0 deletions parsec/mca/device/device_gpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -279,6 +279,7 @@ typedef struct parsec_gpu_workspace_s {
PARSEC_DECLSPEC extern int parsec_gpu_output_stream;
PARSEC_DECLSPEC extern int parsec_gpu_verbosity;
PARSEC_DECLSPEC extern int32_t parsec_gpu_d2h_max_flows;
PARSEC_DECLSPEC extern int32_t parsec_gpu_d2h_max_discarded;

/**
* Debugging functions.
Expand Down Expand Up @@ -352,6 +353,12 @@ int parsec_device_data_advise(parsec_device_module_t *dev, parsec_data_t *data,
int parsec_device_flush_lru( parsec_device_module_t *device );
int parsec_device_memory_release( parsec_device_gpu_module_t* gpu_device );


/**
* Release a gpu copy and return its memory to the zone allocator of the device.
*/
void parsec_device_release_gpu_copy(parsec_device_gpu_module_t* gpu_device, parsec_data_copy_t *gpu_elem);

/**
* This version is based on 4 streams: one for transfers from the memory to
* the GPU, 2 for kernel executions and one for transfers from the GPU into
Expand Down
3 changes: 3 additions & 0 deletions parsec/mca/device/level_zero/device_level_zero_component.c
Original file line number Diff line number Diff line change
Expand Up @@ -271,6 +271,9 @@ static int device_level_zero_component_register(void)
(void)parsec_mca_param_reg_int_name("device_level_zero", "max_number_of_ejected_data",
"Sets up the maximum number of blocks that can be ejected from GPU memory",
false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_flows);
(void)parsec_mca_param_reg_int_name("device_level_zero", "max_number_of_discarded_data",
"Sets up the maximum number of discarded blocks to be collected at once",
false, false, MAX_PARAM_COUNT, &parsec_gpu_d2h_max_discarded);
(void)parsec_mca_param_reg_int_name("device_level_zero", "max_streams",
"Maximum number of Streams to use for the GPU engine; 2 streams are used for communication between host and device, so the minimum is 3",
false, false, PARSEC_GPU_MAX_STREAMS, &parsec_level_zero_max_streams);
Expand Down
Loading