Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
3 changes: 3 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu.h
Original file line number Diff line number Diff line change
Expand Up @@ -238,6 +238,9 @@ extern int amdgpu_mtype_local;
extern int amdgpu_enforce_isolation;
#ifdef CONFIG_HSA_AMD
extern int sched_policy;
extern unsigned int dmabuf_pin_max_mb;
extern int amdgpu_dmabuf_reject_new_pins;
extern int amdgpu_rdma_pin_debug;
extern bool debug_evictions;
extern bool no_system_mem_limit;
extern int halt_if_hws_hang;
Expand Down
1 change: 1 addition & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd.h
Original file line number Diff line number Diff line change
Expand Up @@ -111,6 +111,7 @@ struct amdgpu_kfd_dev {
int64_t vram_used[MAX_XCP];
uint64_t vram_used_aligned[MAX_XCP];
atomic64_t vram_pinned;
atomic64_t rdma_pinned_bytes;
bool init_complete;
struct work_struct reset_work;

Expand Down
60 changes: 54 additions & 6 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_amdkfd_gpuvm.c
Original file line number Diff line number Diff line change
Expand Up @@ -1591,10 +1591,37 @@ static int init_kfd_vm(struct amdgpu_vm *vm, void **process_info,
int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain)
{
int ret = 0;
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);
u64 bo_size = amdgpu_bo_size(bo);
bool rdma_accounted = false;

/* Pin limit: reject new RDMA/P2P pins when global kill switch is on */
if (unlikely(amdgpu_dmabuf_reject_new_pins)) {
dev_info_ratelimited(adev->dev,
"amdgpu: KFD RDMA pin rejected (dmabuf_reject_new_pins=1)\n");
return -ENOSPC;
}

/* Pin limit: enforce per-GPU max pinned VRAM for RDMA/P2P */
if (dmabuf_pin_max_mb && (domain & AMDGPU_GEM_DOMAIN_VRAM)) {
u64 limit = (u64)dmabuf_pin_max_mb << 20;
u64 new_total = atomic64_add_return((s64)bo_size,
&adev->kfd.rdma_pinned_bytes);

if ((u64)new_total > limit) {
atomic64_sub((s64)bo_size, &adev->kfd.rdma_pinned_bytes);
dev_info_ratelimited(adev->dev,
"KFD RDMA pin rejected: pinned=%lluMB + new=%lluMB > max=%uMB\n",
(u64)(new_total - bo_size) >> 20,
bo_size >> 20, dmabuf_pin_max_mb);
return -ENOSPC;
}
rdma_accounted = true;
}

ret = amdgpu_bo_reserve(bo, false);
if (unlikely(ret))
return ret;
goto err_accounting;

if (bo->flags & AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS) {
/*
Expand All @@ -1620,14 +1647,31 @@ int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain)
amdgpu_bo_sync_wait(bo, AMDGPU_FENCE_OWNER_KFD, false);

if (!ret && bo->tbo.resource->mem_type == TTM_PL_VRAM)
atomic64_add(amdgpu_bo_size(bo),
&amdgpu_ttm_adev(bo->tbo.bdev)->kfd.vram_pinned);
atomic64_add(amdgpu_bo_size(bo), &adev->kfd.vram_pinned);
else if (!ret && rdma_accounted &&
bo->tbo.resource->mem_type != TTM_PL_VRAM) {
/*
* Quota was reserved for a VRAM-domain pin; if the BO did not end
* up in VRAM, roll back rdma_pinned_bytes (unpin only decrements
* when mem_type == TTM_PL_VRAM).
*/
atomic64_sub((s64)bo_size, &adev->kfd.rdma_pinned_bytes);
rdma_accounted = false;
}

out:
amdgpu_bo_unreserve(bo);
if (ret)
goto err_accounting;
return 0;

err_accounting:
if (rdma_accounted)
atomic64_sub((s64)bo_size, &adev->kfd.rdma_pinned_bytes);
return ret;
}


/**
* amdgpu_amdkfd_gpuvm_unpin_bo() - Unpins BO using following criteria
* @bo: Handle of buffer object being unpinned
Expand All @@ -1639,16 +1683,20 @@ int amdgpu_amdkfd_gpuvm_pin_bo(struct amdgpu_bo *bo, u32 domain)
void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo)
{
int ret = 0;
struct amdgpu_device *adev = amdgpu_ttm_adev(bo->tbo.bdev);

ret = amdgpu_bo_reserve(bo, false);
if (unlikely(ret))
return;

amdgpu_bo_unpin(bo);

if (bo->tbo.resource->mem_type == TTM_PL_VRAM)
atomic64_sub(amdgpu_bo_size(bo),
&amdgpu_ttm_adev(bo->tbo.bdev)->kfd.vram_pinned);
if (bo->tbo.resource->mem_type == TTM_PL_VRAM) {
atomic64_sub(amdgpu_bo_size(bo), &adev->kfd.vram_pinned);
if (dmabuf_pin_max_mb)
atomic64_sub((s64)amdgpu_bo_size(bo),
&adev->kfd.rdma_pinned_bytes);
}

amdgpu_bo_unreserve(bo);
}
Expand Down
22 changes: 22 additions & 0 deletions drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c
Original file line number Diff line number Diff line change
Expand Up @@ -802,6 +802,28 @@ module_param(max_num_of_queues_per_device, int, 0444);
MODULE_PARM_DESC(max_num_of_queues_per_device,
"Maximum number of supported queues per device (1 = Minimum, 4096 = default)");

/**
* DOC: dmabuf_pin_max_mb (uint)
* Maximum MB of VRAM pinned for RDMA/PeerDirect per GPU. 0 = unlimited.
*/
unsigned int dmabuf_pin_max_mb;
module_param_named(dmabuf_pin_max_mb, dmabuf_pin_max_mb, uint, 0644);
MODULE_PARM_DESC(dmabuf_pin_max_mb,
"Max VRAM pinned for RDMA/PeerDirect per GPU in MB (0 = unlimited (default))");

/**
* DOC: dmabuf_reject_new_pins (int)
* Reject new RDMA/PeerDirect pins (global kill switch).
*/
int amdgpu_dmabuf_reject_new_pins;
module_param_named(dmabuf_reject_new_pins, amdgpu_dmabuf_reject_new_pins, int, 0644);
MODULE_PARM_DESC(dmabuf_reject_new_pins,
"Reject new RDMA pins (0 = allow (default), 1 = reject with -ENOSPC)");

int amdgpu_rdma_pin_debug;
module_param_named(rdma_pin_debug, amdgpu_rdma_pin_debug, int, 0644);
MODULE_PARM_DESC(rdma_pin_debug, "Log RDMA pin/unpin events (0=off, 1=on)");

/**
* DOC: send_sigterm (int)
* Send sigterm to HSA process on unhandled exceptions. Default is not to send sigterm
Expand Down
3 changes: 2 additions & 1 deletion drivers/gpu/drm/amd/amdgpu/amdgpu_ttm.c
Original file line number Diff line number Diff line change
Expand Up @@ -331,7 +331,8 @@ static int amdgpu_ttm_copy_mem_to_mem(struct amdgpu_device *adev,
amdgpu_res_first(src->mem, src->offset, size, &src_mm);
amdgpu_res_first(dst->mem, dst->offset, size, &dst_mm);

mutex_lock(&adev->mman.gtt_window_lock);
if (mutex_lock_interruptible(&adev->mman.gtt_window_lock))
return -ERESTARTSYS;
while (src_mm.remaining) {
uint64_t from, to, cur_size, tiling_flags;
uint32_t num_type, data_format, max_com, write_compress_disable;
Expand Down
6 changes: 5 additions & 1 deletion drivers/gpu/drm/amd/amdkfd/kfd_peerdirect.c
Original file line number Diff line number Diff line change
Expand Up @@ -285,7 +285,11 @@ static int amd_get_pages(unsigned long addr, size_t size, int write, int force,
ret = amdgpu_amdkfd_gpuvm_pin_bo(mem_context->bo,
mem_context->bo->kfd_bo->domain);
if (ret) {
pr_err("Pinning of buffer failed.\n");
if (ret == -ENOSPC)
pr_info("RDMA pin rejected by quota (addr=%#llx size=%#llx)\n",
mem_context->va, mem_context->size);
else
pr_err("Pinning of buffer failed: %d\n", ret);
return ret;
}

Expand Down