Commit c46ebb6a authored by Philip Yang's avatar Philip Yang Committed by Alex Deucher

drm/amdkfd: set memory limit to avoid OOM with HMM enabled

HMM migration alloc sizeof(struct page) on system memory for each VRAM
page, it is 1GB system memory reserved for 64GB VRAM. To avoid
application OOM, increase system memory used size based on VRAM size of
all GPUs, then application alloc memory will fail if system memory usage
reach the limit.
Signed-off-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Reviewed-by: default avatarOak Zeng <Oak.Zeng@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 814ab993
...@@ -275,6 +275,7 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void); ...@@ -275,6 +275,7 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void);
void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev, void amdgpu_amdkfd_gpuvm_destroy_cb(struct amdgpu_device *adev,
struct amdgpu_vm *vm); struct amdgpu_vm *vm);
void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo); void amdgpu_amdkfd_unreserve_memory_limit(struct amdgpu_bo *bo);
void amdgpu_amdkfd_reserve_system_mem(uint64_t size);
#else #else
static inline static inline
void amdgpu_amdkfd_gpuvm_init_mem_limits(void) void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
......
...@@ -108,6 +108,11 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void) ...@@ -108,6 +108,11 @@ void amdgpu_amdkfd_gpuvm_init_mem_limits(void)
(kfd_mem_limit.max_ttm_mem_limit >> 20)); (kfd_mem_limit.max_ttm_mem_limit >> 20));
} }
void amdgpu_amdkfd_reserve_system_mem(uint64_t size)
{
kfd_mem_limit.system_mem_used += size;
}
/* Estimate page table size needed to represent a given memory size /* Estimate page table size needed to represent a given memory size
* *
* With 4KB pages, we need one 8 byte PTE for each 4KB of memory * With 4KB pages, we need one 8 byte PTE for each 4KB of memory
......
...@@ -57,6 +57,9 @@ static const struct dev_pagemap_ops svm_migrate_pgmap_ops = { ...@@ -57,6 +57,9 @@ static const struct dev_pagemap_ops svm_migrate_pgmap_ops = {
.migrate_to_ram = svm_migrate_to_ram, .migrate_to_ram = svm_migrate_to_ram,
}; };
/* Each VRAM page uses sizeof(struct page) on system memory */
#define SVM_HMM_PAGE_STRUCT_SIZE(size) ((size)/PAGE_SIZE * sizeof(struct page))
int svm_migrate_init(struct amdgpu_device *adev) int svm_migrate_init(struct amdgpu_device *adev)
{ {
struct kfd_dev *kfddev = adev->kfd.dev; struct kfd_dev *kfddev = adev->kfd.dev;
...@@ -93,6 +96,11 @@ int svm_migrate_init(struct amdgpu_device *adev) ...@@ -93,6 +96,11 @@ int svm_migrate_init(struct amdgpu_device *adev)
return PTR_ERR(r); return PTR_ERR(r);
} }
pr_debug("reserve %ldMB system memory for VRAM pages struct\n",
SVM_HMM_PAGE_STRUCT_SIZE(size) >> 20);
amdgpu_amdkfd_reserve_system_mem(SVM_HMM_PAGE_STRUCT_SIZE(size));
pr_info("HMM registered %ldMB device memory\n", size >> 20); pr_info("HMM registered %ldMB device memory\n", size >> 20);
return 0; return 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment