Commit 1a799c4c authored by Philip Yang's avatar Philip Yang Committed by Alex Deucher

drm/amdkfd: Fix double release compute pasid

If kfd_process_device_init_vm returns failure after vm is converted to
compute vm and vm->pasid set to compute pasid, KFD will not take
pdd->drm_file reference. As a result, drm close file handler maybe
called to release the compute pasid before KFD process destroy worker to
release the same pasid and set vm->pasid to zero, this generates below
WARNING backtrace and NULL pointer access.

Add helper amdgpu_amdkfd_gpuvm_set_vm_pasid and call it at the last step
of kfd_process_device_init_vm, to ensure vm pasid is the original pasid
if acquiring vm failed or is the compute pasid with pdd->drm_file
reference taken to avoid double release same pasid.

 amdgpu: Failed to create process VM object
 ida_free called for id=32770 which is not allocated.
 WARNING: CPU: 57 PID: 72542 at ../lib/idr.c:522 ida_free+0x96/0x140
 RIP: 0010:ida_free+0x96/0x140
 Call Trace:
  amdgpu_pasid_free_delayed+0xe1/0x2a0 [amdgpu]
  amdgpu_driver_postclose_kms+0x2d8/0x340 [amdgpu]
  drm_file_free.part.13+0x216/0x270 [drm]
  drm_close_helper.isra.14+0x60/0x70 [drm]
  drm_release+0x6e/0xf0 [drm]
  __fput+0xcc/0x280
  ____fput+0xe/0x20
  task_work_run+0x96/0xc0
  do_exit+0x3d0/0xc10

 BUG: kernel NULL pointer dereference, address: 0000000000000000
 RIP: 0010:ida_free+0x76/0x140
 Call Trace:
  amdgpu_pasid_free_delayed+0xe1/0x2a0 [amdgpu]
  amdgpu_driver_postclose_kms+0x2d8/0x340 [amdgpu]
  drm_file_free.part.13+0x216/0x270 [drm]
  drm_close_helper.isra.14+0x60/0x70 [drm]
  drm_release+0x6e/0xf0 [drm]
  __fput+0xcc/0x280
  ____fput+0xe/0x20
  task_work_run+0x96/0xc0
  do_exit+0x3d0/0xc10
Signed-off-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 29d48b87
...@@ -270,8 +270,10 @@ int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool is_ ...@@ -270,8 +270,10 @@ int amdgpu_amdkfd_get_pcie_bandwidth_mbytes(struct amdgpu_device *adev, bool is_
(&((struct amdgpu_fpriv *) \ (&((struct amdgpu_fpriv *) \
((struct drm_file *)(drm_priv))->driver_priv)->vm) ((struct drm_file *)(drm_priv))->driver_priv)->vm)
int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
struct file *filp, u32 pasid);
int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
struct file *filp, u32 pasid, struct file *filp,
void **process_info, void **process_info,
struct dma_fence **ef); struct dma_fence **ef);
void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev, void amdgpu_amdkfd_gpuvm_release_process_vm(struct amdgpu_device *adev,
......
...@@ -1429,10 +1429,9 @@ static void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo) ...@@ -1429,10 +1429,9 @@ static void amdgpu_amdkfd_gpuvm_unpin_bo(struct amdgpu_bo *bo)
amdgpu_bo_unreserve(bo); amdgpu_bo_unreserve(bo);
} }
int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, int amdgpu_amdkfd_gpuvm_set_vm_pasid(struct amdgpu_device *adev,
struct file *filp, u32 pasid, struct file *filp, u32 pasid)
void **process_info,
struct dma_fence **ef)
{ {
struct amdgpu_fpriv *drv_priv; struct amdgpu_fpriv *drv_priv;
struct amdgpu_vm *avm; struct amdgpu_vm *avm;
...@@ -1443,10 +1442,6 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, ...@@ -1443,10 +1442,6 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
return ret; return ret;
avm = &drv_priv->vm; avm = &drv_priv->vm;
/* Already a compute VM? */
if (avm->process_info)
return -EINVAL;
/* Free the original amdgpu allocated pasid, /* Free the original amdgpu allocated pasid,
* will be replaced with kfd allocated pasid. * will be replaced with kfd allocated pasid.
*/ */
...@@ -1455,14 +1450,36 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev, ...@@ -1455,14 +1450,36 @@ int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
amdgpu_vm_set_pasid(adev, avm, 0); amdgpu_vm_set_pasid(adev, avm, 0);
} }
/* Convert VM into a compute VM */ ret = amdgpu_vm_set_pasid(adev, avm, pasid);
ret = amdgpu_vm_make_compute(adev, avm);
if (ret) if (ret)
return ret; return ret;
ret = amdgpu_vm_set_pasid(adev, avm, pasid); return 0;
}
int amdgpu_amdkfd_gpuvm_acquire_process_vm(struct amdgpu_device *adev,
struct file *filp,
void **process_info,
struct dma_fence **ef)
{
struct amdgpu_fpriv *drv_priv;
struct amdgpu_vm *avm;
int ret;
ret = amdgpu_file_to_fpriv(filp, &drv_priv);
if (ret) if (ret)
return ret; return ret;
avm = &drv_priv->vm;
/* Already a compute VM? */
if (avm->process_info)
return -EINVAL;
/* Convert VM into a compute VM */
ret = amdgpu_vm_make_compute(adev, avm);
if (ret)
return ret;
/* Initialize KFD part of the VM and process info */ /* Initialize KFD part of the VM and process info */
ret = init_kfd_vm(avm, process_info, ef); ret = init_kfd_vm(avm, process_info, ef);
if (ret) if (ret)
......
...@@ -1576,9 +1576,9 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, ...@@ -1576,9 +1576,9 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
p = pdd->process; p = pdd->process;
dev = pdd->dev; dev = pdd->dev;
ret = amdgpu_amdkfd_gpuvm_acquire_process_vm( ret = amdgpu_amdkfd_gpuvm_acquire_process_vm(dev->adev, drm_file,
dev->adev, drm_file, p->pasid, &p->kgd_process_info,
&p->kgd_process_info, &p->ef); &p->ef);
if (ret) { if (ret) {
pr_err("Failed to create process VM object\n"); pr_err("Failed to create process VM object\n");
return ret; return ret;
...@@ -1593,10 +1593,16 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd, ...@@ -1593,10 +1593,16 @@ int kfd_process_device_init_vm(struct kfd_process_device *pdd,
if (ret) if (ret)
goto err_init_cwsr; goto err_init_cwsr;
ret = amdgpu_amdkfd_gpuvm_set_vm_pasid(dev->adev, drm_file, p->pasid);
if (ret)
goto err_set_pasid;
pdd->drm_file = drm_file; pdd->drm_file = drm_file;
return 0; return 0;
err_set_pasid:
kfd_process_device_destroy_cwsr_dgpu(pdd);
err_init_cwsr: err_init_cwsr:
kfd_process_device_destroy_ib_mem(pdd); kfd_process_device_destroy_ib_mem(pdd);
err_reserve_ib_mem: err_reserve_ib_mem:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment