Commit 25c01191 authored by Yunxiang Li's avatar Yunxiang Li Committed by Alex Deucher

drm/amdgpu: Add reset_context flag for host FLR

There are other reset sources that pass NULL as the job pointer, such as
amdgpu_amdkfd_reset_work. Therefore, using the job pointer to check if
the FLR comes from the host does not work.

Add a flag in reset_context to explicitly mark host triggered reset, and
set this flag when we receive host reset notification.
Signed-off-by: default avatarYunxiang Li <Yunxiang.Li@amd.com>
Reviewed-by: default avatarEmily Deng <Emily.Deng@amd.com>
Reviewed-by: default avatarZhigang Luo <zhigang.luo@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent f4322b9f
...@@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev) ...@@ -5055,13 +5055,13 @@ static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
* amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
* *
* @adev: amdgpu_device pointer * @adev: amdgpu_device pointer
* @from_hypervisor: request from hypervisor * @reset_context: amdgpu reset context pointer
* *
* do VF FLR and reinitialize Asic * do VF FLR and reinitialize Asic
* return 0 means succeeded otherwise failed * return 0 means succeeded otherwise failed
*/ */
static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
bool from_hypervisor) struct amdgpu_reset_context *reset_context)
{ {
int r; int r;
struct amdgpu_hive_info *hive = NULL; struct amdgpu_hive_info *hive = NULL;
...@@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev, ...@@ -5070,12 +5070,15 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
retry: retry:
amdgpu_amdkfd_pre_reset(adev); amdgpu_amdkfd_pre_reset(adev);
if (from_hypervisor) if (test_bit(AMDGPU_HOST_FLR, &reset_context->flags)) {
clear_bit(AMDGPU_HOST_FLR, &reset_context->flags);
r = amdgpu_virt_request_full_gpu(adev, true); r = amdgpu_virt_request_full_gpu(adev, true);
else } else {
r = amdgpu_virt_reset_gpu(adev); r = amdgpu_virt_reset_gpu(adev);
}
if (r) if (r)
return r; return r;
amdgpu_ras_set_fed(adev, false); amdgpu_ras_set_fed(adev, false);
amdgpu_irq_gpu_reset_resume_helper(adev); amdgpu_irq_gpu_reset_resume_helper(adev);
...@@ -5826,7 +5829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev, ...@@ -5826,7 +5829,7 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
/* Actual ASIC resets if needed.*/ /* Actual ASIC resets if needed.*/
/* Host driver will handle XGMI hive reset for SRIOV */ /* Host driver will handle XGMI hive reset for SRIOV */
if (amdgpu_sriov_vf(adev)) { if (amdgpu_sriov_vf(adev)) {
r = amdgpu_device_reset_sriov(adev, job ? false : true); r = amdgpu_device_reset_sriov(adev, reset_context);
if (r) if (r)
adev->asic_reset_res = r; adev->asic_reset_res = r;
......
...@@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS { ...@@ -33,6 +33,7 @@ enum AMDGPU_RESET_FLAGS {
AMDGPU_NEED_FULL_RESET = 0, AMDGPU_NEED_FULL_RESET = 0,
AMDGPU_SKIP_HW_RESET = 1, AMDGPU_SKIP_HW_RESET = 1,
AMDGPU_SKIP_COREDUMP = 2, AMDGPU_SKIP_COREDUMP = 2,
AMDGPU_HOST_FLR = 3,
}; };
struct amdgpu_reset_context { struct amdgpu_reset_context {
......
...@@ -292,6 +292,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work) ...@@ -292,6 +292,7 @@ static void xgpu_ai_mailbox_flr_work(struct work_struct *work)
reset_context.method = AMD_RESET_METHOD_NONE; reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev; reset_context.reset_req_dev = adev;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
amdgpu_device_gpu_recover(adev, NULL, &reset_context); amdgpu_device_gpu_recover(adev, NULL, &reset_context);
} }
......
...@@ -328,6 +328,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work) ...@@ -328,6 +328,7 @@ static void xgpu_nv_mailbox_flr_work(struct work_struct *work)
reset_context.method = AMD_RESET_METHOD_NONE; reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev; reset_context.reset_req_dev = adev;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
amdgpu_device_gpu_recover(adev, NULL, &reset_context); amdgpu_device_gpu_recover(adev, NULL, &reset_context);
} }
......
...@@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work) ...@@ -529,6 +529,7 @@ static void xgpu_vi_mailbox_flr_work(struct work_struct *work)
reset_context.method = AMD_RESET_METHOD_NONE; reset_context.method = AMD_RESET_METHOD_NONE;
reset_context.reset_req_dev = adev; reset_context.reset_req_dev = adev;
clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags); clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
set_bit(AMDGPU_HOST_FLR, &reset_context.flags);
amdgpu_device_gpu_recover(adev, NULL, &reset_context); amdgpu_device_gpu_recover(adev, NULL, &reset_context);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment