Commit 5e984b0a authored by Hawking Zhang's avatar Hawking Zhang Committed by Alex Deucher

drm/amdgpu: Use driver mode reset for data poison

mode-2 reset is the only reliable method that can get
GC/SDMA back when poison is consumed. mmhub requires
mode-1 reset.
Signed-off-by: default avatarHawking Zhang <Hawking.Zhang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 5adcd78f
...@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, ...@@ -144,7 +144,7 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
uint16_t pasid, uint16_t client_id) uint16_t pasid, uint16_t client_id)
{ {
enum amdgpu_ras_block block = 0; enum amdgpu_ras_block block = 0;
int old_poison, ret = -EINVAL; int old_poison;
uint32_t reset = 0; uint32_t reset = 0;
struct kfd_process *p = kfd_lookup_process_by_pasid(pasid); struct kfd_process *p = kfd_lookup_process_by_pasid(pasid);
...@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, ...@@ -163,17 +163,13 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
case SOC15_IH_CLIENTID_SE2SH: case SOC15_IH_CLIENTID_SE2SH:
case SOC15_IH_CLIENTID_SE3SH: case SOC15_IH_CLIENTID_SE3SH:
case SOC15_IH_CLIENTID_UTCL2: case SOC15_IH_CLIENTID_UTCL2:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__GFX; block = AMDGPU_RAS_BLOCK__GFX;
if (ret) reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break; break;
case SOC15_IH_CLIENTID_VMC: case SOC15_IH_CLIENTID_VMC:
case SOC15_IH_CLIENTID_VMC1: case SOC15_IH_CLIENTID_VMC1:
ret = kfd_dqm_evict_pasid(dev->dqm, pasid);
block = AMDGPU_RAS_BLOCK__MMHUB; block = AMDGPU_RAS_BLOCK__MMHUB;
if (ret) reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
reset = AMDGPU_RAS_GPU_RESET_MODE1_RESET;
break; break;
case SOC15_IH_CLIENTID_SDMA0: case SOC15_IH_CLIENTID_SDMA0:
case SOC15_IH_CLIENTID_SDMA1: case SOC15_IH_CLIENTID_SDMA1:
...@@ -184,22 +180,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev, ...@@ -184,22 +180,15 @@ static void event_interrupt_poison_consumption_v9(struct kfd_node *dev,
reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET; reset = AMDGPU_RAS_GPU_RESET_MODE2_RESET;
break; break;
default: default:
break; dev_warn(dev->adev->dev,
"client %d does not support poison consumption\n", client_id);
return;
} }
kfd_signal_poison_consumed_event(dev, pasid); kfd_signal_poison_consumed_event(dev, pasid);
/* resetting queue passes, do page retirement without gpu reset dev_warn(dev->adev->dev,
* resetting queue fails, fallback to gpu reset solution "poison is consumed by client %d, kick off gpu reset flow\n", client_id);
*/
if (!ret)
dev_warn(dev->adev->dev,
"RAS poison consumption, unmap queue flow succeeded: client id %d\n",
client_id);
else
dev_warn(dev->adev->dev,
"RAS poison consumption, fall back to gpu reset flow: client id %d\n",
client_id);
amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset); amdgpu_amdkfd_ras_poison_consumption_handler(dev->adev, block, reset);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment