Commit 950d6425 authored by Stanley.Yang's avatar Stanley.Yang Committed by Alex Deucher

drm/amdgpu: support ras on SRIOV

support umc/gfx/sdma ras on guest side

Changed from V1:
    move sriov judgment in amdgpu_ras_interrupt_fatal_error_handler
Signed-off-by: default avatarStanley.Yang <Stanley.Yang@amd.com>
Reviewed-by: default avatarTao Zhou <tao.zhou1@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 2c270d3e
...@@ -5219,6 +5219,10 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev, ...@@ -5219,6 +5219,10 @@ int amdgpu_device_gpu_recover_imp(struct amdgpu_device *adev,
r = amdgpu_device_reset_sriov(adev, job ? false : true); r = amdgpu_device_reset_sriov(adev, job ? false : true);
if (r) if (r)
adev->asic_reset_res = r; adev->asic_reset_res = r;
/* Aldebaran supports ras in SRIOV, so need resume ras during reset */
if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2))
amdgpu_ras_resume(adev);
} else { } else {
r = amdgpu_do_asic_reset(device_list_handle, &reset_context); r = amdgpu_do_asic_reset(device_list_handle, &reset_context);
if (r && r == -EAGAIN) if (r && r == -EAGAIN)
......
...@@ -726,7 +726,9 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev, ...@@ -726,7 +726,9 @@ int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
/* Do not enable if it is not allowed. */ /* Do not enable if it is not allowed. */
WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head)); WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
if (!amdgpu_ras_intr_triggered()) { /* Only enable ras feature operation handle on host side */
if (!amdgpu_sriov_vf(adev) &&
!amdgpu_ras_intr_triggered()) {
ret = psp_ras_enable_features(&adev->psp, info, enable); ret = psp_ras_enable_features(&adev->psp, info, enable);
if (ret) { if (ret) {
dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n", dev_err(adev->dev, "ras %s %s failed poison:%d ret:%d\n",
...@@ -1523,7 +1525,9 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev) ...@@ -1523,7 +1525,9 @@ static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
*/ */
void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev) void amdgpu_ras_interrupt_fatal_error_handler(struct amdgpu_device *adev)
{ {
if (!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF)) /* Fatal error events are handled on host side */
if (amdgpu_sriov_vf(adev) ||
!amdgpu_ras_is_supported(adev, AMDGPU_RAS_BLOCK__PCIE_BIF))
return; return;
if (adev->nbio.ras && if (adev->nbio.ras &&
...@@ -2270,10 +2274,14 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) ...@@ -2270,10 +2274,14 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
{ {
adev->ras_hw_enabled = adev->ras_enabled = 0; adev->ras_hw_enabled = adev->ras_enabled = 0;
if (amdgpu_sriov_vf(adev) || !adev->is_atom_fw || if (!adev->is_atom_fw ||
!amdgpu_ras_asic_supported(adev)) !amdgpu_ras_asic_supported(adev))
return; return;
if (!(amdgpu_sriov_vf(adev) &&
(adev->ip_versions[MP1_HWIP][0] == IP_VERSION(13, 0, 2))))
return;
if (!adev->gmc.xgmi.connected_to_cpu) { if (!adev->gmc.xgmi.connected_to_cpu) {
if (amdgpu_atomfirmware_mem_ecc_supported(adev)) { if (amdgpu_atomfirmware_mem_ecc_supported(adev)) {
dev_info(adev->dev, "MEM ECC is active.\n"); dev_info(adev->dev, "MEM ECC is active.\n");
...@@ -2285,15 +2293,21 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev) ...@@ -2285,15 +2293,21 @@ static void amdgpu_ras_check_supported(struct amdgpu_device *adev)
if (amdgpu_atomfirmware_sram_ecc_supported(adev)) { if (amdgpu_atomfirmware_sram_ecc_supported(adev)) {
dev_info(adev->dev, "SRAM ECC is active.\n"); dev_info(adev->dev, "SRAM ECC is active.\n");
adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC | if (!amdgpu_sriov_vf(adev)) {
1 << AMDGPU_RAS_BLOCK__DF); adev->ras_hw_enabled |= ~(1 << AMDGPU_RAS_BLOCK__UMC |
1 << AMDGPU_RAS_BLOCK__DF);
if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0))
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN | if (adev->ip_versions[VCN_HWIP][0] == IP_VERSION(2, 6, 0))
1 << AMDGPU_RAS_BLOCK__JPEG); adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__VCN |
else 1 << AMDGPU_RAS_BLOCK__JPEG);
adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN | else
1 << AMDGPU_RAS_BLOCK__JPEG); adev->ras_hw_enabled &= ~(1 << AMDGPU_RAS_BLOCK__VCN |
1 << AMDGPU_RAS_BLOCK__JPEG);
} else {
adev->ras_hw_enabled |= (1 << AMDGPU_RAS_BLOCK__PCIE_BIF |
1 << AMDGPU_RAS_BLOCK__SDMA |
1 << AMDGPU_RAS_BLOCK__GFX);
}
} else { } else {
dev_info(adev->dev, "SRAM ECC is not presented.\n"); dev_info(adev->dev, "SRAM ECC is not presented.\n");
} }
...@@ -2637,6 +2651,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev) ...@@ -2637,6 +2651,10 @@ int amdgpu_ras_late_init(struct amdgpu_device *adev)
struct amdgpu_ras_block_object *obj; struct amdgpu_ras_block_object *obj;
int r; int r;
/* Guest side doesn't need init ras feature */
if (amdgpu_sriov_vf(adev))
return 0;
list_for_each_entry_safe(node, tmp, &adev->ras_list, node) { list_for_each_entry_safe(node, tmp, &adev->ras_list, node) {
if (!node->ras_obj) { if (!node->ras_obj) {
dev_warn(adev->dev, "Warning: abnormal ras list node.\n"); dev_warn(adev->dev, "Warning: abnormal ras list node.\n");
......
...@@ -124,6 +124,10 @@ int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev, ...@@ -124,6 +124,10 @@ int amdgpu_sdma_process_ras_data_cb(struct amdgpu_device *adev,
struct amdgpu_iv_entry *entry) struct amdgpu_iv_entry *entry)
{ {
kgd2kfd_set_sram_ecc_flag(adev->kfd.dev); kgd2kfd_set_sram_ecc_flag(adev->kfd.dev);
if (amdgpu_sriov_vf(adev))
return AMDGPU_RAS_SUCCESS;
amdgpu_ras_reset_gpu(adev); amdgpu_ras_reset_gpu(adev);
return AMDGPU_RAS_SUCCESS; return AMDGPU_RAS_SUCCESS;
......
...@@ -85,9 +85,12 @@ static int psp_v13_0_init_microcode(struct psp_context *psp) ...@@ -85,9 +85,12 @@ static int psp_v13_0_init_microcode(struct psp_context *psp)
err = psp_init_sos_microcode(psp, chip_name); err = psp_init_sos_microcode(psp, chip_name);
if (err) if (err)
return err; return err;
err = psp_init_ta_microcode(&adev->psp, chip_name); /* It's not necessary to load ras ta on Guest side */
if (err) if (!amdgpu_sriov_vf(adev)) {
return err; err = psp_init_ta_microcode(&adev->psp, chip_name);
if (err)
return err;
}
break; break;
case IP_VERSION(13, 0, 1): case IP_VERSION(13, 0, 1):
case IP_VERSION(13, 0, 3): case IP_VERSION(13, 0, 3):
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment