Commit 1a6fc071 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: move the call of ras recovery_init and bad page reserve to proper place

ras recovery_init should be called after ttm init,
bad page reserve should be put in front of gpu reset since i2c
may be unstable during gpu reset.
add cleanup for recovery_init and recovery_fini

v2: add more comment and print.
    remove cancel_work_sync in recovery_init.
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarGuchun Chen <guchun.chen@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 87d2b92f
...@@ -3630,11 +3630,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive, ...@@ -3630,11 +3630,6 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
break; break;
} }
} }
list_for_each_entry(tmp_adev, device_list_handle,
gmc.xgmi.head) {
amdgpu_ras_reserve_bad_pages(tmp_adev);
}
} }
} }
......
...@@ -1493,16 +1493,17 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) ...@@ -1493,16 +1493,17 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
return 0; return 0;
} }
static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data **data = &con->eh_data; struct ras_err_handler_data **data = &con->eh_data;
int ret; int ret;
*data = kmalloc(sizeof(**data), *data = kmalloc(sizeof(**data), GFP_KERNEL | __GFP_ZERO);
GFP_KERNEL|__GFP_ZERO); if (!*data) {
if (!*data) ret = -ENOMEM;
return -ENOMEM; goto out;
}
mutex_init(&con->recovery_lock); mutex_init(&con->recovery_lock);
INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery); INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
...@@ -1511,18 +1512,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) ...@@ -1511,18 +1512,30 @@ static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control); ret = amdgpu_ras_eeprom_init(&adev->psp.ras.ras->eeprom_control);
if (ret) if (ret)
return ret; goto free;
if (adev->psp.ras.ras->eeprom_control.num_recs) { if (adev->psp.ras.ras->eeprom_control.num_recs) {
ret = amdgpu_ras_load_bad_pages(adev); ret = amdgpu_ras_load_bad_pages(adev);
if (ret) if (ret)
return ret; goto free;
ret = amdgpu_ras_reserve_bad_pages(adev); ret = amdgpu_ras_reserve_bad_pages(adev);
if (ret) if (ret)
return ret; goto release;
} }
return 0; return 0;
release:
amdgpu_ras_release_bad_pages(adev);
free:
con->eh_data = NULL;
kfree((*data)->bps);
kfree((*data)->bps_bo);
kfree(*data);
out:
DRM_WARN("Failed to initialize ras recovery!\n");
return ret;
} }
static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
...@@ -1530,12 +1543,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev) ...@@ -1530,12 +1543,17 @@ static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data = con->eh_data; struct ras_err_handler_data *data = con->eh_data;
/* recovery_init failed to init it, fini is useless */
if (!data)
return 0;
cancel_work_sync(&con->recovery_work); cancel_work_sync(&con->recovery_work);
amdgpu_ras_release_bad_pages(adev); amdgpu_ras_release_bad_pages(adev);
mutex_lock(&con->recovery_lock); mutex_lock(&con->recovery_lock);
con->eh_data = NULL; con->eh_data = NULL;
kfree(data->bps); kfree(data->bps);
kfree(data->bps_bo);
kfree(data); kfree(data);
mutex_unlock(&con->recovery_lock); mutex_unlock(&con->recovery_lock);
...@@ -1627,9 +1645,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) ...@@ -1627,9 +1645,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
return r; return r;
} }
if (amdgpu_ras_recovery_init(adev))
goto recovery_out;
amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK; amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
if (amdgpu_ras_fs_init(adev)) if (amdgpu_ras_fs_init(adev))
...@@ -1644,8 +1659,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev) ...@@ -1644,8 +1659,6 @@ int amdgpu_ras_init(struct amdgpu_device *adev)
con->hw_supported, con->supported); con->hw_supported, con->supported);
return 0; return 0;
fs_out: fs_out:
amdgpu_ras_recovery_fini(adev);
recovery_out:
amdgpu_ras_set_context(adev, NULL); amdgpu_ras_set_context(adev, NULL);
kfree(con); kfree(con);
......
...@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev, ...@@ -480,6 +480,7 @@ static inline int amdgpu_ras_is_supported(struct amdgpu_device *adev,
return ras && (ras->supported & (1 << block)); return ras && (ras->supported & (1 << block));
} }
int amdgpu_ras_recovery_init(struct amdgpu_device *adev);
int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev, int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
unsigned int block); unsigned int block);
...@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev, ...@@ -500,6 +501,10 @@ static inline int amdgpu_ras_reset_gpu(struct amdgpu_device *adev,
{ {
struct amdgpu_ras *ras = amdgpu_ras_get_context(adev); struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
/* save bad page to eeprom before gpu reset,
* i2c may be unstable in gpu reset
*/
amdgpu_ras_reserve_bad_pages(adev);
if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0) if (atomic_cmpxchg(&ras->in_recovery, 0, 1) == 0)
schedule_work(&ras->recovery_work); schedule_work(&ras->recovery_work);
return 0; return 0;
......
...@@ -54,6 +54,7 @@ ...@@ -54,6 +54,7 @@
#include "amdgpu_trace.h" #include "amdgpu_trace.h"
#include "amdgpu_amdkfd.h" #include "amdgpu_amdkfd.h"
#include "amdgpu_sdma.h" #include "amdgpu_sdma.h"
#include "amdgpu_ras.h"
#include "bif/bif_4_1_d.h" #include "bif/bif_4_1_d.h"
static int amdgpu_map_buffer(struct ttm_buffer_object *bo, static int amdgpu_map_buffer(struct ttm_buffer_object *bo,
...@@ -1777,6 +1778,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev) ...@@ -1777,6 +1778,17 @@ int amdgpu_ttm_init(struct amdgpu_device *adev)
adev->gmc.visible_vram_size); adev->gmc.visible_vram_size);
#endif #endif
/*
* retired pages will be loaded from eeprom and reserved here,
* it should be called after ttm init since new bo may be created,
* recovery_init may fail, but it can free all resources allocated by
* itself and its failure should not stop amdgpu init process.
*
* Note: theoretically, this should be called before all vram allocations
* to protect retired page from abusing
*/
amdgpu_ras_recovery_init(adev);
/* /*
*The reserved vram for firmware must be pinned to the specified *The reserved vram for firmware must be pinned to the specified
*place on the VRAM, so reserve it early. *place on the VRAM, so reserve it early.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment