Commit 9dc23a63 authored by Tao Zhou's avatar Tao Zhou Committed by Alex Deucher

drm/amdgpu: change ras bps type to eeprom table record structure

change bps type from retired page to eeprom table record, prepare for
saving umc error records to eeprom
Signed-off-by: default avatarTao Zhou <tao.zhou1@amd.com>
Reviewed-by: default avatarGuchun Chen <guchun.chen@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 4bc22340
...@@ -1205,14 +1205,14 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev, ...@@ -1205,14 +1205,14 @@ static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
for (; i < data->count; i++) { for (; i < data->count; i++) {
(*bps)[i] = (struct ras_badpage){ (*bps)[i] = (struct ras_badpage){
.bp = data->bps[i].bp, .bp = data->bps[i].retired_page,
.size = AMDGPU_GPU_PAGE_SIZE, .size = AMDGPU_GPU_PAGE_SIZE,
.flags = 0, .flags = 0,
}; };
if (data->last_reserved <= i) if (data->last_reserved <= i)
(*bps)[i].flags = 1; (*bps)[i].flags = 1;
else if (data->bps[i].bo == NULL) else if (data->bps_bo[i] == NULL)
(*bps)[i].flags = 2; (*bps)[i].flags = 2;
} }
...@@ -1306,30 +1306,40 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev, ...@@ -1306,30 +1306,40 @@ static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
{ {
unsigned int old_space = data->count + data->space_left; unsigned int old_space = data->count + data->space_left;
unsigned int new_space = old_space + pages; unsigned int new_space = old_space + pages;
unsigned int align_space = ALIGN(new_space, 1024); unsigned int align_space = ALIGN(new_space, 512);
void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL); void *bps = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
struct amdgpu_bo **bps_bo =
if (!tmp) kmalloc(align_space * sizeof(*data->bps_bo), GFP_KERNEL);
if (!bps || !bps_bo) {
kfree(bps);
kfree(bps_bo);
return -ENOMEM; return -ENOMEM;
}
if (data->bps) { if (data->bps) {
memcpy(tmp, data->bps, memcpy(bps, data->bps,
data->count * sizeof(*data->bps)); data->count * sizeof(*data->bps));
kfree(data->bps); kfree(data->bps);
} }
if (data->bps_bo) {
memcpy(bps_bo, data->bps_bo,
data->count * sizeof(*data->bps_bo));
kfree(data->bps_bo);
}
data->bps = tmp; data->bps = bps;
data->bps_bo = bps_bo;
data->space_left += align_space - old_space; data->space_left += align_space - old_space;
return 0; return 0;
} }
/* it deal with vram only. */ /* it deal with vram only. */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
unsigned long *bps, int pages) struct eeprom_table_record *bps, int pages)
{ {
struct amdgpu_ras *con = amdgpu_ras_get_context(adev); struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
struct ras_err_handler_data *data; struct ras_err_handler_data *data;
int i = pages;
int ret = 0; int ret = 0;
if (!con || !con->eh_data || !bps || pages <= 0) if (!con || !con->eh_data || !bps || pages <= 0)
...@@ -1346,10 +1356,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, ...@@ -1346,10 +1356,10 @@ int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
goto out; goto out;
} }
while (i--) memcpy(&data->bps[data->count], bps, pages * sizeof(*data->bps));
data->bps[data->count++].bp = bps[i]; data->count += pages;
data->space_left -= pages; data->space_left -= pages;
out: out:
mutex_unlock(&con->recovery_lock); mutex_unlock(&con->recovery_lock);
...@@ -1374,13 +1384,13 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev) ...@@ -1374,13 +1384,13 @@ int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
goto out; goto out;
/* reserve vram at driver post stage. */ /* reserve vram at driver post stage. */
for (i = data->last_reserved; i < data->count; i++) { for (i = data->last_reserved; i < data->count; i++) {
bp = data->bps[i].bp; bp = data->bps[i].retired_page;
if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT, if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
PAGE_SIZE, &bo)) PAGE_SIZE, &bo))
DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp); DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
data->bps[i].bo = bo; data->bps_bo[i] = bo;
data->last_reserved = i + 1; data->last_reserved = i + 1;
} }
out: out:
...@@ -1405,11 +1415,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev) ...@@ -1405,11 +1415,11 @@ static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
goto out; goto out;
for (i = data->last_reserved - 1; i >= 0; i--) { for (i = data->last_reserved - 1; i >= 0; i--) {
bo = data->bps[i].bo; bo = data->bps_bo[i];
amdgpu_ras_release_vram(adev, &bo); amdgpu_ras_release_vram(adev, &bo);
data->bps[i].bo = bo; data->bps_bo[i] = bo;
data->last_reserved = i; data->last_reserved = i;
} }
out: out:
...@@ -1425,12 +1435,19 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev) ...@@ -1425,12 +1435,19 @@ static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
return 0; return 0;
} }
/*
* read error record array in eeprom and reserve enough space for
* storing new bad pages
*/
static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev) static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
{ {
/* TODO struct eeprom_table_record *bps = NULL;
* read the array to eeprom when SMU disabled. int ret;
*/
return 0; ret = amdgpu_ras_add_bad_pages(adev, bps,
adev->umc.max_ras_err_cnt_per_query);
return ret;
} }
static int amdgpu_ras_recovery_init(struct amdgpu_device *adev) static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
......
...@@ -351,11 +351,10 @@ struct ras_err_data { ...@@ -351,11 +351,10 @@ struct ras_err_data {
}; };
struct ras_err_handler_data { struct ras_err_handler_data {
/* point to bad pages array */ /* point to bad page records array */
struct { struct eeprom_table_record *bps;
unsigned long bp; /* point to reserved bo array */
struct amdgpu_bo *bo; struct amdgpu_bo **bps_bo;
} *bps;
/* the count of entries */ /* the count of entries */
int count; int count;
/* the space can place new entries */ /* the space can place new entries */
...@@ -492,7 +491,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev, ...@@ -492,7 +491,7 @@ unsigned long amdgpu_ras_query_error_count(struct amdgpu_device *adev,
/* error handling functions */ /* error handling functions */
int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev, int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
unsigned long *bps, int pages); struct eeprom_table_record *bps, int pages);
int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev); int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment