Commit f77c7109 authored by Alex Deucher's avatar Alex Deucher

drm/amdgpu/ras: fix and update the documentation for RAS

Add new sections to amdgpu.rst, fix up formatting issues,
add additional documentation to each section.
Acked-by: default avatarChristian König <christian.koenig@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent a667b75c
...@@ -79,12 +79,32 @@ AMDGPU XGMI Support ...@@ -79,12 +79,32 @@ AMDGPU XGMI Support
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_xgmi.c
:internal: :internal:
AMDGPU RAS debugfs control interface AMDGPU RAS Support
==================================== ==================
RAS debugfs/sysfs Control and Error Injection Interfaces
--------------------------------------------------------
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:doc: AMDGPU RAS debugfs control interface :doc: AMDGPU RAS debugfs control interface
RAS Error Count sysfs Interface
-------------------------------
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:doc: AMDGPU RAS sysfs Error Count Interface
RAS EEPROM debugfs Interface
----------------------------
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:doc: AMDGPU RAS debugfs EEPROM table reset interface
RAS VRAM Bad Pages sysfs Interface
----------------------------------
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:doc: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
.. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c .. kernel-doc:: drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c
:internal: :internal:
......
...@@ -310,7 +310,18 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user * ...@@ -310,7 +310,18 @@ static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *
/** /**
* DOC: AMDGPU RAS debugfs EEPROM table reset interface * DOC: AMDGPU RAS debugfs EEPROM table reset interface
* *
* Usage: echo 1 > ../ras/ras_eeprom_reset will reset EEPROM table to 0 entries. * Some boards contain an EEPROM which is used to persistently store a list of
* bad pages containing ECC errors detected in vram. This interface provides
* a way to reset the EEPROM, e.g., after testing error injection.
*
* Usage:
*
* .. code-block:: bash
*
* echo 1 > ../ras/ras_eeprom_reset
*
* will reset EEPROM table to 0 entries.
*
*/ */
static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf, static ssize_t amdgpu_ras_debugfs_eeprom_write(struct file *f, const char __user *buf,
size_t size, loff_t *pos) size_t size, loff_t *pos)
...@@ -337,6 +348,27 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = { ...@@ -337,6 +348,27 @@ static const struct file_operations amdgpu_ras_debugfs_eeprom_ops = {
.llseek = default_llseek .llseek = default_llseek
}; };
/**
* DOC: AMDGPU RAS sysfs Error Count Interface
*
* It allows user to read the error count for each IP block on the gpu through
* /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
*
* It outputs the multiple lines which report the uncorrected (ue) and corrected
* (ce) error counts.
*
* The format of one line is below,
*
* [ce|ue]: count
*
* Example:
*
* .. code-block:: bash
*
* ue: 0
* ce: 1
*
*/
static ssize_t amdgpu_ras_sysfs_read(struct device *dev, static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
struct device_attribute *attr, char *buf) struct device_attribute *attr, char *buf)
{ {
...@@ -781,8 +813,8 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags) ...@@ -781,8 +813,8 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
}; };
} }
/* /**
* DOC: ras sysfs gpu_vram_bad_pages interface * DOC: AMDGPU RAS sysfs gpu_vram_bad_pages Interface
* *
* It allows user to read the bad pages of vram on the gpu through * It allows user to read the bad pages of vram on the gpu through
* /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
...@@ -794,14 +826,21 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags) ...@@ -794,14 +826,21 @@ static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
* *
* gpu pfn and gpu page size are printed in hex format. * gpu pfn and gpu page size are printed in hex format.
* flags can be one of below character, * flags can be one of below character,
*
* R: reserved, this gpu page is reserved and not able to use. * R: reserved, this gpu page is reserved and not able to use.
*
* P: pending for reserve, this gpu page is marked as bad, will be reserved * P: pending for reserve, this gpu page is marked as bad, will be reserved
* in next window of page_reserve. * in next window of page_reserve.
*
* F: unable to reserve. this gpu page can't be reserved due to some reasons. * F: unable to reserve. this gpu page can't be reserved due to some reasons.
* *
* examples: * Examples:
*
* .. code-block:: bash
*
* 0x00000001 : 0x00001000 : R * 0x00000001 : 0x00001000 : R
* 0x00000002 : 0x00001000 : P * 0x00000002 : 0x00001000 : P
*
*/ */
static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f, static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment