Commit b265bdbd authored by Evan Quan's avatar Evan Quan Committed by Alex Deucher

drm/amdgpu: added a sysfs interface for thermal throttling related V4

User can check and set the enablement of throttling logging and
the interval between each logging.

V2: simplify the sysfs interface(no string parsing)
V3: add proper lock protection on updating throttling_logging_rs.interval
V4: documentation cosmetic per Luben's suggestion
Signed-off-by: default avatarEvan Quan <evan.quan@amd.com>
Reviewed-by: default avatarChristian König <christian.koenig@amd.com>
Reviewed-by: default avatarLuben Tuikov <luben.tuikov@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent bcdc7c05
...@@ -993,6 +993,9 @@ struct amdgpu_device { ...@@ -993,6 +993,9 @@ struct amdgpu_device {
char serial[16]; char serial[16];
struct amdgpu_autodump autodump; struct amdgpu_autodump autodump;
atomic_t throttling_logging_enabled;
struct ratelimit_state throttling_logging_rs;
}; };
static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev) static inline struct amdgpu_device *amdgpu_ttm_adev(struct ttm_bo_device *bdev)
......
...@@ -3035,6 +3035,17 @@ int amdgpu_device_init(struct amdgpu_device *adev, ...@@ -3035,6 +3035,17 @@ int amdgpu_device_init(struct amdgpu_device *adev,
adev->gfx.gfx_off_req_count = 1; adev->gfx.gfx_off_req_count = 1;
adev->pm.ac_power = power_supply_is_system_supplied() > 0; adev->pm.ac_power = power_supply_is_system_supplied() > 0;
atomic_set(&adev->throttling_logging_enabled, 1);
/*
* If throttling continues, logging will be performed every minute
* to avoid log flooding. "-1" is subtracted since the thermal
* throttling interrupt comes every second. Thus, the total logging
* interval is 59 seconds(retelimited printk interval) + 1(waiting
* for throttling interrupt) = 60 seconds.
*/
ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
/* Registers mapping */ /* Registers mapping */
/* TODO: block userspace mapping of io register */ /* TODO: block userspace mapping of io register */
if (adev->asic_type >= CHIP_BONAIRE) { if (adev->asic_type >= CHIP_BONAIRE) {
......
...@@ -1808,6 +1808,73 @@ static ssize_t amdgpu_get_unique_id(struct device *dev, ...@@ -1808,6 +1808,73 @@ static ssize_t amdgpu_get_unique_id(struct device *dev,
return 0; return 0;
} }
/**
* DOC: thermal_throttling_logging
*
* Thermal throttling pulls down the clock frequency and thus the performance.
* It's an useful mechanism to protect the chip from overheating. Since it
* impacts performance, the user controls whether it is enabled and if so,
* the log frequency.
*
* Reading back the file shows you the status(enabled or disabled) and
* the interval(in seconds) between each thermal logging.
*
* Writing an integer to the file, sets a new logging interval, in seconds.
* The value should be between 1 and 3600. If the value is less than 1,
* thermal logging is disabled. Values greater than 3600 are ignored.
*/
static ssize_t amdgpu_get_thermal_throttling_logging(struct device *dev,
struct device_attribute *attr,
char *buf)
{
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = ddev->dev_private;
return snprintf(buf, PAGE_SIZE, "%s: thermal throttling logging %s, with interval %d seconds\n",
adev->ddev->unique,
atomic_read(&adev->throttling_logging_enabled) ? "enabled" : "disabled",
adev->throttling_logging_rs.interval / HZ + 1);
}
static ssize_t amdgpu_set_thermal_throttling_logging(struct device *dev,
struct device_attribute *attr,
const char *buf,
size_t count)
{
struct drm_device *ddev = dev_get_drvdata(dev);
struct amdgpu_device *adev = ddev->dev_private;
long throttling_logging_interval;
unsigned long flags;
int ret = 0;
ret = kstrtol(buf, 0, &throttling_logging_interval);
if (ret)
return ret;
if (throttling_logging_interval > 3600)
return -EINVAL;
if (throttling_logging_interval > 0) {
raw_spin_lock_irqsave(&adev->throttling_logging_rs.lock, flags);
/*
* Reset the ratelimit timer internals.
* This can effectively restart the timer.
*/
adev->throttling_logging_rs.interval =
(throttling_logging_interval - 1) * HZ;
adev->throttling_logging_rs.begin = 0;
adev->throttling_logging_rs.printed = 0;
adev->throttling_logging_rs.missed = 0;
raw_spin_unlock_irqrestore(&adev->throttling_logging_rs.lock, flags);
atomic_set(&adev->throttling_logging_enabled, 1);
} else {
atomic_set(&adev->throttling_logging_enabled, 0);
}
return count;
}
static struct amdgpu_device_attr amdgpu_device_attrs[] = { static struct amdgpu_device_attr amdgpu_device_attrs[] = {
AMDGPU_DEVICE_ATTR_RW(power_dpm_state, ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF), AMDGPU_DEVICE_ATTR_RW(power_dpm_state, ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
AMDGPU_DEVICE_ATTR_RW(power_dpm_force_performance_level, ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF), AMDGPU_DEVICE_ATTR_RW(power_dpm_force_performance_level, ATTR_FLAG_BASIC|ATTR_FLAG_ONEVF),
...@@ -1830,6 +1897,7 @@ static struct amdgpu_device_attr amdgpu_device_attrs[] = { ...@@ -1830,6 +1897,7 @@ static struct amdgpu_device_attr amdgpu_device_attrs[] = {
AMDGPU_DEVICE_ATTR_RO(pcie_bw, ATTR_FLAG_BASIC), AMDGPU_DEVICE_ATTR_RO(pcie_bw, ATTR_FLAG_BASIC),
AMDGPU_DEVICE_ATTR_RW(pp_features, ATTR_FLAG_BASIC), AMDGPU_DEVICE_ATTR_RW(pp_features, ATTR_FLAG_BASIC),
AMDGPU_DEVICE_ATTR_RO(unique_id, ATTR_FLAG_BASIC), AMDGPU_DEVICE_ATTR_RO(unique_id, ATTR_FLAG_BASIC),
AMDGPU_DEVICE_ATTR_RW(thermal_throttling_logging, ATTR_FLAG_BASIC),
}; };
static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_attr *attr, static int default_attr_update(struct amdgpu_device *adev, struct amdgpu_device_attr *attr,
......
...@@ -1533,11 +1533,6 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, ...@@ -1533,11 +1533,6 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
*/ */
uint32_t ctxid = entry->src_data[0]; uint32_t ctxid = entry->src_data[0];
uint32_t data; uint32_t data;
/*
* if the throttling continues, the logging will be performed every
* minute to avoid log flooding.
*/
static DEFINE_RATELIMIT_STATE(ratelimit_state, 60 * HZ, 1);
if (client_id == SOC15_IH_CLIENTID_THM) { if (client_id == SOC15_IH_CLIENTID_THM) {
switch (src_id) { switch (src_id) {
...@@ -1582,7 +1577,10 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, ...@@ -1582,7 +1577,10 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
smu_v11_0_ack_ac_dc_interrupt(&adev->smu); smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
break; break;
case 0x7: case 0x7:
if (__ratelimit(&ratelimit_state)) if (!atomic_read(&adev->throttling_logging_enabled))
return 0;
if (__ratelimit(&adev->throttling_logging_rs))
smu_log_thermal_throttling(smu); smu_log_thermal_throttling(smu);
break; break;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment