Commit 2c2b0d88 authored by Mukul Joshi's avatar Mukul Joshi Committed by Alex Deucher

drm/amdkfd: Add thermal throttling SMI event

Add support for reporting thermal throttling events through SMI.
Also, add a counter to count the number of throttling interrupts
observed and report the count in the SMI event message.
Signed-off-by: default avatarMukul Joshi <mukul.joshi@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent df9c8d1a
...@@ -789,4 +789,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry) ...@@ -789,4 +789,8 @@ void kgd2kfd_interrupt(struct kfd_dev *kfd, const void *ih_ring_entry)
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd) void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd)
{ {
} }
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
{
}
#endif #endif
...@@ -270,5 +270,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm); ...@@ -270,5 +270,6 @@ int kgd2kfd_resume_mm(struct mm_struct *mm);
int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm, int kgd2kfd_schedule_evict_and_restore_process(struct mm_struct *mm,
struct dma_fence *fence); struct dma_fence *fence);
void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd); void kgd2kfd_set_sram_ecc_flag(struct kfd_dev *kfd);
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask);
#endif /* AMDGPU_AMDKFD_H_INCLUDED */ #endif /* AMDGPU_AMDKFD_H_INCLUDED */
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include "cwsr_trap_handler.h" #include "cwsr_trap_handler.h"
#include "kfd_iommu.h" #include "kfd_iommu.h"
#include "amdgpu_amdkfd.h" #include "amdgpu_amdkfd.h"
#include "kfd_smi_events.h"
#define MQD_SIZE_ALIGNED 768 #define MQD_SIZE_ALIGNED 768
...@@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd) ...@@ -1245,6 +1246,12 @@ void kfd_dec_compute_active(struct kfd_dev *kfd)
WARN_ONCE(count < 0, "Compute profile ref. count error"); WARN_ONCE(count < 0, "Compute profile ref. count error");
} }
void kgd2kfd_smi_event_throttle(struct kfd_dev *kfd, uint32_t throttle_bitmask)
{
if (kfd)
kfd_smi_event_update_thermal_throttling(kfd, throttle_bitmask);
}
#if defined(CONFIG_DEBUG_FS) #if defined(CONFIG_DEBUG_FS)
/* This function will send a package to HIQ to hang the HWS /* This function will send a package to HIQ to hang the HWS
......
...@@ -24,6 +24,7 @@ ...@@ -24,6 +24,7 @@
#include <linux/wait.h> #include <linux/wait.h>
#include <linux/anon_inodes.h> #include <linux/anon_inodes.h>
#include <uapi/linux/kfd_ioctl.h> #include <uapi/linux/kfd_ioctl.h>
#include "amdgpu.h"
#include "amdgpu_vm.h" #include "amdgpu_vm.h"
#include "kfd_priv.h" #include "kfd_priv.h"
#include "kfd_smi_events.h" #include "kfd_smi_events.h"
...@@ -148,6 +149,54 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep) ...@@ -148,6 +149,54 @@ static int kfd_smi_ev_release(struct inode *inode, struct file *filep)
return 0; return 0;
} }
static void add_event_to_kfifo(struct kfd_dev *dev, unsigned long long smi_event,
char *event_msg, int len)
{
struct kfd_smi_client *client;
rcu_read_lock();
list_for_each_entry_rcu(client, &dev->smi_clients, list) {
if (!(READ_ONCE(client->events) & smi_event))
continue;
spin_lock(&client->lock);
if (kfifo_avail(&client->fifo) >= len) {
kfifo_in(&client->fifo, event_msg, len);
wake_up_all(&client->wait_queue);
} else {
pr_debug("smi_event(EventID: %llu): no space left\n",
smi_event);
}
spin_unlock(&client->lock);
}
rcu_read_unlock();
}
void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
uint32_t throttle_bitmask)
{
struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
/*
* ThermalThrottle msg = throttle_bitmask(8):
* thermal_interrupt_count(16):
* 16 bytes event + 1 byte space + 8 byte throttle_bitmask +
* 1 byte : + 16 byte thermal_interupt_counter + 1 byte \n +
* 1 byte \0 = 44
*/
char fifo_in[44];
int len;
if (list_empty(&dev->smi_clients))
return;
len = snprintf(fifo_in, 44, "%x %x:%llx\n",
KFD_SMI_EVENT_THERMAL_THROTTLE, throttle_bitmask,
atomic64_read(&adev->smu.throttle_int_counter));
add_event_to_kfifo(dev, KFD_SMI_EVENT_THERMAL_THROTTLE, fifo_in, len);
}
void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
{ {
struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd; struct amdgpu_device *adev = (struct amdgpu_device *)dev->kgd;
...@@ -156,7 +205,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) ...@@ -156,7 +205,6 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
/* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43 /* 16 bytes event + 1 byte space + 25 bytes msg + 1 byte \n = 43
*/ */
char fifo_in[43]; char fifo_in[43];
struct kfd_smi_client *client;
int len; int len;
if (list_empty(&dev->smi_clients)) if (list_empty(&dev->smi_clients))
...@@ -171,22 +219,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) ...@@ -171,22 +219,7 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT, len = snprintf(fifo_in, 43, "%x %x:%s\n", KFD_SMI_EVENT_VMFAULT,
task_info.pid, task_info.task_name); task_info.pid, task_info.task_name);
rcu_read_lock(); add_event_to_kfifo(dev, KFD_SMI_EVENT_VMFAULT, fifo_in, len);
list_for_each_entry_rcu(client, &dev->smi_clients, list) {
if (!(READ_ONCE(client->events) & KFD_SMI_EVENT_VMFAULT))
continue;
spin_lock(&client->lock);
if (kfifo_avail(&client->fifo) >= len) {
kfifo_in(&client->fifo, fifo_in, len);
wake_up_all(&client->wait_queue);
}
else
pr_debug("smi_event(vmfault): no space left\n");
spin_unlock(&client->lock);
}
rcu_read_unlock();
} }
int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
......
...@@ -25,5 +25,7 @@ ...@@ -25,5 +25,7 @@
int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd); int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd);
void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
uint32_t throttle_bitmask);
#endif #endif
...@@ -640,6 +640,7 @@ static int smu_sw_init(void *handle) ...@@ -640,6 +640,7 @@ static int smu_sw_init(void *handle)
mutex_init(&smu->message_lock); mutex_init(&smu->message_lock);
INIT_WORK(&smu->throttling_logging_work, smu_throttling_logging_work_fn); INIT_WORK(&smu->throttling_logging_work, smu_throttling_logging_work_fn);
atomic64_set(&smu->throttle_int_counter, 0);
smu->watermarks_bitmap = 0; smu->watermarks_bitmap = 0;
smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; smu->power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT; smu->default_power_profile_mode = PP_SMC_POWER_PROFILE_BOOTUP_DEFAULT;
......
...@@ -2251,6 +2251,7 @@ static void arcturus_log_thermal_throttling_event(struct smu_context *smu) ...@@ -2251,6 +2251,7 @@ static void arcturus_log_thermal_throttling_event(struct smu_context *smu)
dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n", dev_warn(adev->dev, "WARN: GPU thermal throttling temperature reached, expect performance decrease. %s.\n",
log_buf); log_buf);
kgd2kfd_smi_event_throttle(smu->adev->kfd.dev, throttler_status);
} }
static const struct pptable_funcs arcturus_ppt_funcs = { static const struct pptable_funcs arcturus_ppt_funcs = {
......
...@@ -446,6 +446,7 @@ struct smu_context ...@@ -446,6 +446,7 @@ struct smu_context
bool dc_controlled_by_gpio; bool dc_controlled_by_gpio;
struct work_struct throttling_logging_work; struct work_struct throttling_logging_work;
atomic64_t throttle_int_counter;
}; };
struct i2c_adapter; struct i2c_adapter;
......
...@@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev, ...@@ -1311,6 +1311,11 @@ static int smu_v11_0_irq_process(struct amdgpu_device *adev,
smu_v11_0_ack_ac_dc_interrupt(&adev->smu); smu_v11_0_ack_ac_dc_interrupt(&adev->smu);
break; break;
case 0x7: case 0x7:
/*
* Increment the throttle interrupt counter
*/
atomic64_inc(&smu->throttle_int_counter);
if (!atomic_read(&adev->throttling_logging_enabled)) if (!atomic_read(&adev->throttling_logging_enabled))
return 0; return 0;
......
...@@ -451,6 +451,7 @@ struct kfd_ioctl_import_dmabuf_args { ...@@ -451,6 +451,7 @@ struct kfd_ioctl_import_dmabuf_args {
*/ */
/* Event type (defined by bitmask) */ /* Event type (defined by bitmask) */
#define KFD_SMI_EVENT_VMFAULT 0x0000000000000001 #define KFD_SMI_EVENT_VMFAULT 0x0000000000000001
#define KFD_SMI_EVENT_THERMAL_THROTTLE 0x0000000000000002
struct kfd_ioctl_smi_events_args { struct kfd_ioctl_smi_events_args {
__u32 gpuid; /* to KFD */ __u32 gpuid; /* to KFD */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment