Commit e0f1e65b authored by Philip Yang's avatar Philip Yang Committed by Alex Deucher

drm/amdkfd: Add GPU recoverable fault SMI event

Use ktime_get_boottime_ns() as timestamp to correlate with other
APIs. Output timestamp when GPU recoverable fault starts and ends to
recover the fault, if migration happened or only GPU page table is
updated to recover, fault address, if read or write fault.
Signed-off-by: default avatarPhilip Yang <Philip.Yang@amd.com>
Reviewed-by: default avatarFelix Kuehling <Felix.Kuehling@amd.com>
Signed-off-by: default avatarAlex Deucher <alexander.deucher@amd.com>
parent 163a5a58
...@@ -244,6 +244,23 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid) ...@@ -244,6 +244,23 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid)
task_info.pid, task_info.task_name); task_info.pid, task_info.task_name);
} }
void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
unsigned long address, bool write_fault,
ktime_t ts)
{
kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_START,
"%lld -%d @%lx(%x) %c\n", ktime_to_ns(ts), pid,
address, dev->id, write_fault ? 'W' : 'R');
}
void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
unsigned long address, bool migration)
{
kfd_smi_event_add(pid, dev, KFD_SMI_EVENT_PAGE_FAULT_END,
"%lld -%d @%lx(%x) %c\n", ktime_get_boottime_ns(),
pid, address, dev->id, migration ? 'M' : 'U');
}
int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd) int kfd_smi_event_open(struct kfd_dev *dev, uint32_t *fd)
{ {
struct kfd_smi_client *client; struct kfd_smi_client *client;
......
...@@ -29,5 +29,9 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid); ...@@ -29,5 +29,9 @@ void kfd_smi_event_update_vmfault(struct kfd_dev *dev, uint16_t pasid);
void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev, void kfd_smi_event_update_thermal_throttling(struct kfd_dev *dev,
uint64_t throttle_bitmask); uint64_t throttle_bitmask);
void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset); void kfd_smi_event_update_gpu_reset(struct kfd_dev *dev, bool post_reset);
void kfd_smi_event_page_fault_start(struct kfd_dev *dev, pid_t pid,
unsigned long address, bool write_fault,
ktime_t ts);
void kfd_smi_event_page_fault_end(struct kfd_dev *dev, pid_t pid,
unsigned long address, bool migration);
#endif #endif
...@@ -32,6 +32,7 @@ ...@@ -32,6 +32,7 @@
#include "kfd_priv.h" #include "kfd_priv.h"
#include "kfd_svm.h" #include "kfd_svm.h"
#include "kfd_migrate.h" #include "kfd_migrate.h"
#include "kfd_smi_events.h"
#ifdef dev_fmt #ifdef dev_fmt
#undef dev_fmt #undef dev_fmt
...@@ -43,7 +44,7 @@ ...@@ -43,7 +44,7 @@
/* Long enough to ensure no retry fault comes after svm range is restored and /* Long enough to ensure no retry fault comes after svm range is restored and
* page table is updated. * page table is updated.
*/ */
#define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING 2000 #define AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING (2UL * NSEC_PER_MSEC)
struct criu_svm_metadata { struct criu_svm_metadata {
struct list_head list; struct list_head list;
...@@ -1617,7 +1618,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm, ...@@ -1617,7 +1618,7 @@ static int svm_range_validate_and_map(struct mm_struct *mm,
svm_range_unreserve_bos(&ctx); svm_range_unreserve_bos(&ctx);
if (!r) if (!r)
prange->validate_timestamp = ktime_to_us(ktime_get()); prange->validate_timestamp = ktime_get_boottime();
return r; return r;
} }
...@@ -2694,11 +2695,12 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, ...@@ -2694,11 +2695,12 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
struct svm_range_list *svms; struct svm_range_list *svms;
struct svm_range *prange; struct svm_range *prange;
struct kfd_process *p; struct kfd_process *p;
uint64_t timestamp; ktime_t timestamp = ktime_get_boottime();
int32_t best_loc; int32_t best_loc;
int32_t gpuidx = MAX_GPU_INSTANCE; int32_t gpuidx = MAX_GPU_INSTANCE;
bool write_locked = false; bool write_locked = false;
struct vm_area_struct *vma; struct vm_area_struct *vma;
bool migration = false;
int r = 0; int r = 0;
if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) { if (!KFD_IS_SVM_API_SUPPORTED(adev->kfd.dev)) {
...@@ -2775,9 +2777,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, ...@@ -2775,9 +2777,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
goto out_unlock_range; goto out_unlock_range;
} }
timestamp = ktime_to_us(ktime_get()) - prange->validate_timestamp;
/* skip duplicate vm fault on different pages of same range */ /* skip duplicate vm fault on different pages of same range */
if (timestamp < AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING) { if (ktime_before(timestamp, ktime_add_ns(prange->validate_timestamp,
AMDGPU_SVM_RANGE_RETRY_FAULT_PENDING))) {
pr_debug("svms 0x%p [0x%lx %lx] already restored\n", pr_debug("svms 0x%p [0x%lx %lx] already restored\n",
svms, prange->start, prange->last); svms, prange->start, prange->last);
r = 0; r = 0;
...@@ -2813,7 +2815,11 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, ...@@ -2813,7 +2815,11 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
svms, prange->start, prange->last, best_loc, svms, prange->start, prange->last, best_loc,
prange->actual_loc); prange->actual_loc);
kfd_smi_event_page_fault_start(adev->kfd.dev, p->lead_thread->pid, addr,
write_fault, timestamp);
if (prange->actual_loc != best_loc) { if (prange->actual_loc != best_loc) {
migration = true;
if (best_loc) { if (best_loc) {
r = svm_migrate_to_vram(prange, best_loc, mm); r = svm_migrate_to_vram(prange, best_loc, mm);
if (r) { if (r) {
...@@ -2842,6 +2848,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid, ...@@ -2842,6 +2848,9 @@ svm_range_restore_pages(struct amdgpu_device *adev, unsigned int pasid,
pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n", pr_debug("failed %d to map svms 0x%p [0x%lx 0x%lx] to gpus\n",
r, svms, prange->start, prange->last); r, svms, prange->start, prange->last);
kfd_smi_event_page_fault_end(adev->kfd.dev, p->lead_thread->pid, addr,
migration);
out_unlock_range: out_unlock_range:
mutex_unlock(&prange->migrate_mutex); mutex_unlock(&prange->migrate_mutex);
out_unlock_svms: out_unlock_svms:
......
...@@ -125,7 +125,7 @@ struct svm_range { ...@@ -125,7 +125,7 @@ struct svm_range {
uint32_t actual_loc; uint32_t actual_loc;
uint8_t granularity; uint8_t granularity;
atomic_t invalid; atomic_t invalid;
uint64_t validate_timestamp; ktime_t validate_timestamp;
struct mmu_interval_notifier notifier; struct mmu_interval_notifier notifier;
struct svm_work_list_item work_item; struct svm_work_list_item work_item;
struct list_head deferred_list; struct list_head deferred_list;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment