Commit 27d19268 authored by Jacek Lawrynowicz's avatar Jacek Lawrynowicz

accel/ivpu: Improve recovery and reset support

  - Synchronize job submission with reset/recovery using reset_lock
  - Always print recovery reason and call diagnose_failure()
  - Don't allow for autosupend during recovery
  - Prevent immediate autosuspend after reset/recovery
  - Prevent force_recovery for issuing TDR when device is suspended
  - Reset VPU instead triggering recovery after changing debugfs params
Signed-off-by: default avatarJacek Lawrynowicz <jacek.lawrynowicz@linux.intel.com>
Reviewed-by: default avatarWachowski, Karol <karol.wachowski@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20240122120945.1150728-4-jacek.lawrynowicz@linux.intel.com
parent 264b271d
...@@ -102,7 +102,7 @@ static int reset_pending_show(struct seq_file *s, void *v) ...@@ -102,7 +102,7 @@ static int reset_pending_show(struct seq_file *s, void *v)
{ {
struct ivpu_device *vdev = seq_to_ivpu(s); struct ivpu_device *vdev = seq_to_ivpu(s);
seq_printf(s, "%d\n", atomic_read(&vdev->pm->in_reset)); seq_printf(s, "%d\n", atomic_read(&vdev->pm->reset_pending));
return 0; return 0;
} }
...@@ -130,7 +130,9 @@ dvfs_mode_fops_write(struct file *file, const char __user *user_buf, size_t size ...@@ -130,7 +130,9 @@ dvfs_mode_fops_write(struct file *file, const char __user *user_buf, size_t size
fw->dvfs_mode = dvfs_mode; fw->dvfs_mode = dvfs_mode;
ivpu_pm_schedule_recovery(vdev); ret = pci_try_reset_function(to_pci_dev(vdev->drm.dev));
if (ret)
return ret;
return size; return size;
} }
...@@ -190,7 +192,10 @@ fw_profiling_freq_fops_write(struct file *file, const char __user *user_buf, ...@@ -190,7 +192,10 @@ fw_profiling_freq_fops_write(struct file *file, const char __user *user_buf,
return ret; return ret;
ivpu_hw_profiling_freq_drive(vdev, enable); ivpu_hw_profiling_freq_drive(vdev, enable);
ivpu_pm_schedule_recovery(vdev);
ret = pci_try_reset_function(to_pci_dev(vdev->drm.dev));
if (ret)
return ret;
return size; return size;
} }
...@@ -301,11 +306,18 @@ static ssize_t ...@@ -301,11 +306,18 @@ static ssize_t
ivpu_force_recovery_fn(struct file *file, const char __user *user_buf, size_t size, loff_t *pos) ivpu_force_recovery_fn(struct file *file, const char __user *user_buf, size_t size, loff_t *pos)
{ {
struct ivpu_device *vdev = file->private_data; struct ivpu_device *vdev = file->private_data;
int ret;
if (!size) if (!size)
return -EINVAL; return -EINVAL;
ivpu_pm_schedule_recovery(vdev); ret = ivpu_rpm_get(vdev);
if (ret)
return ret;
ivpu_pm_trigger_recovery(vdev, "debugfs");
flush_work(&vdev->pm->recovery_work);
ivpu_rpm_put(vdev);
return size; return size;
} }
......
...@@ -875,24 +875,18 @@ static void ivpu_hw_37xx_irq_disable(struct ivpu_device *vdev) ...@@ -875,24 +875,18 @@ static void ivpu_hw_37xx_irq_disable(struct ivpu_device *vdev)
static void ivpu_hw_37xx_irq_wdt_nce_handler(struct ivpu_device *vdev) static void ivpu_hw_37xx_irq_wdt_nce_handler(struct ivpu_device *vdev)
{ {
ivpu_err_ratelimited(vdev, "WDT NCE irq\n"); ivpu_pm_trigger_recovery(vdev, "WDT NCE IRQ");
ivpu_pm_schedule_recovery(vdev);
} }
static void ivpu_hw_37xx_irq_wdt_mss_handler(struct ivpu_device *vdev) static void ivpu_hw_37xx_irq_wdt_mss_handler(struct ivpu_device *vdev)
{ {
ivpu_err_ratelimited(vdev, "WDT MSS irq\n");
ivpu_hw_wdt_disable(vdev); ivpu_hw_wdt_disable(vdev);
ivpu_pm_schedule_recovery(vdev); ivpu_pm_trigger_recovery(vdev, "WDT MSS IRQ");
} }
static void ivpu_hw_37xx_irq_noc_firewall_handler(struct ivpu_device *vdev) static void ivpu_hw_37xx_irq_noc_firewall_handler(struct ivpu_device *vdev)
{ {
ivpu_err_ratelimited(vdev, "NOC Firewall irq\n"); ivpu_pm_trigger_recovery(vdev, "NOC Firewall IRQ");
ivpu_pm_schedule_recovery(vdev);
} }
/* Handler for IRQs from VPU core (irqV) */ /* Handler for IRQs from VPU core (irqV) */
...@@ -970,7 +964,7 @@ static bool ivpu_hw_37xx_irqb_handler(struct ivpu_device *vdev, int irq) ...@@ -970,7 +964,7 @@ static bool ivpu_hw_37xx_irqb_handler(struct ivpu_device *vdev, int irq)
REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, status); REGB_WR32(VPU_37XX_BUTTRESS_INTERRUPT_STAT, status);
if (schedule_recovery) if (schedule_recovery)
ivpu_pm_schedule_recovery(vdev); ivpu_pm_trigger_recovery(vdev, "Buttress IRQ");
return true; return true;
} }
......
...@@ -1049,18 +1049,18 @@ static void ivpu_hw_40xx_irq_disable(struct ivpu_device *vdev) ...@@ -1049,18 +1049,18 @@ static void ivpu_hw_40xx_irq_disable(struct ivpu_device *vdev)
static void ivpu_hw_40xx_irq_wdt_nce_handler(struct ivpu_device *vdev) static void ivpu_hw_40xx_irq_wdt_nce_handler(struct ivpu_device *vdev)
{ {
/* TODO: For LNN hang consider engine reset instead of full recovery */ /* TODO: For LNN hang consider engine reset instead of full recovery */
ivpu_pm_schedule_recovery(vdev); ivpu_pm_trigger_recovery(vdev, "WDT NCE IRQ");
} }
static void ivpu_hw_40xx_irq_wdt_mss_handler(struct ivpu_device *vdev) static void ivpu_hw_40xx_irq_wdt_mss_handler(struct ivpu_device *vdev)
{ {
ivpu_hw_wdt_disable(vdev); ivpu_hw_wdt_disable(vdev);
ivpu_pm_schedule_recovery(vdev); ivpu_pm_trigger_recovery(vdev, "WDT MSS IRQ");
} }
static void ivpu_hw_40xx_irq_noc_firewall_handler(struct ivpu_device *vdev) static void ivpu_hw_40xx_irq_noc_firewall_handler(struct ivpu_device *vdev)
{ {
ivpu_pm_schedule_recovery(vdev); ivpu_pm_trigger_recovery(vdev, "NOC Firewall IRQ");
} }
/* Handler for IRQs from VPU core (irqV) */ /* Handler for IRQs from VPU core (irqV) */
...@@ -1154,7 +1154,7 @@ static bool ivpu_hw_40xx_irqb_handler(struct ivpu_device *vdev, int irq) ...@@ -1154,7 +1154,7 @@ static bool ivpu_hw_40xx_irqb_handler(struct ivpu_device *vdev, int irq)
REGB_WR32(VPU_40XX_BUTTRESS_INTERRUPT_STAT, status); REGB_WR32(VPU_40XX_BUTTRESS_INTERRUPT_STAT, status);
if (schedule_recovery) if (schedule_recovery)
ivpu_pm_schedule_recovery(vdev); ivpu_pm_trigger_recovery(vdev, "Buttress IRQ");
return true; return true;
} }
......
...@@ -343,10 +343,8 @@ int ivpu_ipc_send_receive_active(struct ivpu_device *vdev, struct vpu_jsm_msg *r ...@@ -343,10 +343,8 @@ int ivpu_ipc_send_receive_active(struct ivpu_device *vdev, struct vpu_jsm_msg *r
hb_ret = ivpu_ipc_send_receive_internal(vdev, &hb_req, VPU_JSM_MSG_QUERY_ENGINE_HB_DONE, hb_ret = ivpu_ipc_send_receive_internal(vdev, &hb_req, VPU_JSM_MSG_QUERY_ENGINE_HB_DONE,
&hb_resp, VPU_IPC_CHAN_ASYNC_CMD, &hb_resp, VPU_IPC_CHAN_ASYNC_CMD,
vdev->timeout.jsm); vdev->timeout.jsm);
if (hb_ret == -ETIMEDOUT) { if (hb_ret == -ETIMEDOUT)
ivpu_hw_diagnose_failure(vdev); ivpu_pm_trigger_recovery(vdev, "IPC timeout");
ivpu_pm_schedule_recovery(vdev);
}
return ret; return ret;
} }
......
...@@ -515,7 +515,9 @@ int ivpu_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file) ...@@ -515,7 +515,9 @@ int ivpu_submit_ioctl(struct drm_device *dev, void *data, struct drm_file *file)
goto err_destroy_job; goto err_destroy_job;
} }
down_read(&vdev->pm->reset_lock);
ret = ivpu_job_submit(job); ret = ivpu_job_submit(job);
up_read(&vdev->pm->reset_lock);
if (ret) if (ret)
goto err_signal_fence; goto err_signal_fence;
......
...@@ -887,7 +887,6 @@ static u32 *ivpu_mmu_get_event(struct ivpu_device *vdev) ...@@ -887,7 +887,6 @@ static u32 *ivpu_mmu_get_event(struct ivpu_device *vdev)
void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev) void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev)
{ {
bool schedule_recovery = false;
u32 *event; u32 *event;
u32 ssid; u32 ssid;
...@@ -897,14 +896,13 @@ void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev) ...@@ -897,14 +896,13 @@ void ivpu_mmu_irq_evtq_handler(struct ivpu_device *vdev)
ivpu_mmu_dump_event(vdev, event); ivpu_mmu_dump_event(vdev, event);
ssid = FIELD_GET(IVPU_MMU_EVT_SSID_MASK, event[0]); ssid = FIELD_GET(IVPU_MMU_EVT_SSID_MASK, event[0]);
if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID) if (ssid == IVPU_GLOBAL_CONTEXT_MMU_SSID) {
schedule_recovery = true; ivpu_pm_trigger_recovery(vdev, "MMU event");
else return;
ivpu_mmu_user_context_mark_invalid(vdev, ssid); }
}
if (schedule_recovery) ivpu_mmu_user_context_mark_invalid(vdev, ssid);
ivpu_pm_schedule_recovery(vdev); }
} }
void ivpu_mmu_evtq_dump(struct ivpu_device *vdev) void ivpu_mmu_evtq_dump(struct ivpu_device *vdev)
......
...@@ -112,6 +112,14 @@ static void ivpu_pm_recovery_work(struct work_struct *work) ...@@ -112,6 +112,14 @@ static void ivpu_pm_recovery_work(struct work_struct *work)
char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL}; char *evt[2] = {"IVPU_PM_EVENT=IVPU_RECOVER", NULL};
int ret; int ret;
ivpu_err(vdev, "Recovering the VPU (reset #%d)\n", atomic_read(&vdev->pm->reset_counter));
ret = pm_runtime_resume_and_get(vdev->drm.dev);
if (ret)
ivpu_err(vdev, "Failed to resume VPU: %d\n", ret);
ivpu_fw_log_dump(vdev);
retry: retry:
ret = pci_try_reset_function(to_pci_dev(vdev->drm.dev)); ret = pci_try_reset_function(to_pci_dev(vdev->drm.dev));
if (ret == -EAGAIN && !drm_dev_is_unplugged(&vdev->drm)) { if (ret == -EAGAIN && !drm_dev_is_unplugged(&vdev->drm)) {
...@@ -123,11 +131,13 @@ static void ivpu_pm_recovery_work(struct work_struct *work) ...@@ -123,11 +131,13 @@ static void ivpu_pm_recovery_work(struct work_struct *work)
ivpu_err(vdev, "Failed to reset VPU: %d\n", ret); ivpu_err(vdev, "Failed to reset VPU: %d\n", ret);
kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt); kobject_uevent_env(&vdev->drm.dev->kobj, KOBJ_CHANGE, evt);
pm_runtime_mark_last_busy(vdev->drm.dev);
pm_runtime_put_autosuspend(vdev->drm.dev);
} }
void ivpu_pm_schedule_recovery(struct ivpu_device *vdev) void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason)
{ {
struct ivpu_pm_info *pm = vdev->pm; ivpu_err(vdev, "Recovery triggered by %s\n", reason);
if (ivpu_disable_recovery) { if (ivpu_disable_recovery) {
ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n"); ivpu_err(vdev, "Recovery not available when disable_recovery param is set\n");
...@@ -139,10 +149,11 @@ void ivpu_pm_schedule_recovery(struct ivpu_device *vdev) ...@@ -139,10 +149,11 @@ void ivpu_pm_schedule_recovery(struct ivpu_device *vdev)
return; return;
} }
/* Schedule recovery if it's not in progress */ /* Trigger recovery if it's not in progress */
if (atomic_cmpxchg(&pm->in_reset, 0, 1) == 0) { if (atomic_cmpxchg(&vdev->pm->reset_pending, 0, 1) == 0) {
ivpu_hw_irq_disable(vdev); ivpu_hw_diagnose_failure(vdev);
queue_work(system_long_wq, &pm->recovery_work); ivpu_hw_irq_disable(vdev); /* Disable IRQ early to protect from IRQ storm */
queue_work(system_long_wq, &vdev->pm->recovery_work);
} }
} }
...@@ -150,12 +161,8 @@ static void ivpu_job_timeout_work(struct work_struct *work) ...@@ -150,12 +161,8 @@ static void ivpu_job_timeout_work(struct work_struct *work)
{ {
struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work); struct ivpu_pm_info *pm = container_of(work, struct ivpu_pm_info, job_timeout_work.work);
struct ivpu_device *vdev = pm->vdev; struct ivpu_device *vdev = pm->vdev;
unsigned long timeout_ms = ivpu_tdr_timeout_ms ? ivpu_tdr_timeout_ms : vdev->timeout.tdr;
ivpu_err(vdev, "TDR detected, timeout %lu ms", timeout_ms); ivpu_pm_trigger_recovery(vdev, "TDR");
ivpu_hw_diagnose_failure(vdev);
ivpu_pm_schedule_recovery(vdev);
} }
void ivpu_start_job_timeout_detection(struct ivpu_device *vdev) void ivpu_start_job_timeout_detection(struct ivpu_device *vdev)
...@@ -228,6 +235,9 @@ int ivpu_pm_runtime_suspend_cb(struct device *dev) ...@@ -228,6 +235,9 @@ int ivpu_pm_runtime_suspend_cb(struct device *dev)
bool hw_is_idle = true; bool hw_is_idle = true;
int ret; int ret;
drm_WARN_ON(&vdev->drm, !xa_empty(&vdev->submitted_jobs_xa));
drm_WARN_ON(&vdev->drm, work_pending(&vdev->pm->recovery_work));
ivpu_dbg(vdev, PM, "Runtime suspend..\n"); ivpu_dbg(vdev, PM, "Runtime suspend..\n");
if (!ivpu_hw_is_idle(vdev) && vdev->pm->suspend_reschedule_counter) { if (!ivpu_hw_is_idle(vdev) && vdev->pm->suspend_reschedule_counter) {
...@@ -310,11 +320,12 @@ void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev) ...@@ -310,11 +320,12 @@ void ivpu_pm_reset_prepare_cb(struct pci_dev *pdev)
{ {
struct ivpu_device *vdev = pci_get_drvdata(pdev); struct ivpu_device *vdev = pci_get_drvdata(pdev);
pm_runtime_get_sync(vdev->drm.dev);
ivpu_dbg(vdev, PM, "Pre-reset..\n"); ivpu_dbg(vdev, PM, "Pre-reset..\n");
atomic_inc(&vdev->pm->reset_counter); atomic_inc(&vdev->pm->reset_counter);
atomic_set(&vdev->pm->in_reset, 1); atomic_set(&vdev->pm->reset_pending, 1);
pm_runtime_get_sync(vdev->drm.dev);
down_write(&vdev->pm->reset_lock);
ivpu_prepare_for_reset(vdev); ivpu_prepare_for_reset(vdev);
ivpu_hw_reset(vdev); ivpu_hw_reset(vdev);
ivpu_pm_prepare_cold_boot(vdev); ivpu_pm_prepare_cold_boot(vdev);
...@@ -331,9 +342,11 @@ void ivpu_pm_reset_done_cb(struct pci_dev *pdev) ...@@ -331,9 +342,11 @@ void ivpu_pm_reset_done_cb(struct pci_dev *pdev)
ret = ivpu_resume(vdev); ret = ivpu_resume(vdev);
if (ret) if (ret)
ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret); ivpu_err(vdev, "Failed to set RESUME state: %d\n", ret);
atomic_set(&vdev->pm->in_reset, 0); up_write(&vdev->pm->reset_lock);
atomic_set(&vdev->pm->reset_pending, 0);
ivpu_dbg(vdev, PM, "Post-reset done.\n"); ivpu_dbg(vdev, PM, "Post-reset done.\n");
pm_runtime_mark_last_busy(vdev->drm.dev);
pm_runtime_put_autosuspend(vdev->drm.dev); pm_runtime_put_autosuspend(vdev->drm.dev);
} }
...@@ -346,7 +359,10 @@ void ivpu_pm_init(struct ivpu_device *vdev) ...@@ -346,7 +359,10 @@ void ivpu_pm_init(struct ivpu_device *vdev)
pm->vdev = vdev; pm->vdev = vdev;
pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT; pm->suspend_reschedule_counter = PM_RESCHEDULE_LIMIT;
atomic_set(&pm->in_reset, 0); init_rwsem(&pm->reset_lock);
atomic_set(&pm->reset_pending, 0);
atomic_set(&pm->reset_counter, 0);
INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work); INIT_WORK(&pm->recovery_work, ivpu_pm_recovery_work);
INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work); INIT_DELAYED_WORK(&pm->job_timeout_work, ivpu_job_timeout_work);
......
...@@ -6,6 +6,7 @@ ...@@ -6,6 +6,7 @@
#ifndef __IVPU_PM_H__ #ifndef __IVPU_PM_H__
#define __IVPU_PM_H__ #define __IVPU_PM_H__
#include <linux/rwsem.h>
#include <linux/types.h> #include <linux/types.h>
struct ivpu_device; struct ivpu_device;
...@@ -14,8 +15,9 @@ struct ivpu_pm_info { ...@@ -14,8 +15,9 @@ struct ivpu_pm_info {
struct ivpu_device *vdev; struct ivpu_device *vdev;
struct delayed_work job_timeout_work; struct delayed_work job_timeout_work;
struct work_struct recovery_work; struct work_struct recovery_work;
atomic_t in_reset; struct rw_semaphore reset_lock;
atomic_t reset_counter; atomic_t reset_counter;
atomic_t reset_pending;
bool is_warmboot; bool is_warmboot;
u32 suspend_reschedule_counter; u32 suspend_reschedule_counter;
}; };
...@@ -37,7 +39,7 @@ int __must_check ivpu_rpm_get(struct ivpu_device *vdev); ...@@ -37,7 +39,7 @@ int __must_check ivpu_rpm_get(struct ivpu_device *vdev);
int __must_check ivpu_rpm_get_if_active(struct ivpu_device *vdev); int __must_check ivpu_rpm_get_if_active(struct ivpu_device *vdev);
void ivpu_rpm_put(struct ivpu_device *vdev); void ivpu_rpm_put(struct ivpu_device *vdev);
void ivpu_pm_schedule_recovery(struct ivpu_device *vdev); void ivpu_pm_trigger_recovery(struct ivpu_device *vdev, const char *reason);
void ivpu_start_job_timeout_detection(struct ivpu_device *vdev); void ivpu_start_job_timeout_detection(struct ivpu_device *vdev);
void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev); void ivpu_stop_job_timeout_detection(struct ivpu_device *vdev);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment