Commit 6d7a20c0 authored by Lucas Stach's avatar Lucas Stach

drm/etnaviv: replace hangcheck with scheduler timeout

This replaces the etnaviv internal hangcheck logic with the job timeout
handling provided by the DRM scheduler. This simplifies the driver further
and allows to replay jobs after a GPU reset, so only minimal state is lost.

This introduces a user-visible change in that we don't allow jobs to run
indefinitely as long as they make progress anymore, as this introduces
quality of service issues when multiple processes are using the GPU.
Userspace is now responsible to flush jobs in a way that the finish in a
reasonable time, where reasonable is currently defined as less than 500ms.
Signed-off-by: default avatarLucas Stach <l.stach@pengutronix.de>
parent e0580254
...@@ -20,9 +20,13 @@ ...@@ -20,9 +20,13 @@
#include "etnaviv_gem.h" #include "etnaviv_gem.h"
#include "etnaviv_gpu.h" #include "etnaviv_gpu.h"
#include "etnaviv_mmu.h" #include "etnaviv_mmu.h"
#include "etnaviv_sched.h"
#include "state.xml.h" #include "state.xml.h"
#include "state_hi.xml.h" #include "state_hi.xml.h"
static bool etnaviv_dump_core = true;
module_param_named(dump_core, etnaviv_dump_core, bool, 0600);
struct core_dump_iterator { struct core_dump_iterator {
void *start; void *start;
struct etnaviv_dump_object_header *hdr; struct etnaviv_dump_object_header *hdr;
...@@ -121,10 +125,16 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu) ...@@ -121,10 +125,16 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
struct etnaviv_vram_mapping *vram; struct etnaviv_vram_mapping *vram;
struct etnaviv_gem_object *obj; struct etnaviv_gem_object *obj;
struct etnaviv_gem_submit *submit; struct etnaviv_gem_submit *submit;
struct drm_sched_job *s_job;
unsigned int n_obj, n_bomap_pages; unsigned int n_obj, n_bomap_pages;
size_t file_size, mmu_size; size_t file_size, mmu_size;
__le64 *bomap, *bomap_start; __le64 *bomap, *bomap_start;
/* Only catch the first event, or when manually re-armed */
if (!etnaviv_dump_core)
return;
etnaviv_dump_core = false;
mmu_size = etnaviv_iommu_dump_size(gpu->mmu); mmu_size = etnaviv_iommu_dump_size(gpu->mmu);
/* We always dump registers, mmu, ring and end marker */ /* We always dump registers, mmu, ring and end marker */
...@@ -135,10 +145,13 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu) ...@@ -135,10 +145,13 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
mmu_size + gpu->buffer.size; mmu_size + gpu->buffer.size;
/* Add in the active command buffers */ /* Add in the active command buffers */
list_for_each_entry(submit, &gpu->active_submit_list, node) { spin_lock(&gpu->sched.job_list_lock);
list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
submit = to_etnaviv_submit(s_job);
file_size += submit->cmdbuf.size; file_size += submit->cmdbuf.size;
n_obj++; n_obj++;
} }
spin_unlock(&gpu->sched.job_list_lock);
/* Add in the active buffer objects */ /* Add in the active buffer objects */
list_for_each_entry(vram, &gpu->mmu->mappings, mmu_node) { list_for_each_entry(vram, &gpu->mmu->mappings, mmu_node) {
...@@ -180,10 +193,14 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu) ...@@ -180,10 +193,14 @@ void etnaviv_core_dump(struct etnaviv_gpu *gpu)
gpu->buffer.size, gpu->buffer.size,
etnaviv_cmdbuf_get_va(&gpu->buffer)); etnaviv_cmdbuf_get_va(&gpu->buffer));
list_for_each_entry(submit, &gpu->active_submit_list, node) spin_lock(&gpu->sched.job_list_lock);
list_for_each_entry(s_job, &gpu->sched.ring_mirror_list, node) {
submit = to_etnaviv_submit(s_job);
etnaviv_core_dump_mem(&iter, ETDUMP_BUF_CMD, etnaviv_core_dump_mem(&iter, ETDUMP_BUF_CMD,
submit->cmdbuf.vaddr, submit->cmdbuf.size, submit->cmdbuf.vaddr, submit->cmdbuf.size,
etnaviv_cmdbuf_get_va(&submit->cmdbuf)); etnaviv_cmdbuf_get_va(&submit->cmdbuf));
}
spin_unlock(&gpu->sched.job_list_lock);
/* Reserve space for the bomap */ /* Reserve space for the bomap */
if (n_bomap_pages) { if (n_bomap_pages) {
......
...@@ -542,7 +542,6 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data, ...@@ -542,7 +542,6 @@ int etnaviv_ioctl_gem_submit(struct drm_device *dev, void *data,
goto err_submit_objects; goto err_submit_objects;
memcpy(submit->cmdbuf.vaddr, stream, args->stream_size); memcpy(submit->cmdbuf.vaddr, stream, args->stream_size);
submit->cmdbuf.user_size = ALIGN(args->stream_size, 8);
ret = submit_lock_objects(submit, &ticket); ret = submit_lock_objects(submit, &ticket);
if (ret) if (ret)
......
...@@ -41,9 +41,6 @@ static const struct platform_device_id gpu_ids[] = { ...@@ -41,9 +41,6 @@ static const struct platform_device_id gpu_ids[] = {
{ }, { },
}; };
static bool etnaviv_dump_core = true;
module_param_named(dump_core, etnaviv_dump_core, bool, 0600);
/* /*
* Driver functions: * Driver functions:
*/ */
...@@ -919,38 +916,24 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m) ...@@ -919,38 +916,24 @@ int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m)
} }
#endif #endif
/* void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu)
* Hangcheck detection for locked gpu:
*/
static void recover_worker(struct work_struct *work)
{ {
struct etnaviv_gpu *gpu = container_of(work, struct etnaviv_gpu,
recover_work);
unsigned long flags; unsigned long flags;
unsigned int i = 0; unsigned int i = 0;
dev_err(gpu->dev, "hangcheck recover!\n"); dev_err(gpu->dev, "recover hung GPU!\n");
if (pm_runtime_get_sync(gpu->dev) < 0) if (pm_runtime_get_sync(gpu->dev) < 0)
return; return;
mutex_lock(&gpu->lock); mutex_lock(&gpu->lock);
/* Only catch the first event, or when manually re-armed */
if (etnaviv_dump_core) {
etnaviv_core_dump(gpu);
etnaviv_dump_core = false;
}
etnaviv_hw_reset(gpu); etnaviv_hw_reset(gpu);
/* complete all events, the GPU won't do it after the reset */ /* complete all events, the GPU won't do it after the reset */
spin_lock_irqsave(&gpu->event_spinlock, flags); spin_lock_irqsave(&gpu->event_spinlock, flags);
for_each_set_bit_from(i, gpu->event_bitmap, ETNA_NR_EVENTS) { for_each_set_bit_from(i, gpu->event_bitmap, ETNA_NR_EVENTS)
dma_fence_signal(gpu->event[i].fence);
gpu->event[i].fence = NULL;
complete(&gpu->event_free); complete(&gpu->event_free);
}
bitmap_zero(gpu->event_bitmap, ETNA_NR_EVENTS); bitmap_zero(gpu->event_bitmap, ETNA_NR_EVENTS);
spin_unlock_irqrestore(&gpu->event_spinlock, flags); spin_unlock_irqrestore(&gpu->event_spinlock, flags);
gpu->completed_fence = gpu->active_fence; gpu->completed_fence = gpu->active_fence;
...@@ -964,53 +947,6 @@ static void recover_worker(struct work_struct *work) ...@@ -964,53 +947,6 @@ static void recover_worker(struct work_struct *work)
pm_runtime_put_autosuspend(gpu->dev); pm_runtime_put_autosuspend(gpu->dev);
} }
static void hangcheck_timer_reset(struct etnaviv_gpu *gpu)
{
DBG("%s", dev_name(gpu->dev));
mod_timer(&gpu->hangcheck_timer,
round_jiffies_up(jiffies + DRM_ETNAVIV_HANGCHECK_JIFFIES));
}
static void hangcheck_handler(struct timer_list *t)
{
struct etnaviv_gpu *gpu = from_timer(gpu, t, hangcheck_timer);
u32 fence = gpu->completed_fence;
bool progress = false;
if (fence != gpu->hangcheck_fence) {
gpu->hangcheck_fence = fence;
progress = true;
}
if (!progress) {
u32 dma_addr = gpu_read(gpu, VIVS_FE_DMA_ADDRESS);
int change = dma_addr - gpu->hangcheck_dma_addr;
if (change < 0 || change > 16) {
gpu->hangcheck_dma_addr = dma_addr;
progress = true;
}
}
if (!progress && fence_after(gpu->active_fence, fence)) {
dev_err(gpu->dev, "hangcheck detected gpu lockup!\n");
dev_err(gpu->dev, " completed fence: %u\n", fence);
dev_err(gpu->dev, " active fence: %u\n",
gpu->active_fence);
queue_work(gpu->wq, &gpu->recover_work);
}
/* if still more pending work, reset the hangcheck timer: */
if (fence_after(gpu->active_fence, gpu->hangcheck_fence))
hangcheck_timer_reset(gpu);
}
static void hangcheck_disable(struct etnaviv_gpu *gpu)
{
del_timer_sync(&gpu->hangcheck_timer);
cancel_work_sync(&gpu->recover_work);
}
/* fence object management */ /* fence object management */
struct etnaviv_fence { struct etnaviv_fence {
struct etnaviv_gpu *gpu; struct etnaviv_gpu *gpu;
...@@ -1286,10 +1222,12 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit) ...@@ -1286,10 +1222,12 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
unsigned int i, nr_events = 1, event[3]; unsigned int i, nr_events = 1, event[3];
int ret; int ret;
if (!submit->runtime_resumed) {
ret = pm_runtime_get_sync(gpu->dev); ret = pm_runtime_get_sync(gpu->dev);
if (ret < 0) if (ret < 0)
return NULL; return NULL;
submit->runtime_resumed = true; submit->runtime_resumed = true;
}
/* /*
* if there are performance monitor requests we need to have * if there are performance monitor requests we need to have
...@@ -1327,6 +1265,7 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit) ...@@ -1327,6 +1265,7 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
} }
gpu->event[event[0]].fence = gpu_fence; gpu->event[event[0]].fence = gpu_fence;
submit->cmdbuf.user_size = submit->cmdbuf.size - 8;
etnaviv_buffer_queue(gpu, submit->exec_state, event[0], etnaviv_buffer_queue(gpu, submit->exec_state, event[0],
&submit->cmdbuf); &submit->cmdbuf);
...@@ -1337,8 +1276,6 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit) ...@@ -1337,8 +1276,6 @@ struct dma_fence *etnaviv_gpu_submit(struct etnaviv_gem_submit *submit)
etnaviv_sync_point_queue(gpu, event[2]); etnaviv_sync_point_queue(gpu, event[2]);
} }
hangcheck_timer_reset(gpu);
out_unlock: out_unlock:
mutex_unlock(&gpu->lock); mutex_unlock(&gpu->lock);
...@@ -1626,13 +1563,9 @@ static int etnaviv_gpu_bind(struct device *dev, struct device *master, ...@@ -1626,13 +1563,9 @@ static int etnaviv_gpu_bind(struct device *dev, struct device *master,
idr_init(&gpu->fence_idr); idr_init(&gpu->fence_idr);
spin_lock_init(&gpu->fence_spinlock); spin_lock_init(&gpu->fence_spinlock);
INIT_LIST_HEAD(&gpu->active_submit_list);
INIT_WORK(&gpu->sync_point_work, sync_point_worker); INIT_WORK(&gpu->sync_point_work, sync_point_worker);
INIT_WORK(&gpu->recover_work, recover_worker);
init_waitqueue_head(&gpu->fence_event); init_waitqueue_head(&gpu->fence_event);
timer_setup(&gpu->hangcheck_timer, hangcheck_handler, TIMER_DEFERRABLE);
priv->gpu[priv->num_gpus++] = gpu; priv->gpu[priv->num_gpus++] = gpu;
pm_runtime_mark_last_busy(gpu->dev); pm_runtime_mark_last_busy(gpu->dev);
...@@ -1660,8 +1593,6 @@ static void etnaviv_gpu_unbind(struct device *dev, struct device *master, ...@@ -1660,8 +1593,6 @@ static void etnaviv_gpu_unbind(struct device *dev, struct device *master,
DBG("%s", dev_name(gpu->dev)); DBG("%s", dev_name(gpu->dev));
hangcheck_disable(gpu);
flush_workqueue(gpu->wq); flush_workqueue(gpu->wq);
destroy_workqueue(gpu->wq); destroy_workqueue(gpu->wq);
......
...@@ -123,9 +123,6 @@ struct etnaviv_gpu { ...@@ -123,9 +123,6 @@ struct etnaviv_gpu {
struct completion event_free; struct completion event_free;
spinlock_t event_spinlock; spinlock_t event_spinlock;
/* list of currently in-flight command buffers */
struct list_head active_submit_list;
u32 idle_mask; u32 idle_mask;
/* Fencing support */ /* Fencing support */
...@@ -153,13 +150,6 @@ struct etnaviv_gpu { ...@@ -153,13 +150,6 @@ struct etnaviv_gpu {
struct clk *clk_core; struct clk *clk_core;
struct clk *clk_shader; struct clk *clk_shader;
/* Hang Detction: */
#define DRM_ETNAVIV_HANGCHECK_PERIOD 500 /* in ms */
#define DRM_ETNAVIV_HANGCHECK_JIFFIES msecs_to_jiffies(DRM_ETNAVIV_HANGCHECK_PERIOD)
struct timer_list hangcheck_timer;
u32 hangcheck_fence;
u32 hangcheck_dma_addr;
struct work_struct recover_work;
unsigned int freq_scale; unsigned int freq_scale;
unsigned long base_rate_core; unsigned long base_rate_core;
unsigned long base_rate_shader; unsigned long base_rate_shader;
...@@ -188,6 +178,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu); ...@@ -188,6 +178,7 @@ int etnaviv_gpu_init(struct etnaviv_gpu *gpu);
int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m); int etnaviv_gpu_debugfs(struct etnaviv_gpu *gpu, struct seq_file *m);
#endif #endif
void etnaviv_gpu_recover_hang(struct etnaviv_gpu *gpu);
void etnaviv_gpu_retire(struct etnaviv_gpu *gpu); void etnaviv_gpu_retire(struct etnaviv_gpu *gpu);
int etnaviv_gpu_wait_fence_interruptible(struct etnaviv_gpu *gpu, int etnaviv_gpu_wait_fence_interruptible(struct etnaviv_gpu *gpu,
u32 fence, struct timespec *timeout); u32 fence, struct timespec *timeout);
......
...@@ -14,24 +14,19 @@ ...@@ -14,24 +14,19 @@
* this program. If not, see <http://www.gnu.org/licenses/>. * this program. If not, see <http://www.gnu.org/licenses/>.
*/ */
#include <drm/gpu_scheduler.h>
#include <linux/kthread.h> #include <linux/kthread.h>
#include "etnaviv_drv.h" #include "etnaviv_drv.h"
#include "etnaviv_dump.h"
#include "etnaviv_gem.h" #include "etnaviv_gem.h"
#include "etnaviv_gpu.h" #include "etnaviv_gpu.h"
#include "etnaviv_sched.h"
static int etnaviv_job_hang_limit = 0; static int etnaviv_job_hang_limit = 0;
module_param_named(job_hang_limit, etnaviv_job_hang_limit, int , 0444); module_param_named(job_hang_limit, etnaviv_job_hang_limit, int , 0444);
static int etnaviv_hw_jobs_limit = 2; static int etnaviv_hw_jobs_limit = 2;
module_param_named(hw_job_limit, etnaviv_hw_jobs_limit, int , 0444); module_param_named(hw_job_limit, etnaviv_hw_jobs_limit, int , 0444);
static inline
struct etnaviv_gem_submit *to_etnaviv_submit(struct drm_sched_job *sched_job)
{
return container_of(sched_job, struct etnaviv_gem_submit, sched_job);
}
struct dma_fence *etnaviv_sched_dependency(struct drm_sched_job *sched_job, struct dma_fence *etnaviv_sched_dependency(struct drm_sched_job *sched_job,
struct drm_sched_entity *entity) struct drm_sched_entity *entity)
{ {
...@@ -86,34 +81,38 @@ struct dma_fence *etnaviv_sched_dependency(struct drm_sched_job *sched_job, ...@@ -86,34 +81,38 @@ struct dma_fence *etnaviv_sched_dependency(struct drm_sched_job *sched_job,
struct dma_fence *etnaviv_sched_run_job(struct drm_sched_job *sched_job) struct dma_fence *etnaviv_sched_run_job(struct drm_sched_job *sched_job)
{ {
struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job); struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
struct dma_fence *fence; struct dma_fence *fence = NULL;
mutex_lock(&submit->gpu->lock);
list_add_tail(&submit->node, &submit->gpu->active_submit_list);
mutex_unlock(&submit->gpu->lock);
if (likely(!sched_job->s_fence->finished.error))
fence = etnaviv_gpu_submit(submit); fence = etnaviv_gpu_submit(submit);
if (!fence) { else
etnaviv_submit_put(submit); dev_dbg(submit->gpu->dev, "skipping bad job\n");
return NULL;
}
return fence; return fence;
} }
static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job) static void etnaviv_sched_timedout_job(struct drm_sched_job *sched_job)
{ {
/* this replaces the hangcheck */ struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
struct etnaviv_gpu *gpu = submit->gpu;
/* block scheduler */
kthread_park(gpu->sched.thread);
drm_sched_hw_job_reset(&gpu->sched, sched_job);
/* get the GPU back into the init state */
etnaviv_core_dump(gpu);
etnaviv_gpu_recover_hang(gpu);
/* restart scheduler after GPU is usable again */
drm_sched_job_recovery(&gpu->sched);
kthread_unpark(gpu->sched.thread);
} }
static void etnaviv_sched_free_job(struct drm_sched_job *sched_job) static void etnaviv_sched_free_job(struct drm_sched_job *sched_job)
{ {
struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job); struct etnaviv_gem_submit *submit = to_etnaviv_submit(sched_job);
mutex_lock(&submit->gpu->lock);
list_del(&submit->node);
mutex_unlock(&submit->gpu->lock);
etnaviv_submit_put(submit); etnaviv_submit_put(submit);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment