Commit a4be3dca authored by John Harrison's avatar John Harrison

drm/i915: Fix up locking around dumping requests lists

The debugfs dump of requests was confused about what state requires
the execlist lock versus the GuC lock. There was also a bunch of
duplicated messy code between it and the error capture code.

So refactor the hung request search into a re-usable function. And
reduce the span of the execlist state lock to only the execlist
specific code paths. In order to do that, also move the report of hold
count (which is an execlist only concept) from the top level dump
function to the lower level execlist specific function. Also, move the
execlist specific code into the execlist source file.

v2: Rename some functions and move to more appropriate files (Daniele).
v3: Rename new execlist dump function (Daniele)

Fixes: dc0dad36 ("drm/i915/guc: Fix for error capture after full GPU reset with GuC")
Signed-off-by: default avatarJohn Harrison <John.C.Harrison@Intel.com>
Reviewed-by: default avatarDaniele Ceraolo Spurio <daniele.ceraolospurio@intel.com>
Acked-by: default avatarTvrtko Ursulin <tvrtko.ursulin@intel.com>
Cc: Matthew Brost <matthew.brost@intel.com>
Cc: Jani Nikula <jani.nikula@linux.intel.com>
Cc: Joonas Lahtinen <joonas.lahtinen@linux.intel.com>
Cc: Rodrigo Vivi <rodrigo.vivi@intel.com>
Cc: Matt Roper <matthew.d.roper@intel.com>
Cc: Umesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Cc: Michael Cheng <michael.cheng@intel.com>
Cc: Lucas De Marchi <lucas.demarchi@intel.com>
Cc: Bruce Chang <yu.bruce.chang@intel.com>
Cc: Alan Previn <alan.previn.teres.alexis@intel.com>
Cc: Matthew Auld <matthew.auld@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20230127002842.3169194-4-John.C.Harrison@Intel.com
parent 3700e353
...@@ -250,8 +250,8 @@ void intel_engine_dump_active_requests(struct list_head *requests, ...@@ -250,8 +250,8 @@ void intel_engine_dump_active_requests(struct list_head *requests,
ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine, ktime_t intel_engine_get_busy_time(struct intel_engine_cs *engine,
ktime_t *now); ktime_t *now);
struct i915_request * void intel_engine_get_hung_entity(struct intel_engine_cs *engine,
intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine); struct intel_context **ce, struct i915_request **rq);
u32 intel_engine_context_size(struct intel_gt *gt, u8 class); u32 intel_engine_context_size(struct intel_gt *gt, u8 class);
struct intel_context * struct intel_context *
......
...@@ -2114,17 +2114,6 @@ static void print_request_ring(struct drm_printer *m, struct i915_request *rq) ...@@ -2114,17 +2114,6 @@ static void print_request_ring(struct drm_printer *m, struct i915_request *rq)
} }
} }
static unsigned long list_count(struct list_head *list)
{
struct list_head *pos;
unsigned long count = 0;
list_for_each(pos, list)
count++;
return count;
}
static unsigned long read_ul(void *p, size_t x) static unsigned long read_ul(void *p, size_t x)
{ {
return *(unsigned long *)(p + x); return *(unsigned long *)(p + x);
...@@ -2216,11 +2205,11 @@ void intel_engine_dump_active_requests(struct list_head *requests, ...@@ -2216,11 +2205,11 @@ void intel_engine_dump_active_requests(struct list_head *requests,
} }
} }
static void engine_dump_active_requests(struct intel_engine_cs *engine, struct drm_printer *m) static void engine_dump_active_requests(struct intel_engine_cs *engine,
struct drm_printer *m)
{ {
struct intel_context *hung_ce = NULL;
struct i915_request *hung_rq = NULL; struct i915_request *hung_rq = NULL;
struct intel_context *ce;
bool guc;
/* /*
* No need for an engine->irq_seqno_barrier() before the seqno reads. * No need for an engine->irq_seqno_barrier() before the seqno reads.
...@@ -2229,29 +2218,20 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d ...@@ -2229,29 +2218,20 @@ static void engine_dump_active_requests(struct intel_engine_cs *engine, struct d
* But the intention here is just to report an instantaneous snapshot * But the intention here is just to report an instantaneous snapshot
* so that's fine. * so that's fine.
*/ */
lockdep_assert_held(&engine->sched_engine->lock); intel_engine_get_hung_entity(engine, &hung_ce, &hung_rq);
drm_printf(m, "\tRequests:\n"); drm_printf(m, "\tRequests:\n");
guc = intel_uc_uses_guc_submission(&engine->gt->uc);
if (guc) {
ce = intel_engine_get_hung_context(engine);
if (ce)
hung_rq = intel_context_get_active_request(ce);
} else {
hung_rq = intel_engine_execlist_find_hung_request(engine);
if (hung_rq)
hung_rq = i915_request_get_rcu(hung_rq);
}
if (hung_rq) if (hung_rq)
engine_dump_request(hung_rq, m, "\t\thung"); engine_dump_request(hung_rq, m, "\t\thung");
else if (hung_ce)
drm_printf(m, "\t\tGot hung ce but no hung rq!\n");
if (guc) if (intel_uc_uses_guc_submission(&engine->gt->uc))
intel_guc_dump_active_requests(engine, hung_rq, m); intel_guc_dump_active_requests(engine, hung_rq, m);
else else
intel_engine_dump_active_requests(&engine->sched_engine->requests, intel_execlists_dump_active_requests(engine, hung_rq, m);
hung_rq, m);
if (hung_rq) if (hung_rq)
i915_request_put(hung_rq); i915_request_put(hung_rq);
} }
...@@ -2263,7 +2243,6 @@ void intel_engine_dump(struct intel_engine_cs *engine, ...@@ -2263,7 +2243,6 @@ void intel_engine_dump(struct intel_engine_cs *engine,
struct i915_gpu_error * const error = &engine->i915->gpu_error; struct i915_gpu_error * const error = &engine->i915->gpu_error;
struct i915_request *rq; struct i915_request *rq;
intel_wakeref_t wakeref; intel_wakeref_t wakeref;
unsigned long flags;
ktime_t dummy; ktime_t dummy;
if (header) { if (header) {
...@@ -2300,13 +2279,8 @@ void intel_engine_dump(struct intel_engine_cs *engine, ...@@ -2300,13 +2279,8 @@ void intel_engine_dump(struct intel_engine_cs *engine,
i915_reset_count(error)); i915_reset_count(error));
print_properties(engine, m); print_properties(engine, m);
spin_lock_irqsave(&engine->sched_engine->lock, flags);
engine_dump_active_requests(engine, m); engine_dump_active_requests(engine, m);
drm_printf(m, "\tOn hold?: %lu\n",
list_count(&engine->sched_engine->hold));
spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
drm_printf(m, "\tMMIO base: 0x%08x\n", engine->mmio_base); drm_printf(m, "\tMMIO base: 0x%08x\n", engine->mmio_base);
wakeref = intel_runtime_pm_get_if_in_use(engine->uncore->rpm); wakeref = intel_runtime_pm_get_if_in_use(engine->uncore->rpm);
if (wakeref) { if (wakeref) {
...@@ -2352,8 +2326,7 @@ intel_engine_create_virtual(struct intel_engine_cs **siblings, ...@@ -2352,8 +2326,7 @@ intel_engine_create_virtual(struct intel_engine_cs **siblings,
return siblings[0]->cops->create_virtual(siblings, count, flags); return siblings[0]->cops->create_virtual(siblings, count, flags);
} }
struct i915_request * static struct i915_request *engine_execlist_find_hung_request(struct intel_engine_cs *engine)
intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
{ {
struct i915_request *request, *active = NULL; struct i915_request *request, *active = NULL;
...@@ -2405,6 +2378,33 @@ intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine) ...@@ -2405,6 +2378,33 @@ intel_engine_execlist_find_hung_request(struct intel_engine_cs *engine)
return active; return active;
} }
void intel_engine_get_hung_entity(struct intel_engine_cs *engine,
struct intel_context **ce, struct i915_request **rq)
{
unsigned long flags;
*ce = intel_engine_get_hung_context(engine);
if (*ce) {
intel_engine_clear_hung_context(engine);
*rq = intel_context_get_active_request(*ce);
return;
}
/*
* Getting here with GuC enabled means it is a forced error capture
* with no actual hang. So, no need to attempt the execlist search.
*/
if (intel_uc_uses_guc_submission(&engine->gt->uc))
return;
spin_lock_irqsave(&engine->sched_engine->lock, flags);
*rq = engine_execlist_find_hung_request(engine);
if (*rq)
*rq = i915_request_get_rcu(*rq);
spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
}
void xehp_enable_ccs_engines(struct intel_engine_cs *engine) void xehp_enable_ccs_engines(struct intel_engine_cs *engine)
{ {
/* /*
......
...@@ -4150,6 +4150,33 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine, ...@@ -4150,6 +4150,33 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
spin_unlock_irqrestore(&sched_engine->lock, flags); spin_unlock_irqrestore(&sched_engine->lock, flags);
} }
static unsigned long list_count(struct list_head *list)
{
struct list_head *pos;
unsigned long count = 0;
list_for_each(pos, list)
count++;
return count;
}
void intel_execlists_dump_active_requests(struct intel_engine_cs *engine,
struct i915_request *hung_rq,
struct drm_printer *m)
{
unsigned long flags;
spin_lock_irqsave(&engine->sched_engine->lock, flags);
intel_engine_dump_active_requests(&engine->sched_engine->requests, hung_rq, m);
drm_printf(m, "\tOn hold?: %lu\n",
list_count(&engine->sched_engine->hold));
spin_unlock_irqrestore(&engine->sched_engine->lock, flags);
}
#if IS_ENABLED(CONFIG_DRM_I915_SELFTEST) #if IS_ENABLED(CONFIG_DRM_I915_SELFTEST)
#include "selftest_execlists.c" #include "selftest_execlists.c"
#endif #endif
...@@ -32,6 +32,10 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine, ...@@ -32,6 +32,10 @@ void intel_execlists_show_requests(struct intel_engine_cs *engine,
int indent), int indent),
unsigned int max); unsigned int max);
void intel_execlists_dump_active_requests(struct intel_engine_cs *engine,
struct i915_request *hung_rq,
struct drm_printer *m);
bool bool
intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine); intel_engine_in_execlists_submission_mode(const struct intel_engine_cs *engine);
......
...@@ -1596,36 +1596,16 @@ capture_engine(struct intel_engine_cs *engine, ...@@ -1596,36 +1596,16 @@ capture_engine(struct intel_engine_cs *engine,
{ {
struct intel_engine_capture_vma *capture = NULL; struct intel_engine_capture_vma *capture = NULL;
struct intel_engine_coredump *ee; struct intel_engine_coredump *ee;
struct intel_context *ce; struct intel_context *ce = NULL;
struct i915_request *rq = NULL; struct i915_request *rq = NULL;
unsigned long flags;
ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags); ee = intel_engine_coredump_alloc(engine, ALLOW_FAIL, dump_flags);
if (!ee) if (!ee)
return NULL; return NULL;
ce = intel_engine_get_hung_context(engine); intel_engine_get_hung_entity(engine, &ce, &rq);
if (ce) {
intel_engine_clear_hung_context(engine);
rq = intel_context_get_active_request(ce);
if (!rq || !i915_request_started(rq)) if (!rq || !i915_request_started(rq))
goto no_request_capture; goto no_request_capture;
} else {
/*
* Getting here with GuC enabled means it is a forced error capture
* with no actual hang. So, no need to attempt the execlist search.
*/
if (!intel_uc_uses_guc_submission(&engine->gt->uc)) {
spin_lock_irqsave(&engine->sched_engine->lock, flags);
rq = intel_engine_execlist_find_hung_request(engine);
if (rq)
rq = i915_request_get_rcu(rq);
spin_unlock_irqrestore(&engine->sched_engine->lock,
flags);
}
}
if (!rq)
goto no_request_capture;
capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL); capture = intel_engine_coredump_add_request(ee, rq, ATOMIC_MAYFAIL);
if (!capture) if (!capture)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment