Commit f5718a72 authored by Alan Previn's avatar Alan Previn Committed by Lucas De Marchi

drm/i915/guc: Extract GuC error capture lists on G2H notification.

- Upon the G2H Notify-Err-Capture event, parse through the
  GuC Log Buffer (error-capture-subregion) and generate one or
  more capture-nodes. A single node represents a single "engine-
  instance-capture-dump" and contains at least 3 register lists:
  global, engine-class and engine-instance. An internal link
  list is maintained to store one or more nodes.
- Because the link-list node generation happen before the call
  to i915_gpu_codedump, duplicate global and engine-class register
  lists for each engine-instance register dump if we find
  dependent-engine resets in a engine-capture-group.
- When i915_gpu_coredump calls into capture_engine, (in a
  subsequent patch) we detach the matching node (guc-id,
  LRCA, etc) from the link list above and attach it to
  i915_gpu_coredump's intel_engine_coredump structure when have
  matching LRCA/guc-id/engine-instance.

Additional notes to be aware of:
- GuC generates the error capture dump into the GuC log buffer but
  this buffer is one big log buffer with 3 independent subregions
  within it. Each subregion is populated with different content
  and used in different ways and timings but all regions operate
  behave as independent ring buffers. Each guc-log subregion
  (general-logs, crash-dump and error- capture) has it's own
  guc_log_buffer_state that contain independent read and write
  pointers.
Signed-off-by: default avatarAlan Previn <alan.previn.teres.alexis@intel.com>
Reviewed-by: default avatarUmesh Nerlige Ramappa <umesh.nerlige.ramappa@intel.com>
Signed-off-by: default avatarLucas De Marchi <lucas.demarchi@intel.com>
Link: https://patchwork.freedesktop.org/patch/msgid/20220321164527.2500062-11-alan.previn.teres.alexis@intel.com
parent d7c15d76
...@@ -172,4 +172,11 @@ enum intel_guc_sleep_state_status { ...@@ -172,4 +172,11 @@ enum intel_guc_sleep_state_status {
#define GUC_LOG_CONTROL_VERBOSITY_MASK (0xF << GUC_LOG_CONTROL_VERBOSITY_SHIFT) #define GUC_LOG_CONTROL_VERBOSITY_MASK (0xF << GUC_LOG_CONTROL_VERBOSITY_SHIFT)
#define GUC_LOG_CONTROL_DEFAULT_LOGGING (1 << 8) #define GUC_LOG_CONTROL_DEFAULT_LOGGING (1 << 8)
enum intel_guc_state_capture_event_status {
INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_SUCCESS = 0x0,
INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE = 0x1,
};
#define INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_MASK 0x000000FF
#endif /* _ABI_GUC_ACTIONS_ABI_H */ #endif /* _ABI_GUC_ACTIONS_ABI_H */
...@@ -12,6 +12,52 @@ ...@@ -12,6 +12,52 @@
struct intel_guc; struct intel_guc;
struct file; struct file;
/**
* struct __guc_capture_bufstate
*
* Book-keeping structure used to track read and write pointers
* as we extract error capture data from the GuC-log-buffer's
* error-capture region as a stream of dwords.
*/
struct __guc_capture_bufstate {
u32 size;
void *data;
u32 rd;
u32 wr;
};
/**
* struct __guc_capture_parsed_output - extracted error capture node
*
* A single unit of extracted error-capture output data grouped together
* at an engine-instance level. We keep these nodes in a linked list.
* See outlist below.
*/
struct __guc_capture_parsed_output {
/*
* A single set of 3 capture lists: a global-list
* an engine-class-list and an engine-instance list.
* outlist in __guc_capture_parsed_output will keep
* a linked list of these nodes that will eventually
* be detached from outlist and attached into to
* i915_gpu_codedump in response to a context reset
*/
struct list_head link;
bool is_partial;
u32 eng_class;
u32 eng_inst;
u32 guc_id;
u32 lrca;
struct gcap_reg_list_info {
u32 vfid;
u32 num_regs;
struct guc_mmio_reg *regs;
} reginfo[GUC_CAPTURE_LIST_TYPE_MAX];
#define GCAP_PARSED_REGLIST_INDEX_GLOBAL BIT(GUC_CAPTURE_LIST_TYPE_GLOBAL)
#define GCAP_PARSED_REGLIST_INDEX_ENGCLASS BIT(GUC_CAPTURE_LIST_TYPE_ENGINE_CLASS)
#define GCAP_PARSED_REGLIST_INDEX_ENGINST BIT(GUC_CAPTURE_LIST_TYPE_ENGINE_INSTANCE)
};
/** /**
* struct guc_debug_capture_list_header / struct guc_debug_capture_list * struct guc_debug_capture_list_header / struct guc_debug_capture_list
* *
...@@ -142,6 +188,16 @@ struct intel_guc_state_capture { ...@@ -142,6 +188,16 @@ struct intel_guc_state_capture {
[GUC_CAPTURE_LIST_TYPE_MAX] [GUC_CAPTURE_LIST_TYPE_MAX]
[GUC_MAX_ENGINE_CLASSES]; [GUC_MAX_ENGINE_CLASSES];
void *ads_null_cache; void *ads_null_cache;
/**
* @outlist: allocated nodes with parsed engine-instance error capture data
*
* A linked list of parsed GuC error-capture output data before
* reporting with formatting via i915_gpu_coredump. Each node in this linked list shall
* contain a single engine-capture including global, engine-class and
* engine-instance register dumps as per guc_capture_parsed_output_node
*/
struct list_head outlist;
}; };
#endif /* _INTEL_GUC_CAPTURE_FWIF_H */ #endif /* _INTEL_GUC_CAPTURE_FWIF_H */
...@@ -11,6 +11,7 @@ ...@@ -11,6 +11,7 @@
struct guc_gt_system_info; struct guc_gt_system_info;
struct intel_guc; struct intel_guc;
void intel_guc_capture_process(struct intel_guc *guc);
int intel_guc_capture_output_min_size_est(struct intel_guc *guc); int intel_guc_capture_output_min_size_est(struct intel_guc *guc);
int intel_guc_capture_getlist(struct intel_guc *guc, u32 owner, u32 type, u32 classid, int intel_guc_capture_getlist(struct intel_guc *guc, u32 owner, u32 type, u32 classid,
void **outptr); void **outptr);
......
...@@ -158,7 +158,7 @@ static void *guc_get_write_buffer(struct intel_guc_log *log) ...@@ -158,7 +158,7 @@ static void *guc_get_write_buffer(struct intel_guc_log *log)
return relay_reserve(log->relay.channel, 0); return relay_reserve(log->relay.channel, 0);
} }
static bool guc_check_log_buf_overflow(struct intel_guc_log *log, bool intel_guc_check_log_buf_overflow(struct intel_guc_log *log,
enum guc_log_buffer_type type, enum guc_log_buffer_type type,
unsigned int full_cnt) unsigned int full_cnt)
{ {
...@@ -183,7 +183,7 @@ static bool guc_check_log_buf_overflow(struct intel_guc_log *log, ...@@ -183,7 +183,7 @@ static bool guc_check_log_buf_overflow(struct intel_guc_log *log,
return overflow; return overflow;
} }
static unsigned int guc_get_log_buffer_size(enum guc_log_buffer_type type) unsigned int intel_guc_get_log_buffer_size(enum guc_log_buffer_type type)
{ {
switch (type) { switch (type) {
case GUC_DEBUG_LOG_BUFFER: case GUC_DEBUG_LOG_BUFFER:
...@@ -199,6 +199,20 @@ static unsigned int guc_get_log_buffer_size(enum guc_log_buffer_type type) ...@@ -199,6 +199,20 @@ static unsigned int guc_get_log_buffer_size(enum guc_log_buffer_type type)
return 0; return 0;
} }
size_t intel_guc_get_log_buffer_offset(enum guc_log_buffer_type type)
{
enum guc_log_buffer_type i;
size_t offset = PAGE_SIZE;/* for the log_buffer_states */
for (i = GUC_DEBUG_LOG_BUFFER; i < GUC_MAX_LOG_BUFFER; ++i) {
if (i == type)
break;
offset += intel_guc_get_log_buffer_size(i);
}
return offset;
}
static void _guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log) static void _guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log)
{ {
unsigned int buffer_size, read_offset, write_offset, bytes_to_copy, full_cnt; unsigned int buffer_size, read_offset, write_offset, bytes_to_copy, full_cnt;
...@@ -244,14 +258,14 @@ static void _guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log) ...@@ -244,14 +258,14 @@ static void _guc_log_copy_debuglogs_for_relay(struct intel_guc_log *log)
*/ */
memcpy(&log_buf_state_local, log_buf_state, memcpy(&log_buf_state_local, log_buf_state,
sizeof(struct guc_log_buffer_state)); sizeof(struct guc_log_buffer_state));
buffer_size = guc_get_log_buffer_size(type); buffer_size = intel_guc_get_log_buffer_size(type);
read_offset = log_buf_state_local.read_ptr; read_offset = log_buf_state_local.read_ptr;
write_offset = log_buf_state_local.sampled_write_ptr; write_offset = log_buf_state_local.sampled_write_ptr;
full_cnt = log_buf_state_local.buffer_full_cnt; full_cnt = log_buf_state_local.buffer_full_cnt;
/* Bookkeeping stuff */ /* Bookkeeping stuff */
log->stats[type].flush += log_buf_state_local.flush_to_file; log->stats[type].flush += log_buf_state_local.flush_to_file;
new_overflow = guc_check_log_buf_overflow(log, type, full_cnt); new_overflow = intel_guc_check_log_buf_overflow(log, type, full_cnt);
/* Update the state of shared log buffer */ /* Update the state of shared log buffer */
log_buf_state->read_ptr = write_offset; log_buf_state->read_ptr = write_offset;
......
...@@ -67,6 +67,10 @@ struct intel_guc_log { ...@@ -67,6 +67,10 @@ struct intel_guc_log {
}; };
void intel_guc_log_init_early(struct intel_guc_log *log); void intel_guc_log_init_early(struct intel_guc_log *log);
bool intel_guc_check_log_buf_overflow(struct intel_guc_log *log, enum guc_log_buffer_type type,
unsigned int full_cnt);
unsigned int intel_guc_get_log_buffer_size(enum guc_log_buffer_type type);
size_t intel_guc_get_log_buffer_offset(enum guc_log_buffer_type type);
int intel_guc_log_create(struct intel_guc_log *log); int intel_guc_log_create(struct intel_guc_log *log);
void intel_guc_log_destroy(struct intel_guc_log *log); void intel_guc_log_destroy(struct intel_guc_log *log);
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include "gt/intel_ring.h" #include "gt/intel_ring.h"
#include "intel_guc_ads.h" #include "intel_guc_ads.h"
#include "intel_guc_capture.h"
#include "intel_guc_submission.h" #include "intel_guc_submission.h"
#include "i915_drv.h" #include "i915_drv.h"
...@@ -4095,17 +4096,18 @@ int intel_guc_context_reset_process_msg(struct intel_guc *guc, ...@@ -4095,17 +4096,18 @@ int intel_guc_context_reset_process_msg(struct intel_guc *guc,
int intel_guc_error_capture_process_msg(struct intel_guc *guc, int intel_guc_error_capture_process_msg(struct intel_guc *guc,
const u32 *msg, u32 len) const u32 *msg, u32 len)
{ {
int status; u32 status;
if (unlikely(len != 1)) { if (unlikely(len != 1)) {
drm_dbg(&guc_to_gt(guc)->i915->drm, "Invalid length %u", len); drm_dbg(&guc_to_gt(guc)->i915->drm, "Invalid length %u", len);
return -EPROTO; return -EPROTO;
} }
status = msg[0]; status = msg[0] & INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_MASK;
drm_info(&guc_to_gt(guc)->i915->drm, "Got error capture: status = %d", status); if (status == INTEL_GUC_STATE_CAPTURE_EVENT_STATUS_NOSPACE)
drm_warn(&guc_to_gt(guc)->i915->drm, "G2H-Error capture no space");
/* FIXME: Do something with the capture */ intel_guc_capture_process(guc);
return 0; return 0;
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment