Commit ae303d88 authored by Tomer Tayar's avatar Tomer Tayar Committed by Oded Gabbay

accel/habanalabs/gaudi2: get the correct QM CQ info upon an error

Upon a QM error, the address/size from both the CQ and the ARC_CQ are
printed, although the instruction that led to the error was received
from only one of them.

Moreover, in case of a QM undefined opcode, only one of these
address/size sets will be captured based on the value of ARC_CQ_PTR.
However, this value can be non-zero even if currently the CQ is used, in
case the CQ/ARC_CQ are alternately used.

Under the assumption of having a stop-on-error configuration, modify to
use CP_STS.CUR_CQ field to get the relevant CQ for the QM error.
Signed-off-by: default avatarTomer Tayar <ttayar@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 4b0b1fbc
...@@ -7860,36 +7860,36 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type, ...@@ -7860,36 +7860,36 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u64 event_mask) static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u64 event_mask)
{ {
u32 lo, hi, cq_ptr_size, arc_cq_ptr_size; u32 lo, hi, cq_ptr_size, cp_sts;
u64 cq_ptr, arc_cq_ptr, cp_current_inst; u64 cq_ptr, cp_current_inst;
bool is_arc_cq;
lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET); cp_sts = RREG32(qman_base + QM_CP_STS_4_OFFSET);
cq_ptr = ((u64) hi) << 32 | lo; is_arc_cq = FIELD_GET(PDMA0_QM_CP_STS_CUR_CQ_MASK, cp_sts); /* 0 - legacy CQ, 1 - ARC_CQ */
cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
if (is_arc_cq) {
lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET); lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET); hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
arc_cq_ptr = ((u64) hi) << 32 | lo; cq_ptr = ((u64) hi) << 32 | lo;
arc_cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET); cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
} else {
lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
cq_ptr = ((u64) hi) << 32 | lo;
cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
}
lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET); lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET);
hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET); hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET);
cp_current_inst = ((u64) hi) << 32 | lo; cp_current_inst = ((u64) hi) << 32 | lo;
dev_info(hdev->dev, dev_info(hdev->dev,
"LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n", "LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst); is_arc_cq ? "ARC_" : "", cq_ptr, cq_ptr_size, cp_current_inst);
if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) { if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
if (arc_cq_ptr) { hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
hdev->captured_err_info.undef_opcode.cq_addr = arc_cq_ptr; hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size;
hdev->captured_err_info.undef_opcode.cq_size = arc_cq_ptr_size;
} else {
hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size;
}
hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS; hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS;
} }
} }
......
...@@ -250,6 +250,7 @@ ...@@ -250,6 +250,7 @@
#define QM_ARC_CQ_PTR_HI_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_HI - mmPDMA0_QM_BASE) #define QM_ARC_CQ_PTR_HI_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_HI - mmPDMA0_QM_BASE)
#define QM_ARC_CQ_TSIZE_OFFSET (mmPDMA0_QM_ARC_CQ_TSIZE - mmPDMA0_QM_BASE) #define QM_ARC_CQ_TSIZE_OFFSET (mmPDMA0_QM_ARC_CQ_TSIZE - mmPDMA0_QM_BASE)
#define QM_CP_STS_4_OFFSET (mmPDMA0_QM_CP_STS_4 - mmPDMA0_QM_BASE)
#define QM_CP_CURRENT_INST_LO_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_LO_4 - mmPDMA0_QM_BASE) #define QM_CP_CURRENT_INST_LO_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_LO_4 - mmPDMA0_QM_BASE)
#define QM_CP_CURRENT_INST_HI_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_HI_4 - mmPDMA0_QM_BASE) #define QM_CP_CURRENT_INST_HI_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_HI_4 - mmPDMA0_QM_BASE)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment