Commit 5dbd7b4d authored by Ofir Bitton's avatar Ofir Bitton Committed by Oded Gabbay

habanalabs: improve communication protocol with cpucp

Current messaging communictaion protocol with cpucp can get out
of sync due to coherency issues. In order to improve the protocol
reliability, we modify the protocol to expect a different
acknowledgment for every packet sent to cpucp.
Signed-off-by: default avatarOfir Bitton <obitton@habana.ai>
Reviewed-by: default avatarOded Gabbay <ogabbay@kernel.org>
Signed-off-by: default avatarOded Gabbay <ogabbay@kernel.org>
parent 6c1e3f92
...@@ -90,9 +90,10 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode) ...@@ -90,9 +90,10 @@ int hl_fw_send_pci_access_msg(struct hl_device *hdev, u32 opcode)
int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
u16 len, u32 timeout, u64 *result) u16 len, u32 timeout, u64 *result)
{ {
struct hl_hw_queue *queue = &hdev->kernel_queues[hw_queue_id];
struct cpucp_packet *pkt; struct cpucp_packet *pkt;
dma_addr_t pkt_dma_addr; dma_addr_t pkt_dma_addr;
u32 tmp; u32 tmp, expected_ack_val;
int rc = 0; int rc = 0;
pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len, pkt = hdev->asic_funcs->cpu_accessible_dma_pool_alloc(hdev, len,
...@@ -115,14 +116,22 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg, ...@@ -115,14 +116,22 @@ int hl_fw_send_cpu_message(struct hl_device *hdev, u32 hw_queue_id, u32 *msg,
goto out; goto out;
} }
/* set fence to a non valid value */
pkt->fence = UINT_MAX;
rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, len, pkt_dma_addr); rc = hl_hw_queue_send_cb_no_cmpl(hdev, hw_queue_id, len, pkt_dma_addr);
if (rc) { if (rc) {
dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc); dev_err(hdev->dev, "Failed to send CB on CPU PQ (%d)\n", rc);
goto out; goto out;
} }
if (hdev->asic_prop.fw_cpucp_ack_with_pi)
expected_ack_val = queue->pi;
else
expected_ack_val = CPUCP_PACKET_FENCE_VAL;
rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp, rc = hl_poll_timeout_memory(hdev, &pkt->fence, tmp,
(tmp == CPUCP_PACKET_FENCE_VAL), 1000, (tmp == expected_ack_val), 1000,
timeout, true); timeout, true);
hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id); hl_hw_queue_inc_ci_kernel(hdev, hw_queue_id);
...@@ -777,6 +786,10 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg, ...@@ -777,6 +786,10 @@ int hl_fw_init_cpu(struct hl_device *hdev, u32 cpu_boot_status_reg,
CPU_BOOT_DEV_STS0_FW_HARD_RST_EN) CPU_BOOT_DEV_STS0_FW_HARD_RST_EN)
prop->hard_reset_done_by_fw = true; prop->hard_reset_done_by_fw = true;
if (prop->fw_boot_cpu_security_map &
CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN)
prop->fw_cpucp_ack_with_pi = true;
dev_dbg(hdev->dev, dev_dbg(hdev->dev,
"Firmware boot CPU security status %#x\n", "Firmware boot CPU security status %#x\n",
prop->fw_boot_cpu_security_map); prop->fw_boot_cpu_security_map);
......
...@@ -419,6 +419,8 @@ struct hl_mmu_properties { ...@@ -419,6 +419,8 @@ struct hl_mmu_properties {
* from BOOT_DEV_STS0 * from BOOT_DEV_STS0
* @dram_supports_virtual_memory: is there an MMU towards the DRAM * @dram_supports_virtual_memory: is there an MMU towards the DRAM
* @hard_reset_done_by_fw: true if firmware is handling hard reset flow * @hard_reset_done_by_fw: true if firmware is handling hard reset flow
* @fw_cpucp_ack_with_pi: true if cpucp is acking messages with the PQ PI
* instead of a magic number
* @num_functional_hbms: number of functional HBMs in each DCORE. * @num_functional_hbms: number of functional HBMs in each DCORE.
*/ */
struct asic_fixed_properties { struct asic_fixed_properties {
...@@ -479,6 +481,7 @@ struct asic_fixed_properties { ...@@ -479,6 +481,7 @@ struct asic_fixed_properties {
u8 fw_security_status_valid; u8 fw_security_status_valid;
u8 dram_supports_virtual_memory; u8 dram_supports_virtual_memory;
u8 hard_reset_done_by_fw; u8 hard_reset_done_by_fw;
u8 fw_cpucp_ack_with_pi;
u8 num_functional_hbms; u8 num_functional_hbms;
}; };
......
...@@ -533,6 +533,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev) ...@@ -533,6 +533,7 @@ static int gaudi_get_fixed_properties(struct hl_device *hdev)
prop->fw_security_disabled = true; prop->fw_security_disabled = true;
prop->fw_security_status_valid = false; prop->fw_security_status_valid = false;
prop->hard_reset_done_by_fw = false; prop->hard_reset_done_by_fw = false;
prop->fw_cpucp_ack_with_pi = false;
return 0; return 0;
} }
...@@ -4438,9 +4439,12 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi) ...@@ -4438,9 +4439,12 @@ static void gaudi_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
/* ring the doorbell */ /* ring the doorbell */
WREG32(db_reg_offset, db_value); WREG32(db_reg_offset, db_value);
if (hw_queue_id == GAUDI_QUEUE_ID_CPU_PQ) if (hw_queue_id == GAUDI_QUEUE_ID_CPU_PQ) {
/* make sure device CPU will read latest data from host */
mb();
WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
GAUDI_EVENT_PI_UPDATE); GAUDI_EVENT_PI_UPDATE);
}
} }
static void gaudi_pqe_write(struct hl_device *hdev, __le64 *pqe, static void gaudi_pqe_write(struct hl_device *hdev, __le64 *pqe,
......
...@@ -461,6 +461,7 @@ int goya_get_fixed_properties(struct hl_device *hdev) ...@@ -461,6 +461,7 @@ int goya_get_fixed_properties(struct hl_device *hdev)
prop->fw_security_disabled = true; prop->fw_security_disabled = true;
prop->fw_security_status_valid = false; prop->fw_security_status_valid = false;
prop->hard_reset_done_by_fw = false; prop->hard_reset_done_by_fw = false;
prop->fw_cpucp_ack_with_pi = false;
return 0; return 0;
} }
...@@ -2806,9 +2807,12 @@ void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi) ...@@ -2806,9 +2807,12 @@ void goya_ring_doorbell(struct hl_device *hdev, u32 hw_queue_id, u32 pi)
/* ring the doorbell */ /* ring the doorbell */
WREG32(db_reg_offset, db_value); WREG32(db_reg_offset, db_value);
if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ) if (hw_queue_id == GOYA_QUEUE_ID_CPU_PQ) {
/* make sure device CPU will read latest data from host */
mb();
WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR, WREG32(mmGIC_DISTRIBUTOR__5_GICD_SETSPI_NSR,
GOYA_ASYNC_EVENT_ID_PI_UPDATE); GOYA_ASYNC_EVENT_ID_PI_UPDATE);
}
} }
void goya_pqe_write(struct hl_device *hdev, __le64 *pqe, struct hl_bd *bd) void goya_pqe_write(struct hl_device *hdev, __le64 *pqe, struct hl_bd *bd)
......
...@@ -166,6 +166,10 @@ ...@@ -166,6 +166,10 @@
* FW handles HBM ECC indications. * FW handles HBM ECC indications.
* Initialized in: linux * Initialized in: linux
* *
* CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN Packets ack value used in the armcpd
* is set to the PI counter.
* Initialized in: linux
*
* CPU_BOOT_DEV_STS0_ENABLED Device status register enabled. * CPU_BOOT_DEV_STS0_ENABLED Device status register enabled.
* This is a main indication that the * This is a main indication that the
* running FW populates the device status * running FW populates the device status
...@@ -190,6 +194,7 @@ ...@@ -190,6 +194,7 @@
#define CPU_BOOT_DEV_STS0_SP_SRAM_EN (1 << 12) #define CPU_BOOT_DEV_STS0_SP_SRAM_EN (1 << 12)
#define CPU_BOOT_DEV_STS0_CLK_GATE_EN (1 << 13) #define CPU_BOOT_DEV_STS0_CLK_GATE_EN (1 << 13)
#define CPU_BOOT_DEV_STS0_HBM_ECC_EN (1 << 14) #define CPU_BOOT_DEV_STS0_HBM_ECC_EN (1 << 14)
#define CPU_BOOT_DEV_STS0_PKT_PI_ACK_EN (1 << 15)
#define CPU_BOOT_DEV_STS0_ENABLED (1 << 31) #define CPU_BOOT_DEV_STS0_ENABLED (1 << 31)
enum cpu_boot_status { enum cpu_boot_status {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment