Commit 92242716 authored by Dave Airlie's avatar Dave Airlie

Merge tag 'drm-habanalabs-next-2023-12-19' of...

Merge tag 'drm-habanalabs-next-2023-12-19' of https://git.kernel.org/pub/scm/linux/kernel/git/ogabbay/linux into drm-next

This tag contains habanalabs driver changes for v6.8.

The notable changes are:

- uAPI changes:
  - Add sysfs entry to allow users to identify a device minor id with its
    debugfs path
  - Add sysfs entry to expose the device's module id as given to us from
    the f/w
  - Add signed device information retrieval through the INFO ioctl

- New features and improvements:
  - Update documentation of debugfs paths
  - Add support for Gaudi2C device (new PCI revision number)
  - Add pcie reset prepare/done hooks

- Firmware related fixes and changes:
  - Print three instances version numbers of Infineon second stage
  - Assume hard-reset is done by f/w upon PCIe AXI drain

- Bug fixes and code cleanups:
  - Fix information leak in sec_attest_info()
  - Avoid overriding existing undefined opcode data in Gaudi2
  - Multiple Queue Manager (QMAN) fixes for Gaudi2
  - Set hard reset flag if graceful reset is skipped
  - Remove 'get temperature' debug print
  - Fix the new Event Queue heartbeat mechanism
Signed-off-by: default avatarDave Airlie <airlied@redhat.com>

From: Oded Gabbay <ogabbay@kernel.org>
Link: https://patchwork.freedesktop.org/patch/msgid/ZYFpihZscr/fsRRd@ogabbay-vm-u22.habana-labs.com
parents dc83fb6e a9f07790
......@@ -149,6 +149,18 @@ Contact: ogabbay@kernel.org
Description: Displays the current clock frequency, in Hz, of the MME compute
engine. This property is valid only for the Goya ASIC family
What: /sys/class/accel/accel<n>/device/module_id
Date: Nov 2023
KernelVersion: not yet upstreamed
Contact: ogabbay@kernel.org
Description: Displays the device's module id
What: /sys/class/accel/accel<n>/device/parent_device
Date: Nov 2023
KernelVersion: 6.8
Contact: ttayar@habana.ai
Description: Displays the name of the parent device of the accel device
What: /sys/class/accel/accel<n>/device/pci_addr
Date: Jan 2019
KernelVersion: 5.1
......
......@@ -853,6 +853,9 @@ static int device_early_init(struct hl_device *hdev)
gaudi2_set_asic_funcs(hdev);
strscpy(hdev->asic_name, "GAUDI2B", sizeof(hdev->asic_name));
break;
case ASIC_GAUDI2C:
gaudi2_set_asic_funcs(hdev);
strscpy(hdev->asic_name, "GAUDI2C", sizeof(hdev->asic_name));
break;
default:
dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
......@@ -1041,18 +1044,21 @@ static bool is_pci_link_healthy(struct hl_device *hdev)
return (vendor_id == PCI_VENDOR_ID_HABANALABS);
}
static void hl_device_eq_heartbeat(struct hl_device *hdev)
static int hl_device_eq_heartbeat_check(struct hl_device *hdev)
{
u64 event_mask = HL_NOTIFIER_EVENT_DEVICE_RESET | HL_NOTIFIER_EVENT_DEVICE_UNAVAILABLE;
struct asic_fixed_properties *prop = &hdev->asic_prop;
if (!prop->cpucp_info.eq_health_check_supported)
return;
return 0;
if (hdev->eq_heartbeat_received)
if (hdev->eq_heartbeat_received) {
hdev->eq_heartbeat_received = false;
else
hl_device_cond_reset(hdev, HL_DRV_RESET_HARD, event_mask);
} else {
dev_err(hdev->dev, "EQ heartbeat event was not received!\n");
return -EIO;
}
return 0;
}
static void hl_device_heartbeat(struct work_struct *work)
......@@ -1069,10 +1075,9 @@ static void hl_device_heartbeat(struct work_struct *work)
/*
* For EQ health check need to check if driver received the heartbeat eq event
* in order to validate the eq is working.
* Only if both the EQ is healthy and we managed to send the next heartbeat reschedule.
*/
hl_device_eq_heartbeat(hdev);
if (!hdev->asic_funcs->send_heartbeat(hdev))
if ((!hl_device_eq_heartbeat_check(hdev)) && (!hdev->asic_funcs->send_heartbeat(hdev)))
goto reschedule;
if (hl_device_operational(hdev, NULL))
......@@ -2035,7 +2040,7 @@ int hl_device_cond_reset(struct hl_device *hdev, u32 flags, u64 event_mask)
if (ctx)
hl_ctx_put(ctx);
return hl_device_reset(hdev, flags);
return hl_device_reset(hdev, flags | HL_DRV_RESET_HARD);
}
static void hl_notifier_event_send(struct hl_notifier_event *notifier_event, u64 event_mask)
......
......@@ -646,39 +646,27 @@ int hl_fw_send_heartbeat(struct hl_device *hdev)
return rc;
}
static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
u32 sts_val)
static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val, u32 sts_val)
{
bool err_exists = false;
if (!(err_val & CPU_BOOT_ERR0_ENABLED))
return false;
if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL) {
dev_err(hdev->dev,
"Device boot error - DRAM initialization failed\n");
err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_DRAM_INIT_FAIL)
dev_err(hdev->dev, "Device boot error - DRAM initialization failed\n");
if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED) {
if (err_val & CPU_BOOT_ERR0_FIT_CORRUPTED)
dev_err(hdev->dev, "Device boot error - FIT image corrupted\n");
err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL) {
dev_err(hdev->dev,
"Device boot error - Thermal Sensor initialization failed\n");
err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_TS_INIT_FAIL)
dev_err(hdev->dev, "Device boot error - Thermal Sensor initialization failed\n");
if (err_val & CPU_BOOT_ERR0_BMC_WAIT_SKIPPED) {
if (hdev->bmc_enable) {
dev_err(hdev->dev,
"Device boot error - Skipped waiting for BMC\n");
err_exists = true;
dev_err(hdev->dev, "Device boot error - Skipped waiting for BMC\n");
} else {
dev_info(hdev->dev,
"Device boot message - Skipped waiting for BMC\n");
dev_info(hdev->dev, "Device boot message - Skipped waiting for BMC\n");
/* This is an info so we don't want it to disable the
* device
*/
......@@ -686,48 +674,29 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
}
}
if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY) {
dev_err(hdev->dev,
"Device boot error - Serdes data from BMC not available\n");
err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_NIC_DATA_NOT_RDY)
dev_err(hdev->dev, "Device boot error - Serdes data from BMC not available\n");
if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL) {
dev_err(hdev->dev,
"Device boot error - NIC F/W initialization failed\n");
err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_NIC_FW_FAIL)
dev_err(hdev->dev, "Device boot error - NIC F/W initialization failed\n");
if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY) {
dev_err(hdev->dev,
"Device boot warning - security not ready\n");
err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_SECURITY_NOT_RDY)
dev_err(hdev->dev, "Device boot warning - security not ready\n");
if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL) {
if (err_val & CPU_BOOT_ERR0_SECURITY_FAIL)
dev_err(hdev->dev, "Device boot error - security failure\n");
err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL) {
if (err_val & CPU_BOOT_ERR0_EFUSE_FAIL)
dev_err(hdev->dev, "Device boot error - eFuse failure\n");
err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL) {
if (err_val & CPU_BOOT_ERR0_SEC_IMG_VER_FAIL)
dev_err(hdev->dev, "Device boot error - Failed to load preboot secondary image\n");
err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_PLL_FAIL) {
if (err_val & CPU_BOOT_ERR0_PLL_FAIL)
dev_err(hdev->dev, "Device boot error - PLL failure\n");
err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL) {
if (err_val & CPU_BOOT_ERR0_TMP_THRESH_INIT_FAIL)
dev_err(hdev->dev, "Device boot error - Failed to set threshold for temperature sensor\n");
err_exists = true;
}
if (err_val & CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL) {
/* Ignore this bit, don't prevent driver loading */
......@@ -735,52 +704,32 @@ static bool fw_report_boot_dev0(struct hl_device *hdev, u32 err_val,
err_val &= ~CPU_BOOT_ERR0_DEVICE_UNUSABLE_FAIL;
}
if (err_val & CPU_BOOT_ERR0_BINNING_FAIL) {
if (err_val & CPU_BOOT_ERR0_BINNING_FAIL)
dev_err(hdev->dev, "Device boot error - binning failure\n");
err_exists = true;
}
if (sts_val & CPU_BOOT_DEV_STS0_ENABLED)
dev_dbg(hdev->dev, "Device status0 %#x\n", sts_val);
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED)
dev_err(hdev->dev, "Device boot warning - Skipped DRAM initialization\n");
if (err_val & CPU_BOOT_ERR_ENG_ARC_MEM_SCRUB_FAIL)
dev_err(hdev->dev, "Device boot error - ARC memory scrub failed\n");
/* All warnings should go here in order not to reach the unknown error validation */
if (err_val & CPU_BOOT_ERR0_EEPROM_FAIL) {
dev_err(hdev->dev, "Device boot error - EEPROM failure detected\n");
err_exists = true;
}
/* All warnings should go here in order not to reach the unknown error validation */
if (err_val & CPU_BOOT_ERR0_DRAM_SKIPPED) {
dev_warn(hdev->dev,
"Device boot warning - Skipped DRAM initialization\n");
/* This is a warning so we don't want it to disable the
* device
*/
err_val &= ~CPU_BOOT_ERR0_DRAM_SKIPPED;
}
if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL)
dev_warn(hdev->dev, "Device boot warning - Failed to load preboot primary image\n");
if (err_val & CPU_BOOT_ERR0_PRI_IMG_VER_FAIL) {
dev_warn(hdev->dev,
"Device boot warning - Failed to load preboot primary image\n");
/* This is a warning so we don't want it to disable the
* device as we have a secondary preboot image
*/
err_val &= ~CPU_BOOT_ERR0_PRI_IMG_VER_FAIL;
}
if (err_val & CPU_BOOT_ERR0_TPM_FAIL) {
dev_warn(hdev->dev,
"Device boot warning - TPM failure\n");
/* This is a warning so we don't want it to disable the
* device
*/
err_val &= ~CPU_BOOT_ERR0_TPM_FAIL;
}
if (err_val & CPU_BOOT_ERR0_TPM_FAIL)
dev_warn(hdev->dev, "Device boot warning - TPM failure\n");
if (!err_exists && (err_val & ~CPU_BOOT_ERR0_ENABLED)) {
dev_err(hdev->dev,
"Device boot error - unknown ERR0 error 0x%08x\n", err_val);
if (err_val & CPU_BOOT_ERR_FATAL_MASK)
err_exists = true;
}
/* return error only if it's in the predefined mask */
if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) &
......@@ -3295,6 +3244,14 @@ int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_in
HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC);
}
int hl_fw_get_dev_info_signed(struct hl_device *hdev,
struct cpucp_dev_info_signed *dev_info_signed, u32 nonce)
{
return hl_fw_get_sec_attest_data(hdev, CPUCP_PACKET_INFO_SIGNED_GET, dev_info_signed,
sizeof(struct cpucp_dev_info_signed), nonce,
HL_CPUCP_SEC_ATTEST_INFO_TINEOUT_USEC);
}
int hl_fw_send_generic_request(struct hl_device *hdev, enum hl_passthrough_type sub_opcode,
dma_addr_t buff, u32 *size)
{
......
......@@ -1262,6 +1262,7 @@ struct hl_dec {
* @ASIC_GAUDI_SEC: Gaudi secured device (HL-2000).
* @ASIC_GAUDI2: Gaudi2 device.
* @ASIC_GAUDI2B: Gaudi2B device.
* @ASIC_GAUDI2C: Gaudi2C device.
*/
enum hl_asic_type {
ASIC_INVALID,
......@@ -1270,6 +1271,7 @@ enum hl_asic_type {
ASIC_GAUDI_SEC,
ASIC_GAUDI2,
ASIC_GAUDI2B,
ASIC_GAUDI2C,
};
struct hl_cs_parser;
......@@ -3519,6 +3521,9 @@ struct hl_device {
u8 heartbeat;
};
/* Retrieve PCI device name in case of a PCI device or dev name in simulator */
#define HL_DEV_NAME(hdev) \
((hdev)->pdev ? dev_name(&(hdev)->pdev->dev) : "NA-DEVICE")
/**
* struct hl_cs_encaps_sig_handle - encapsulated signals handle structure
......@@ -3594,6 +3599,14 @@ static inline bool hl_is_fw_sw_ver_below(struct hl_device *hdev, u32 fw_sw_major
return false;
}
static inline bool hl_is_fw_sw_ver_equal_or_greater(struct hl_device *hdev, u32 fw_sw_major,
u32 fw_sw_minor)
{
return (hdev->fw_sw_major_ver > fw_sw_major ||
(hdev->fw_sw_major_ver == fw_sw_major &&
hdev->fw_sw_minor_ver >= fw_sw_minor));
}
/*
* Kernel module functions that can be accessed by entire module
*/
......@@ -3954,6 +3967,8 @@ long hl_fw_get_max_power(struct hl_device *hdev);
void hl_fw_set_max_power(struct hl_device *hdev);
int hl_fw_get_sec_attest_info(struct hl_device *hdev, struct cpucp_sec_attest_info *sec_attest_info,
u32 nonce);
int hl_fw_get_dev_info_signed(struct hl_device *hdev,
struct cpucp_dev_info_signed *dev_info_signed, u32 nonce);
int hl_set_voltage(struct hl_device *hdev, int sensor_index, u32 attr, long value);
int hl_set_current(struct hl_device *hdev, int sensor_index, u32 attr, long value);
int hl_set_power(struct hl_device *hdev, int sensor_index, u32 attr, long value);
......
......@@ -141,6 +141,9 @@ static enum hl_asic_type get_asic_type(struct hl_device *hdev)
case REV_ID_B:
asic_type = ASIC_GAUDI2B;
break;
case REV_ID_C:
asic_type = ASIC_GAUDI2C;
break;
default:
break;
}
......@@ -670,6 +673,38 @@ static pci_ers_result_t hl_pci_err_slot_reset(struct pci_dev *pdev)
return PCI_ERS_RESULT_RECOVERED;
}
static void hl_pci_reset_prepare(struct pci_dev *pdev)
{
struct hl_device *hdev;
hdev = pci_get_drvdata(pdev);
if (!hdev)
return;
hdev->disabled = true;
}
static void hl_pci_reset_done(struct pci_dev *pdev)
{
struct hl_device *hdev;
u32 flags;
hdev = pci_get_drvdata(pdev);
if (!hdev)
return;
/*
* Schedule a thread to trigger hard reset.
* The reason for this handler, is for rare cases where the driver is up
* and FLR occurs. This is valid only when working with no VM, so FW handles FLR
* and resets the device. FW will go back preboot stage, so driver needs to perform
* hard reset in order to load FW fit again.
*/
flags = HL_DRV_RESET_HARD | HL_DRV_RESET_BYPASS_REQ_TO_FW;
hl_device_reset(hdev, flags);
}
static const struct dev_pm_ops hl_pm_ops = {
.suspend = hl_pmops_suspend,
.resume = hl_pmops_resume,
......@@ -679,6 +714,8 @@ static const struct pci_error_handlers hl_pci_err_handler = {
.error_detected = hl_pci_err_detected,
.slot_reset = hl_pci_err_slot_reset,
.resume = hl_pci_err_resume,
.reset_prepare = hl_pci_reset_prepare,
.reset_done = hl_pci_reset_done,
};
static struct pci_driver hl_pci_driver = {
......
......@@ -19,6 +19,9 @@
#include <asm/msr.h>
/* make sure there is space for all the signed info */
static_assert(sizeof(struct cpucp_info) <= SEC_DEV_INFO_BUF_SZ);
static u32 hl_debug_struct_size[HL_DEBUG_OP_TIMESTAMP + 1] = {
[HL_DEBUG_OP_ETR] = sizeof(struct hl_debug_params_etr),
[HL_DEBUG_OP_ETF] = sizeof(struct hl_debug_params_etf),
......@@ -685,7 +688,7 @@ static int sec_attest_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
if (!sec_attest_info)
return -ENOMEM;
info = kmalloc(sizeof(*info), GFP_KERNEL);
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
rc = -ENOMEM;
goto free_sec_attest_info;
......@@ -719,6 +722,53 @@ static int sec_attest_info(struct hl_fpriv *hpriv, struct hl_info_args *args)
return rc;
}
static int dev_info_signed(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
void __user *out = (void __user *) (uintptr_t) args->return_pointer;
struct cpucp_dev_info_signed *dev_info_signed;
struct hl_info_signed *info;
u32 max_size = args->return_size;
int rc;
if ((!max_size) || (!out))
return -EINVAL;
dev_info_signed = kzalloc(sizeof(*dev_info_signed), GFP_KERNEL);
if (!dev_info_signed)
return -ENOMEM;
info = kzalloc(sizeof(*info), GFP_KERNEL);
if (!info) {
rc = -ENOMEM;
goto free_dev_info_signed;
}
rc = hl_fw_get_dev_info_signed(hpriv->hdev,
dev_info_signed, args->sec_attest_nonce);
if (rc)
goto free_info;
info->nonce = le32_to_cpu(dev_info_signed->nonce);
info->info_sig_len = dev_info_signed->info_sig_len;
info->pub_data_len = le16_to_cpu(dev_info_signed->pub_data_len);
info->certificate_len = le16_to_cpu(dev_info_signed->certificate_len);
info->dev_info_len = sizeof(struct cpucp_info);
memcpy(&info->info_sig, &dev_info_signed->info_sig, sizeof(info->info_sig));
memcpy(&info->public_data, &dev_info_signed->public_data, sizeof(info->public_data));
memcpy(&info->certificate, &dev_info_signed->certificate, sizeof(info->certificate));
memcpy(&info->dev_info, &dev_info_signed->info, info->dev_info_len);
rc = copy_to_user(out, info, min_t(size_t, max_size, sizeof(*info))) ? -EFAULT : 0;
free_info:
kfree(info);
free_dev_info_signed:
kfree(dev_info_signed);
return rc;
}
static int eventfd_register(struct hl_fpriv *hpriv, struct hl_info_args *args)
{
int rc;
......@@ -1089,6 +1139,9 @@ static int _hl_info_ioctl(struct hl_fpriv *hpriv, void *data,
case HL_INFO_FW_GENERIC_REQ:
return send_fw_generic_request(hdev, args);
case HL_INFO_DEV_SIGNED:
return dev_info_signed(hpriv, args);
default:
dev_err(dev, "Invalid request %d\n", args->op);
rc = -EINVAL;
......
......@@ -578,10 +578,6 @@ int hl_get_temperature(struct hl_device *hdev,
CPUCP_PKT_CTL_OPCODE_SHIFT);
pkt.sensor_index = __cpu_to_le16(sensor_index);
pkt.type = __cpu_to_le16(attr);
dev_dbg(hdev->dev, "get temp, ctl 0x%x, sensor %d, type %d\n",
pkt.ctl, pkt.sensor_index, pkt.type);
rc = hdev->asic_funcs->send_cpu_message(hdev, (u32 *) &pkt, sizeof(pkt),
0, &result);
......
......@@ -955,8 +955,8 @@ static int map_phys_pg_pack(struct hl_ctx *ctx, u64 vaddr,
(i + 1) == phys_pg_pack->npages);
if (rc) {
dev_err(hdev->dev,
"map failed for handle %u, npages: %llu, mapped: %llu",
phys_pg_pack->handle, phys_pg_pack->npages,
"map failed (%d) for handle %u, npages: %llu, mapped: %llu\n",
rc, phys_pg_pack->handle, phys_pg_pack->npages,
mapped_pg_cnt);
goto err;
}
......@@ -1186,7 +1186,8 @@ static int map_device_va(struct hl_ctx *ctx, struct hl_mem_in *args, u64 *device
rc = map_phys_pg_pack(ctx, ret_vaddr, phys_pg_pack);
if (rc) {
dev_err(hdev->dev, "mapping page pack failed for handle %u\n", handle);
dev_err(hdev->dev, "mapping page pack failed (%d) for handle %u\n",
rc, handle);
mutex_unlock(&hdev->mmu_lock);
goto map_err;
}
......
......@@ -596,6 +596,7 @@ int hl_mmu_if_set_funcs(struct hl_device *hdev)
break;
case ASIC_GAUDI2:
case ASIC_GAUDI2B:
case ASIC_GAUDI2C:
/* MMUs in Gaudi2 are always host resident */
hl_mmu_v2_hr_set_funcs(hdev, &hdev->mmu_func[MMU_HR_PGT]);
break;
......
......@@ -8,6 +8,7 @@
#include "habanalabs.h"
#include <linux/pci.h>
#include <linux/types.h>
static ssize_t clk_max_freq_mhz_show(struct device *dev, struct device_attribute *attr, char *buf)
{
......@@ -80,12 +81,27 @@ static ssize_t vrm_ver_show(struct device *dev, struct device_attribute *attr, c
{
struct hl_device *hdev = dev_get_drvdata(dev);
struct cpucp_info *cpucp_info;
u32 infineon_second_stage_version;
u32 infineon_second_stage_first_instance;
u32 infineon_second_stage_second_instance;
u32 infineon_second_stage_third_instance;
u32 mask = 0xff;
cpucp_info = &hdev->asic_prop.cpucp_info;
infineon_second_stage_version = le32_to_cpu(cpucp_info->infineon_second_stage_version);
infineon_second_stage_first_instance = infineon_second_stage_version & mask;
infineon_second_stage_second_instance =
(infineon_second_stage_version >> 8) & mask;
infineon_second_stage_third_instance =
(infineon_second_stage_version >> 16) & mask;
if (cpucp_info->infineon_second_stage_version)
return sprintf(buf, "%#04x %#04x\n", le32_to_cpu(cpucp_info->infineon_version),
le32_to_cpu(cpucp_info->infineon_second_stage_version));
return sprintf(buf, "%#04x %#04x:%#04x:%#04x\n",
le32_to_cpu(cpucp_info->infineon_version),
infineon_second_stage_first_instance,
infineon_second_stage_second_instance,
infineon_second_stage_third_instance);
else
return sprintf(buf, "%#04x\n", le32_to_cpu(cpucp_info->infineon_version));
}
......@@ -251,6 +267,9 @@ static ssize_t device_type_show(struct device *dev,
case ASIC_GAUDI2B:
str = "GAUDI2B";
break;
case ASIC_GAUDI2C:
str = "GAUDI2C";
break;
default:
dev_err(hdev->dev, "Unrecognized ASIC type %d\n",
hdev->asic_type);
......@@ -383,6 +402,21 @@ static ssize_t security_enabled_show(struct device *dev,
return sprintf(buf, "%d\n", hdev->asic_prop.fw_security_enabled);
}
static ssize_t module_id_show(struct device *dev,
struct device_attribute *attr, char *buf)
{
struct hl_device *hdev = dev_get_drvdata(dev);
return sprintf(buf, "%u\n", le32_to_cpu(hdev->asic_prop.cpucp_info.card_location));
}
static ssize_t parent_device_show(struct device *dev, struct device_attribute *attr, char *buf)
{
struct hl_device *hdev = dev_get_drvdata(dev);
return sprintf(buf, "%s\n", HL_DEV_NAME(hdev));
}
static DEVICE_ATTR_RO(armcp_kernel_ver);
static DEVICE_ATTR_RO(armcp_ver);
static DEVICE_ATTR_RO(cpld_ver);
......@@ -402,6 +436,8 @@ static DEVICE_ATTR_RO(thermal_ver);
static DEVICE_ATTR_RO(uboot_ver);
static DEVICE_ATTR_RO(fw_os_ver);
static DEVICE_ATTR_RO(security_enabled);
static DEVICE_ATTR_RO(module_id);
static DEVICE_ATTR_RO(parent_device);
static struct bin_attribute bin_attr_eeprom = {
.attr = {.name = "eeprom", .mode = (0444)},
......@@ -427,6 +463,8 @@ static struct attribute *hl_dev_attrs[] = {
&dev_attr_uboot_ver.attr,
&dev_attr_fw_os_ver.attr,
&dev_attr_security_enabled.attr,
&dev_attr_module_id.attr,
&dev_attr_parent_device.attr,
NULL,
};
......
......@@ -7858,39 +7858,44 @@ static bool gaudi2_handle_ecc_event(struct hl_device *hdev, u16 event_type,
return !!ecc_data->is_critical;
}
static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u64 event_mask)
static void handle_lower_qman_data_on_err(struct hl_device *hdev, u64 qman_base, u32 engine_id)
{
u32 lo, hi, cq_ptr_size, arc_cq_ptr_size;
u64 cq_ptr, arc_cq_ptr, cp_current_inst;
struct undefined_opcode_info *undef_opcode = &hdev->captured_err_info.undef_opcode;
u64 cq_ptr, cp_current_inst;
u32 lo, hi, cq_size, cp_sts;
bool is_arc_cq;
lo = RREG32(qman_base + QM_CQ_PTR_LO_4_OFFSET);
hi = RREG32(qman_base + QM_CQ_PTR_HI_4_OFFSET);
cq_ptr = ((u64) hi) << 32 | lo;
cq_ptr_size = RREG32(qman_base + QM_CQ_TSIZE_4_OFFSET);
cp_sts = RREG32(qman_base + QM_CP_STS_4_OFFSET);
is_arc_cq = FIELD_GET(PDMA0_QM_CP_STS_CUR_CQ_MASK, cp_sts); /* 0 - legacy CQ, 1 - ARC_CQ */
lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_OFFSET);
hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_OFFSET);
arc_cq_ptr = ((u64) hi) << 32 | lo;
arc_cq_ptr_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_OFFSET);
if (is_arc_cq) {
lo = RREG32(qman_base + QM_ARC_CQ_PTR_LO_STS_OFFSET);
hi = RREG32(qman_base + QM_ARC_CQ_PTR_HI_STS_OFFSET);
cq_ptr = ((u64) hi) << 32 | lo;
cq_size = RREG32(qman_base + QM_ARC_CQ_TSIZE_STS_OFFSET);
} else {
lo = RREG32(qman_base + QM_CQ_PTR_LO_STS_4_OFFSET);
hi = RREG32(qman_base + QM_CQ_PTR_HI_STS_4_OFFSET);
cq_ptr = ((u64) hi) << 32 | lo;
cq_size = RREG32(qman_base + QM_CQ_TSIZE_STS_4_OFFSET);
}
lo = RREG32(qman_base + QM_CP_CURRENT_INST_LO_4_OFFSET);
hi = RREG32(qman_base + QM_CP_CURRENT_INST_HI_4_OFFSET);
cp_current_inst = ((u64) hi) << 32 | lo;
dev_info(hdev->dev,
"LowerQM. CQ: {ptr %#llx, size %u}, ARC_CQ: {ptr %#llx, size %u}, CP: {instruction %#llx}\n",
cq_ptr, cq_ptr_size, arc_cq_ptr, arc_cq_ptr_size, cp_current_inst);
"LowerQM. %sCQ: {ptr %#llx, size %u}, CP: {instruction %#018llx}\n",
is_arc_cq ? "ARC_" : "", cq_ptr, cq_size, cp_current_inst);
if (event_mask & HL_NOTIFIER_EVENT_UNDEFINED_OPCODE) {
if (arc_cq_ptr) {
hdev->captured_err_info.undef_opcode.cq_addr = arc_cq_ptr;
hdev->captured_err_info.undef_opcode.cq_size = arc_cq_ptr_size;
} else {
hdev->captured_err_info.undef_opcode.cq_addr = cq_ptr;
hdev->captured_err_info.undef_opcode.cq_size = cq_ptr_size;
}
hdev->captured_err_info.undef_opcode.stream_id = QMAN_STREAMS;
if (undef_opcode->write_enable) {
memset(undef_opcode, 0, sizeof(*undef_opcode));
undef_opcode->timestamp = ktime_get();
undef_opcode->cq_addr = cq_ptr;
undef_opcode->cq_size = cq_size;
undef_opcode->engine_id = engine_id;
undef_opcode->stream_id = QMAN_STREAMS;
undef_opcode->write_enable = 0;
}
}
......@@ -7929,21 +7934,12 @@ static int gaudi2_handle_qman_err_generic(struct hl_device *hdev, u16 event_type
error_count++;
}
if (i == QMAN_STREAMS && error_count) {
/* check for undefined opcode */
if (glbl_sts_val & PDMA0_QM_GLBL_ERR_STS_CP_UNDEF_CMD_ERR_MASK &&
hdev->captured_err_info.undef_opcode.write_enable) {
memset(&hdev->captured_err_info.undef_opcode, 0,
sizeof(hdev->captured_err_info.undef_opcode));
hdev->captured_err_info.undef_opcode.write_enable = false;
hdev->captured_err_info.undef_opcode.timestamp = ktime_get();
hdev->captured_err_info.undef_opcode.engine_id =
gaudi2_queue_id_to_engine_id[qid_base];
*event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
}
handle_lower_qman_data_on_err(hdev, qman_base, *event_mask);
/* Check for undefined opcode error in lower QM */
if ((i == QMAN_STREAMS) &&
(glbl_sts_val & PDMA0_QM_GLBL_ERR_STS_CP_UNDEF_CMD_ERR_MASK)) {
handle_lower_qman_data_on_err(hdev, qman_base,
gaudi2_queue_id_to_engine_id[qid_base]);
*event_mask |= HL_NOTIFIER_EVENT_UNDEFINED_OPCODE;
}
}
......@@ -10007,6 +10003,8 @@ static void gaudi2_handle_eqe(struct hl_device *hdev, struct hl_eq_entry *eq_ent
error_count = gaudi2_handle_pcie_drain(hdev, &eq_entry->pcie_drain_ind_data);
reset_flags |= HL_DRV_RESET_FW_FATAL_ERR;
event_mask |= HL_NOTIFIER_EVENT_GENERAL_HW_ERR;
if (hl_is_fw_sw_ver_equal_or_greater(hdev, 1, 13))
is_critical = true;
break;
case GAUDI2_EVENT_PSOC59_RPM_ERROR_OR_DRAIN:
......
......@@ -242,14 +242,15 @@
#define QM_FENCE2_OFFSET (mmPDMA0_QM_CP_FENCE2_RDATA_0 - mmPDMA0_QM_BASE)
#define QM_SEI_STATUS_OFFSET (mmPDMA0_QM_SEI_STATUS - mmPDMA0_QM_BASE)
#define QM_CQ_PTR_LO_4_OFFSET (mmPDMA0_QM_CQ_PTR_LO_4 - mmPDMA0_QM_BASE)
#define QM_CQ_PTR_HI_4_OFFSET (mmPDMA0_QM_CQ_PTR_HI_4 - mmPDMA0_QM_BASE)
#define QM_CQ_TSIZE_4_OFFSET (mmPDMA0_QM_CQ_TSIZE_4 - mmPDMA0_QM_BASE)
#define QM_CQ_TSIZE_STS_4_OFFSET (mmPDMA0_QM_CQ_TSIZE_STS_4 - mmPDMA0_QM_BASE)
#define QM_CQ_PTR_LO_STS_4_OFFSET (mmPDMA0_QM_CQ_PTR_LO_STS_4 - mmPDMA0_QM_BASE)
#define QM_CQ_PTR_HI_STS_4_OFFSET (mmPDMA0_QM_CQ_PTR_HI_STS_4 - mmPDMA0_QM_BASE)
#define QM_ARC_CQ_PTR_LO_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_LO - mmPDMA0_QM_BASE)
#define QM_ARC_CQ_PTR_HI_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_HI - mmPDMA0_QM_BASE)
#define QM_ARC_CQ_TSIZE_OFFSET (mmPDMA0_QM_ARC_CQ_TSIZE - mmPDMA0_QM_BASE)
#define QM_ARC_CQ_TSIZE_STS_OFFSET (mmPDMA0_QM_ARC_CQ_TSIZE_STS - mmPDMA0_QM_BASE)
#define QM_ARC_CQ_PTR_LO_STS_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_LO_STS - mmPDMA0_QM_BASE)
#define QM_ARC_CQ_PTR_HI_STS_OFFSET (mmPDMA0_QM_ARC_CQ_PTR_HI_STS - mmPDMA0_QM_BASE)
#define QM_CP_STS_4_OFFSET (mmPDMA0_QM_CP_STS_4 - mmPDMA0_QM_BASE)
#define QM_CP_CURRENT_INST_LO_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_LO_4 - mmPDMA0_QM_BASE)
#define QM_CP_CURRENT_INST_HI_4_OFFSET (mmPDMA0_QM_CP_CURRENT_INST_HI_4 - mmPDMA0_QM_BASE)
......
......@@ -25,6 +25,7 @@ enum hl_revision_id {
REV_ID_INVALID = 0x00,
REV_ID_A = 0x01,
REV_ID_B = 0x02,
REV_ID_C = 0x03
};
#endif /* INCLUDE_PCI_GENERAL_H_ */
......@@ -659,6 +659,12 @@ enum pq_init_status {
* number (nonce) provided by the host to prevent replay attacks.
* public key and certificate also provided as part of the FW response.
*
* CPUCP_PACKET_INFO_SIGNED_GET -
* Get the device information signed by the Trusted Platform device.
* device info data is also hashed with some unique number (nonce) provided
* by the host to prevent replay attacks. public key and certificate also
* provided as part of the FW response.
*
* CPUCP_PACKET_MONITOR_DUMP_GET -
* Get monitors registers dump from the CpuCP kernel.
* The CPU will put the registers dump in the a buffer allocated by the driver
......@@ -733,7 +739,7 @@ enum cpucp_packet_id {
CPUCP_PACKET_ENGINE_CORE_ASID_SET, /* internal */
CPUCP_PACKET_RESERVED2, /* not used */
CPUCP_PACKET_SEC_ATTEST_GET, /* internal */
CPUCP_PACKET_RESERVED3, /* not used */
CPUCP_PACKET_INFO_SIGNED_GET, /* internal */
CPUCP_PACKET_RESERVED4, /* not used */
CPUCP_PACKET_MONITOR_DUMP_GET, /* debugfs */
CPUCP_PACKET_RESERVED5, /* not used */
......
......@@ -846,6 +846,7 @@ enum hl_server_type {
#define HL_INFO_HW_ERR_EVENT 36
#define HL_INFO_FW_ERR_EVENT 37
#define HL_INFO_USER_ENGINE_ERR_EVENT 38
#define HL_INFO_DEV_SIGNED 40
#define HL_INFO_VERSION_MAX_LEN 128
#define HL_INFO_CARD_NAME_MAX_LEN 16
......@@ -1256,6 +1257,7 @@ struct hl_info_dev_memalloc_page_sizes {
#define SEC_SIGNATURE_BUF_SZ 255 /* (256 - 1) 1 byte used for size */
#define SEC_PUB_DATA_BUF_SZ 510 /* (512 - 2) 2 bytes used for size */
#define SEC_CERTIFICATE_BUF_SZ 2046 /* (2048 - 2) 2 bytes used for size */
#define SEC_DEV_INFO_BUF_SZ 5120
/*
* struct hl_info_sec_attest - attestation report of the boot
......@@ -1290,6 +1292,32 @@ struct hl_info_sec_attest {
__u8 pad0[2];
};
/*
* struct hl_info_signed - device information signed by a secured device.
* @nonce: number only used once. random number provided by host. this also passed to the quote
* command as a qualifying data.
* @pub_data_len: length of the public data (bytes)
* @certificate_len: length of the certificate (bytes)
* @info_sig_len: length of the attestation signature (bytes)
* @public_data: public key info signed info data (outPublic + name + qualifiedName)
* @certificate: certificate for the signing key
* @info_sig: signature of the info + nonce data.
* @dev_info_len: length of device info (bytes)
* @dev_info: device info as byte array.
*/
struct hl_info_signed {
__u32 nonce;
__u16 pub_data_len;
__u16 certificate_len;
__u8 info_sig_len;
__u8 public_data[SEC_PUB_DATA_BUF_SZ];
__u8 certificate[SEC_CERTIFICATE_BUF_SZ];
__u8 info_sig[SEC_SIGNATURE_BUF_SZ];
__u16 dev_info_len;
__u8 dev_info[SEC_DEV_INFO_BUF_SZ];
__u8 pad[2];
};
/**
* struct hl_page_fault_info - page fault information.
* @timestamp: timestamp of page fault.
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment