Commit 5984be90 authored by Jack Morgenstein's avatar Jack Morgenstein Committed by Roland Dreier

mlx4_core: Report thermal error events

Print an error message when a thermal error async event is reported by the HW.
Signed-off-by: default avatarJack Morgenstein <jackm@dev.mellanox.co.il>
Signed-off-by: default avatarDotan Barak <dotanb@mellanox.com>
Signed-off-by: default avatarRoland Dreier <roland@purestorage.com>
parent e10903b0
...@@ -79,7 +79,8 @@ enum { ...@@ -79,7 +79,8 @@ enum {
(1ull << MLX4_EVENT_TYPE_SRQ_LIMIT) | \ (1ull << MLX4_EVENT_TYPE_SRQ_LIMIT) | \
(1ull << MLX4_EVENT_TYPE_CMD) | \ (1ull << MLX4_EVENT_TYPE_CMD) | \
(1ull << MLX4_EVENT_TYPE_COMM_CHANNEL) | \ (1ull << MLX4_EVENT_TYPE_COMM_CHANNEL) | \
(1ull << MLX4_EVENT_TYPE_FLR_EVENT)) (1ull << MLX4_EVENT_TYPE_FLR_EVENT) | \
(1ull << MLX4_EVENT_TYPE_FATAL_WARNING))
static void eq_set_ci(struct mlx4_eq *eq, int req_not) static void eq_set_ci(struct mlx4_eq *eq, int req_not)
{ {
...@@ -443,6 +444,35 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq) ...@@ -443,6 +444,35 @@ static int mlx4_eq_int(struct mlx4_dev *dev, struct mlx4_eq *eq)
queue_work(priv->mfunc.master.comm_wq, queue_work(priv->mfunc.master.comm_wq,
&priv->mfunc.master.slave_flr_event_work); &priv->mfunc.master.slave_flr_event_work);
break; break;
case MLX4_EVENT_TYPE_FATAL_WARNING:
if (eqe->subtype == MLX4_FATAL_WARNING_SUBTYPE_WARMING) {
if (mlx4_is_master(dev))
for (i = 0; i < dev->num_slaves; i++) {
mlx4_dbg(dev, "%s: Sending "
"MLX4_FATAL_WARNING_SUBTYPE_WARMING"
" to slave: %d\n", __func__, i);
if (i == dev->caps.function)
continue;
mlx4_slave_event(dev, i, eqe);
}
mlx4_err(dev, "Temperature Threshold was reached! "
"Threshold: %d celsius degrees; "
"Current Temperature: %d\n",
be16_to_cpu(eqe->event.warming.warning_threshold),
be16_to_cpu(eqe->event.warming.current_temperature));
} else
mlx4_warn(dev, "Unhandled event FATAL WARNING (%02x), "
"subtype %02x on EQ %d at index %u. owner=%x, "
"nent=0x%x, slave=%x, ownership=%s\n",
eqe->type, eqe->subtype, eq->eqn,
eq->cons_index, eqe->owner, eq->nent,
eqe->slave_id,
!!(eqe->owner & 0x80) ^
!!(eq->cons_index & eq->nent) ? "HW" : "SW");
break;
case MLX4_EVENT_TYPE_EEC_CATAS_ERROR: case MLX4_EVENT_TYPE_EEC_CATAS_ERROR:
case MLX4_EVENT_TYPE_ECC_DETECT: case MLX4_EVENT_TYPE_ECC_DETECT:
default: default:
......
...@@ -363,6 +363,10 @@ struct mlx4_eqe { ...@@ -363,6 +363,10 @@ struct mlx4_eqe {
struct { struct {
__be32 slave_id; __be32 slave_id;
} __packed flr_event; } __packed flr_event;
struct {
__be16 current_temperature;
__be16 warning_threshold;
} __packed warming;
} event; } event;
u8 slave_id; u8 slave_id;
u8 reserved3[2]; u8 reserved3[2];
......
...@@ -133,6 +133,7 @@ enum mlx4_event { ...@@ -133,6 +133,7 @@ enum mlx4_event {
MLX4_EVENT_TYPE_CMD = 0x0a, MLX4_EVENT_TYPE_CMD = 0x0a,
MLX4_EVENT_TYPE_VEP_UPDATE = 0x19, MLX4_EVENT_TYPE_VEP_UPDATE = 0x19,
MLX4_EVENT_TYPE_COMM_CHANNEL = 0x18, MLX4_EVENT_TYPE_COMM_CHANNEL = 0x18,
MLX4_EVENT_TYPE_FATAL_WARNING = 0x1b,
MLX4_EVENT_TYPE_FLR_EVENT = 0x1c, MLX4_EVENT_TYPE_FLR_EVENT = 0x1c,
MLX4_EVENT_TYPE_NONE = 0xff, MLX4_EVENT_TYPE_NONE = 0xff,
}; };
...@@ -142,6 +143,10 @@ enum { ...@@ -142,6 +143,10 @@ enum {
MLX4_PORT_CHANGE_SUBTYPE_ACTIVE = 4 MLX4_PORT_CHANGE_SUBTYPE_ACTIVE = 4
}; };
enum {
MLX4_FATAL_WARNING_SUBTYPE_WARMING = 0,
};
enum { enum {
MLX4_PERM_LOCAL_READ = 1 << 10, MLX4_PERM_LOCAL_READ = 1 << 10,
MLX4_PERM_LOCAL_WRITE = 1 << 11, MLX4_PERM_LOCAL_WRITE = 1 << 11,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment