Commit fd1483fe authored by Moshe Shemesh's avatar Moshe Shemesh Committed by Saeed Mahameed

net/mlx5: Add support for FW reporter dump

Add support of dump callback for mlx5 FW reporter.  Once we trigger FW
dump, the FW will write the core dump to its raw data buffer. The tracer
translates the raw data to traces and save it to a cyclic array. Once
dump is done, the saved traces data is filled into the dump buffer. In
case syndrome is not zero the health buffer content will be printed as
well.

FW dump example:
$ devlink health dump show pci/0000:82:00.0 reporter fw
 dump fw traces:
   timestamp: 509006640427 lost: false event_id: 185 msg: dump general
info GVMI=0x0000
   timestamp: 509006645474 lost: false event_id: 185 msg: GVMI
management info, gvmi_management context:
   timestamp: 509006654463 lost: false event_id: 185 msg: [000]:
00000000  00000000  00000000  00000000
   timestamp: 509006656127 lost: false event_id: 185 msg: [010]:
00000000  00000000  00000000  00000000
   timestamp: 509006656255 lost: false event_id: 185 msg: [020]:
00000000  00000000  00000000  00000000
   timestamp: 509006656511 lost: false event_id: 185 msg: [030]:
00000000  00000000  00000000  00000000
   timestamp: 509006656639 lost: false event_id: 185 msg: [040]:
00000000  00000000  00000000  00000000
   timestamp: 509006656895 lost: false event_id: 185 msg: [050]:
00000000  00000000  00000000  00000000
   timestamp: 509006657023 lost: false event_id: 185 msg: [060]:
00000000  00000000  00000000  00000000
   timestamp: 509006657180 lost: false event_id: 185 msg: [070]:
00000000  00000000  00000000  00000000
   timestamp: 509006659839 lost: false event_id: 185 msg: CMDIF dbase
from IRON: active_dbase_slots = 0x00000000
   timestamp: 509006667391 lost: false event_id: 185 msg: GVMI=0x0000
hw_toc context:
   timestamp: 509006667647 lost: false event_id: 185 msg: [000]:
00000000  00000000  00000000  fffff000
   timestamp: 509006667775 lost: false event_id: 185 msg: [010]:
00000000  00000000  00000000  80d00000
...
...
Signed-off-by: default avatarMoshe Shemesh <moshe@mellanox.com>
Signed-off-by: default avatarEran Ben Elisha <eranbe@mellanox.com>
Signed-off-by: default avatarSaeed Mahameed <saeedm@mellanox.com>
parent 1e34f3ef
...@@ -243,6 +243,19 @@ static int mlx5_fw_tracer_allocate_strings_db(struct mlx5_fw_tracer *tracer) ...@@ -243,6 +243,19 @@ static int mlx5_fw_tracer_allocate_strings_db(struct mlx5_fw_tracer *tracer)
return -ENOMEM; return -ENOMEM;
} }
static void
mlx5_fw_tracer_init_saved_traces_array(struct mlx5_fw_tracer *tracer)
{
tracer->st_arr.saved_traces_index = 0;
mutex_init(&tracer->st_arr.lock);
}
static void
mlx5_fw_tracer_clean_saved_traces_array(struct mlx5_fw_tracer *tracer)
{
mutex_destroy(&tracer->st_arr.lock);
}
static void mlx5_tracer_read_strings_db(struct work_struct *work) static void mlx5_tracer_read_strings_db(struct work_struct *work)
{ {
struct mlx5_fw_tracer *tracer = container_of(work, struct mlx5_fw_tracer, struct mlx5_fw_tracer *tracer = container_of(work, struct mlx5_fw_tracer,
...@@ -522,6 +535,24 @@ static void mlx5_fw_tracer_clean_ready_list(struct mlx5_fw_tracer *tracer) ...@@ -522,6 +535,24 @@ static void mlx5_fw_tracer_clean_ready_list(struct mlx5_fw_tracer *tracer)
list_del(&str_frmt->list); list_del(&str_frmt->list);
} }
static void mlx5_fw_tracer_save_trace(struct mlx5_fw_tracer *tracer,
u64 timestamp, bool lost,
u8 event_id, char *msg)
{
struct mlx5_fw_trace_data *trace_data;
mutex_lock(&tracer->st_arr.lock);
trace_data = &tracer->st_arr.straces[tracer->st_arr.saved_traces_index];
trace_data->timestamp = timestamp;
trace_data->lost = lost;
trace_data->event_id = event_id;
strncpy(trace_data->msg, msg, TRACE_STR_MSG);
tracer->st_arr.saved_traces_index =
(tracer->st_arr.saved_traces_index + 1) & (SAVED_TRACES_NUM - 1);
mutex_unlock(&tracer->st_arr.lock);
}
static void mlx5_tracer_print_trace(struct tracer_string_format *str_frmt, static void mlx5_tracer_print_trace(struct tracer_string_format *str_frmt,
struct mlx5_core_dev *dev, struct mlx5_core_dev *dev,
u64 trace_timestamp) u64 trace_timestamp)
...@@ -540,6 +571,9 @@ static void mlx5_tracer_print_trace(struct tracer_string_format *str_frmt, ...@@ -540,6 +571,9 @@ static void mlx5_tracer_print_trace(struct tracer_string_format *str_frmt,
trace_mlx5_fw(dev->tracer, trace_timestamp, str_frmt->lost, trace_mlx5_fw(dev->tracer, trace_timestamp, str_frmt->lost,
str_frmt->event_id, tmp); str_frmt->event_id, tmp);
mlx5_fw_tracer_save_trace(dev->tracer, trace_timestamp,
str_frmt->lost, str_frmt->event_id, tmp);
/* remove it from hash */ /* remove it from hash */
mlx5_tracer_clean_message(str_frmt); mlx5_tracer_clean_message(str_frmt);
} }
...@@ -786,6 +820,109 @@ static void mlx5_fw_tracer_ownership_change(struct work_struct *work) ...@@ -786,6 +820,109 @@ static void mlx5_fw_tracer_ownership_change(struct work_struct *work)
mlx5_fw_tracer_start(tracer); mlx5_fw_tracer_start(tracer);
} }
static int mlx5_fw_tracer_set_core_dump_reg(struct mlx5_core_dev *dev,
u32 *in, int size_in)
{
u32 out[MLX5_ST_SZ_DW(core_dump_reg)] = {};
if (!MLX5_CAP_DEBUG(dev, core_dump_general) &&
!MLX5_CAP_DEBUG(dev, core_dump_qp))
return -EOPNOTSUPP;
return mlx5_core_access_reg(dev, in, size_in, out, sizeof(out),
MLX5_REG_CORE_DUMP, 0, 1);
}
int mlx5_fw_tracer_trigger_core_dump_general(struct mlx5_core_dev *dev)
{
struct mlx5_fw_tracer *tracer = dev->tracer;
u32 in[MLX5_ST_SZ_DW(core_dump_reg)] = {};
int err;
if (!MLX5_CAP_DEBUG(dev, core_dump_general) || !tracer)
return -EOPNOTSUPP;
if (!tracer->owner)
return -EPERM;
MLX5_SET(core_dump_reg, in, core_dump_type, 0x0);
err = mlx5_fw_tracer_set_core_dump_reg(dev, in, sizeof(in));
if (err)
return err;
queue_work(tracer->work_queue, &tracer->handle_traces_work);
flush_workqueue(tracer->work_queue);
return 0;
}
static int
mlx5_devlink_fmsg_fill_trace(struct devlink_fmsg *fmsg,
struct mlx5_fw_trace_data *trace_data)
{
int err;
err = devlink_fmsg_obj_nest_start(fmsg);
if (err)
return err;
err = devlink_fmsg_u64_pair_put(fmsg, "timestamp", trace_data->timestamp);
if (err)
return err;
err = devlink_fmsg_bool_pair_put(fmsg, "lost", trace_data->lost);
if (err)
return err;
err = devlink_fmsg_u8_pair_put(fmsg, "event_id", trace_data->event_id);
if (err)
return err;
err = devlink_fmsg_string_pair_put(fmsg, "msg", trace_data->msg);
if (err)
return err;
err = devlink_fmsg_obj_nest_end(fmsg);
if (err)
return err;
return 0;
}
int mlx5_fw_tracer_get_saved_traces_objects(struct mlx5_fw_tracer *tracer,
struct devlink_fmsg *fmsg)
{
struct mlx5_fw_trace_data *straces = tracer->st_arr.straces;
u32 index, start_index, end_index;
u32 saved_traces_index;
int err;
if (!straces[0].timestamp)
return -ENOMSG;
mutex_lock(&tracer->st_arr.lock);
saved_traces_index = tracer->st_arr.saved_traces_index;
if (straces[saved_traces_index].timestamp)
start_index = saved_traces_index;
else
start_index = 0;
end_index = (saved_traces_index - 1) & (SAVED_TRACES_NUM - 1);
err = devlink_fmsg_arr_pair_nest_start(fmsg, "dump fw traces");
if (err)
goto unlock;
index = start_index;
while (index != end_index) {
err = mlx5_devlink_fmsg_fill_trace(fmsg, &straces[index]);
if (err)
goto unlock;
index = (index + 1) & (SAVED_TRACES_NUM - 1);
}
err = devlink_fmsg_arr_pair_nest_end(fmsg);
unlock:
mutex_unlock(&tracer->st_arr.lock);
return err;
}
/* Create software resources (Buffers, etc ..) */ /* Create software resources (Buffers, etc ..) */
struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev) struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev)
{ {
...@@ -833,6 +970,7 @@ struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev) ...@@ -833,6 +970,7 @@ struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev)
goto free_log_buf; goto free_log_buf;
} }
mlx5_fw_tracer_init_saved_traces_array(tracer);
mlx5_core_dbg(dev, "FWTracer: Tracer created\n"); mlx5_core_dbg(dev, "FWTracer: Tracer created\n");
return tracer; return tracer;
...@@ -917,6 +1055,7 @@ void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer) ...@@ -917,6 +1055,7 @@ void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer)
cancel_work_sync(&tracer->read_fw_strings_work); cancel_work_sync(&tracer->read_fw_strings_work);
mlx5_fw_tracer_clean_ready_list(tracer); mlx5_fw_tracer_clean_ready_list(tracer);
mlx5_fw_tracer_clean_print_hash(tracer); mlx5_fw_tracer_clean_print_hash(tracer);
mlx5_fw_tracer_clean_saved_traces_array(tracer);
mlx5_fw_tracer_free_strings_db(tracer); mlx5_fw_tracer_free_strings_db(tracer);
mlx5_fw_tracer_destroy_log_buf(tracer); mlx5_fw_tracer_destroy_log_buf(tracer);
flush_workqueue(tracer->work_queue); flush_workqueue(tracer->work_queue);
......
...@@ -46,6 +46,9 @@ ...@@ -46,6 +46,9 @@
#define TRACER_BLOCK_SIZE_BYTE 256 #define TRACER_BLOCK_SIZE_BYTE 256
#define TRACES_PER_BLOCK 32 #define TRACES_PER_BLOCK 32
#define TRACE_STR_MSG 256
#define SAVED_TRACES_NUM 8192
#define TRACER_MAX_PARAMS 7 #define TRACER_MAX_PARAMS 7
#define MESSAGE_HASH_BITS 6 #define MESSAGE_HASH_BITS 6
#define MESSAGE_HASH_SIZE BIT(MESSAGE_HASH_BITS) #define MESSAGE_HASH_SIZE BIT(MESSAGE_HASH_BITS)
...@@ -53,6 +56,13 @@ ...@@ -53,6 +56,13 @@
#define MASK_52_7 (0x1FFFFFFFFFFF80) #define MASK_52_7 (0x1FFFFFFFFFFF80)
#define MASK_6_0 (0x7F) #define MASK_6_0 (0x7F)
struct mlx5_fw_trace_data {
u64 timestamp;
bool lost;
u8 event_id;
char msg[TRACE_STR_MSG];
};
struct mlx5_fw_tracer { struct mlx5_fw_tracer {
struct mlx5_core_dev *dev; struct mlx5_core_dev *dev;
struct mlx5_nb nb; struct mlx5_nb nb;
...@@ -83,6 +93,13 @@ struct mlx5_fw_tracer { ...@@ -83,6 +93,13 @@ struct mlx5_fw_tracer {
u32 consumer_index; u32 consumer_index;
} buff; } buff;
/* Saved Traces Array */
struct {
struct mlx5_fw_trace_data straces[SAVED_TRACES_NUM];
u32 saved_traces_index;
struct mutex lock; /* Protect st_arr access */
} st_arr;
u64 last_timestamp; u64 last_timestamp;
struct work_struct handle_traces_work; struct work_struct handle_traces_work;
struct hlist_head hash[MESSAGE_HASH_SIZE]; struct hlist_head hash[MESSAGE_HASH_SIZE];
...@@ -171,5 +188,8 @@ struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev); ...@@ -171,5 +188,8 @@ struct mlx5_fw_tracer *mlx5_fw_tracer_create(struct mlx5_core_dev *dev);
int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer); int mlx5_fw_tracer_init(struct mlx5_fw_tracer *tracer);
void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer); void mlx5_fw_tracer_cleanup(struct mlx5_fw_tracer *tracer);
void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer); void mlx5_fw_tracer_destroy(struct mlx5_fw_tracer *tracer);
int mlx5_fw_tracer_trigger_core_dump_general(struct mlx5_core_dev *dev);
int mlx5_fw_tracer_get_saved_traces_objects(struct mlx5_fw_tracer *tracer,
struct devlink_fmsg *fmsg);
#endif #endif
...@@ -41,6 +41,7 @@ ...@@ -41,6 +41,7 @@
#include "lib/eq.h" #include "lib/eq.h"
#include "lib/mlx5.h" #include "lib/mlx5.h"
#include "lib/pci_vsc.h" #include "lib/pci_vsc.h"
#include "diag/fw_tracer.h"
enum { enum {
MLX5_HEALTH_POLL_INTERVAL = 2 * HZ, MLX5_HEALTH_POLL_INTERVAL = 2 * HZ,
...@@ -405,9 +406,119 @@ mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter, ...@@ -405,9 +406,119 @@ mlx5_fw_reporter_diagnose(struct devlink_health_reporter *reporter,
return devlink_fmsg_string_pair_put(fmsg, "Description", hsynd_str(synd)); return devlink_fmsg_string_pair_put(fmsg, "Description", hsynd_str(synd));
} }
struct mlx5_fw_reporter_ctx {
u8 err_synd;
int miss_counter;
};
static int
mlx5_fw_reporter_ctx_pairs_put(struct devlink_fmsg *fmsg,
struct mlx5_fw_reporter_ctx *fw_reporter_ctx)
{
int err;
err = devlink_fmsg_u8_pair_put(fmsg, "syndrome",
fw_reporter_ctx->err_synd);
if (err)
return err;
err = devlink_fmsg_u32_pair_put(fmsg, "fw_miss_counter",
fw_reporter_ctx->miss_counter);
if (err)
return err;
return 0;
}
static int
mlx5_fw_reporter_heath_buffer_data_put(struct mlx5_core_dev *dev,
struct devlink_fmsg *fmsg)
{
struct mlx5_core_health *health = &dev->priv.health;
struct health_buffer __iomem *h = health->health;
int err;
int i;
if (!ioread8(&h->synd))
return 0;
err = devlink_fmsg_pair_nest_start(fmsg, "health buffer");
if (err)
return err;
err = devlink_fmsg_obj_nest_start(fmsg);
if (err)
return err;
err = devlink_fmsg_arr_pair_nest_start(fmsg, "assert_var");
if (err)
return err;
for (i = 0; i < ARRAY_SIZE(h->assert_var); i++) {
err = devlink_fmsg_u32_put(fmsg, ioread32be(h->assert_var + i));
if (err)
return err;
}
err = devlink_fmsg_arr_pair_nest_end(fmsg);
if (err)
return err;
err = devlink_fmsg_u32_pair_put(fmsg, "assert_exit_ptr",
ioread32be(&h->assert_exit_ptr));
if (err)
return err;
err = devlink_fmsg_u32_pair_put(fmsg, "assert_callra",
ioread32be(&h->assert_callra));
if (err)
return err;
err = devlink_fmsg_u32_pair_put(fmsg, "hw_id", ioread32be(&h->hw_id));
if (err)
return err;
err = devlink_fmsg_u8_pair_put(fmsg, "irisc_index",
ioread8(&h->irisc_index));
if (err)
return err;
err = devlink_fmsg_u8_pair_put(fmsg, "synd", ioread8(&h->synd));
if (err)
return err;
err = devlink_fmsg_u32_pair_put(fmsg, "ext_synd",
ioread16be(&h->ext_synd));
if (err)
return err;
err = devlink_fmsg_u32_pair_put(fmsg, "raw_fw_ver",
ioread32be(&h->fw_ver));
if (err)
return err;
err = devlink_fmsg_obj_nest_end(fmsg);
if (err)
return err;
return devlink_fmsg_pair_nest_end(fmsg);
}
static int
mlx5_fw_reporter_dump(struct devlink_health_reporter *reporter,
struct devlink_fmsg *fmsg, void *priv_ctx)
{
struct mlx5_core_dev *dev = devlink_health_reporter_priv(reporter);
int err;
err = mlx5_fw_tracer_trigger_core_dump_general(dev);
if (err)
return err;
if (priv_ctx) {
struct mlx5_fw_reporter_ctx *fw_reporter_ctx = priv_ctx;
err = mlx5_fw_reporter_ctx_pairs_put(fmsg, fw_reporter_ctx);
if (err)
return err;
}
err = mlx5_fw_reporter_heath_buffer_data_put(dev, fmsg);
if (err)
return err;
return mlx5_fw_tracer_get_saved_traces_objects(dev->tracer, fmsg);
}
static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = { static const struct devlink_health_reporter_ops mlx5_fw_reporter_ops = {
.name = "fw", .name = "fw",
.diagnose = mlx5_fw_reporter_diagnose, .diagnose = mlx5_fw_reporter_diagnose,
.dump = mlx5_fw_reporter_dump,
}; };
static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev) static void mlx5_fw_reporter_create(struct mlx5_core_dev *dev)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment