Commit 72a813a4 authored by David S. Miller's avatar David S. Miller

Merge branch 'mlxsw-new-reset-flow'

Petr Machata says:

====================
mlxsw: Add support for new reset flow

Ido Schimmel writes:

This patchset changes mlxsw to issue a PCI reset during probe and
devlink reload so that the PCI firmware could be upgraded without a
reboot.

Unlike the old version of this patchset [1], in this version the driver
no longer tries to issue a PCI reset by triggering a PCI link toggle on
its own, but instead calls the PCI core to issue the reset.

The PCI APIs require the device lock to be held which is why patches

Patches #7 adds reset method quirk for NVIDIA Spectrum devices.

Patch #8 adds a debug level print in PCI core so that device ready delay
will be printed even if it is shorter than one second.

Patches #9-#11 are straightforward preparations in mlxsw.

Patch #12 finally implements the new reset flow in mlxsw.

Patch #13 adds PCI reset handlers in mlxsw to avoid user space from
resetting the device from underneath an unaware driver. Instead, the
driver is gracefully de-initialized before the PCI reset and then
initialized again after it.

Patch #14 adds a PCI reset selftest to make sure this code path does not
regress.

[1] https://lore.kernel.org/netdev/cover.1679502371.git.petrm@nvidia.com/
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 4dce97b1 af51d6bd
......@@ -1484,8 +1484,8 @@ operations:
dont-validate: [ strict ]
flags: [ admin-perm ]
do:
pre: devlink-nl-pre-doit
post: devlink-nl-post-doit
pre: devlink-nl-pre-doit-dev-lock
post: devlink-nl-post-doit-dev-lock
request:
attributes:
- bus-name
......
......@@ -130,6 +130,7 @@ struct mlxsw_pci {
const struct pci_device_id *id;
enum mlxsw_pci_cqe_v max_cqe_ver; /* Maximal supported CQE version */
u8 num_sdq_cqs; /* Number of CQs used for SDQs */
bool skip_reset;
};
static void mlxsw_pci_queue_tasklet_schedule(struct mlxsw_pci_queue *q)
......@@ -1476,11 +1477,47 @@ static int mlxsw_pci_sys_ready_wait(struct mlxsw_pci *mlxsw_pci,
return -EBUSY;
}
static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci,
const struct pci_device_id *id)
static int mlxsw_pci_reset_at_pci_disable(struct mlxsw_pci *mlxsw_pci)
{
struct pci_dev *pdev = mlxsw_pci->pdev;
char mrsr_pl[MLXSW_REG_MRSR_LEN];
int err;
mlxsw_reg_mrsr_pack(mrsr_pl,
MLXSW_REG_MRSR_COMMAND_RESET_AT_PCI_DISABLE);
err = mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl);
if (err)
return err;
device_lock_assert(&pdev->dev);
pci_cfg_access_lock(pdev);
pci_save_state(pdev);
err = __pci_reset_function_locked(pdev);
if (err)
pci_err(pdev, "PCI function reset failed with %d\n", err);
pci_restore_state(pdev);
pci_cfg_access_unlock(pdev);
return err;
}
static int mlxsw_pci_reset_sw(struct mlxsw_pci *mlxsw_pci)
{
char mrsr_pl[MLXSW_REG_MRSR_LEN];
mlxsw_reg_mrsr_pack(mrsr_pl, MLXSW_REG_MRSR_COMMAND_SOFTWARE_RESET);
return mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl);
}
static int
mlxsw_pci_reset(struct mlxsw_pci *mlxsw_pci, const struct pci_device_id *id)
{
struct pci_dev *pdev = mlxsw_pci->pdev;
char mcam_pl[MLXSW_REG_MCAM_LEN];
bool pci_reset_supported;
u32 sys_status;
int err;
......@@ -1491,11 +1528,27 @@ static int mlxsw_pci_sw_reset(struct mlxsw_pci *mlxsw_pci,
return err;
}
mlxsw_reg_mrsr_pack(mrsr_pl);
err = mlxsw_reg_write(mlxsw_pci->core, MLXSW_REG(mrsr), mrsr_pl);
/* PCI core already issued a PCI reset, do not issue another reset. */
if (mlxsw_pci->skip_reset)
return 0;
mlxsw_reg_mcam_pack(mcam_pl,
MLXSW_REG_MCAM_FEATURE_GROUP_ENHANCED_FEATURES);
err = mlxsw_reg_query(mlxsw_pci->core, MLXSW_REG(mcam), mcam_pl);
if (err)
return err;
mlxsw_reg_mcam_unpack(mcam_pl, MLXSW_REG_MCAM_PCI_RESET,
&pci_reset_supported);
if (pci_reset_supported) {
pci_dbg(pdev, "Starting PCI reset flow\n");
err = mlxsw_pci_reset_at_pci_disable(mlxsw_pci);
} else {
pci_dbg(pdev, "Starting software reset flow\n");
err = mlxsw_pci_reset_sw(mlxsw_pci);
}
err = mlxsw_pci_sys_ready_wait(mlxsw_pci, id, &sys_status);
if (err) {
dev_err(&pdev->dev, "Failed to reach system ready status after reset. Status is 0x%x\n",
......@@ -1537,9 +1590,9 @@ static int mlxsw_pci_init(void *bus_priv, struct mlxsw_core *mlxsw_core,
if (!mbox)
return -ENOMEM;
err = mlxsw_pci_sw_reset(mlxsw_pci, mlxsw_pci->id);
err = mlxsw_pci_reset(mlxsw_pci, mlxsw_pci->id);
if (err)
goto err_sw_reset;
goto err_reset;
err = mlxsw_pci_alloc_irq_vectors(mlxsw_pci);
if (err < 0) {
......@@ -1672,7 +1725,7 @@ static int mlxsw_pci_init(void *bus_priv, struct mlxsw_core *mlxsw_core,
err_query_fw:
mlxsw_pci_free_irq_vectors(mlxsw_pci);
err_alloc_irq:
err_sw_reset:
err_reset:
mbox_put:
mlxsw_cmd_mbox_free(mbox);
return err;
......@@ -2059,11 +2112,34 @@ static void mlxsw_pci_remove(struct pci_dev *pdev)
kfree(mlxsw_pci);
}
static void mlxsw_pci_reset_prepare(struct pci_dev *pdev)
{
struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev);
mlxsw_core_bus_device_unregister(mlxsw_pci->core, false);
}
static void mlxsw_pci_reset_done(struct pci_dev *pdev)
{
struct mlxsw_pci *mlxsw_pci = pci_get_drvdata(pdev);
mlxsw_pci->skip_reset = true;
mlxsw_core_bus_device_register(&mlxsw_pci->bus_info, &mlxsw_pci_bus,
mlxsw_pci, false, NULL, NULL);
mlxsw_pci->skip_reset = false;
}
static const struct pci_error_handlers mlxsw_pci_err_handler = {
.reset_prepare = mlxsw_pci_reset_prepare,
.reset_done = mlxsw_pci_reset_done,
};
int mlxsw_pci_driver_register(struct pci_driver *pci_driver)
{
pci_driver->probe = mlxsw_pci_probe;
pci_driver->remove = mlxsw_pci_remove;
pci_driver->shutdown = mlxsw_pci_remove;
pci_driver->err_handler = &mlxsw_pci_err_handler;
return pci_register_driver(pci_driver);
}
EXPORT_SYMBOL(mlxsw_pci_driver_register);
......
......@@ -10122,6 +10122,15 @@ mlxsw_reg_mgir_unpack(char *payload, u32 *hw_rev, char *fw_info_psid,
MLXSW_REG_DEFINE(mrsr, MLXSW_REG_MRSR_ID, MLXSW_REG_MRSR_LEN);
enum mlxsw_reg_mrsr_command {
/* Switch soft reset, does not reset PCI firmware. */
MLXSW_REG_MRSR_COMMAND_SOFTWARE_RESET = 1,
/* Reset will be done when PCI link will be disabled.
* This command will reset PCI firmware also.
*/
MLXSW_REG_MRSR_COMMAND_RESET_AT_PCI_DISABLE = 6,
};
/* reg_mrsr_command
* Reset/shutdown command
* 0 - do nothing
......@@ -10130,10 +10139,11 @@ MLXSW_REG_DEFINE(mrsr, MLXSW_REG_MRSR_ID, MLXSW_REG_MRSR_LEN);
*/
MLXSW_ITEM32(reg, mrsr, command, 0x00, 0, 4);
static inline void mlxsw_reg_mrsr_pack(char *payload)
static inline void mlxsw_reg_mrsr_pack(char *payload,
enum mlxsw_reg_mrsr_command command)
{
MLXSW_REG_ZERO(mrsr, payload);
mlxsw_reg_mrsr_command_set(payload, 1);
mlxsw_reg_mrsr_command_set(payload, command);
}
/* MLCR - Management LED Control Register
......@@ -10584,6 +10594,8 @@ MLXSW_ITEM32(reg, mcam, feature_group, 0x00, 16, 8);
enum mlxsw_reg_mcam_mng_feature_cap_mask_bits {
/* If set, MCIA supports 128 bytes payloads. Otherwise, 48 bytes. */
MLXSW_REG_MCAM_MCIA_128B = 34,
/* If set, MRSR.command=6 is supported. */
MLXSW_REG_MCAM_PCI_RESET = 48,
};
#define MLXSW_REG_BYTES_PER_DWORD 0x4
......
......@@ -1219,6 +1219,9 @@ static int pci_dev_wait(struct pci_dev *dev, char *reset_type, int timeout)
if (delay > PCI_RESET_WAIT)
pci_info(dev, "ready %dms after %s\n", delay - 1,
reset_type);
else
pci_dbg(dev, "ready %dms after %s\n", delay - 1,
reset_type);
return 0;
}
......
......@@ -3786,6 +3786,19 @@ static void quirk_no_pm_reset(struct pci_dev *dev)
DECLARE_PCI_FIXUP_CLASS_HEADER(PCI_VENDOR_ID_ATI, PCI_ANY_ID,
PCI_CLASS_DISPLAY_VGA, 8, quirk_no_pm_reset);
/*
* Spectrum-{1,2,3,4} devices report that a D3hot->D0 transition causes a reset
* (i.e., they advertise NoSoftRst-). However, this transition does not have
* any effect on the device: It continues to be operational and network ports
* remain up. Advertising this support makes it seem as if a PM reset is viable
* for these devices. Mark it as unavailable to skip it when testing reset
* methods.
*/
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MELLANOX, 0xcb84, quirk_no_pm_reset);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MELLANOX, 0xcf6c, quirk_no_pm_reset);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MELLANOX, 0xcf70, quirk_no_pm_reset);
DECLARE_PCI_FIXUP_HEADER(PCI_VENDOR_ID_MELLANOX, 0xcf80, quirk_no_pm_reset);
/*
* Thunderbolt controllers with broken MSI hotplug signaling:
* Entire 1st generation (Light Ridge, Eagle Ridge, Light Peak) and part
......
......@@ -503,14 +503,14 @@ static void __net_exit devlink_pernet_pre_exit(struct net *net)
* all devlink instances from this namespace into init_net.
*/
devlinks_xa_for_each_registered_get(net, index, devlink) {
devl_lock(devlink);
devl_dev_lock(devlink, true);
err = 0;
if (devl_is_registered(devlink))
err = devlink_reload(devlink, &init_net,
DEVLINK_RELOAD_ACTION_DRIVER_REINIT,
DEVLINK_RELOAD_LIMIT_UNSPEC,
&actions_performed, NULL);
devl_unlock(devlink);
devl_dev_unlock(devlink, true);
devlink_put(devlink);
if (err && err != -EOPNOTSUPP)
pr_warn("Failed to reload devlink instance into init_net\n");
......
......@@ -4,6 +4,7 @@
* Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
*/
#include <linux/device.h>
#include <net/genetlink.h>
#include <net/sock.h>
#include "devl_internal.h"
......@@ -433,6 +434,13 @@ int devlink_reload(struct devlink *devlink, struct net *dest_net,
struct net *curr_net;
int err;
/* Make sure the reload operations are invoked with the device lock
* held to allow drivers to trigger functionality that expects it
* (e.g., PCI reset) and to close possible races between these
* operations and probe/remove.
*/
device_lock_assert(devlink->dev);
memcpy(remote_reload_stats, devlink->stats.remote_reload_stats,
sizeof(remote_reload_stats));
......
......@@ -3,6 +3,7 @@
* Copyright (c) 2016 Jiri Pirko <jiri@mellanox.com>
*/
#include <linux/device.h>
#include <linux/etherdevice.h>
#include <linux/mutex.h>
#include <linux/netdevice.h>
......@@ -96,6 +97,20 @@ static inline bool devl_is_registered(struct devlink *devlink)
return xa_get_mark(&devlinks, devlink->index, DEVLINK_REGISTERED);
}
static inline void devl_dev_lock(struct devlink *devlink, bool dev_lock)
{
if (dev_lock)
device_lock(devlink->dev);
devl_lock(devlink);
}
static inline void devl_dev_unlock(struct devlink *devlink, bool dev_lock)
{
devl_unlock(devlink);
if (dev_lock)
device_unlock(devlink->dev);
}
typedef void devlink_rel_notify_cb_t(struct devlink *devlink, u32 obj_index);
typedef void devlink_rel_cleanup_cb_t(struct devlink *devlink, u32 obj_index,
u32 rel_index);
......@@ -111,9 +126,6 @@ int devlink_rel_devlink_handle_put(struct sk_buff *msg, struct devlink *devlink,
bool *msg_updated);
/* Netlink */
#define DEVLINK_NL_FLAG_NEED_PORT BIT(0)
#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1)
enum devlink_multicast_groups {
DEVLINK_MCGRP_CONFIG,
};
......@@ -140,7 +152,8 @@ typedef int devlink_nl_dump_one_func_t(struct sk_buff *msg,
int flags);
struct devlink *
devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs);
devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs,
bool dev_lock);
int devlink_nl_dumpit(struct sk_buff *msg, struct netlink_callback *cb,
devlink_nl_dump_one_func_t *dump_one);
......
......@@ -1151,7 +1151,8 @@ devlink_health_reporter_get_from_cb_lock(struct netlink_callback *cb)
struct nlattr **attrs = info->attrs;
struct devlink *devlink;
devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs);
devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs,
false);
if (IS_ERR(devlink))
return NULL;
......
......@@ -9,6 +9,10 @@
#include "devl_internal.h"
#define DEVLINK_NL_FLAG_NEED_PORT BIT(0)
#define DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT BIT(1)
#define DEVLINK_NL_FLAG_NEED_DEV_LOCK BIT(2)
static const struct genl_multicast_group devlink_nl_mcgrps[] = {
[DEVLINK_MCGRP_CONFIG] = { .name = DEVLINK_GENL_MCGRP_CONFIG_NAME },
};
......@@ -61,7 +65,8 @@ int devlink_nl_msg_reply_and_new(struct sk_buff **msg, struct genl_info *info)
}
struct devlink *
devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs)
devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs,
bool dev_lock)
{
struct devlink *devlink;
unsigned long index;
......@@ -75,12 +80,12 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs)
devname = nla_data(attrs[DEVLINK_ATTR_DEV_NAME]);
devlinks_xa_for_each_registered_get(net, index, devlink) {
devl_lock(devlink);
devl_dev_lock(devlink, dev_lock);
if (devl_is_registered(devlink) &&
strcmp(devlink->dev->bus->name, busname) == 0 &&
strcmp(dev_name(devlink->dev), devname) == 0)
return devlink;
devl_unlock(devlink);
devl_dev_unlock(devlink, dev_lock);
devlink_put(devlink);
}
......@@ -90,11 +95,13 @@ devlink_get_from_attrs_lock(struct net *net, struct nlattr **attrs)
static int __devlink_nl_pre_doit(struct sk_buff *skb, struct genl_info *info,
u8 flags)
{
bool dev_lock = flags & DEVLINK_NL_FLAG_NEED_DEV_LOCK;
struct devlink_port *devlink_port;
struct devlink *devlink;
int err;
devlink = devlink_get_from_attrs_lock(genl_info_net(info), info->attrs);
devlink = devlink_get_from_attrs_lock(genl_info_net(info), info->attrs,
dev_lock);
if (IS_ERR(devlink))
return PTR_ERR(devlink);
......@@ -114,7 +121,7 @@ static int __devlink_nl_pre_doit(struct sk_buff *skb, struct genl_info *info,
return 0;
unlock:
devl_unlock(devlink);
devl_dev_unlock(devlink, dev_lock);
devlink_put(devlink);
return err;
}
......@@ -131,6 +138,12 @@ int devlink_nl_pre_doit_port(const struct genl_split_ops *ops,
return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_PORT);
}
int devlink_nl_pre_doit_dev_lock(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info)
{
return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEV_LOCK);
}
int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops,
struct sk_buff *skb,
struct genl_info *info)
......@@ -138,16 +151,30 @@ int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops,
return __devlink_nl_pre_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEVLINK_OR_PORT);
}
void devlink_nl_post_doit(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info)
static void __devlink_nl_post_doit(struct sk_buff *skb, struct genl_info *info,
u8 flags)
{
bool dev_lock = flags & DEVLINK_NL_FLAG_NEED_DEV_LOCK;
struct devlink *devlink;
devlink = info->user_ptr[0];
devl_unlock(devlink);
devl_dev_unlock(devlink, dev_lock);
devlink_put(devlink);
}
void devlink_nl_post_doit(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info)
{
__devlink_nl_post_doit(skb, info, 0);
}
void
devlink_nl_post_doit_dev_lock(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info)
{
__devlink_nl_post_doit(skb, info, DEVLINK_NL_FLAG_NEED_DEV_LOCK);
}
static int devlink_nl_inst_single_dumpit(struct sk_buff *msg,
struct netlink_callback *cb, int flags,
devlink_nl_dump_one_func_t *dump_one,
......@@ -156,7 +183,7 @@ static int devlink_nl_inst_single_dumpit(struct sk_buff *msg,
struct devlink *devlink;
int err;
devlink = devlink_get_from_attrs_lock(sock_net(msg->sk), attrs);
devlink = devlink_get_from_attrs_lock(sock_net(msg->sk), attrs, false);
if (IS_ERR(devlink))
return PTR_ERR(devlink);
err = dump_one(msg, devlink, cb, flags | NLM_F_DUMP_FILTERED);
......
......@@ -846,9 +846,9 @@ const struct genl_split_ops devlink_nl_ops[73] = {
{
.cmd = DEVLINK_CMD_RELOAD,
.validate = GENL_DONT_VALIDATE_STRICT,
.pre_doit = devlink_nl_pre_doit,
.pre_doit = devlink_nl_pre_doit_dev_lock,
.doit = devlink_nl_reload_doit,
.post_doit = devlink_nl_post_doit,
.post_doit = devlink_nl_post_doit_dev_lock,
.policy = devlink_reload_nl_policy,
.maxattr = DEVLINK_ATTR_RELOAD_LIMITS,
.flags = GENL_ADMIN_PERM | GENL_CMD_CAP_DO,
......
......@@ -22,12 +22,17 @@ int devlink_nl_pre_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
struct genl_info *info);
int devlink_nl_pre_doit_port(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info);
int devlink_nl_pre_doit_dev_lock(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info);
int devlink_nl_pre_doit_port_optional(const struct genl_split_ops *ops,
struct sk_buff *skb,
struct genl_info *info);
void
devlink_nl_post_doit(const struct genl_split_ops *ops, struct sk_buff *skb,
struct genl_info *info);
void
devlink_nl_post_doit_dev_lock(const struct genl_split_ops *ops,
struct sk_buff *skb, struct genl_info *info);
int devlink_nl_get_doit(struct sk_buff *skb, struct genl_info *info);
int devlink_nl_get_dumpit(struct sk_buff *skb, struct netlink_callback *cb);
......
......@@ -883,7 +883,8 @@ int devlink_nl_region_read_dumpit(struct sk_buff *skb,
start_offset = state->start_offset;
devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs);
devlink = devlink_get_from_attrs_lock(sock_net(cb->skb->sk), attrs,
false);
if (IS_ERR(devlink))
return PTR_ERR(devlink);
......
#!/bin/bash
# SPDX-License-Identifier: GPL-2.0
#
# Test that PCI reset works correctly by verifying that only the expected reset
# methods are supported and that after issuing the reset the ifindex of the
# port changes.
lib_dir=$(dirname $0)/../../../net/forwarding
ALL_TESTS="
pci_reset_test
"
NUM_NETIFS=1
source $lib_dir/lib.sh
source $lib_dir/devlink_lib.sh
pci_reset_test()
{
RET=0
local bus=$(echo $DEVLINK_DEV | cut -d '/' -f 1)
local bdf=$(echo $DEVLINK_DEV | cut -d '/' -f 2)
if [ $bus != "pci" ]; then
check_err 1 "devlink device is not a PCI device"
log_test "pci reset"
return
fi
if [ ! -f /sys/bus/pci/devices/$bdf/reset_method ]; then
check_err 1 "reset is not supported"
log_test "pci reset"
return
fi
[[ $(cat /sys/bus/pci/devices/$bdf/reset_method) == "bus" ]]
check_err $? "only \"bus\" reset method should be supported"
local ifindex_pre=$(ip -j link show dev $swp1 | jq '.[]["ifindex"]')
echo 1 > /sys/bus/pci/devices/$bdf/reset
check_err $? "reset failed"
# Wait for udev to rename newly created netdev.
udevadm settle
local ifindex_post=$(ip -j link show dev $swp1 | jq '.[]["ifindex"]')
[[ $ifindex_pre != $ifindex_post ]]
check_err $? "reset not performed"
log_test "pci reset"
}
swp1=${NETIFS[p1]}
tests_run
exit $EXIT_STATUS
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment