Commit 63173885 authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'ethtool-provide-the-dim-profile-fine-tuning-channel'

Heng Qi says:

====================
ethtool: provide the dim profile fine-tuning channel

The NetDIM library provides excellent acceleration for many modern
network cards. However, the default profiles of DIM limits its maximum
capabilities for different NICs, so providing a way which the NIC can
be custom configured is necessary.

Currently, the way is based on the commonly used "ethtool -C".

For example,
on the server side, the virtio-net NIC with rx dim enabled has 8
queues and runs nginx.
The client uses the following command to send traffic to the server:
  ./wrk http://server_ip:80 -c 64 -t 5 -d 30

Then adjust the default rx-profile for server dim to

  {.usec =   1, .pkts = 256, .comps = n/a,},
  {.usec =   8, .pkts = 256, .comps = n/a,},
  {.usec =  30, .pkts = 256, .comps = n/a,},
  {.usec =  64, .pkts = 256, .comps = n/a,},
  {.usec = 128, .pkts = 256, .comps = n/a,}

The server PPS is improved by 20%+.
====================

Link: https://patch.msgid.link/20240621101353.107425-1-hengqi@linux.alibaba.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents c84f9324 dcb67f6a
......@@ -414,6 +414,26 @@ attribute-sets:
name: combined-count
type: u32
-
name: irq-moderation
attributes:
-
name: usec
type: u32
-
name: pkts
type: u32
-
name: comps
type: u32
-
name: profile
attributes:
-
name: irq-moderation
type: nest
multi-attr: true
nested-attributes: irq-moderation
-
name: coalesce
attributes:
......@@ -502,6 +522,15 @@ attribute-sets:
-
name: tx-aggr-time-usecs
type: u32
-
name: rx-profile
type: nest
nested-attributes: profile
-
name: tx-profile
type: nest
nested-attributes: profile
-
name: pause-stat
attributes:
......@@ -1325,6 +1354,8 @@ operations:
- tx-aggr-max-bytes
- tx-aggr-max-frames
- tx-aggr-time-usecs
- rx-profile
- tx-profile
dump: *coalesce-get-op
-
name: coalesce-set
......
......@@ -1033,6 +1033,8 @@ Kernel response contents:
``ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES`` u32 max aggr size, Tx
``ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES`` u32 max aggr packets, Tx
``ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS`` u32 time (us), aggr, Tx
``ETHTOOL_A_COALESCE_RX_PROFILE`` nested profile of DIM, Rx
``ETHTOOL_A_COALESCE_TX_PROFILE`` nested profile of DIM, Tx
=========================================== ====== =======================
Attributes are only included in reply if their value is not zero or the
......@@ -1062,6 +1064,10 @@ block should be sent.
This feature is mainly of interest for specific USB devices which does not cope
well with frequent small-sized URBs transmissions.
``ETHTOOL_A_COALESCE_RX_PROFILE`` and ``ETHTOOL_A_COALESCE_TX_PROFILE`` refer
to DIM parameters, see `Generic Network Dynamic Interrupt Moderation (Net DIM)
<https://www.kernel.org/doc/Documentation/networking/net_dim.rst>`_.
COALESCE_SET
============
......@@ -1098,6 +1104,8 @@ Request contents:
``ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES`` u32 max aggr size, Tx
``ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES`` u32 max aggr packets, Tx
``ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS`` u32 time (us), aggr, Tx
``ETHTOOL_A_COALESCE_RX_PROFILE`` nested profile of DIM, Rx
``ETHTOOL_A_COALESCE_TX_PROFILE`` nested profile of DIM, Tx
=========================================== ====== =======================
Request is rejected if it attributes declared as unsupported by driver (i.e.
......
......@@ -169,6 +169,48 @@ usage is not complete but it should make the outline of the usage clear.
...
}
Tuning DIM
==========
Net DIM serves a range of network devices and delivers excellent acceleration
benefits. Yet, it has been observed that some preset configurations of DIM may
not align seamlessly with the varying specifications of network devices, and
this discrepancy has been identified as a factor to the suboptimal performance
outcomes of DIM-enabled network devices, related to a mismatch in profiles.
To address this issue, Net DIM introduces a per-device control to modify and
access a device's ``rx-profile`` and ``tx-profile`` parameters:
Assume that the target network device is named ethx, and ethx only declares
support for RX profile setting and supports modification of ``usec`` field
and ``pkts`` field (See the data structure:
:c:type:`struct dim_cq_moder <dim_cq_moder>`).
You can use ethtool to modify the current RX DIM profile where all
values are 64::
$ ethtool -C ethx rx-profile 1,1,n_2,2,n_3,n,n_n,4,n_n,n,n
``n`` means do not modify this field, and ``_`` separates structure
elements of the profile array.
Querying the current profiles using::
$ ethtool -c ethx
...
rx-profile:
{.usec = 1, .pkts = 1, .comps = n/a,},
{.usec = 2, .pkts = 2, .comps = n/a,},
{.usec = 3, .pkts = 64, .comps = n/a,},
{.usec = 64, .pkts = 4, .comps = n/a,},
{.usec = 64, .pkts = 64, .comps = n/a,}
tx-profile: n/a
If the network device does not support specific fields of DIM profiles,
the corresponding ``n/a`` will display. If the ``n/a`` field is being
modified, error messages will be reported.
Dynamic Interrupt Moderation (DIM) library API
==============================================
......
......@@ -2469,6 +2469,13 @@ static int virtnet_enable_queue_pair(struct virtnet_info *vi, int qp_index)
return err;
}
static void virtnet_cancel_dim(struct virtnet_info *vi, struct dim *dim)
{
if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
return;
net_dim_work_cancel(dim);
}
static int virtnet_open(struct net_device *dev)
{
struct virtnet_info *vi = netdev_priv(dev);
......@@ -2495,7 +2502,7 @@ static int virtnet_open(struct net_device *dev)
for (i--; i >= 0; i--) {
virtnet_disable_queue_pair(vi, i);
cancel_work_sync(&vi->rq[i].dim.work);
virtnet_cancel_dim(vi, &vi->rq[i].dim);
}
return err;
......@@ -2671,7 +2678,7 @@ static int virtnet_rx_resize(struct virtnet_info *vi,
if (running) {
napi_disable(&rq->napi);
cancel_work_sync(&rq->dim.work);
virtnet_cancel_dim(vi, &rq->dim);
}
err = virtqueue_resize(rq->vq, ring_num, virtnet_rq_unmap_free_buf);
......@@ -2932,7 +2939,7 @@ static int virtnet_close(struct net_device *dev)
for (i = 0; i < vi->max_queue_pairs; i++) {
virtnet_disable_queue_pair(vi, i);
cancel_work_sync(&vi->rq[i].dim.work);
virtnet_cancel_dim(vi, &vi->rq[i].dim);
}
return 0;
......@@ -4458,7 +4465,7 @@ static void virtnet_rx_dim_work(struct work_struct *work)
if (!rq->dim_enabled)
goto out;
update_moder = net_dim_get_rx_moderation(dim->mode, dim->profile_ix);
update_moder = net_dim_get_rx_irq_moder(dev, dim);
if (update_moder.usec != rq->intr_coal.max_usecs ||
update_moder.pkts != rq->intr_coal.max_packets) {
err = virtnet_send_rx_ctrl_coal_vq_cmd(vi, qnum,
......@@ -5158,6 +5165,36 @@ static void virtnet_tx_timeout(struct net_device *dev, unsigned int txqueue)
jiffies_to_usecs(jiffies - READ_ONCE(txq->trans_start)));
}
static int virtnet_init_irq_moder(struct virtnet_info *vi)
{
u8 profile_flags = 0, coal_flags = 0;
int ret, i;
profile_flags |= DIM_PROFILE_RX;
coal_flags |= DIM_COALESCE_USEC | DIM_COALESCE_PKTS;
ret = net_dim_init_irq_moder(vi->dev, profile_flags, coal_flags,
DIM_CQ_PERIOD_MODE_START_FROM_EQE,
0, virtnet_rx_dim_work, NULL);
if (ret)
return ret;
for (i = 0; i < vi->max_queue_pairs; i++)
net_dim_setting(vi->dev, &vi->rq[i].dim, false);
return 0;
}
static void virtnet_free_irq_moder(struct virtnet_info *vi)
{
if (!virtio_has_feature(vi->vdev, VIRTIO_NET_F_VQ_NOTF_COAL))
return;
rtnl_lock();
net_dim_free_irq_moder(vi->dev);
rtnl_unlock();
}
static const struct net_device_ops virtnet_netdev = {
.ndo_open = virtnet_open,
.ndo_stop = virtnet_close,
......@@ -5437,9 +5474,6 @@ static int virtnet_alloc_queues(struct virtnet_info *vi)
virtnet_poll_tx,
napi_tx ? napi_weight : 0);
INIT_WORK(&vi->rq[i].dim.work, virtnet_rx_dim_work);
vi->rq[i].dim.mode = DIM_CQ_PERIOD_MODE_START_FROM_EQE;
sg_init_table(vi->rq[i].sg, ARRAY_SIZE(vi->rq[i].sg));
ewma_pkt_len_init(&vi->rq[i].mrg_avg_pkt_len);
sg_init_table(vi->sq[i].sg, ARRAY_SIZE(vi->sq[i].sg));
......@@ -5868,6 +5902,10 @@ static int virtnet_probe(struct virtio_device *vdev)
for (i = 0; i < vi->max_queue_pairs; i++)
if (vi->sq[i].napi.weight)
vi->sq[i].intr_coal.max_packets = 1;
err = virtnet_init_irq_moder(vi);
if (err)
goto free;
}
#ifdef CONFIG_SYSFS
......@@ -6019,6 +6057,8 @@ static void virtnet_remove(struct virtio_device *vdev)
disable_rx_mode_work(vi);
flush_work(&vi->rx_mode_work);
virtnet_free_irq_moder(vi);
unregister_netdev(vi->dev);
net_failover_destroy(vi->failover);
......
......@@ -22,7 +22,7 @@ config FSL_GUTS
config FSL_MC_DPIO
tristate "QorIQ DPAA2 DPIO driver"
depends on FSL_MC_BUS
depends on FSL_MC_BUS && NET
select SOC_BUS
select FSL_GUTS
select DIMLIB
......
......@@ -10,6 +10,15 @@
#include <linux/types.h>
#include <linux/workqueue.h>
struct net_device;
/* Number of DIM profiles and period mode. */
#define NET_DIM_PARAMS_NUM_PROFILES 5
#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256
#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128
#define NET_DIM_DEF_PROFILE_CQE 1
#define NET_DIM_DEF_PROFILE_EQE 1
/*
* Number of events between DIM iterations.
* Causes a moderation of the algorithm run.
......@@ -38,12 +47,45 @@
* @pkts: CQ packet counter suggestion (by DIM)
* @comps: Completion counter
* @cq_period_mode: CQ period count mode (from CQE/EQE)
* @rcu: for asynchronous kfree_rcu
*/
struct dim_cq_moder {
u16 usec;
u16 pkts;
u16 comps;
u8 cq_period_mode;
struct rcu_head rcu;
};
#define DIM_PROFILE_RX BIT(0) /* support rx profile modification */
#define DIM_PROFILE_TX BIT(1) /* support tx profile modification */
#define DIM_COALESCE_USEC BIT(0) /* support usec field modification */
#define DIM_COALESCE_PKTS BIT(1) /* support pkts field modification */
#define DIM_COALESCE_COMPS BIT(2) /* support comps field modification */
/**
* struct dim_irq_moder - Structure for irq moderation information.
* Used to collect irq moderation related information.
*
* @profile_flags: DIM_PROFILE_*
* @coal_flags: DIM_COALESCE_* for Rx and Tx
* @dim_rx_mode: Rx DIM period count mode: CQE or EQE
* @dim_tx_mode: Tx DIM period count mode: CQE or EQE
* @rx_profile: DIM profile list for Rx
* @tx_profile: DIM profile list for Tx
* @rx_dim_work: Rx DIM worker scheduled by net_dim()
* @tx_dim_work: Tx DIM worker scheduled by net_dim()
*/
struct dim_irq_moder {
u8 profile_flags;
u8 coal_flags;
u8 dim_rx_mode;
u8 dim_tx_mode;
struct dim_cq_moder __rcu *rx_profile;
struct dim_cq_moder __rcu *tx_profile;
void (*rx_dim_work)(struct work_struct *work);
void (*tx_dim_work)(struct work_struct *work);
};
/**
......@@ -191,6 +233,77 @@ enum dim_step_result {
DIM_ON_EDGE,
};
/**
* net_dim_init_irq_moder - collect information to initialize irq moderation
* @dev: target network device
* @profile_flags: Rx or Tx profile modification capability
* @coal_flags: irq moderation params flags
* @rx_mode: CQ period mode for Rx
* @tx_mode: CQ period mode for Tx
* @rx_dim_work: Rx worker called after dim decision
* @tx_dim_work: Tx worker called after dim decision
*
* Return: 0 on success or a negative error code.
*/
int net_dim_init_irq_moder(struct net_device *dev, u8 profile_flags,
u8 coal_flags, u8 rx_mode, u8 tx_mode,
void (*rx_dim_work)(struct work_struct *work),
void (*tx_dim_work)(struct work_struct *work));
/**
* net_dim_free_irq_moder - free fields for irq moderation
* @dev: target network device
*/
void net_dim_free_irq_moder(struct net_device *dev);
/**
* net_dim_setting - initialize DIM's cq mode and schedule worker
* @dev: target network device
* @dim: DIM context
* @is_tx: true indicates the tx direction, false indicates the rx direction
*/
void net_dim_setting(struct net_device *dev, struct dim *dim, bool is_tx);
/**
* net_dim_work_cancel - synchronously cancel dim's worker
* @dim: DIM context
*/
void net_dim_work_cancel(struct dim *dim);
/**
* net_dim_get_rx_irq_moder - get DIM rx results based on profile_ix
* @dev: target network device
* @dim: DIM context
*
* Return: DIM irq moderation
*/
struct dim_cq_moder
net_dim_get_rx_irq_moder(struct net_device *dev, struct dim *dim);
/**
* net_dim_get_tx_irq_moder - get DIM tx results based on profile_ix
* @dev: target network device
* @dim: DIM context
*
* Return: DIM irq moderation
*/
struct dim_cq_moder
net_dim_get_tx_irq_moder(struct net_device *dev, struct dim *dim);
/**
* net_dim_set_rx_mode - set DIM rx cq mode
* @dev: target network device
* @rx_mode: target rx cq mode
*/
void net_dim_set_rx_mode(struct net_device *dev, u8 rx_mode);
/**
* net_dim_set_tx_mode - set DIM tx cq mode
* @dev: target network device
* @tx_mode: target tx cq mode
*/
void net_dim_set_tx_mode(struct net_device *dev, u8 tx_mode);
/**
* dim_on_top - check if current state is a good place to stop (top location)
* @dim: DIM context
......
......@@ -284,7 +284,9 @@ bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32,
#define ETHTOOL_COALESCE_TX_AGGR_MAX_BYTES BIT(24)
#define ETHTOOL_COALESCE_TX_AGGR_MAX_FRAMES BIT(25)
#define ETHTOOL_COALESCE_TX_AGGR_TIME_USECS BIT(26)
#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(26, 0)
#define ETHTOOL_COALESCE_RX_PROFILE BIT(27)
#define ETHTOOL_COALESCE_TX_PROFILE BIT(28)
#define ETHTOOL_COALESCE_ALL_PARAMS GENMASK(28, 0)
#define ETHTOOL_COALESCE_USECS \
(ETHTOOL_COALESCE_RX_USECS | ETHTOOL_COALESCE_TX_USECS)
......
......@@ -2402,6 +2402,9 @@ struct net_device {
/** @page_pools: page pools created for this netdevice */
struct hlist_head page_pools;
#endif
/** @irq_moder: dim parameters used if IS_ENABLED(CONFIG_DIMLIB). */
struct dim_irq_moder *irq_moder;
};
#define to_net_dev(d) container_of(d, struct net_device, dev)
......
......@@ -415,12 +415,34 @@ enum {
ETHTOOL_A_COALESCE_TX_AGGR_MAX_BYTES, /* u32 */
ETHTOOL_A_COALESCE_TX_AGGR_MAX_FRAMES, /* u32 */
ETHTOOL_A_COALESCE_TX_AGGR_TIME_USECS, /* u32 */
/* nest - _A_PROFILE_IRQ_MODERATION */
ETHTOOL_A_COALESCE_RX_PROFILE,
/* nest - _A_PROFILE_IRQ_MODERATION */
ETHTOOL_A_COALESCE_TX_PROFILE,
/* add new constants above here */
__ETHTOOL_A_COALESCE_CNT,
ETHTOOL_A_COALESCE_MAX = (__ETHTOOL_A_COALESCE_CNT - 1)
};
enum {
ETHTOOL_A_PROFILE_UNSPEC,
/* nest, _A_IRQ_MODERATION_* */
ETHTOOL_A_PROFILE_IRQ_MODERATION,
__ETHTOOL_A_PROFILE_CNT,
ETHTOOL_A_PROFILE_MAX = (__ETHTOOL_A_PROFILE_CNT - 1)
};
enum {
ETHTOOL_A_IRQ_MODERATION_UNSPEC,
ETHTOOL_A_IRQ_MODERATION_USEC, /* u32 */
ETHTOOL_A_IRQ_MODERATION_PKTS, /* u32 */
ETHTOOL_A_IRQ_MODERATION_COMPS, /* u32 */
__ETHTOOL_A_IRQ_MODERATION_CNT,
ETHTOOL_A_IRQ_MODERATION_MAX = (__ETHTOOL_A_IRQ_MODERATION_CNT - 1)
};
/* PAUSE */
enum {
......
......@@ -623,6 +623,7 @@ config SIGNATURE
config DIMLIB
tristate
depends on NET
help
Dynamic Interrupt Moderation library.
Implements an algorithm for dynamically changing CQ moderation values
......
......@@ -4,6 +4,7 @@
*/
#include <linux/dim.h>
#include <linux/rtnetlink.h>
/*
* Net DIM profiles:
......@@ -11,12 +12,6 @@
* There are different set of profiles for RX/TX CQs.
* Each profile size must be of NET_DIM_PARAMS_NUM_PROFILES
*/
#define NET_DIM_PARAMS_NUM_PROFILES 5
#define NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE 256
#define NET_DIM_DEFAULT_TX_CQ_PKTS_FROM_EQE 128
#define NET_DIM_DEF_PROFILE_CQE 1
#define NET_DIM_DEF_PROFILE_EQE 1
#define NET_DIM_RX_EQE_PROFILES { \
{.usec = 1, .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \
{.usec = 8, .pkts = NET_DIM_DEFAULT_RX_CQ_PKTS_FROM_EQE,}, \
......@@ -101,6 +96,143 @@ net_dim_get_def_tx_moderation(u8 cq_period_mode)
}
EXPORT_SYMBOL(net_dim_get_def_tx_moderation);
int net_dim_init_irq_moder(struct net_device *dev, u8 profile_flags,
u8 coal_flags, u8 rx_mode, u8 tx_mode,
void (*rx_dim_work)(struct work_struct *work),
void (*tx_dim_work)(struct work_struct *work))
{
struct dim_cq_moder *rxp = NULL, *txp;
struct dim_irq_moder *moder;
int len;
dev->irq_moder = kzalloc(sizeof(*dev->irq_moder), GFP_KERNEL);
if (!dev->irq_moder)
return -ENOMEM;
moder = dev->irq_moder;
len = NET_DIM_PARAMS_NUM_PROFILES * sizeof(*moder->rx_profile);
moder->coal_flags = coal_flags;
moder->profile_flags = profile_flags;
if (profile_flags & DIM_PROFILE_RX) {
moder->rx_dim_work = rx_dim_work;
moder->dim_rx_mode = rx_mode;
rxp = kmemdup(rx_profile[rx_mode], len, GFP_KERNEL);
if (!rxp)
goto free_moder;
rcu_assign_pointer(moder->rx_profile, rxp);
}
if (profile_flags & DIM_PROFILE_TX) {
moder->tx_dim_work = tx_dim_work;
moder->dim_tx_mode = tx_mode;
txp = kmemdup(tx_profile[tx_mode], len, GFP_KERNEL);
if (!txp)
goto free_rxp;
rcu_assign_pointer(moder->tx_profile, txp);
}
return 0;
free_rxp:
kfree(rxp);
free_moder:
kfree(moder);
return -ENOMEM;
}
EXPORT_SYMBOL(net_dim_init_irq_moder);
/* RTNL lock is held. */
void net_dim_free_irq_moder(struct net_device *dev)
{
struct dim_cq_moder *rxp, *txp;
if (!dev->irq_moder)
return;
rxp = rtnl_dereference(dev->irq_moder->rx_profile);
txp = rtnl_dereference(dev->irq_moder->tx_profile);
rcu_assign_pointer(dev->irq_moder->rx_profile, NULL);
rcu_assign_pointer(dev->irq_moder->tx_profile, NULL);
kfree_rcu(rxp, rcu);
kfree_rcu(txp, rcu);
kfree(dev->irq_moder);
}
EXPORT_SYMBOL(net_dim_free_irq_moder);
void net_dim_setting(struct net_device *dev, struct dim *dim, bool is_tx)
{
struct dim_irq_moder *irq_moder = dev->irq_moder;
if (!irq_moder)
return;
if (is_tx) {
INIT_WORK(&dim->work, irq_moder->tx_dim_work);
dim->mode = READ_ONCE(irq_moder->dim_tx_mode);
return;
}
INIT_WORK(&dim->work, irq_moder->rx_dim_work);
dim->mode = READ_ONCE(irq_moder->dim_rx_mode);
}
EXPORT_SYMBOL(net_dim_setting);
void net_dim_work_cancel(struct dim *dim)
{
cancel_work_sync(&dim->work);
}
EXPORT_SYMBOL(net_dim_work_cancel);
struct dim_cq_moder net_dim_get_rx_irq_moder(struct net_device *dev,
struct dim *dim)
{
struct dim_cq_moder res, *profile;
rcu_read_lock();
profile = rcu_dereference(dev->irq_moder->rx_profile);
res = profile[dim->profile_ix];
rcu_read_unlock();
res.cq_period_mode = dim->mode;
return res;
}
EXPORT_SYMBOL(net_dim_get_rx_irq_moder);
struct dim_cq_moder net_dim_get_tx_irq_moder(struct net_device *dev,
struct dim *dim)
{
struct dim_cq_moder res, *profile;
rcu_read_lock();
profile = rcu_dereference(dev->irq_moder->tx_profile);
res = profile[dim->profile_ix];
rcu_read_unlock();
res.cq_period_mode = dim->mode;
return res;
}
EXPORT_SYMBOL(net_dim_get_tx_irq_moder);
void net_dim_set_rx_mode(struct net_device *dev, u8 rx_mode)
{
WRITE_ONCE(dev->irq_moder->dim_rx_mode, rx_mode);
}
EXPORT_SYMBOL(net_dim_set_rx_mode);
void net_dim_set_tx_mode(struct net_device *dev, u8 tx_mode)
{
WRITE_ONCE(dev->irq_moder->dim_tx_mode, tx_mode);
}
EXPORT_SYMBOL(net_dim_set_tx_mode);
static int net_dim_step(struct dim *dim)
{
if (dim->tired == (NET_DIM_PARAMS_NUM_PROFILES * 2))
......
......@@ -508,6 +508,7 @@ config FAILOVER
config ETHTOOL_NETLINK
bool "Netlink interface for ethtool"
select DIMLIB
default y
help
An alternative userspace interface for ethtool based on generic
......
This diff is collapsed.
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment