Commit 97680ade authored by David S. Miller's avatar David S. Miller

Merge branch 'xps-symmretric-queue-selection'

Amritha Nambiar says:

====================
Symmetric queue selection using XPS for Rx queues

This patch series implements support for Tx queue selection based on
Rx queue(s) map. This is done by configuring Rx queue(s) map per Tx-queue
using sysfs attribute. If the user configuration for Rx queues does
not apply, then the Tx queue selection falls back to XPS using CPUs and
finally to hashing.

XPS is refactored to support Tx queue selection based on either the
CPUs map or the Rx-queues map. The config option CONFIG_XPS needs to be
enabled. By default no receive queues are configured for the Tx queue.

- /sys/class/net/<dev>/queues/tx-*/xps_rxqs

A set of receive queues can be mapped to a set of transmit queues (many:many),
although the common use case is a 1:1 mapping. This will enable sending
packets on the same Tx-Rx queue association as this is useful for busy polling
multi-threaded workloads where it is not possible to pin the threads to
a CPU. This is a rework of Sridhar's patch for symmetric queueing via
socket option:
https://www.spinics.net/lists/netdev/msg453106.html

Testing Hints:
Kernel:  Linux 4.17.0-rc7+
Interface:
driver: ixgbe
version: 5.1.0-k
firmware-version: 0x00015e0b

Configuration:
ethtool -L $iface combined 16
ethtool -C $iface rx-usecs 1000
sysctl net.core.busy_poll=1000
ATR disabled:
ethtool -K $iface ntuple on

Workload:
Modified memcached that changes the thread selection policy to be based
on the incoming rx-queue of a connection using SO_INCOMING_NAPI_ID socket
option. The default is round-robin.

Default: No rxqs_map configured
Symmetric queues: Enable rxqs_map for all queues 1:1 mapped to Tx queue

System:
Architecture:          x86_64
CPU(s):                72
Model name:            Intel(R) Xeon(R) CPU E5-2699 v3 @ 2.30GHz

16 threads  400K requests/sec
=============================
-------------------------------------------------------------------------------
                                Default                 Symmetric queues
-------------------------------------------------------------------------------
RTT min/avg/max                 4/51/2215               2/30/5163
(usec)

intr/sec                        26655                   18606

contextswitch/sec               5145                    4044

insn per cycle                  0.43                    0.72

cache-misses                    6.919                   4.310
(% of all cache refs)

L1-dcache-load-                 4.49                    3.29
-misses
(% of all L1-dcache hits)

LLC-load-misses                 13.26                   8.96
(% of all LL-cache hits)

-------------------------------------------------------------------------------

32 threads  400K requests/sec
=============================
-------------------------------------------------------------------------------
                                Default                 Symmetric queues
-------------------------------------------------------------------------------
RTT min/avg/max                 10/112/5562             9/46/4637
(usec)

intr/sec                        30456                   27666

contextswitch/sec               7552                    5133

insn per cycle                  0.41                    0.49

cache-misses                    9.357                   2.769
(% of all cache refs)

L1-dcache-load-                 4.09                    3.98
-misses
(% of all L1-dcache hits)

LLC-load-misses                 12.96                   3.96
(% of all LL-cache hits)

-------------------------------------------------------------------------------

16 threads  800K requests/sec
=============================
-------------------------------------------------------------------------------
                                Default                 Symmetric queues
-------------------------------------------------------------------------------
RTT min/avg/max                  5/151/4989             9/69/2611
(usec)

intr/sec                        35686                   22907

contextswitch/sec               25522                   12281

insn per cycle                  0.67                    0.74

cache-misses                    8.652                   6.38
(% of all cache refs)

L1-dcache-load-                 3.19                    2.86
-misses
(% of all L1-dcache hits)

LLC-load-misses                 16.53                   11.99
(% of all LL-cache hits)

-------------------------------------------------------------------------------
32 threads  800K requests/sec
=============================
-------------------------------------------------------------------------------
                                Default                 Symmetric queues
-------------------------------------------------------------------------------
RTT min/avg/max                  6/163/6152             8/88/4209
(usec)

intr/sec                        47079                   26548

contextswitch/sec               42190                   39168

insn per cycle                  0.45                    0.54

cache-misses                    8.798                   4.668
(% of all cache refs)

L1-dcache-load-                 6.55                    6.29
-misses
(% of all L1-dcache hits)

LLC-load-misses                 13.91                   10.44
(% of all LL-cache hits)

-------------------------------------------------------------------------------

v6:
- Changed the names of some functions to begin with net_if.
- Cleaned up sk_tx_queue_set/sk_rx_queue_set functions.
- Added sk_rx_queue_clear to make it consistent with tx_queue_mapping
  initialization.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 1a84d7fd a4fd1f4b
......@@ -42,6 +42,17 @@ Description:
network device transmit queue. Possible vaules depend on the
number of available CPU(s) in the system.
What: /sys/class/<iface>/queues/tx-<queue>/xps_rxqs
Date: June 2018
KernelVersion: 4.18.0
Contact: netdev@vger.kernel.org
Description:
Mask of the receive queue(s) currently enabled to participate
into the Transmit Packet Steering packet processing flow for this
network device transmit queue. Possible values depend on the
number of available receive queue(s) in the network device.
Default is disabled.
What: /sys/class/<iface>/queues/tx-<queue>/byte_queue_limits/hold_time
Date: November 2011
KernelVersion: 3.3
......
......@@ -366,8 +366,13 @@ XPS: Transmit Packet Steering
Transmit Packet Steering is a mechanism for intelligently selecting
which transmit queue to use when transmitting a packet on a multi-queue
device. To accomplish this, a mapping from CPU to hardware queue(s) is
recorded. The goal of this mapping is usually to assign queues
device. This can be accomplished by recording two kinds of maps, either
a mapping of CPU to hardware queue(s) or a mapping of receive queue(s)
to hardware transmit queue(s).
1. XPS using CPUs map
The goal of this mapping is usually to assign queues
exclusively to a subset of CPUs, where the transmit completions for
these queues are processed on a CPU within this set. This choice
provides two benefits. First, contention on the device queue lock is
......@@ -377,15 +382,40 @@ transmit queue). Secondly, cache miss rate on transmit completion is
reduced, in particular for data cache lines that hold the sk_buff
structures.
XPS is configured per transmit queue by setting a bitmap of CPUs that
may use that queue to transmit. The reverse mapping, from CPUs to
transmit queues, is computed and maintained for each network device.
When transmitting the first packet in a flow, the function
get_xps_queue() is called to select a queue. This function uses the ID
of the running CPU as a key into the CPU-to-queue lookup table. If the
2. XPS using receive queues map
This mapping is used to pick transmit queue based on the receive
queue(s) map configuration set by the administrator. A set of receive
queues can be mapped to a set of transmit queues (many:many), although
the common use case is a 1:1 mapping. This will enable sending packets
on the same queue associations for transmit and receive. This is useful for
busy polling multi-threaded workloads where there are challenges in
associating a given CPU to a given application thread. The application
threads are not pinned to CPUs and each thread handles packets
received on a single queue. The receive queue number is cached in the
socket for the connection. In this model, sending the packets on the same
transmit queue corresponding to the associated receive queue has benefits
in keeping the CPU overhead low. Transmit completion work is locked into
the same queue-association that a given application is polling on. This
avoids the overhead of triggering an interrupt on another CPU. When the
application cleans up the packets during the busy poll, transmit completion
may be processed along with it in the same thread context and so result in
reduced latency.
XPS is configured per transmit queue by setting a bitmap of
CPUs/receive-queues that may use that queue to transmit. The reverse
mapping, from CPUs to transmit queues or from receive-queues to transmit
queues, is computed and maintained for each network device. When
transmitting the first packet in a flow, the function get_xps_queue() is
called to select a queue. This function uses the ID of the receive queue
for the socket connection for a match in the receive queue-to-transmit queue
lookup table. Alternatively, this function can also use the ID of the
running CPU as a key into the CPU-to-queue lookup table. If the
ID matches a single queue, that is used for transmission. If multiple
queues match, one is selected by using the flow hash to compute an index
into the set.
into the set. When selecting the transmit queue based on receive queue(s)
map, the transmit device is not validated against the receive device as it
requires expensive lookup operation in the datapath.
The queue chosen for transmitting a particular flow is saved in the
corresponding socket structure for the flow (e.g. a TCP connection).
......@@ -404,11 +434,15 @@ acknowledged.
XPS is only available if the kconfig symbol CONFIG_XPS is enabled (on by
default for SMP). The functionality remains disabled until explicitly
configured. To enable XPS, the bitmap of CPUs that may use a transmit
queue is configured using the sysfs file entry:
configured. To enable XPS, the bitmap of CPUs/receive-queues that may
use a transmit queue is configured using the sysfs file entry:
For selection based on CPUs map:
/sys/class/net/<dev>/queues/tx-<n>/xps_cpus
For selection based on receive-queues map:
/sys/class/net/<dev>/queues/tx-<n>/xps_rxqs
== Suggested Configuration
For a network device with a single transmission queue, XPS configuration
......@@ -421,6 +455,11 @@ best CPUs to share a given queue are probably those that share the cache
with the CPU that processes transmit completions for that queue
(transmit interrupts).
For transmit queue selection based on receive queue(s), XPS has to be
explicitly configured mapping receive-queue(s) to transmit queue(s). If the
user configuration for receive-queue map does not apply, then the transmit
queue is selected based on the CPUs map.
Per TX Queue rate limitation:
=============================
......
......@@ -115,12 +115,17 @@ extern struct cpumask __cpu_active_mask;
#define cpu_active(cpu) ((cpu) == 0)
#endif
/* verify cpu argument to cpumask_* operators */
static inline unsigned int cpumask_check(unsigned int cpu)
static inline void cpu_max_bits_warn(unsigned int cpu, unsigned int bits)
{
#ifdef CONFIG_DEBUG_PER_CPU_MAPS
WARN_ON_ONCE(cpu >= nr_cpumask_bits);
WARN_ON_ONCE(cpu >= bits);
#endif /* CONFIG_DEBUG_PER_CPU_MAPS */
}
/* verify cpu argument to cpumask_* operators */
static inline unsigned int cpumask_check(unsigned int cpu)
{
cpu_max_bits_warn(cpu, nr_cpumask_bits);
return cpu;
}
......
......@@ -731,10 +731,15 @@ struct xps_map {
*/
struct xps_dev_maps {
struct rcu_head rcu;
struct xps_map __rcu *cpu_map[0];
struct xps_map __rcu *attr_map[0]; /* Either CPUs map or RXQs map */
};
#define XPS_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
#define XPS_CPU_DEV_MAPS_SIZE(_tcs) (sizeof(struct xps_dev_maps) + \
(nr_cpu_ids * (_tcs) * sizeof(struct xps_map *)))
#define XPS_RXQ_DEV_MAPS_SIZE(_tcs, _rxqs) (sizeof(struct xps_dev_maps) +\
(_rxqs * (_tcs) * sizeof(struct xps_map *)))
#endif /* CONFIG_XPS */
#define TC_MAX_QUEUE 16
......@@ -1910,7 +1915,8 @@ struct net_device {
int watchdog_timeo;
#ifdef CONFIG_XPS
struct xps_dev_maps __rcu *xps_maps;
struct xps_dev_maps __rcu *xps_cpus_map;
struct xps_dev_maps __rcu *xps_rxqs_map;
#endif
#ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc __rcu *miniq_egress;
......@@ -3259,6 +3265,92 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
#ifdef CONFIG_XPS
int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
u16 index);
int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
u16 index, bool is_rxqs_map);
/**
* netif_attr_test_mask - Test a CPU or Rx queue set in a mask
* @j: CPU/Rx queue index
* @mask: bitmask of all cpus/rx queues
* @nr_bits: number of bits in the bitmask
*
* Test if a CPU or Rx queue index is set in a mask of all CPU/Rx queues.
*/
static inline bool netif_attr_test_mask(unsigned long j,
const unsigned long *mask,
unsigned int nr_bits)
{
cpu_max_bits_warn(j, nr_bits);
return test_bit(j, mask);
}
/**
* netif_attr_test_online - Test for online CPU/Rx queue
* @j: CPU/Rx queue index
* @online_mask: bitmask for CPUs/Rx queues that are online
* @nr_bits: number of bits in the bitmask
*
* Returns true if a CPU/Rx queue is online.
*/
static inline bool netif_attr_test_online(unsigned long j,
const unsigned long *online_mask,
unsigned int nr_bits)
{
cpu_max_bits_warn(j, nr_bits);
if (online_mask)
return test_bit(j, online_mask);
return (j < nr_bits);
}
/**
* netif_attrmask_next - get the next CPU/Rx queue in a cpu/Rx queues mask
* @n: CPU/Rx queue index
* @srcp: the cpumask/Rx queue mask pointer
* @nr_bits: number of bits in the bitmask
*
* Returns >= nr_bits if no further CPUs/Rx queues set.
*/
static inline unsigned int netif_attrmask_next(int n, const unsigned long *srcp,
unsigned int nr_bits)
{
/* -1 is a legal arg here. */
if (n != -1)
cpu_max_bits_warn(n, nr_bits);
if (srcp)
return find_next_bit(srcp, nr_bits, n + 1);
return n + 1;
}
/**
* netif_attrmask_next_and - get the next CPU/Rx queue in *src1p & *src2p
* @n: CPU/Rx queue index
* @src1p: the first CPUs/Rx queues mask pointer
* @src2p: the second CPUs/Rx queues mask pointer
* @nr_bits: number of bits in the bitmask
*
* Returns >= nr_bits if no further CPUs/Rx queues set in both.
*/
static inline int netif_attrmask_next_and(int n, const unsigned long *src1p,
const unsigned long *src2p,
unsigned int nr_bits)
{
/* -1 is a legal arg here. */
if (n != -1)
cpu_max_bits_warn(n, nr_bits);
if (src1p && src2p)
return find_next_and_bit(src1p, src2p, nr_bits, n + 1);
else if (src1p)
return find_next_bit(src1p, nr_bits, n + 1);
else if (src2p)
return find_next_bit(src2p, nr_bits, n + 1);
return n + 1;
}
#else
static inline int netif_set_xps_queue(struct net_device *dev,
const struct cpumask *mask,
......
......@@ -151,6 +151,7 @@ static inline void sk_mark_napi_id(struct sock *sk, const struct sk_buff *skb)
#ifdef CONFIG_NET_RX_BUSY_POLL
sk->sk_napi_id = skb->napi_id;
#endif
sk_rx_queue_set(sk, skb);
}
/* variant used for unconnected sockets */
......
......@@ -139,6 +139,7 @@ typedef __u64 __bitwise __addrpair;
* @skc_node: main hash linkage for various protocol lookup tables
* @skc_nulls_node: main hash linkage for TCP/UDP/UDP-Lite protocol
* @skc_tx_queue_mapping: tx queue number for this connection
* @skc_rx_queue_mapping: rx queue number for this connection
* @skc_flags: place holder for sk_flags
* %SO_LINGER (l_onoff), %SO_BROADCAST, %SO_KEEPALIVE,
* %SO_OOBINLINE settings, %SO_TIMESTAMPING settings
......@@ -214,7 +215,10 @@ struct sock_common {
struct hlist_node skc_node;
struct hlist_nulls_node skc_nulls_node;
};
int skc_tx_queue_mapping;
unsigned short skc_tx_queue_mapping;
#ifdef CONFIG_XPS
unsigned short skc_rx_queue_mapping;
#endif
union {
int skc_incoming_cpu;
u32 skc_rcv_wnd;
......@@ -326,6 +330,9 @@ struct sock {
#define sk_nulls_node __sk_common.skc_nulls_node
#define sk_refcnt __sk_common.skc_refcnt
#define sk_tx_queue_mapping __sk_common.skc_tx_queue_mapping
#ifdef CONFIG_XPS
#define sk_rx_queue_mapping __sk_common.skc_rx_queue_mapping
#endif
#define sk_dontcopy_begin __sk_common.skc_dontcopy_begin
#define sk_dontcopy_end __sk_common.skc_dontcopy_end
......@@ -1681,19 +1688,58 @@ static inline int sk_receive_skb(struct sock *sk, struct sk_buff *skb,
static inline void sk_tx_queue_set(struct sock *sk, int tx_queue)
{
/* sk_tx_queue_mapping accept only upto a 16-bit value */
if (WARN_ON_ONCE((unsigned short)tx_queue >= USHRT_MAX))
return;
sk->sk_tx_queue_mapping = tx_queue;
}
#define NO_QUEUE_MAPPING USHRT_MAX
static inline void sk_tx_queue_clear(struct sock *sk)
{
sk->sk_tx_queue_mapping = -1;
sk->sk_tx_queue_mapping = NO_QUEUE_MAPPING;
}
static inline int sk_tx_queue_get(const struct sock *sk)
{
return sk ? sk->sk_tx_queue_mapping : -1;
if (sk && sk->sk_tx_queue_mapping != NO_QUEUE_MAPPING)
return sk->sk_tx_queue_mapping;
return -1;
}
static inline void sk_rx_queue_set(struct sock *sk, const struct sk_buff *skb)
{
#ifdef CONFIG_XPS
if (skb_rx_queue_recorded(skb)) {
u16 rx_queue = skb_get_rx_queue(skb);
if (WARN_ON_ONCE(rx_queue == NO_QUEUE_MAPPING))
return;
sk->sk_rx_queue_mapping = rx_queue;
}
#endif
}
static inline void sk_rx_queue_clear(struct sock *sk)
{
#ifdef CONFIG_XPS
sk->sk_rx_queue_mapping = NO_QUEUE_MAPPING;
#endif
}
#ifdef CONFIG_XPS
static inline int sk_rx_queue_get(const struct sock *sk)
{
if (sk && sk->sk_rx_queue_mapping != NO_QUEUE_MAPPING)
return sk->sk_rx_queue_mapping;
return -1;
}
#endif
static inline void sk_set_socket(struct sock *sk, struct socket *sock)
{
sk_tx_queue_clear(sk);
......
This diff is collapsed.
......@@ -1227,13 +1227,13 @@ static ssize_t xps_cpus_show(struct netdev_queue *queue,
return -ENOMEM;
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_maps);
dev_maps = rcu_dereference(dev->xps_cpus_map);
if (dev_maps) {
for_each_possible_cpu(cpu) {
int i, tci = cpu * num_tc + tc;
struct xps_map *map;
map = rcu_dereference(dev_maps->cpu_map[tci]);
map = rcu_dereference(dev_maps->attr_map[tci]);
if (!map)
continue;
......@@ -1283,6 +1283,88 @@ static ssize_t xps_cpus_store(struct netdev_queue *queue,
static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init
= __ATTR_RW(xps_cpus);
static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
{
struct net_device *dev = queue->dev;
struct xps_dev_maps *dev_maps;
unsigned long *mask, index;
int j, len, num_tc = 1, tc = 0;
index = get_netdev_queue_index(queue);
if (dev->num_tc) {
num_tc = dev->num_tc;
tc = netdev_txq_to_tc(dev, index);
if (tc < 0)
return -EINVAL;
}
mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
GFP_KERNEL);
if (!mask)
return -ENOMEM;
rcu_read_lock();
dev_maps = rcu_dereference(dev->xps_rxqs_map);
if (!dev_maps)
goto out_no_maps;
for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
j < dev->num_rx_queues;) {
int i, tci = j * num_tc + tc;
struct xps_map *map;
map = rcu_dereference(dev_maps->attr_map[tci]);
if (!map)
continue;
for (i = map->len; i--;) {
if (map->queues[i] == index) {
set_bit(j, mask);
break;
}
}
}
out_no_maps:
rcu_read_unlock();
len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
kfree(mask);
return len < PAGE_SIZE ? len : -EINVAL;
}
static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
size_t len)
{
struct net_device *dev = queue->dev;
struct net *net = dev_net(dev);
unsigned long *mask, index;
int err;
if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
return -EPERM;
mask = kcalloc(BITS_TO_LONGS(dev->num_rx_queues), sizeof(long),
GFP_KERNEL);
if (!mask)
return -ENOMEM;
index = get_netdev_queue_index(queue);
err = bitmap_parse(buf, len, mask, dev->num_rx_queues);
if (err) {
kfree(mask);
return err;
}
err = __netif_set_xps_queue(dev, mask, index, true);
kfree(mask);
return err ? : len;
}
static struct netdev_queue_attribute xps_rxqs_attribute __ro_after_init
= __ATTR_RW(xps_rxqs);
#endif /* CONFIG_XPS */
static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
......@@ -1290,6 +1372,7 @@ static struct attribute *netdev_queue_default_attrs[] __ro_after_init = {
&queue_traffic_class.attr,
#ifdef CONFIG_XPS
&xps_cpus_attribute.attr,
&xps_rxqs_attribute.attr,
&queue_tx_maxrate.attr,
#endif
NULL
......
......@@ -2818,6 +2818,8 @@ void sock_init_data(struct socket *sock, struct sock *sk)
sk->sk_pacing_rate = ~0U;
sk->sk_pacing_shift = 10;
sk->sk_incoming_cpu = -1;
sk_rx_queue_clear(sk);
/*
* Before updating sk_refcnt, we must commit prior changes to memory
* (Documentation/RCU/rculist_nulls.txt for details)
......
......@@ -78,6 +78,7 @@
#include <linux/errqueue.h>
#include <trace/events/tcp.h>
#include <linux/static_key.h>
#include <net/busy_poll.h>
int sysctl_tcp_max_orphans __read_mostly = NR_FILE;
......@@ -5592,6 +5593,7 @@ void tcp_finish_connect(struct sock *sk, struct sk_buff *skb)
if (skb) {
icsk->icsk_af_ops->sk_rx_dst_set(sk, skb);
security_inet_conn_established(sk, skb);
sk_mark_napi_id(sk, skb);
}
tcp_init_transfer(sk, BPF_SOCK_OPS_ACTIVE_ESTABLISHED_CB);
......@@ -6420,6 +6422,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
tcp_rsk(req)->snt_isn = isn;
tcp_rsk(req)->txhash = net_tx_rndhash();
tcp_openreq_init_rwin(req, sk, dst);
sk_rx_queue_set(req_to_sk(req), skb);
if (!want_cookie) {
tcp_reqsk_record_syn(sk, req, skb);
fastopen_sk = tcp_try_fastopen(sk, skb, req, &foc, dst);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment