Commit 0a9627f2 authored by Tom Herbert's avatar Tom Herbert Committed by David S. Miller

rps: Receive Packet Steering

This patch implements software receive side packet steering (RPS).  RPS
distributes the load of received packet processing across multiple CPUs.

Problem statement: Protocol processing done in the NAPI context for received
packets is serialized per device queue and becomes a bottleneck under high
packet load.  This substantially limits pps that can be achieved on a single
queue NIC and provides no scaling with multiple cores.

This solution queues packets early on in the receive path on the backlog queues
of other CPUs.   This allows protocol processing (e.g. IP and TCP) to be
performed on packets in parallel.   For each device (or each receive queue in
a multi-queue device) a mask of CPUs is set to indicate the CPUs that can
process packets. A CPU is selected on a per packet basis by hashing contents
of the packet header (e.g. the TCP or UDP 4-tuple) and using the result to index
into the CPU mask.  The IPI mechanism is used to raise networking receive
softirqs between CPUs.  This effectively emulates in software what a multi-queue
NIC can provide, but is generic requiring no device support.

Many devices now provide a hash over the 4-tuple on a per packet basis
(e.g. the Toeplitz hash).  This patch allow drivers to set the HW reported hash
in an skb field, and that value in turn is used to index into the RPS maps.
Using the HW generated hash can avoid cache misses on the packet when
steering it to a remote CPU.

The CPU mask is set on a per device and per queue basis in the sysfs variable
/sys/class/net/<device>/queues/rx-<n>/rps_cpus.  This is a set of canonical
bit maps for receive queues in the device (numbered by <n>).  If a device
does not support multi-queue, a single variable is used for the device (rx-0).

Generally, we have found this technique increases pps capabilities of a single
queue device with good CPU utilization.  Optimal settings for the CPU mask
seem to depend on architectures and cache hierarcy.  Below are some results
running 500 instances of netperf TCP_RR test with 1 byte req. and resp.
Results show cumulative transaction rate and system CPU utilization.

e1000e on 8 core Intel
   Without RPS: 108K tps at 33% CPU
   With RPS:    311K tps at 64% CPU

forcedeth on 16 core AMD
   Without RPS: 156K tps at 15% CPU
   With RPS:    404K tps at 49% CPU

bnx2x on 16 core AMD
   Without RPS  567K tps at 61% CPU (4 HW RX queues)
   Without RPS  738K tps at 96% CPU (8 HW RX queues)
   With RPS:    854K tps at 76% CPU (4 HW RX queues)

Caveats:
- The benefits of this patch are dependent on architecture and cache hierarchy.
Tuning the masks to get best performance is probably necessary.
- This patch adds overhead in the path for processing a single packet.  In
a lightly loaded server this overhead may eliminate the advantages of
increased parallelism, and possibly cause some relative performance degradation.
We have found that masks that are cache aware (share same caches with
the interrupting CPU) mitigate much of this.
- The RPS masks can be changed dynamically, however whenever the mask is changed
this introduces the possibility of generating out of order packets.  It's
probably best not change the masks too frequently.
Signed-off-by: default avatarTom Herbert <therbert@google.com>

 include/linux/netdevice.h |   32 ++++-
 include/linux/skbuff.h    |    3 +
 net/core/dev.c            |  335 +++++++++++++++++++++++++++++++++++++--------
 net/core/net-sysfs.c      |  225 ++++++++++++++++++++++++++++++-
 net/core/skbuff.c         |    2 +
 5 files changed, 538 insertions(+), 59 deletions(-)
Signed-off-by: default avatarEric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 768bbedf
......@@ -223,6 +223,7 @@ struct netif_rx_stats {
unsigned dropped;
unsigned time_squeeze;
unsigned cpu_collision;
unsigned received_rps;
};
DECLARE_PER_CPU(struct netif_rx_stats, netdev_rx_stat);
......@@ -530,6 +531,24 @@ struct netdev_queue {
unsigned long tx_dropped;
} ____cacheline_aligned_in_smp;
/*
* This structure holds an RPS map which can be of variable length. The
* map is an array of CPUs.
*/
struct rps_map {
unsigned int len;
struct rcu_head rcu;
u16 cpus[0];
};
#define RPS_MAP_SIZE(_num) (sizeof(struct rps_map) + (_num * sizeof(u16)))
/* This structure contains an instance of an RX queue. */
struct netdev_rx_queue {
struct rps_map *rps_map;
struct kobject kobj;
struct netdev_rx_queue *first;
atomic_t count;
} ____cacheline_aligned_in_smp;
/*
* This structure defines the management hooks for network devices.
......@@ -878,6 +897,13 @@ struct net_device {
unsigned char broadcast[MAX_ADDR_LEN]; /* hw bcast add */
struct kset *queues_kset;
struct netdev_rx_queue *_rx;
/* Number of RX queues allocated at alloc_netdev_mq() time */
unsigned int num_rx_queues;
struct netdev_queue rx_queue;
struct netdev_queue *_tx ____cacheline_aligned_in_smp;
......@@ -1311,14 +1337,16 @@ static inline int unregister_gifconf(unsigned int family)
*/
struct softnet_data {
struct Qdisc *output_queue;
struct sk_buff_head input_pkt_queue;
struct list_head poll_list;
struct sk_buff *completion_queue;
/* Elements below can be accessed between CPUs for RPS */
struct call_single_data csd ____cacheline_aligned_in_smp;
struct sk_buff_head input_pkt_queue;
struct napi_struct backlog;
};
DECLARE_PER_CPU(struct softnet_data,softnet_data);
DECLARE_PER_CPU_ALIGNED(struct softnet_data, softnet_data);
#define HAVE_NETIF_QUEUE
......
......@@ -300,6 +300,7 @@ typedef unsigned char *sk_buff_data_t;
* @nfct_reasm: netfilter conntrack re-assembly pointer
* @nf_bridge: Saved data about a bridged frame - see br_netfilter.c
* @skb_iif: ifindex of device we arrived on
* @rxhash: the packet hash computed on receive
* @queue_mapping: Queue mapping for multiqueue devices
* @tc_index: Traffic control index
* @tc_verd: traffic control verdict
......@@ -375,6 +376,8 @@ struct sk_buff {
#endif
#endif
__u32 rxhash;
kmemcheck_bitfield_begin(flags2);
__u16 queue_mapping:16;
#ifdef CONFIG_IPV6_NDISC_NODETYPE
......
This diff is collapsed.
......@@ -466,6 +466,216 @@ static struct attribute_group wireless_group = {
};
#endif
/*
* RX queue sysfs structures and functions.
*/
struct rx_queue_attribute {
struct attribute attr;
ssize_t (*show)(struct netdev_rx_queue *queue,
struct rx_queue_attribute *attr, char *buf);
ssize_t (*store)(struct netdev_rx_queue *queue,
struct rx_queue_attribute *attr, const char *buf, size_t len);
};
#define to_rx_queue_attr(_attr) container_of(_attr, \
struct rx_queue_attribute, attr)
#define to_rx_queue(obj) container_of(obj, struct netdev_rx_queue, kobj)
static ssize_t rx_queue_attr_show(struct kobject *kobj, struct attribute *attr,
char *buf)
{
struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
struct netdev_rx_queue *queue = to_rx_queue(kobj);
if (!attribute->show)
return -EIO;
return attribute->show(queue, attribute, buf);
}
static ssize_t rx_queue_attr_store(struct kobject *kobj, struct attribute *attr,
const char *buf, size_t count)
{
struct rx_queue_attribute *attribute = to_rx_queue_attr(attr);
struct netdev_rx_queue *queue = to_rx_queue(kobj);
if (!attribute->store)
return -EIO;
return attribute->store(queue, attribute, buf, count);
}
static struct sysfs_ops rx_queue_sysfs_ops = {
.show = rx_queue_attr_show,
.store = rx_queue_attr_store,
};
static ssize_t show_rps_map(struct netdev_rx_queue *queue,
struct rx_queue_attribute *attribute, char *buf)
{
struct rps_map *map;
cpumask_var_t mask;
size_t len = 0;
int i;
if (!zalloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
rcu_read_lock();
map = rcu_dereference(queue->rps_map);
if (map)
for (i = 0; i < map->len; i++)
cpumask_set_cpu(map->cpus[i], mask);
len += cpumask_scnprintf(buf + len, PAGE_SIZE, mask);
if (PAGE_SIZE - len < 3) {
rcu_read_unlock();
free_cpumask_var(mask);
return -EINVAL;
}
rcu_read_unlock();
free_cpumask_var(mask);
len += sprintf(buf + len, "\n");
return len;
}
static void rps_map_release(struct rcu_head *rcu)
{
struct rps_map *map = container_of(rcu, struct rps_map, rcu);
kfree(map);
}
ssize_t store_rps_map(struct netdev_rx_queue *queue,
struct rx_queue_attribute *attribute,
const char *buf, size_t len)
{
struct rps_map *old_map, *map;
cpumask_var_t mask;
int err, cpu, i;
static DEFINE_SPINLOCK(rps_map_lock);
if (!capable(CAP_NET_ADMIN))
return -EPERM;
if (!alloc_cpumask_var(&mask, GFP_KERNEL))
return -ENOMEM;
err = bitmap_parse(buf, len, cpumask_bits(mask), nr_cpumask_bits);
if (err) {
free_cpumask_var(mask);
return err;
}
map = kzalloc(max_t(unsigned,
RPS_MAP_SIZE(cpumask_weight(mask)), L1_CACHE_BYTES),
GFP_KERNEL);
if (!map) {
free_cpumask_var(mask);
return -ENOMEM;
}
i = 0;
for_each_cpu_and(cpu, mask, cpu_online_mask)
map->cpus[i++] = cpu;
if (i)
map->len = i;
else {
kfree(map);
map = NULL;
}
spin_lock(&rps_map_lock);
old_map = queue->rps_map;
rcu_assign_pointer(queue->rps_map, map);
spin_unlock(&rps_map_lock);
if (old_map)
call_rcu(&old_map->rcu, rps_map_release);
free_cpumask_var(mask);
return len;
}
static struct rx_queue_attribute rps_cpus_attribute =
__ATTR(rps_cpus, S_IRUGO | S_IWUSR, show_rps_map, store_rps_map);
static struct attribute *rx_queue_default_attrs[] = {
&rps_cpus_attribute.attr,
NULL
};
static void rx_queue_release(struct kobject *kobj)
{
struct netdev_rx_queue *queue = to_rx_queue(kobj);
struct rps_map *map = queue->rps_map;
struct netdev_rx_queue *first = queue->first;
if (map)
call_rcu(&map->rcu, rps_map_release);
if (atomic_dec_and_test(&first->count))
kfree(first);
}
static struct kobj_type rx_queue_ktype = {
.sysfs_ops = &rx_queue_sysfs_ops,
.release = rx_queue_release,
.default_attrs = rx_queue_default_attrs,
};
static int rx_queue_add_kobject(struct net_device *net, int index)
{
struct netdev_rx_queue *queue = net->_rx + index;
struct kobject *kobj = &queue->kobj;
int error = 0;
kobj->kset = net->queues_kset;
error = kobject_init_and_add(kobj, &rx_queue_ktype, NULL,
"rx-%u", index);
if (error) {
kobject_put(kobj);
return error;
}
kobject_uevent(kobj, KOBJ_ADD);
return error;
}
static int rx_queue_register_kobjects(struct net_device *net)
{
int i;
int error = 0;
net->queues_kset = kset_create_and_add("queues",
NULL, &net->dev.kobj);
if (!net->queues_kset)
return -ENOMEM;
for (i = 0; i < net->num_rx_queues; i++) {
error = rx_queue_add_kobject(net, i);
if (error)
break;
}
if (error)
while (--i >= 0)
kobject_put(&net->_rx[i].kobj);
return error;
}
static void rx_queue_remove_kobjects(struct net_device *net)
{
int i;
for (i = 0; i < net->num_rx_queues; i++)
kobject_put(&net->_rx[i].kobj);
kset_unregister(net->queues_kset);
}
#endif /* CONFIG_SYSFS */
#ifdef CONFIG_HOTPLUG
......@@ -529,6 +739,8 @@ void netdev_unregister_kobject(struct net_device * net)
if (!net_eq(dev_net(net), &init_net))
return;
rx_queue_remove_kobjects(net);
device_del(dev);
}
......@@ -537,6 +749,7 @@ int netdev_register_kobject(struct net_device *net)
{
struct device *dev = &(net->dev);
const struct attribute_group **groups = net->sysfs_groups;
int error = 0;
dev->class = &net_class;
dev->platform_data = net;
......@@ -563,7 +776,17 @@ int netdev_register_kobject(struct net_device *net)
if (!net_eq(dev_net(net), &init_net))
return 0;
return device_add(dev);
error = device_add(dev);
if (error)
return error;
error = rx_queue_register_kobjects(net);
if (error) {
device_del(dev);
return error;
}
return error;
}
int netdev_class_create_file(struct class_attribute *class_attr)
......
......@@ -534,6 +534,7 @@ static void __copy_skb_header(struct sk_buff *new, const struct sk_buff *old)
new->network_header = old->network_header;
new->mac_header = old->mac_header;
skb_dst_set(new, dst_clone(skb_dst(old)));
new->rxhash = old->rxhash;
#ifdef CONFIG_XFRM
new->sp = secpath_get(old->sp);
#endif
......@@ -581,6 +582,7 @@ static struct sk_buff *__skb_clone(struct sk_buff *n, struct sk_buff *skb)
C(len);
C(data_len);
C(mac_len);
C(rxhash);
n->hdr_len = skb->nohdr ? skb_headroom(skb) : skb->hdr_len;
n->cloned = 1;
n->nohdr = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment