Merge branch 'net-xps-improve-the-xps-maps-handling'

Antoine Tenart says: ==================== net: xps: improve the xps maps handling This series aims at fixing various issues with the xps code, including out-of-bound accesses and use-after-free. While doing so we try to improve the xps code maintainability and readability. The main change is moving dev->num_tc and dev->nr_ids in the xps maps, to avoid out-of-bound accesses as those two fields can be updated after the maps have been allocated. This allows further reworks, to improve the xps code readability and allow to stop taking the rtnl lock when reading the maps in sysfs. The maps are moved to an array in net_device, which simplifies the code a lot. One future improvement may be to remove the use of xps_map_mutex from net/core/dev.c, but that may require extra care. Thanks! Antoine Since v3: - Removed the 3 patches about the rtnl lock and __netif_set_xps_queue as there are extra issues. Those patches were not tied to the others, and I'll see want can be done as a separate effort. - One small fix in patch 12. Since v2: - Patches 13-16 are new to the series. - Fixed another issue I found while preparing v3 (use after free of old xps maps). - Kept the rtnl lock when calling netdev_get_tx_queue and netdev_txq_to_tc. - Use get_device/put_device when using the sb_dev. - Take the rtnl lock in mlx5 and virtio_net when calling netif_set_xps_queue. - Fixed a coding style issue. Since v1: - Reordered the patches to improve readability and avoid introducing issues in between patches. - Use dev_maps->nr_ids to allocate the mask in xps_queue_show but still default to nr_cpu_ids/dev->num_rx_queues in xps_queue_show when dev_maps hasn't been allocated yet for backward compatibility.:w ==================== Signed-off-by: David S. Miller <davem@davemloft.net>

Merge branch 'net-xps-improve-the-xps-maps-handling'
Antoine Tenart says: ==================== net: xps: improve the xps maps handling This series aims at fixing various issues with the xps code, including out-of-bound accesses and use-after-free. While doing so we try to improve the xps code maintainability and readability. The main change is moving dev->num_tc and dev->nr_ids in the xps maps, to avoid out-of-bound accesses as those two fields can be updated after the maps have been allocated. This allows further reworks, to improve the xps code readability and allow to stop taking the rtnl lock when reading the maps in sysfs. The maps are moved to an array in net_device, which simplifies the code a lot. One future improvement may be to remove the use of xps_map_mutex from net/core/dev.c, but that may require extra care. Thanks! Antoine Since v3: - Removed the 3 patches about the rtnl lock and __netif_set_xps_queue as there are extra issues. Those patches were not tied to the others, and I'll see want can be done as a separate effort. - One small fix in patch 12. Since v2: - Patches 13-16 are new to the series. - Fixed another issue I found while preparing v3 (use after free of old xps maps). - Kept the rtnl lock when calling netdev_get_tx_queue and netdev_txq_to_tc. - Use get_device/put_device when using the sb_dev. - Take the rtnl lock in mlx5 and virtio_net when calling netif_set_xps_queue. - Fixed a coding style issue. Since v1: - Reordered the patches to improve readability and avoid introducing issues in between patches. - Use dev_maps->nr_ids to allocate the mask in xps_queue_show but still default to nr_cpu_ids/dev->num_rx_queues in xps_queue_show when dev_maps hasn't been allocated yet for backward compatibility.:w ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
c2ed62b9 · David S. Miller · 6859d915 · 75b2758a · c2ed62b9 · c2ed62b9
Commit c2ed62b9 authored Mar 18, 2021 by David S. Miller
Showing with 222 additions and 231 deletions

drivers/net/virtio_net.c drivers/net/virtio_net.c +1 -1

include/linux/netdevice.h include/linux/netdevice.h +21 -6

net/core/dev.c net/core/dev.c +127 -120

net/core/net-sysfs.c net/core/net-sysfs.c +73 -104

No files found.
--- a/drivers/net/virtio_net.c
+++ b/drivers/net/virtio_net.c
@@ -2015,7 +2015,7 @@ static void virtnet_set_affinity(struct virtnet_info *vi)
 		}
 		virtqueue_set_affinity(vi->rq[i].vq, mask);
 		virtqueue_set_affinity(vi->sq[i].vq, mask);
-		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, false);
+		__netif_set_xps_queue(vi->dev, cpumask_bits(mask), i, XPS_CPUS);
 		cpumask_clear(mask);
 	}


--- a/include/linux/netdevice.h
+++ b/include/linux/netdevice.h
@@ -754,6 +754,13 @@ struct rx_queue_attribute {
 			 const char *buf, size_t len);
 };

+/* XPS map type and offset of the xps map within net_device->xps_maps[]. */
+enum xps_map_type {
+	XPS_CPUS = 0,
+	XPS_RXQS,
+	XPS_MAPS_MAX,
+};
+
 #ifdef CONFIG_XPS
 /*
 * This structure holds an XPS map which can be of variable length.  The
@@ -771,9 +778,19 @@ struct xps_map {

 /*
 * This structure holds all XPS maps for device.  Maps are indexed by CPU.
+ *
+ * We keep track of the number of cpus/rxqs used when the struct is allocated,
+ * in nr_ids. This will help not accessing out-of-bound memory.
+ *
+ * We keep track of the number of traffic classes used when the struct is
+ * allocated, in num_tc. This will be used to navigate the maps, to ensure we're
+ * not crossing its upper bound, as the original dev->num_tc can be updated in
+ * the meantime.
 */
 struct xps_dev_maps {
 	struct rcu_head rcu;
+	unsigned int nr_ids;
+	s16 num_tc;
 	struct xps_map __rcu *attr_map[]; /* Either CPUs map or RXQs map */
 };

@@ -1763,8 +1780,7 @@ enum netdev_ml_priv_type {
 *	@tx_queue_len:		Max frames per queue allowed
 *	@tx_global_lock: 	XXX: need comments on this one
 *	@xdp_bulkq:		XDP device bulk queue
- *	@xps_cpus_map:		all CPUs map for XPS device
- *	@xps_rxqs_map:		all RXQs map for XPS device
+ *	@xps_maps:		all CPUs/RXQs maps for XPS device
 *
 *	@xps_maps:	XXX: need comments on this one
 *	@miniq_egress:		clsact qdisc specific data for
@@ -2060,8 +2076,7 @@ struct net_device {
 	struct xdp_dev_bulk_queue __percpu *xdp_bulkq;

 #ifdef CONFIG_XPS
-	struct xps_dev_maps __rcu *xps_cpus_map;
-	struct xps_dev_maps __rcu *xps_rxqs_map;
+	struct xps_dev_maps __rcu *xps_maps[XPS_MAPS_MAX];
 #endif
 #ifdef CONFIG_NET_CLS_ACT
 	struct mini_Qdisc __rcu	*miniq_egress;
@@ -3691,7 +3706,7 @@ static inline void netif_wake_subqueue(struct net_device *dev, u16 queue_index)
 int netif_set_xps_queue(struct net_device *dev, const struct cpumask *mask,
 			u16 index);
 int __netif_set_xps_queue(struct net_device *dev, const unsigned long *mask,
-			  u16 index, bool is_rxqs_map);
+			  u16 index, enum xps_map_type type);

 /**
 *	netif_attr_test_mask - Test a CPU or Rx queue set in a mask
@@ -3786,7 +3801,7 @@ static inline int netif_set_xps_queue(struct net_device *dev,

 static inline int __netif_set_xps_queue(struct net_device *dev,
 					const unsigned long *mask,
-					u16 index, bool is_rxqs_map)
+					u16 index, enum xps_map_type type)
 {
 	return 0;
 }

--- a/net/core/dev.c
+++ b/net/core/dev.c
--- a/net/core/net-sysfs.c
+++ b/net/core/net-sysfs.c
@@ -1361,83 +1361,94 @@ static const struct attribute_group dql_group = {
 #endif /* CONFIG_BQL */

 #ifdef CONFIG_XPS
-static ssize_t xps_cpus_show(struct netdev_queue *queue,
-			     char *buf)
+static ssize_t xps_queue_show(struct net_device *dev, unsigned int index,
+			      int tc, char *buf, enum xps_map_type type)
 {
-	int cpu, len, ret, num_tc = 1, tc = 0;
-	struct net_device *dev = queue->dev;
 	struct xps_dev_maps *dev_maps;
-	cpumask_var_t mask;
-	unsigned long index;
-
-	if (!netif_is_multiqueue(dev))
-		return -ENOENT;
+	unsigned long *mask;
+	unsigned int nr_ids;
+	int j, len;

-	index = get_netdev_queue_index(queue);
+	rcu_read_lock();
+	dev_maps = rcu_dereference(dev->xps_maps[type]);

-	if (!rtnl_trylock())
-		return restart_syscall();
+	/* Default to nr_cpu_ids/dev->num_rx_queues and do not just return 0
+	 * when dev_maps hasn't been allocated yet, to be backward compatible.
+	 */
+	nr_ids = dev_maps ? dev_maps->nr_ids :
+		 (type == XPS_CPUS ? nr_cpu_ids : dev->num_rx_queues);

-	if (dev->num_tc) {
-		/* Do not allow XPS on subordinate device directly */
-		num_tc = dev->num_tc;
-		if (num_tc < 0) {
-			ret = -EINVAL;
-			goto err_rtnl_unlock;
-		}
+	mask = bitmap_zalloc(nr_ids, GFP_KERNEL);
+	if (!mask) {
+		rcu_read_unlock();
+		return -ENOMEM;
+	}

-		/* If queue belongs to subordinate dev use its map */
-		dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+	if (!dev_maps || tc >= dev_maps->num_tc)
+		goto out_no_maps;

-		tc = netdev_txq_to_tc(dev, index);
-		if (tc < 0) {
-			ret = -EINVAL;
-			goto err_rtnl_unlock;
-		}
-	}
+	for (j = 0; j < nr_ids; j++) {
+		int i, tci = j * dev_maps->num_tc + tc;
+		struct xps_map *map;

-	if (!zalloc_cpumask_var(&mask, GFP_KERNEL)) {
-		ret = -ENOMEM;
-		goto err_rtnl_unlock;
-	}
+		map = rcu_dereference(dev_maps->attr_map[tci]);
+		if (!map)
+			continue;

-	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_cpus_map);
-	if (dev_maps) {
-		for_each_possible_cpu(cpu) {
-			int i, tci = cpu * num_tc + tc;
-			struct xps_map *map;
-
-			map = rcu_dereference(dev_maps->attr_map[tci]);
-			if (!map)
-				continue;
-
-			for (i = map->len; i--;) {
-				if (map->queues[i] == index) {
-					cpumask_set_cpu(cpu, mask);
-					break;
-				}
+		for (i = map->len; i--;) {
+			if (map->queues[i] == index) {
+				set_bit(j, mask);
+				break;
 			}
 		}
 	}
+out_no_maps:
 	rcu_read_unlock();

-	rtnl_unlock();
+	len = bitmap_print_to_pagebuf(false, buf, mask, nr_ids);
+	bitmap_free(mask);

-	len = snprintf(buf, PAGE_SIZE, "%*pb\n", cpumask_pr_args(mask));
-	free_cpumask_var(mask);
 	return len < PAGE_SIZE ? len : -EINVAL;
+}
+
+static ssize_t xps_cpus_show(struct netdev_queue *queue, char *buf)
+{
+	struct net_device *dev = queue->dev;
+	unsigned int index;
+	int len, tc;
+
+	if (!netif_is_multiqueue(dev))
+		return -ENOENT;

-err_rtnl_unlock:
+	index = get_netdev_queue_index(queue);
+
+	if (!rtnl_trylock())
+		return restart_syscall();
+
+	/* If queue belongs to subordinate dev use its map */
+	dev = netdev_get_tx_queue(dev, index)->sb_dev ? : dev;
+
+	tc = netdev_txq_to_tc(dev, index);
+	if (tc < 0) {
+		rtnl_unlock();
+		return -EINVAL;
+	}
+
+	/* Make sure the subordinate device can't be freed */
+	get_device(&dev->dev);
 	rtnl_unlock();
-	return ret;
+
+	len = xps_queue_show(dev, index, tc, buf, XPS_CPUS);
+
+	put_device(&dev->dev);
+	return len;
 }

 static ssize_t xps_cpus_store(struct netdev_queue *queue,
 			      const char *buf, size_t len)
 {
 	struct net_device *dev = queue->dev;
-	unsigned long index;
+	unsigned int index;
 	cpumask_var_t mask;
 	int err;

@@ -1476,64 +1487,21 @@ static struct netdev_queue_attribute xps_cpus_attribute __ro_after_init

 static ssize_t xps_rxqs_show(struct netdev_queue *queue, char *buf)
 {
-	int j, len, ret, num_tc = 1, tc = 0;
 	struct net_device *dev = queue->dev;
-	struct xps_dev_maps *dev_maps;
-	unsigned long *mask, index;
+	unsigned int index;
+	int tc;

 	index = get_netdev_queue_index(queue);

 	if (!rtnl_trylock())
 		return restart_syscall();

-	if (dev->num_tc) {
-		num_tc = dev->num_tc;
-		tc = netdev_txq_to_tc(dev, index);
-		if (tc < 0) {
-			ret = -EINVAL;
-			goto err_rtnl_unlock;
-		}
-	}
-	mask = bitmap_zalloc(dev->num_rx_queues, GFP_KERNEL);
-	if (!mask) {
-		ret = -ENOMEM;
-		goto err_rtnl_unlock;
-	}
-
-	rcu_read_lock();
-	dev_maps = rcu_dereference(dev->xps_rxqs_map);
-	if (!dev_maps)
-		goto out_no_maps;
-
-	for (j = -1; j = netif_attrmask_next(j, NULL, dev->num_rx_queues),
-	     j < dev->num_rx_queues;) {
-		int i, tci = j * num_tc + tc;
-		struct xps_map *map;
-
-		map = rcu_dereference(dev_maps->attr_map[tci]);
-		if (!map)
-			continue;
-
-		for (i = map->len; i--;) {
-			if (map->queues[i] == index) {
-				set_bit(j, mask);
-				break;
-			}
-		}
-	}
-out_no_maps:
-	rcu_read_unlock();
-
+	tc = netdev_txq_to_tc(dev, index);
 	rtnl_unlock();
+	if (tc < 0)
+		return -EINVAL;

-	len = bitmap_print_to_pagebuf(false, buf, mask, dev->num_rx_queues);
-	bitmap_free(mask);
-
-	return len < PAGE_SIZE ? len : -EINVAL;
-
-err_rtnl_unlock:
-	rtnl_unlock();
-	return ret;
+	return xps_queue_show(dev, index, tc, buf, XPS_RXQS);
 }

 static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
@@ -1541,7 +1509,8 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
 {
 	struct net_device *dev = queue->dev;
 	struct net *net = dev_net(dev);
-	unsigned long *mask, index;
+	unsigned long *mask;
+	unsigned int index;
 	int err;

 	if (!ns_capable(net->user_ns, CAP_NET_ADMIN))
@@ -1565,7 +1534,7 @@ static ssize_t xps_rxqs_store(struct netdev_queue *queue, const char *buf,
 	}

 	cpus_read_lock();
-	err = __netif_set_xps_queue(dev, mask, index, true);
+	err = __netif_set_xps_queue(dev, mask, index, XPS_RXQS);
 	cpus_read_unlock();

 	rtnl_unlock();