Merge branch 'bridge-improve-cache-utilization'

Nikolay Aleksandrov says: ==================== bridge: improve cache utilization This is the first set which begins to deal with the bad bridge cache access patterns. The first patch rearranges the bridge and port structs a little so the frequently (and closely) accessed members are in the same cache line. The second patch then moves the garbage collection to a workqueue trying to improve system responsiveness under load (many fdbs) and more importantly removes the need to check if the matched entry is expired in __br_fdb_get which was a major source of false-sharing. The third patch is a preparation for the final one which If properly configured, i.e. ports bound to CPUs (thus updating "updated" locally) then the bridge's HitM goes from 100% to 0%, but even without binding we get a win because previously every lookup that iterated over the hash chain caused false-sharing due to the first cache line being used for both mac/vid and used/updated fields. Some results from tests I've run: (note that these were run in good conditions for the baseline, everything ran on a single NUMA node and there were only 3 fdbs) 1. baseline 100% Load HitM on the fdbs (between everyone who has done lookups and hit one of the 3 hash chains of the communicating src/dst fdbs) Overall 5.06% Load HitM for the bridge, first place in the list 2. patched & ports bound to CPUs 0% Local load HitM, bridge is not even in the c2c report list Also there's 3% consistent improvement in netperf tests. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>

Merge branch 'bridge-improve-cache-utilization'
Nikolay Aleksandrov says: ==================== bridge: improve cache utilization This is the first set which begins to deal with the bad bridge cache access patterns. The first patch rearranges the bridge and port structs a little so the frequently (and closely) accessed members are in the same cache line. The second patch then moves the garbage collection to a workqueue trying to improve system responsiveness under load (many fdbs) and more importantly removes the need to check if the matched entry is expired in __br_fdb_get which was a major source of false-sharing. The third patch is a preparation for the final one which If properly configured, i.e. ports bound to CPUs (thus updating "updated" locally) then the bridge's HitM goes from 100% to 0%, but even without binding we get a win because previously every lookup that iterated over the hash chain caused false-sharing due to the first cache line being used for both mac/vid and used/updated fields. Some results from tests I've run: (note that these were run in good conditions for the baseline, everything ran on a single NUMA node and there were only 3 fdbs) 1. baseline 100% Load HitM on the fdbs (between everyone who has done lookups and hit one of the 3 hash chains of the communicating src/dst fdbs) Overall 5.06% Load HitM for the bridge, first place in the list 2. patched & ports bound to CPUs 0% Local load HitM, bridge is not even in the c2c report list Also there's 3% consistent improvement in netperf tests. ==================== Signed-off-by: David S. Miller <davem@davemloft.net>
152bff37 · David S. Miller · 63dfef75 · 83a718d6 · 152bff37 · 152bff37
Commit 152bff37 authored Feb 06, 2017 by David S. Miller
11 changed files
--- a/net/bridge/br_device.c
+++ b/net/bridge/br_device.c
@@ -411,4 +411,5 @@ void br_dev_setup(struct net_device *dev)
 	br_netfilter_rtable_init(br);
 	br_stp_timer_init(br);
 	br_multicast_init(br);
+	INIT_DELAYED_WORK(&br->gc_work, br_fdb_cleanup);
 }
--- a/net/bridge/br_fdb.c
+++ b/net/bridge/br_fdb.c
@@ -154,7 +154,7 @@ static void fdb_delete(struct net_bridge *br, struct net_bridge_fdb_entry *f)
 	if (f->added_by_external_learn)
 		fdb_del_external_learn(f);
-	hlist_del_rcu(&f->hlist);
+	hlist_del_init_rcu(&f->hlist);
 	fdb_notify(br, f, RTM_DELNEIGH);
 	call_rcu(&f->rcu, fdb_rcu_free);
 }
@@ -290,34 +290,43 @@ void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr)
 	spin_unlock_bh(&br->hash_lock);
 }
-void br_fdb_cleanup(unsigned long _data)
+void br_fdb_cleanup(struct work_struct *work)
 {
-	struct net_bridge *br = (struct net_bridge *)_data;
+	struct net_bridge *br = container_of(work, struct net_bridge,
+					     gc_work.work);
 	unsigned long delay = hold_time(br);
-	unsigned long next_timer = jiffies + br->ageing_time;
+	unsigned long work_delay = delay;
+	unsigned long now = jiffies;
 	int i;
-	spin_lock(&br->hash_lock);
 	for (i = 0; i < BR_HASH_SIZE; i++) {
 		struct net_bridge_fdb_entry *f;
 		struct hlist_node *n;
+		if (!br->hash[i].first)
+			continue;
+		spin_lock_bh(&br->hash_lock);
 		hlist_for_each_entry_safe(f, n, &br->hash[i], hlist) {
 			unsigned long this_timer;
 			if (f->is_static)
 				continue;
 			if (f->added_by_external_learn)
 				continue;
 			this_timer = f->updated + delay;
-			if (time_before_eq(this_timer, jiffies))
+			if (time_after(this_timer, now))
+				work_delay = min(work_delay, this_timer - now);
+			else
 				fdb_delete(br, f);
-			else if (time_before(this_timer, next_timer))
-				next_timer = this_timer;
 		}
+		spin_unlock_bh(&br->hash_lock);
+		cond_resched();
 	}
-	spin_unlock(&br->hash_lock);
-	mod_timer(&br->gc_timer, round_jiffies_up(next_timer));
+	/* Cleanup minimum 10 milliseconds apart */
+	work_delay = max_t(unsigned long, work_delay, msecs_to_jiffies(10));
+	mod_delayed_work(system_long_wq, &br->gc_work, work_delay);
 }
 /* Completely flush all dynamic entries in forwarding database.*/
@@ -382,8 +391,6 @@ struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,
 				&br->hash[br_mac_hash(addr, vid)], hlist) {
 		if (ether_addr_equal(fdb->addr.addr, addr) &&
 		    fdb->vlan_id == vid) {
-			if (unlikely(has_expired(br, fdb)))
-				break;
 			return fdb;
 		}
 	}
@@ -590,7 +597,8 @@ void br_fdb_update(struct net_bridge *br, struct net_bridge_port *source,
 				fdb->dst = source;
 				fdb_modified = true;
 			}
-			fdb->updated = jiffies;
+			if (jiffies != fdb->updated)
+				fdb->updated = jiffies;
 			if (unlikely(added_by_user))
 				fdb->added_by_user = 1;
 			if (unlikely(fdb_modified))

--- a/net/bridge/br_if.c
+++ b/net/bridge/br_if.c
@@ -313,7 +313,7 @@ void br_dev_delete(struct net_device *dev, struct list_head *head)
 	br_vlan_flush(br);
 	br_multicast_dev_del(br);
-	del_timer_sync(&br->gc_timer);
+	cancel_delayed_work_sync(&br->gc_work);
 	br_sysfs_delbr(br->dev);
 	unregister_netdevice_queue(br->dev, head);

--- a/net/bridge/br_input.c
+++ b/net/bridge/br_input.c
@@ -198,7 +198,8 @@ int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb
 		if (dst->is_local)
 			return br_pass_frame_up(skb);
-		dst->used = jiffies;
+		if (jiffies != dst->used)
+			dst->used = jiffies;
 		br_forward(dst->dst, skb, local_rcv, false);
 	} else {
 		if (!mcast_hit)

--- a/net/bridge/br_ioctl.c
+++ b/net/bridge/br_ioctl.c
@@ -149,7 +149,7 @@ static int old_dev_ioctl(struct net_device *dev, struct ifreq *rq, int cmd)
 		b.hello_timer_value = br_timer_value(&br->hello_timer);
 		b.tcn_timer_value = br_timer_value(&br->tcn_timer);
 		b.topology_change_timer_value = br_timer_value(&br->topology_change_timer);
-		b.gc_timer_value = br_timer_value(&br->gc_timer);
+		b.gc_timer_value = br_timer_value(&br->gc_work.timer);
 		rcu_read_unlock();
 		if (copy_to_user((void __user *)args[1], &b, sizeof(b)))

--- a/net/bridge/br_netlink.c
+++ b/net/bridge/br_netlink.c
@@ -1250,7 +1250,7 @@ static int br_fill_info(struct sk_buff *skb, const struct net_device *brdev)
 	if (nla_put_u64_64bit(skb, IFLA_BR_TOPOLOGY_CHANGE_TIMER, clockval,
 			      IFLA_BR_PAD))
 		return -EMSGSIZE;
-	clockval = br_timer_value(&br->gc_timer);
+	clockval = br_timer_value(&br->gc_work.timer);
 	if (nla_put_u64_64bit(skb, IFLA_BR_GC_TIMER, clockval, IFLA_BR_PAD))
 		return -EMSGSIZE;

--- a/net/bridge/br_private.h
+++ b/net/bridge/br_private.h
@@ -160,19 +160,21 @@ struct net_bridge_vlan_group {
 	u16				pvid;
 };
-struct net_bridge_fdb_entry
+struct net_bridge_fdb_entry {
-{
 	struct hlist_node		hlist;
 	struct net_bridge_port		*dst;
-	unsigned long			updated;
-	unsigned long			used;
 	mac_addr			addr;
 	__u16				vlan_id;
 	unsigned char			is_local:1,
 					is_static:1,
 					added_by_user:1,
 					added_by_external_learn:1;
+	/* write-heavy members should not affect lookups */
+	unsigned long			updated ____cacheline_aligned_in_smp;
+	unsigned long			used;
 	struct rcu_head			rcu;
 };
@@ -212,12 +214,16 @@ struct net_bridge_mdb_htable
 	u32				ver;
 };
-struct net_bridge_port
+struct net_bridge_port {
-{
 	struct net_bridge		*br;
 	struct net_device		*dev;
 	struct list_head		list;
+	unsigned long			flags;
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	struct net_bridge_vlan_group	__rcu *vlgrp;
+#endif
 	/* STP */
 	u8				priority;
 	u8				state;
@@ -238,8 +244,6 @@ struct net_bridge_port
 	struct kobject			kobj;
 	struct rcu_head			rcu;
-	unsigned long 			flags;
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	struct bridge_mcast_own_query	ip4_own_query;
 #if IS_ENABLED(CONFIG_IPV6)
@@ -259,9 +263,6 @@ struct net_bridge_port
 #ifdef CONFIG_NET_POLL_CONTROLLER
 	struct netpoll			*np;
 #endif
-#ifdef CONFIG_BRIDGE_VLAN_FILTERING
-	struct net_bridge_vlan_group	__rcu *vlgrp;
-#endif
 #ifdef CONFIG_NET_SWITCHDEV
 	int				offload_fwd_mark;
 #endif
@@ -283,14 +284,21 @@ static inline struct net_bridge_port *br_port_get_rtnl(const struct net_device *
 		rtnl_dereference(dev->rx_handler_data) : NULL;
 }
-struct net_bridge
+struct net_bridge {
-{
 	spinlock_t			lock;
+	spinlock_t			hash_lock;
 	struct list_head		port_list;
 	struct net_device		*dev;
 	struct pcpu_sw_netstats		__percpu *stats;
-	spinlock_t			hash_lock;
+	/* These fields are accessed on each packet */
+#ifdef CONFIG_BRIDGE_VLAN_FILTERING
+	u8				vlan_enabled;
+	u8				vlan_stats_enabled;
+	__be16				vlan_proto;
+	u16				default_pvid;
+	struct net_bridge_vlan_group	__rcu *vlgrp;
+#endif
 	struct hlist_head		hash[BR_HASH_SIZE];
 #if IS_ENABLED(CONFIG_BRIDGE_NETFILTER)
 	union {
@@ -308,6 +316,9 @@ struct net_bridge
 	bridge_id			designated_root;
 	bridge_id			bridge_id;
 	u32				root_path_cost;
+	unsigned char			topology_change;
+	unsigned char			topology_change_detected;
+	u16				root_port;
 	unsigned long			max_age;
 	unsigned long			hello_time;
 	unsigned long			forward_delay;
@@ -319,7 +330,6 @@ struct net_bridge
 	u8				group_addr[ETH_ALEN];
 	bool				group_addr_set;
-	u16				root_port;
 	enum {
 		BR_NO_STP, 		/* no spanning tree */
@@ -327,9 +337,6 @@ struct net_bridge
 		BR_USER_STP,		/* new RSTP in userspace */
 	} stp_enabled;
-	unsigned char			topology_change;
-	unsigned char			topology_change_detected;
 #ifdef CONFIG_BRIDGE_IGMP_SNOOPING
 	unsigned char			multicast_router;
@@ -374,21 +381,13 @@ struct net_bridge
 	struct timer_list		hello_timer;
 	struct timer_list		tcn_timer;
 	struct timer_list		topology_change_timer;
-	struct timer_list		gc_timer;
+	struct delayed_work		gc_work;
 	struct kobject			*ifobj;
 	u32				auto_cnt;
 #ifdef CONFIG_NET_SWITCHDEV
 	int offload_fwd_mark;
 #endif
-#ifdef CONFIG_BRIDGE_VLAN_FILTERING
-	struct net_bridge_vlan_group	__rcu *vlgrp;
-	u8				vlan_enabled;
-	u8				vlan_stats_enabled;
-	__be16				vlan_proto;
-	u16				default_pvid;
-#endif
 };
 struct br_input_skb_cb {
@@ -505,7 +504,7 @@ void br_fdb_find_delete_local(struct net_bridge *br,
 			      const unsigned char *addr, u16 vid);
 void br_fdb_changeaddr(struct net_bridge_port *p, const unsigned char *newaddr);
 void br_fdb_change_mac_address(struct net_bridge *br, const u8 *newaddr);
-void br_fdb_cleanup(unsigned long arg);
+void br_fdb_cleanup(struct work_struct *work);
 void br_fdb_delete_by_port(struct net_bridge *br,
 			   const struct net_bridge_port *p, u16 vid, int do_all);
 struct net_bridge_fdb_entry *__br_fdb_get(struct net_bridge *br,

--- a/net/bridge/br_stp.c
+++ b/net/bridge/br_stp.c
@@ -602,7 +602,7 @@ int br_set_ageing_time(struct net_bridge *br, clock_t ageing_time)
 	br->ageing_time = t;
 	spin_unlock_bh(&br->lock);
-	mod_timer(&br->gc_timer, jiffies);
+	mod_delayed_work(system_long_wq, &br->gc_work, 0);
 	return 0;
 }

--- a/net/bridge/br_stp_if.c
+++ b/net/bridge/br_stp_if.c
@@ -57,7 +57,7 @@ void br_stp_enable_bridge(struct net_bridge *br)
 	spin_lock_bh(&br->lock);
 	if (br->stp_enabled == BR_KERNEL_STP)
 		mod_timer(&br->hello_timer, jiffies + br->hello_time);
-	mod_timer(&br->gc_timer, jiffies + HZ/10);
+	mod_delayed_work(system_long_wq, &br->gc_work, HZ / 10);
 	br_config_bpdu_generation(br);
@@ -88,7 +88,7 @@ void br_stp_disable_bridge(struct net_bridge *br)
 	del_timer_sync(&br->hello_timer);
 	del_timer_sync(&br->topology_change_timer);
 	del_timer_sync(&br->tcn_timer);
-	del_timer_sync(&br->gc_timer);
+	cancel_delayed_work_sync(&br->gc_work);
 }
 /* called under bridge lock */

--- a/net/bridge/br_stp_timer.c
+++ b/net/bridge/br_stp_timer.c
@@ -153,8 +153,6 @@ void br_stp_timer_init(struct net_bridge *br)
 	setup_timer(&br->topology_change_timer,
 		      br_topology_change_timer_expired,
 		      (unsigned long) br);
-	setup_timer(&br->gc_timer, br_fdb_cleanup, (unsigned long) br);
 }
 void br_stp_port_timer_init(struct net_bridge_port *p)

--- a/net/bridge/br_sysfs_br.c
+++ b/net/bridge/br_sysfs_br.c
@@ -263,7 +263,7 @@ static ssize_t gc_timer_show(struct device *d, struct device_attribute *attr,
 			     char *buf)
 {
 	struct net_bridge *br = to_bridge(d);
-	return sprintf(buf, "%ld\n", br_timer_value(&br->gc_timer));
+	return sprintf(buf, "%ld\n", br_timer_value(&br->gc_work.timer));
 }
 static DEVICE_ATTR_RO(gc_timer);