Commit f8b2990f authored by David S. Miller's avatar David S. Miller

Merge branch 'net-bridge-add-support-for-backup-port'

Nikolay Aleksandrov says:

====================
net: bridge: add support for backup port

This set introduces a new bridge port option that allows any port to have
any other port (in the same bridge of course) as its backup and traffic
will be forwarded to the backup port when the primary goes down. This is
mainly used in MLAG and EVPN setups where we have peerlink path which is
a backup of many (or even all) ports and is a participating bridge port
itself. There's more detailed information in patch 02. Patch 01 just
prepares the port sysfs code for options that take raw value. The main
issues that this set solves are scalability and fallback latency.

We have used similar code for over 6 months now to bring the fallback
latency of the backup peerlink down and avoid fdb notification storms.
Also due to the nature of master devices such setup is currently not
possible, and last but not least having tens of thousands of fdbs require
thousands of calls to switch.

I've also CCed our MLAG experts that have been using similar option.

Roopa also adds:

"Two switches acting in a MLAG pair are connected by the peerlink
interface which is a bridge port.

the config on one of the switches looks like the below. The other
switch also has a similar config.
eth0 is connected to one port on the server. And the server is
connected to both switches.

br0 -- team0---eth0
      |
      -- switch-peerlink

switch-peerlink becomes the failover/backport port when say team0 to
the server goes down.
Today, when team0 goes down, control plane has to withdraw all the fdb
entries pointing to team0
and re-install the fdb entries pointing to switch-peerlink...and
restore the fdb entries when team0 comes back up again.
and  this is the problem we are trying to solve.

This also becomes necessary when multihoming is implemented by a
standard like E-VPN https://tools.ietf.org/html/rfc8365#section-8
where the 'switch-peerlink' is an overlay vxlan port (like nikolay
mentions in his patch commit). In these implementations, the fdb scale
can be much larger.

On why bond failover cannot be used here ?: the point that nikolay was
alluding to is, switch-peerlink in the above example is a bridge port
and is a failover/backport port for more than one or all ports in the
bridge br0. And you cannot enslave switch-peerlink into a second level
team
with other bridge ports. Hence a multi layered team device is not an
option (FWIW, switch-peerlink is also a teamed interface to the peer
switch)."

v3: Added Roopa's explanation and diagram
v2: In patch 01 use kstrdup/kfree to avoid casting the const buf. In order
to avoid using GFP_ATOMIC or always allocating I kept the spinlock inside
each branch.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 0a78c380 2756f68c
......@@ -334,6 +334,7 @@ enum {
IFLA_BRPORT_GROUP_FWD_MASK,
IFLA_BRPORT_NEIGH_SUPPRESS,
IFLA_BRPORT_ISOLATED,
IFLA_BRPORT_BACKUP_PORT,
__IFLA_BRPORT_MAX
};
#define IFLA_BRPORT_MAX (__IFLA_BRPORT_MAX - 1)
......
......@@ -142,7 +142,20 @@ static int deliver_clone(const struct net_bridge_port *prev,
void br_forward(const struct net_bridge_port *to,
struct sk_buff *skb, bool local_rcv, bool local_orig)
{
if (to && should_deliver(to, skb)) {
if (unlikely(!to))
goto out;
/* redirect to backup link if the destination port is down */
if (rcu_access_pointer(to->backup_port) && !netif_carrier_ok(to->dev)) {
struct net_bridge_port *backup_port;
backup_port = rcu_dereference(to->backup_port);
if (unlikely(!backup_port))
goto out;
to = backup_port;
}
if (should_deliver(to, skb)) {
if (local_rcv)
deliver_clone(to, skb, local_orig);
else
......@@ -150,6 +163,7 @@ void br_forward(const struct net_bridge_port *to,
return;
}
out:
if (!local_rcv)
kfree_skb(skb);
}
......
......@@ -170,6 +170,58 @@ void br_manage_promisc(struct net_bridge *br)
}
}
int nbp_backup_change(struct net_bridge_port *p,
struct net_device *backup_dev)
{
struct net_bridge_port *old_backup = rtnl_dereference(p->backup_port);
struct net_bridge_port *backup_p = NULL;
ASSERT_RTNL();
if (backup_dev) {
if (!br_port_exists(backup_dev))
return -ENOENT;
backup_p = br_port_get_rtnl(backup_dev);
if (backup_p->br != p->br)
return -EINVAL;
}
if (p == backup_p)
return -EINVAL;
if (old_backup == backup_p)
return 0;
/* if the backup link is already set, clear it */
if (old_backup)
old_backup->backup_redirected_cnt--;
if (backup_p)
backup_p->backup_redirected_cnt++;
rcu_assign_pointer(p->backup_port, backup_p);
return 0;
}
static void nbp_backup_clear(struct net_bridge_port *p)
{
nbp_backup_change(p, NULL);
if (p->backup_redirected_cnt) {
struct net_bridge_port *cur_p;
list_for_each_entry(cur_p, &p->br->port_list, list) {
struct net_bridge_port *backup_p;
backup_p = rtnl_dereference(cur_p->backup_port);
if (backup_p == p)
nbp_backup_change(cur_p, NULL);
}
}
WARN_ON(rcu_access_pointer(p->backup_port) || p->backup_redirected_cnt);
}
static void nbp_update_port_count(struct net_bridge *br)
{
struct net_bridge_port *p;
......@@ -295,6 +347,7 @@ static void del_nbp(struct net_bridge_port *p)
nbp_vlan_flush(p);
br_fdb_delete_by_port(br, p, 0, 1);
switchdev_deferred_process();
nbp_backup_clear(p);
nbp_update_port_count(br);
......
......@@ -169,13 +169,15 @@ static inline size_t br_nlmsg_size(struct net_device *dev, u32 filter_mask)
+ nla_total_size(1) /* IFLA_OPERSTATE */
+ nla_total_size(br_port_info_size()) /* IFLA_PROTINFO */
+ nla_total_size(br_get_link_af_size_filtered(dev,
filter_mask)); /* IFLA_AF_SPEC */
filter_mask)) /* IFLA_AF_SPEC */
+ nla_total_size(4); /* IFLA_BRPORT_BACKUP_PORT */
}
static int br_port_fill_attrs(struct sk_buff *skb,
const struct net_bridge_port *p)
{
u8 mode = !!(p->flags & BR_HAIRPIN_MODE);
struct net_bridge_port *backup_p;
u64 timerval;
if (nla_put_u8(skb, IFLA_BRPORT_STATE, p->state) ||
......@@ -237,6 +239,14 @@ static int br_port_fill_attrs(struct sk_buff *skb,
return -EMSGSIZE;
#endif
/* we might be called only with br->lock */
rcu_read_lock();
backup_p = rcu_dereference(p->backup_port);
if (backup_p)
nla_put_u32(skb, IFLA_BRPORT_BACKUP_PORT,
backup_p->dev->ifindex);
rcu_read_unlock();
return 0;
}
......@@ -663,6 +673,7 @@ static const struct nla_policy br_port_policy[IFLA_BRPORT_MAX + 1] = {
[IFLA_BRPORT_GROUP_FWD_MASK] = { .type = NLA_U16 },
[IFLA_BRPORT_NEIGH_SUPPRESS] = { .type = NLA_U8 },
[IFLA_BRPORT_ISOLATED] = { .type = NLA_U8 },
[IFLA_BRPORT_BACKUP_PORT] = { .type = NLA_U32 },
};
/* Change the state of the port and notify spanning tree */
......@@ -817,6 +828,23 @@ static int br_setport(struct net_bridge_port *p, struct nlattr *tb[])
if (err)
return err;
if (tb[IFLA_BRPORT_BACKUP_PORT]) {
struct net_device *backup_dev = NULL;
u32 backup_ifindex;
backup_ifindex = nla_get_u32(tb[IFLA_BRPORT_BACKUP_PORT]);
if (backup_ifindex) {
backup_dev = __dev_get_by_index(dev_net(p->dev),
backup_ifindex);
if (!backup_dev)
return -ENOENT;
}
err = nbp_backup_change(p, backup_dev);
if (err)
return err;
}
br_port_flags_change(p, old_flags ^ p->flags);
return 0;
}
......
......@@ -237,6 +237,7 @@ struct net_bridge_port {
#ifdef CONFIG_BRIDGE_VLAN_FILTERING
struct net_bridge_vlan_group __rcu *vlgrp;
#endif
struct net_bridge_port __rcu *backup_port;
/* STP */
u8 priority;
......@@ -281,6 +282,7 @@ struct net_bridge_port {
int offload_fwd_mark;
#endif
u16 group_fwd_mask;
u16 backup_redirected_cnt;
};
#define kobj_to_brport(obj) container_of(obj, struct net_bridge_port, kobj)
......@@ -597,6 +599,7 @@ netdev_features_t br_features_recompute(struct net_bridge *br,
netdev_features_t features);
void br_port_flags_change(struct net_bridge_port *port, unsigned long mask);
void br_manage_promisc(struct net_bridge *br);
int nbp_backup_change(struct net_bridge_port *p, struct net_device *backup_dev);
/* br_input.c */
int br_handle_frame_finish(struct net *net, struct sock *sk, struct sk_buff *skb);
......
......@@ -25,6 +25,15 @@ struct brport_attribute {
struct attribute attr;
ssize_t (*show)(struct net_bridge_port *, char *);
int (*store)(struct net_bridge_port *, unsigned long);
int (*store_raw)(struct net_bridge_port *, char *);
};
#define BRPORT_ATTR_RAW(_name, _mode, _show, _store) \
const struct brport_attribute brport_attr_##_name = { \
.attr = {.name = __stringify(_name), \
.mode = _mode }, \
.show = _show, \
.store_raw = _store, \
};
#define BRPORT_ATTR(_name, _mode, _show, _store) \
......@@ -182,6 +191,38 @@ static int store_group_fwd_mask(struct net_bridge_port *p,
static BRPORT_ATTR(group_fwd_mask, 0644, show_group_fwd_mask,
store_group_fwd_mask);
static ssize_t show_backup_port(struct net_bridge_port *p, char *buf)
{
struct net_bridge_port *backup_p;
int ret = 0;
rcu_read_lock();
backup_p = rcu_dereference(p->backup_port);
if (backup_p)
ret = sprintf(buf, "%s\n", backup_p->dev->name);
rcu_read_unlock();
return ret;
}
static int store_backup_port(struct net_bridge_port *p, char *buf)
{
struct net_device *backup_dev = NULL;
char *nl = strchr(buf, '\n');
if (nl)
*nl = '\0';
if (strlen(buf) > 0) {
backup_dev = __dev_get_by_name(dev_net(p->dev), buf);
if (!backup_dev)
return -ENOENT;
}
return nbp_backup_change(p, backup_dev);
}
static BRPORT_ATTR_RAW(backup_port, 0644, show_backup_port, store_backup_port);
BRPORT_ATTR_FLAG(hairpin_mode, BR_HAIRPIN_MODE);
BRPORT_ATTR_FLAG(bpdu_guard, BR_BPDU_GUARD);
BRPORT_ATTR_FLAG(root_block, BR_ROOT_BLOCK);
......@@ -245,6 +286,7 @@ static const struct brport_attribute *brport_attrs[] = {
&brport_attr_group_fwd_mask,
&brport_attr_neigh_suppress,
&brport_attr_isolated,
&brport_attr_backup_port,
NULL
};
......@@ -269,27 +311,46 @@ static ssize_t brport_store(struct kobject *kobj,
struct brport_attribute *brport_attr = to_brport_attr(attr);
struct net_bridge_port *p = kobj_to_brport(kobj);
ssize_t ret = -EINVAL;
char *endp;
unsigned long val;
char *endp;
if (!ns_capable(dev_net(p->dev)->user_ns, CAP_NET_ADMIN))
return -EPERM;
val = simple_strtoul(buf, &endp, 0);
if (endp != buf) {
if (!rtnl_trylock())
return restart_syscall();
if (p->dev && p->br && brport_attr->store) {
spin_lock_bh(&p->br->lock);
ret = brport_attr->store(p, val);
spin_unlock_bh(&p->br->lock);
if (!ret) {
br_ifinfo_notify(RTM_NEWLINK, NULL, p);
ret = count;
}
if (!rtnl_trylock())
return restart_syscall();
if (!p->dev || !p->br)
goto out_unlock;
if (brport_attr->store_raw) {
char *buf_copy;
buf_copy = kstrndup(buf, count, GFP_KERNEL);
if (!buf_copy) {
ret = -ENOMEM;
goto out_unlock;
}
rtnl_unlock();
spin_lock_bh(&p->br->lock);
ret = brport_attr->store_raw(p, buf_copy);
spin_unlock_bh(&p->br->lock);
kfree(buf_copy);
} else if (brport_attr->store) {
val = simple_strtoul(buf, &endp, 0);
if (endp == buf)
goto out_unlock;
spin_lock_bh(&p->br->lock);
ret = brport_attr->store(p, val);
spin_unlock_bh(&p->br->lock);
}
if (!ret) {
br_ifinfo_notify(RTM_NEWLINK, NULL, p);
ret = count;
}
out_unlock:
rtnl_unlock();
return ret;
}
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment