Commit fabe7bed authored by David S. Miller's avatar David S. Miller

Merge branch 'l3_hw_offload'

Scott Feldman says:

====================
switchdev: add IPv4 routing offload

v4:

  - Add NETIF_F_NETNS_LOCAL to rocker port feature list to keep rocker
    ports in the default netns.  Rocker hardware can't be partitioned
    to support multiple namespaces, currently.  It would be interesting
    to add netns support to rocker device by basically adding another
    match field to each table to match on some unique netns ID, with
    a port knowing it's netns ID.  Future work TDB.
  - Up-level the RTNH_F_EXTERNAL marking of routes installed to offload
    device from driver to switchdev common code.  Now driver can't skip
    routes.  Either it can install the route or it cannot.  Yes or No.
    If no on any route, all offloading is aborted by removing routes
    from offload device and setting ipv4.fib_offload_disabled so no more
    routes can be offloaded.  This is harsh, but it's our starting point.
    We can refine the policies in follow-up work.
  - Add new net.ipv4.fib_offload_disabled bool that is set if anything
    goes wrong with route offloading.  We can refine this later to make
    the setting per-device or per-device-port-netdev, but let's start
    here simple and refine in follow-up work.
  - Rebase against Alex's latest FIB changes.  I think I did everything
    correctly, and didn't run into any issues with testing, but I'd like
    Alex to look over the changes and maybe follow-up with any cleanups.

v3:

Changes based on v2 review comments:

  - Move check for custom rules up earlier in patch set, to keep git bisect
    safe.
  - Simplify the route add/modify failure handling to simple try until
    failure, and then on failure, undo everything.  The switchdev driver
    will return err when route can normally be installed to device, but
    the install fails for one reason or another (no space left on device,
    etc).  If a failure happens, uninstall all routes from the device,
    punting forwarding for all routes back to the kernel.
  - Scan route's full nexthop list, ensuring all nexthop devs belong
    to the same switchdev device, otherwise don't try to install route
    to device.

v2:

Changes based on v1 review comments and discussions at netconf:

  - Allow route modification, but use same ndo op used for adding route.
    Driver/device is expected to modify route in-place, if it can, to avoid
    interruption of service.
  - Add new RTNH_F_EXTERNAL flag to mark FIB entries offloaded externally.
  - Don't offload routes if using custom IP rules.  If routes are already
    offloaded, and custom IP rules are turned on, flush routes from offload
    device.  (Offloaded routes are marked with RTNH_F_EXTERNAL).
  - Use kernel's neigh resolution code to resolve route's nexthops' neigh
    MAC addrs.  (Thanks davem, works great!).
  - Use fib->fib_priority in rocker driver to give priorities to routes in
    OF-DPA unicast route table.

v1:

This patch set adds L3 routing offload support for IPv4 routes.  The idea is to
mirror routes installed in the kernel's FIB down to a hardware switch device to
offload the data forwarding path for L3.  Only the data forwarding path is
intercepted.  Control and management of the kernel's FIB remains with the
kernel.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 24d2e4a5 c1beeef7
This diff is collapsed.
...@@ -768,6 +768,8 @@ struct netdev_phys_item_id { ...@@ -768,6 +768,8 @@ struct netdev_phys_item_id {
typedef u16 (*select_queue_fallback_t)(struct net_device *dev, typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
struct sk_buff *skb); struct sk_buff *skb);
struct fib_info;
/* /*
* This structure defines the management hooks for network devices. * This structure defines the management hooks for network devices.
* The following hooks can be defined; unless noted otherwise, they are * The following hooks can be defined; unless noted otherwise, they are
...@@ -1031,6 +1033,14 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev, ...@@ -1031,6 +1033,14 @@ typedef u16 (*select_queue_fallback_t)(struct net_device *dev,
* int (*ndo_switch_port_stp_update)(struct net_device *dev, u8 state); * int (*ndo_switch_port_stp_update)(struct net_device *dev, u8 state);
* Called to notify switch device port of bridge port STP * Called to notify switch device port of bridge port STP
* state change. * state change.
* int (*ndo_sw_parent_fib_ipv4_add)(struct net_device *dev, __be32 dst,
* int dst_len, struct fib_info *fi,
* u8 tos, u8 type, u32 tb_id);
* Called to add/modify IPv4 route to switch device.
* int (*ndo_sw_parent_fib_ipv4_del)(struct net_device *dev, __be32 dst,
* int dst_len, struct fib_info *fi,
* u8 tos, u8 type, u32 tb_id);
* Called to delete IPv4 route from switch device.
*/ */
struct net_device_ops { struct net_device_ops {
int (*ndo_init)(struct net_device *dev); int (*ndo_init)(struct net_device *dev);
...@@ -1192,6 +1202,18 @@ struct net_device_ops { ...@@ -1192,6 +1202,18 @@ struct net_device_ops {
struct netdev_phys_item_id *psid); struct netdev_phys_item_id *psid);
int (*ndo_switch_port_stp_update)(struct net_device *dev, int (*ndo_switch_port_stp_update)(struct net_device *dev,
u8 state); u8 state);
int (*ndo_switch_fib_ipv4_add)(struct net_device *dev,
__be32 dst,
int dst_len,
struct fib_info *fi,
u8 tos, u8 type,
u32 tb_id);
int (*ndo_switch_fib_ipv4_del)(struct net_device *dev,
__be32 dst,
int dst_len,
struct fib_info *fi,
u8 tos, u8 type,
u32 tb_id);
#endif #endif
}; };
......
...@@ -196,6 +196,7 @@ int fib_table_delete(struct fib_table *, struct fib_config *); ...@@ -196,6 +196,7 @@ int fib_table_delete(struct fib_table *, struct fib_config *);
int fib_table_dump(struct fib_table *table, struct sk_buff *skb, int fib_table_dump(struct fib_table *table, struct sk_buff *skb,
struct netlink_callback *cb); struct netlink_callback *cb);
int fib_table_flush(struct fib_table *table); int fib_table_flush(struct fib_table *table);
void fib_table_flush_external(struct fib_table *table);
void fib_free_table(struct fib_table *tb); void fib_free_table(struct fib_table *tb);
...@@ -308,6 +309,7 @@ static inline int fib_num_tclassid_users(struct net *net) ...@@ -308,6 +309,7 @@ static inline int fib_num_tclassid_users(struct net *net)
return 0; return 0;
} }
#endif #endif
void fib_flush_external(struct net *net);
/* Exported by fib_semantics.c */ /* Exported by fib_semantics.c */
int ip_fib_check_default(__be32 gw, struct net_device *dev); int ip_fib_check_default(__be32 gw, struct net_device *dev);
......
...@@ -47,6 +47,7 @@ struct netns_ipv4 { ...@@ -47,6 +47,7 @@ struct netns_ipv4 {
int fib_num_tclassid_users; int fib_num_tclassid_users;
#endif #endif
struct hlist_head *fib_table_hash; struct hlist_head *fib_table_hash;
bool fib_offload_disabled;
struct sock *fibnl; struct sock *fibnl;
struct sock * __percpu *icmp_sk; struct sock * __percpu *icmp_sk;
......
...@@ -51,6 +51,12 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, ...@@ -51,6 +51,12 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
struct nlmsghdr *nlh, u16 flags); struct nlmsghdr *nlh, u16 flags);
int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev, int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *dev,
struct nlmsghdr *nlh, u16 flags); struct nlmsghdr *nlh, u16 flags);
int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
u8 tos, u8 type, u32 tb_id);
int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
u8 tos, u8 type, u32 tb_id);
void netdev_switch_fib_ipv4_abort(struct fib_info *fi);
#else #else
static inline int netdev_switch_parent_id_get(struct net_device *dev, static inline int netdev_switch_parent_id_get(struct net_device *dev,
...@@ -109,6 +115,24 @@ static inline int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device * ...@@ -109,6 +115,24 @@ static inline int ndo_dflt_netdev_switch_port_bridge_setlink(struct net_device *
return 0; return 0;
} }
static inline int netdev_switch_fib_ipv4_add(u32 dst, int dst_len,
struct fib_info *fi,
u8 tos, u8 type, u32 tb_id)
{
return 0;
}
static inline int netdev_switch_fib_ipv4_del(u32 dst, int dst_len,
struct fib_info *fi,
u8 tos, u8 type, u32 tb_id)
{
return 0;
}
void netdev_switch_fib_ipv4_abort(struct fib_info *fi)
{
}
#endif #endif
#endif /* _LINUX_SWITCHDEV_H_ */ #endif /* _LINUX_SWITCHDEV_H_ */
...@@ -334,6 +334,7 @@ struct rtnexthop { ...@@ -334,6 +334,7 @@ struct rtnexthop {
#define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */ #define RTNH_F_DEAD 1 /* Nexthop is dead (used by multipath) */
#define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */ #define RTNH_F_PERVASIVE 2 /* Do recursive gateway lookup */
#define RTNH_F_ONLINK 4 /* Gateway is forced on link */ #define RTNH_F_ONLINK 4 /* Gateway is forced on link */
#define RTNH_F_EXTERNAL 8 /* Route installed externally */
/* Macros to handle hexthops */ /* Macros to handle hexthops */
......
...@@ -144,6 +144,19 @@ static void fib_flush(struct net *net) ...@@ -144,6 +144,19 @@ static void fib_flush(struct net *net)
rt_cache_flush(net); rt_cache_flush(net);
} }
void fib_flush_external(struct net *net)
{
struct fib_table *tb;
struct hlist_head *head;
unsigned int h;
for (h = 0; h < FIB_TABLE_HASHSZ; h++) {
head = &net->ipv4.fib_table_hash[h];
hlist_for_each_entry(tb, head, tb_hlist)
fib_table_flush_external(tb);
}
}
/* /*
* Find address type as if only "dev" was present in the system. If * Find address type as if only "dev" was present in the system. If
* on_dev is NULL then all interfaces are taken into consideration. * on_dev is NULL then all interfaces are taken into consideration.
......
...@@ -209,6 +209,8 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb, ...@@ -209,6 +209,8 @@ static int fib4_rule_configure(struct fib_rule *rule, struct sk_buff *skb,
rule4->tos = frh->tos; rule4->tos = frh->tos;
net->ipv4.fib_has_custom_rules = true; net->ipv4.fib_has_custom_rules = true;
fib_flush_external(rule->fr_net);
err = 0; err = 0;
errout: errout:
return err; return err;
...@@ -224,6 +226,7 @@ static void fib4_rule_delete(struct fib_rule *rule) ...@@ -224,6 +226,7 @@ static void fib4_rule_delete(struct fib_rule *rule)
net->ipv4.fib_num_tclassid_users--; net->ipv4.fib_num_tclassid_users--;
#endif #endif
net->ipv4.fib_has_custom_rules = true; net->ipv4.fib_has_custom_rules = true;
fib_flush_external(rule->fr_net);
} }
static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh, static int fib4_rule_compare(struct fib_rule *rule, struct fib_rule_hdr *frh,
......
...@@ -79,6 +79,7 @@ ...@@ -79,6 +79,7 @@
#include <net/tcp.h> #include <net/tcp.h>
#include <net/sock.h> #include <net/sock.h>
#include <net/ip_fib.h> #include <net/ip_fib.h>
#include <net/switchdev.h>
#include "fib_lookup.h" #include "fib_lookup.h"
#define MAX_STAT_DEPTH 32 #define MAX_STAT_DEPTH 32
...@@ -1135,7 +1136,18 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) ...@@ -1135,7 +1136,18 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_state = state & ~FA_S_ACCESSED; new_fa->fa_state = state & ~FA_S_ACCESSED;
new_fa->fa_slen = fa->fa_slen; new_fa->fa_slen = fa->fa_slen;
err = netdev_switch_fib_ipv4_add(key, plen, fi,
new_fa->fa_tos,
cfg->fc_type,
tb->tb_id);
if (err) {
netdev_switch_fib_ipv4_abort(fi);
kmem_cache_free(fn_alias_kmem, new_fa);
goto out;
}
hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list); hlist_replace_rcu(&fa->fa_list, &new_fa->fa_list);
alias_free_mem_rcu(fa); alias_free_mem_rcu(fa);
fib_release_info(fi_drop); fib_release_info(fi_drop);
...@@ -1171,10 +1183,18 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) ...@@ -1171,10 +1183,18 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
new_fa->fa_state = 0; new_fa->fa_state = 0;
new_fa->fa_slen = slen; new_fa->fa_slen = slen;
/* (Optionally) offload fib entry to switch hardware. */
err = netdev_switch_fib_ipv4_add(key, plen, fi, tos,
cfg->fc_type, tb->tb_id);
if (err) {
netdev_switch_fib_ipv4_abort(fi);
goto out_free_new_fa;
}
/* Insert new entry to the list. */ /* Insert new entry to the list. */
err = fib_insert_alias(t, tp, l, new_fa, fa, key); err = fib_insert_alias(t, tp, l, new_fa, fa, key);
if (err) if (err)
goto out_free_new_fa; goto out_sw_fib_del;
if (!plen) if (!plen)
tb->tb_num_default++; tb->tb_num_default++;
...@@ -1185,6 +1205,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg) ...@@ -1185,6 +1205,8 @@ int fib_table_insert(struct fib_table *tb, struct fib_config *cfg)
succeeded: succeeded:
return 0; return 0;
out_sw_fib_del:
netdev_switch_fib_ipv4_del(key, plen, fi, tos, cfg->fc_type, tb->tb_id);
out_free_new_fa: out_free_new_fa:
kmem_cache_free(fn_alias_kmem, new_fa); kmem_cache_free(fn_alias_kmem, new_fa);
out: out:
...@@ -1456,6 +1478,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg) ...@@ -1456,6 +1478,9 @@ int fib_table_delete(struct fib_table *tb, struct fib_config *cfg)
if (!fa_to_delete) if (!fa_to_delete)
return -ESRCH; return -ESRCH;
netdev_switch_fib_ipv4_del(key, plen, fa_to_delete->fa_info, tos,
cfg->fc_type, tb->tb_id);
rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id, rtmsg_fib(RTM_DELROUTE, htonl(key), fa_to_delete, plen, tb->tb_id,
&cfg->fc_nlinfo, 0); &cfg->fc_nlinfo, 0);
...@@ -1536,6 +1561,67 @@ static struct tnode *leaf_walk_rcu(struct tnode **tn, t_key key) ...@@ -1536,6 +1561,67 @@ static struct tnode *leaf_walk_rcu(struct tnode **tn, t_key key)
return n; return n;
} }
/* Caller must hold RTNL */
void fib_table_flush_external(struct fib_table *tb)
{
struct trie *t = (struct trie *)tb->tb_data;
struct fib_alias *fa;
struct tnode *n, *pn;
unsigned long cindex;
unsigned char slen;
int found = 0;
n = rcu_dereference(t->trie);
if (!n)
return;
pn = NULL;
cindex = 0;
while (IS_TNODE(n)) {
/* record pn and cindex for leaf walking */
pn = n;
cindex = 1ul << n->bits;
backtrace:
/* walk trie in reverse order */
do {
while (!(cindex--)) {
t_key pkey = pn->key;
n = pn;
pn = node_parent(n);
/* resize completed node */
resize(t, n);
/* if we got the root we are done */
if (!pn)
return;
cindex = get_index(pkey, pn);
}
/* grab the next available node */
n = tnode_get_child(pn, cindex);
} while (!n);
}
hlist_for_each_entry(fa, &n->leaf, fa_list) {
struct fib_info *fi = fa->fa_info;
if (fi && (fi->fib_flags & RTNH_F_EXTERNAL)) {
netdev_switch_fib_ipv4_del(n->key,
KEYLENGTH - fa->fa_slen,
fi, fa->fa_tos,
fa->fa_type, tb->tb_id);
}
}
/* if trie is leaf only loop is completed */
if (pn)
goto backtrace;
}
/* Caller must hold RTNL. */ /* Caller must hold RTNL. */
int fib_table_flush(struct fib_table *tb) int fib_table_flush(struct fib_table *tb)
{ {
...@@ -1589,6 +1675,10 @@ int fib_table_flush(struct fib_table *tb) ...@@ -1589,6 +1675,10 @@ int fib_table_flush(struct fib_table *tb)
struct fib_info *fi = fa->fa_info; struct fib_info *fi = fa->fa_info;
if (fi && (fi->fib_flags & RTNH_F_DEAD)) { if (fi && (fi->fib_flags & RTNH_F_DEAD)) {
netdev_switch_fib_ipv4_del(n->key,
KEYLENGTH - fa->fa_slen,
fi, fa->fa_tos,
fa->fa_type, tb->tb_id);
hlist_del_rcu(&fa->fa_list); hlist_del_rcu(&fa->fa_list);
fib_release_info(fa->fa_info); fib_release_info(fa->fa_info);
alias_free_mem_rcu(fa); alias_free_mem_rcu(fa);
......
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
#include <linux/mutex.h> #include <linux/mutex.h>
#include <linux/notifier.h> #include <linux/notifier.h>
#include <linux/netdevice.h> #include <linux/netdevice.h>
#include <net/ip_fib.h>
#include <net/switchdev.h> #include <net/switchdev.h>
/** /**
...@@ -225,3 +226,163 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev, ...@@ -225,3 +226,163 @@ int ndo_dflt_netdev_switch_port_bridge_dellink(struct net_device *dev,
return ret; return ret;
} }
EXPORT_SYMBOL(ndo_dflt_netdev_switch_port_bridge_dellink); EXPORT_SYMBOL(ndo_dflt_netdev_switch_port_bridge_dellink);
static struct net_device *netdev_switch_get_lowest_dev(struct net_device *dev)
{
const struct net_device_ops *ops = dev->netdev_ops;
struct net_device *lower_dev;
struct net_device *port_dev;
struct list_head *iter;
/* Recusively search down until we find a sw port dev.
* (A sw port dev supports ndo_switch_parent_id_get).
*/
if (dev->features & NETIF_F_HW_SWITCH_OFFLOAD &&
ops->ndo_switch_parent_id_get)
return dev;
netdev_for_each_lower_dev(dev, lower_dev, iter) {
port_dev = netdev_switch_get_lowest_dev(lower_dev);
if (port_dev)
return port_dev;
}
return NULL;
}
static struct net_device *netdev_switch_get_dev_by_nhs(struct fib_info *fi)
{
struct netdev_phys_item_id psid;
struct netdev_phys_item_id prev_psid;
struct net_device *dev = NULL;
int nhsel;
/* For this route, all nexthop devs must be on the same switch. */
for (nhsel = 0; nhsel < fi->fib_nhs; nhsel++) {
const struct fib_nh *nh = &fi->fib_nh[nhsel];
if (!nh->nh_dev)
return NULL;
dev = netdev_switch_get_lowest_dev(nh->nh_dev);
if (!dev)
return NULL;
if (netdev_switch_parent_id_get(dev, &psid))
return NULL;
if (nhsel > 0) {
if (prev_psid.id_len != psid.id_len)
return NULL;
if (memcmp(prev_psid.id, psid.id, psid.id_len))
return NULL;
}
prev_psid = psid;
}
return dev;
}
/**
* netdev_switch_fib_ipv4_add - Add IPv4 route entry to switch
*
* @dst: route's IPv4 destination address
* @dst_len: destination address length (prefix length)
* @fi: route FIB info structure
* @tos: route TOS
* @type: route type
* @tb_id: route table ID
*
* Add IPv4 route entry to switch device.
*/
int netdev_switch_fib_ipv4_add(u32 dst, int dst_len, struct fib_info *fi,
u8 tos, u8 type, u32 tb_id)
{
struct net_device *dev;
const struct net_device_ops *ops;
int err = 0;
/* Don't offload route if using custom ip rules or if
* IPv4 FIB offloading has been disabled completely.
*/
if (fi->fib_net->ipv4.fib_has_custom_rules |
fi->fib_net->ipv4.fib_offload_disabled)
return 0;
dev = netdev_switch_get_dev_by_nhs(fi);
if (!dev)
return 0;
ops = dev->netdev_ops;
if (ops->ndo_switch_fib_ipv4_add) {
err = ops->ndo_switch_fib_ipv4_add(dev, htonl(dst), dst_len,
fi, tos, type, tb_id);
if (!err)
fi->fib_flags |= RTNH_F_EXTERNAL;
}
return err;
}
EXPORT_SYMBOL(netdev_switch_fib_ipv4_add);
/**
* netdev_switch_fib_ipv4_del - Delete IPv4 route entry from switch
*
* @dst: route's IPv4 destination address
* @dst_len: destination address length (prefix length)
* @fi: route FIB info structure
* @tos: route TOS
* @type: route type
* @tb_id: route table ID
*
* Delete IPv4 route entry from switch device.
*/
int netdev_switch_fib_ipv4_del(u32 dst, int dst_len, struct fib_info *fi,
u8 tos, u8 type, u32 tb_id)
{
struct net_device *dev;
const struct net_device_ops *ops;
int err = 0;
if (!(fi->fib_flags & RTNH_F_EXTERNAL))
return 0;
dev = netdev_switch_get_dev_by_nhs(fi);
if (!dev)
return 0;
ops = dev->netdev_ops;
if (ops->ndo_switch_fib_ipv4_del) {
err = ops->ndo_switch_fib_ipv4_del(dev, htonl(dst), dst_len,
fi, tos, type, tb_id);
if (!err)
fi->fib_flags &= ~RTNH_F_EXTERNAL;
}
return err;
}
EXPORT_SYMBOL(netdev_switch_fib_ipv4_del);
/**
* netdev_switch_fib_ipv4_abort - Abort an IPv4 FIB operation
*
* @fi: route FIB info structure
*/
void netdev_switch_fib_ipv4_abort(struct fib_info *fi)
{
/* There was a problem installing this route to the offload
* device. For now, until we come up with more refined
* policy handling, abruptly end IPv4 fib offloading for
* for entire net by flushing offload device(s) of all
* IPv4 routes, and mark IPv4 fib offloading broken from
* this point forward.
*/
fib_flush_external(fi->fib_net);
fi->fib_net->ipv4.fib_offload_disabled = true;
}
EXPORT_SYMBOL(netdev_switch_fib_ipv4_abort);
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment