Commit d2e42a17 authored by David S. Miller's avatar David S. Miller

Merge branch 'ndo_set_rx_headroom'

Paolo Abeni says:

====================
bridge/ovs: avoid skb head copy on frame forwarding

Currently, while when an OVS or Linux bridge is used to forward frames towards
some tunnel device, a skb_head_copy() may occur if the ingress device do not
provide enough headroom for the tx encapsulation.

This patch series tries to address the issue implementing a new ndo operation to
allow the master device to control the headroom used when allocating the skb on
frame reception.

Said operation is used by the Linux bridge to notify the bridged ports of
needed_headroom changes, and similar bookkeeping and behaviour is also added to
openvswitch, on a per datapath basis.

Finally, the operation is implemented for veth and tun device, which give
performance improvement in the 6-12% range when forwarding frames from said
devices towards a vxlan tunnel.

v2:
- fix netdev_get_fwd_headroom() behaviour
- remove some code duplication with the netdev_set_rx_headroom() and
   netdev_reset_rx_headroom() helpers
- handle headroom reset on [v]port removal/deletion
- initialize tun align to the old default value

v3:
- fix a comment typo
====================
Acked-by: default avatarPravin B Shelar <pshelar@ovn.org>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 46d5efa9 163e5292
......@@ -187,6 +187,7 @@ struct tun_struct {
#define TUN_USER_FEATURES (NETIF_F_HW_CSUM|NETIF_F_TSO_ECN|NETIF_F_TSO| \
NETIF_F_TSO6|NETIF_F_UFO)
int align;
int vnet_hdr_sz;
int sndbuf;
struct tap_filter txflt;
......@@ -934,6 +935,17 @@ static void tun_poll_controller(struct net_device *dev)
return;
}
#endif
static void tun_set_headroom(struct net_device *dev, int new_hr)
{
struct tun_struct *tun = netdev_priv(dev);
if (new_hr < NET_SKB_PAD)
new_hr = NET_SKB_PAD;
tun->align = new_hr;
}
static const struct net_device_ops tun_netdev_ops = {
.ndo_uninit = tun_net_uninit,
.ndo_open = tun_net_open,
......@@ -945,6 +957,7 @@ static const struct net_device_ops tun_netdev_ops = {
#ifdef CONFIG_NET_POLL_CONTROLLER
.ndo_poll_controller = tun_poll_controller,
#endif
.ndo_set_rx_headroom = tun_set_headroom,
};
static const struct net_device_ops tap_netdev_ops = {
......@@ -962,6 +975,7 @@ static const struct net_device_ops tap_netdev_ops = {
.ndo_poll_controller = tun_poll_controller,
#endif
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = tun_set_headroom,
};
static void tun_flow_init(struct tun_struct *tun)
......@@ -1086,7 +1100,7 @@ static ssize_t tun_get_user(struct tun_struct *tun, struct tun_file *tfile,
struct tun_pi pi = { 0, cpu_to_be16(ETH_P_IP) };
struct sk_buff *skb;
size_t total_len = iov_iter_count(from);
size_t len = total_len, align = NET_SKB_PAD, linear;
size_t len = total_len, align = tun->align, linear;
struct virtio_net_hdr gso = { 0 };
int good_linear;
int copylen;
......@@ -1694,6 +1708,7 @@ static int tun_set_iff(struct net *net, struct file *file, struct ifreq *ifr)
tun->txflt.count = 0;
tun->vnet_hdr_sz = sizeof(struct virtio_net_hdr);
tun->align = NET_SKB_PAD;
tun->filter_attached = false;
tun->sndbuf = tfile->socket.sk->sk_sndbuf;
......
......@@ -35,6 +35,7 @@ struct pcpu_vstats {
struct veth_priv {
struct net_device __rcu *peer;
atomic64_t dropped;
unsigned requested_headroom;
};
/*
......@@ -271,6 +272,29 @@ static int veth_get_iflink(const struct net_device *dev)
return iflink;
}
static void veth_set_rx_headroom(struct net_device *dev, int new_hr)
{
struct veth_priv *peer_priv, *priv = netdev_priv(dev);
struct net_device *peer;
if (new_hr < 0)
new_hr = 0;
rcu_read_lock();
peer = rcu_dereference(priv->peer);
if (unlikely(!peer))
goto out;
peer_priv = netdev_priv(peer);
priv->requested_headroom = new_hr;
new_hr = max(priv->requested_headroom, peer_priv->requested_headroom);
dev->needed_headroom = new_hr;
peer->needed_headroom = new_hr;
out:
rcu_read_unlock();
}
static const struct net_device_ops veth_netdev_ops = {
.ndo_init = veth_dev_init,
.ndo_open = veth_open,
......@@ -285,6 +309,7 @@ static const struct net_device_ops veth_netdev_ops = {
#endif
.ndo_get_iflink = veth_get_iflink,
.ndo_features_check = passthru_features_check,
.ndo_set_rx_headroom = veth_set_rx_headroom,
};
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_ALL_TSO | \
......@@ -301,6 +326,7 @@ static void veth_setup(struct net_device *dev)
dev->priv_flags &= ~IFF_TX_SKB_SHARING;
dev->priv_flags |= IFF_LIVE_ADDR_CHANGE;
dev->priv_flags |= IFF_NO_QUEUE;
dev->priv_flags |= IFF_PHONY_HEADROOM;
dev->netdev_ops = &veth_netdev_ops;
dev->ethtool_ops = &veth_ethtool_ops;
......
......@@ -1093,6 +1093,12 @@ struct tc_to_netdev {
* This function is used to get egress tunnel information for given skb.
* This is useful for retrieving outer tunnel header parameters while
* sampling packet.
* void (*ndo_set_rx_headroom)(struct net_device *dev, int needed_headroom);
* This function is used to specify the headroom that the skb must
* consider when allocation skb during packet reception. Setting
* appropriate rx headroom value allows avoiding skb head copy on
* forward. Setting a negative value reset the rx headroom to the
* default value.
*
*/
struct net_device_ops {
......@@ -1278,6 +1284,8 @@ struct net_device_ops {
bool proto_down);
int (*ndo_fill_metadata_dst)(struct net_device *dev,
struct sk_buff *skb);
void (*ndo_set_rx_headroom)(struct net_device *dev,
int needed_headroom);
};
/**
......@@ -1315,6 +1323,8 @@ struct net_device_ops {
* @IFF_L3MDEV_SLAVE: device is enslaved to an L3 master device
* @IFF_TEAM: device is a team device
* @IFF_RXFH_CONFIGURED: device has had Rx Flow indirection table configured
* @IFF_PHONY_HEADROOM: the headroom value is controlled by an external
* entity (i.e. the master device for bridged veth)
*/
enum netdev_priv_flags {
IFF_802_1Q_VLAN = 1<<0,
......@@ -1343,6 +1353,7 @@ enum netdev_priv_flags {
IFF_L3MDEV_SLAVE = 1<<23,
IFF_TEAM = 1<<24,
IFF_RXFH_CONFIGURED = 1<<25,
IFF_PHONY_HEADROOM = 1<<26,
};
#define IFF_802_1Q_VLAN IFF_802_1Q_VLAN
......@@ -1937,6 +1948,26 @@ struct netdev_queue *netdev_pick_tx(struct net_device *dev,
struct sk_buff *skb,
void *accel_priv);
/* returns the headroom that the master device needs to take in account
* when forwarding to this dev
*/
static inline unsigned netdev_get_fwd_headroom(struct net_device *dev)
{
return dev->priv_flags & IFF_PHONY_HEADROOM ? 0 : dev->needed_headroom;
}
static inline void netdev_set_rx_headroom(struct net_device *dev, int new_hr)
{
if (dev->netdev_ops->ndo_set_rx_headroom)
dev->netdev_ops->ndo_set_rx_headroom(dev, new_hr);
}
/* set the device rx headroom to the dev's default */
static inline void netdev_reset_rx_headroom(struct net_device *dev)
{
netdev_set_rx_headroom(dev, -1);
}
/*
* Net namespace inlines
*/
......
......@@ -223,6 +223,31 @@ static void destroy_nbp_rcu(struct rcu_head *head)
destroy_nbp(p);
}
static unsigned get_max_headroom(struct net_bridge *br)
{
unsigned max_headroom = 0;
struct net_bridge_port *p;
list_for_each_entry(p, &br->port_list, list) {
unsigned dev_headroom = netdev_get_fwd_headroom(p->dev);
if (dev_headroom > max_headroom)
max_headroom = dev_headroom;
}
return max_headroom;
}
static void update_headroom(struct net_bridge *br, int new_hr)
{
struct net_bridge_port *p;
list_for_each_entry(p, &br->port_list, list)
netdev_set_rx_headroom(p->dev, new_hr);
br->dev->needed_headroom = new_hr;
}
/* Delete port(interface) from bridge is done in two steps.
* via RCU. First step, marks device as down. That deletes
* all the timers and stops new packets from flowing through.
......@@ -248,6 +273,9 @@ static void del_nbp(struct net_bridge_port *p)
br_ifinfo_notify(RTM_DELLINK, p);
list_del_rcu(&p->list);
if (netdev_get_fwd_headroom(dev) == br->dev->needed_headroom)
update_headroom(br, get_max_headroom(br));
netdev_reset_rx_headroom(dev);
nbp_vlan_flush(p);
br_fdb_delete_by_port(br, p, 0, 1);
......@@ -438,6 +466,7 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
{
struct net_bridge_port *p;
int err = 0;
unsigned br_hr, dev_hr;
bool changed_addr;
/* Don't allow bridging non-ethernet like devices, or DSA-enabled
......@@ -505,8 +534,12 @@ int br_add_if(struct net_bridge *br, struct net_device *dev)
netdev_update_features(br->dev);
if (br->dev->needed_headroom < dev->needed_headroom)
br->dev->needed_headroom = dev->needed_headroom;
br_hr = br->dev->needed_headroom;
dev_hr = netdev_get_fwd_headroom(dev);
if (br_hr < dev_hr)
update_headroom(br, dev_hr);
else
netdev_set_rx_headroom(dev, br_hr);
if (br_fdb_insert(br, p, dev->dev_addr, 0))
netdev_err(dev, "failed insert local address bridge forwarding table\n");
......
......@@ -1908,6 +1908,29 @@ static struct vport *lookup_vport(struct net *net,
return ERR_PTR(-EINVAL);
}
/* Called with ovs_mutex */
static void update_headroom(struct datapath *dp)
{
unsigned dev_headroom, max_headroom = 0;
struct net_device *dev;
struct vport *vport;
int i;
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++) {
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node) {
dev = vport->dev;
dev_headroom = netdev_get_fwd_headroom(dev);
if (dev_headroom > max_headroom)
max_headroom = dev_headroom;
}
}
dp->max_headroom = max_headroom;
for (i = 0; i < DP_VPORT_HASH_BUCKETS; i++)
hlist_for_each_entry_rcu(vport, &dp->ports[i], dp_hash_node)
netdev_set_rx_headroom(vport->dev, max_headroom);
}
static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
{
struct nlattr **a = info->attrs;
......@@ -1973,6 +1996,12 @@ static int ovs_vport_cmd_new(struct sk_buff *skb, struct genl_info *info)
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_NEW);
if (netdev_get_fwd_headroom(vport->dev) > dp->max_headroom)
update_headroom(dp);
else
netdev_set_rx_headroom(vport->dev, dp->max_headroom);
BUG_ON(err < 0);
ovs_unlock();
......@@ -2039,8 +2068,10 @@ static int ovs_vport_cmd_set(struct sk_buff *skb, struct genl_info *info)
static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
{
bool must_update_headroom = false;
struct nlattr **a = info->attrs;
struct sk_buff *reply;
struct datapath *dp;
struct vport *vport;
int err;
......@@ -2062,7 +2093,16 @@ static int ovs_vport_cmd_del(struct sk_buff *skb, struct genl_info *info)
err = ovs_vport_cmd_fill_info(vport, reply, info->snd_portid,
info->snd_seq, 0, OVS_VPORT_CMD_DEL);
BUG_ON(err < 0);
/* the vport deletion may trigger dp headroom update */
dp = vport->dp;
if (netdev_get_fwd_headroom(vport->dev) == dp->max_headroom)
must_update_headroom = true;
netdev_reset_rx_headroom(vport->dev);
ovs_dp_detach_port(vport);
if (must_update_headroom)
update_headroom(dp);
ovs_unlock();
ovs_notify(&dp_vport_genl_family, reply, info);
......
......@@ -68,6 +68,8 @@ struct dp_stats_percpu {
* ovs_mutex and RCU.
* @stats_percpu: Per-CPU datapath statistics.
* @net: Reference to net namespace.
* @max_headroom: the maximum headroom of all vports in this datapath; it will
* be used by all the internal vports in this dp.
*
* Context: See the comment on locking at the top of datapath.c for additional
* locking information.
......@@ -89,6 +91,8 @@ struct datapath {
possible_net_t net;
u32 user_features;
u32 max_headroom;
};
/**
......
......@@ -138,6 +138,11 @@ internal_get_stats(struct net_device *dev, struct rtnl_link_stats64 *stats)
return stats;
}
void internal_set_rx_headroom(struct net_device *dev, int new_hr)
{
dev->needed_headroom = new_hr;
}
static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_open = internal_dev_open,
.ndo_stop = internal_dev_stop,
......@@ -145,6 +150,7 @@ static const struct net_device_ops internal_dev_netdev_ops = {
.ndo_set_mac_address = eth_mac_addr,
.ndo_change_mtu = internal_dev_change_mtu,
.ndo_get_stats64 = internal_get_stats,
.ndo_set_rx_headroom = internal_set_rx_headroom,
};
static struct rtnl_link_ops internal_dev_link_ops __read_mostly = {
......@@ -158,7 +164,8 @@ static void do_setup(struct net_device *netdev)
netdev->netdev_ops = &internal_dev_netdev_ops;
netdev->priv_flags &= ~IFF_TX_SKB_SHARING;
netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH;
netdev->priv_flags |= IFF_LIVE_ADDR_CHANGE | IFF_OPENVSWITCH |
IFF_PHONY_HEADROOM;
netdev->destructor = internal_dev_destructor;
netdev->ethtool_ops = &internal_dev_ethtool_ops;
netdev->rtnl_link_ops = &internal_dev_link_ops;
......@@ -199,6 +206,7 @@ static struct vport *internal_dev_create(const struct vport_parms *parms)
err = -ENOMEM;
goto error_free_netdev;
}
vport->dev->needed_headroom = vport->dp->max_headroom;
dev_net_set(vport->dev, ovs_dp_get_net(vport->dp));
internal_dev = internal_dev_priv(vport->dev);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment