Commit 3cdcf133 authored by David S. Miller's avatar David S. Miller

Merge branch 'macvlan_hwaccel'

John Fastabend says:

====================
l2 hardware accelerated macvlans

This patch adds support to offload macvlan net_devices to the
hardware. With these patches packets are pushed to the macvlan
net_device directly and do not pass through the lower dev.

The patches here have made it through multiple iterations
each with a slightly different focus. First I tried to
push these as a new link type called "VMDQ". The patches
shown here,

http://comments.gmane.org/gmane.linux.network/237617

Following this implementation I renamed the link type
"VSI" and addressed various comments. Finally Neil
Horman picked up the patches and integrated the offload
into the macvlan code. Here,

http://permalink.gmane.org/gmane.linux.network/285658

The attached series is clean-up of his patches, with a
few fixes.

If folks find this series acceptable there are a few
items we can work on next. First broadcast and multicast
will use the hardware even for local traffic with this
series. It would be best (I think) to use the software
path for macvlan to macvlan traffic and save the PCIe
bus. This depends on how much you value CPU time vs
PCIE bandwidth. This will need another patch series
to flush out.

Also this series only allows for layer 2 mac forwarding
where some hardware supports more interesting forwarding
capabilities. Integrating with OVS may be useful here.

As always any comments/feedback welcome.

My basic I/O test is here but I've also done some link
testing, SRIOV/DCB with macvlans and others,

Changelog:
v2: two fixes to ixgbe when all features DCB, FCoE, SR-IOV
    are enabled with macvlans. A VMDQ_P() reference
    should have been accel->pool and do not set the offset
    of the ring index from dfwd add call. The offset is used
    by SR-IOV so clearing it can cause SR-IOV quue index's
    to go sideways. With these fixes testing macvlan's with
    SRIOV enabled was successful.
v3: addressed Neil's comments in ixgbe
    fixed error path on dfwd_add_station() in ixgbe
    fixed ixgbe to allow SRIOV and accelerated macvlans to
    coexist.
v4: Dave caught some strange indentation, fixed it here
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 1ec4864b 2a47fa45
......@@ -223,6 +223,15 @@ enum ixgbe_ring_state_t {
__IXGBE_RX_FCOE,
};
struct ixgbe_fwd_adapter {
unsigned long active_vlans[BITS_TO_LONGS(VLAN_N_VID)];
struct net_device *netdev;
struct ixgbe_adapter *real_adapter;
unsigned int tx_base_queue;
unsigned int rx_base_queue;
int pool;
};
#define check_for_tx_hang(ring) \
test_bit(__IXGBE_TX_DETECT_HANG, &(ring)->state)
#define set_check_for_tx_hang(ring) \
......@@ -240,6 +249,7 @@ struct ixgbe_ring {
struct ixgbe_q_vector *q_vector; /* backpointer to host q_vector */
struct net_device *netdev; /* netdev ring belongs to */
struct device *dev; /* device for DMA mapping */
struct ixgbe_fwd_adapter *l2_accel_priv;
void *desc; /* descriptor ring memory */
union {
struct ixgbe_tx_buffer *tx_buffer_info;
......@@ -297,6 +307,12 @@ enum ixgbe_ring_f_enum {
#define IXGBE_MAX_FCOE_INDICES 8
#define MAX_RX_QUEUES (IXGBE_MAX_FDIR_INDICES + 1)
#define MAX_TX_QUEUES (IXGBE_MAX_FDIR_INDICES + 1)
#define IXGBE_MAX_L2A_QUEUES 4
#define IXGBE_MAX_L2A_QUEUES 4
#define IXGBE_BAD_L2A_QUEUE 3
#define IXGBE_MAX_MACVLANS 31
#define IXGBE_MAX_DCBMACVLANS 8
struct ixgbe_ring_feature {
u16 limit; /* upper limit on feature indices */
u16 indices; /* current value of indices */
......@@ -766,6 +782,7 @@ struct ixgbe_adapter {
#endif /*CONFIG_DEBUG_FS*/
u8 default_up;
unsigned long fwd_bitmask; /* Bitmask indicating in use pools */
};
struct ixgbe_fdir_filter {
......@@ -939,4 +956,7 @@ void ixgbe_ptp_check_pps_event(struct ixgbe_adapter *adapter, u32 eicr);
void ixgbe_sriov_reinit(struct ixgbe_adapter *adapter);
#endif
netdev_tx_t ixgbe_xmit_frame_ring(struct sk_buff *skb,
struct ixgbe_adapter *adapter,
struct ixgbe_ring *tx_ring);
#endif /* _IXGBE_H_ */
......@@ -498,6 +498,7 @@ static bool ixgbe_set_sriov_queues(struct ixgbe_adapter *adapter)
#ifdef IXGBE_FCOE
u16 fcoe_i = 0;
#endif
bool pools = (find_first_zero_bit(&adapter->fwd_bitmask, 32) > 1);
/* only proceed if SR-IOV is enabled */
if (!(adapter->flags & IXGBE_FLAG_SRIOV_ENABLED))
......@@ -510,7 +511,7 @@ static bool ixgbe_set_sriov_queues(struct ixgbe_adapter *adapter)
vmdq_i = min_t(u16, IXGBE_MAX_VMDQ_INDICES, vmdq_i);
/* 64 pool mode with 2 queues per pool */
if ((vmdq_i > 32) || (rss_i < 4)) {
if ((vmdq_i > 32) || (rss_i < 4) || (vmdq_i > 16 && pools)) {
vmdq_m = IXGBE_82599_VMDQ_2Q_MASK;
rss_m = IXGBE_RSS_2Q_MASK;
rss_i = min_t(u16, rss_i, 2);
......@@ -852,6 +853,10 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
/* apply Tx specific ring traits */
ring->count = adapter->tx_ring_count;
if (adapter->num_rx_pools > 1)
ring->queue_index =
txr_idx % adapter->num_rx_queues_per_pool;
else
ring->queue_index = txr_idx;
/* assign ring to adapter */
......@@ -895,6 +900,10 @@ static int ixgbe_alloc_q_vector(struct ixgbe_adapter *adapter,
#endif /* IXGBE_FCOE */
/* apply Rx specific ring traits */
ring->count = adapter->rx_ring_count;
if (adapter->num_rx_pools > 1)
ring->queue_index =
rxr_idx % adapter->num_rx_queues_per_pool;
else
ring->queue_index = rxr_idx;
/* assign ring to adapter */
......
......@@ -223,17 +223,19 @@ int ixgbe_disable_sriov(struct ixgbe_adapter *adapter)
IXGBE_WRITE_FLUSH(hw);
/* Disable VMDq flag so device will be set in VM mode */
if (adapter->ring_feature[RING_F_VMDQ].limit == 1)
if (adapter->ring_feature[RING_F_VMDQ].limit == 1) {
adapter->flags &= ~IXGBE_FLAG_VMDQ_ENABLED;
adapter->ring_feature[RING_F_VMDQ].offset = 0;
adapter->flags &= ~IXGBE_FLAG_SRIOV_ENABLED;
rss = min_t(int, IXGBE_MAX_RSS_INDICES, num_online_cpus());
} else {
rss = min_t(int, IXGBE_MAX_L2A_QUEUES, num_online_cpus());
}
adapter->ring_feature[RING_F_VMDQ].offset = 0;
adapter->ring_feature[RING_F_RSS].limit = rss;
/* take a breather then clean up driver data */
msleep(100);
adapter->flags &= ~IXGBE_FLAG_SRIOV_ENABLED;
return 0;
}
......@@ -298,13 +300,10 @@ static int ixgbe_pci_sriov_disable(struct pci_dev *dev)
err = ixgbe_disable_sriov(adapter);
/* Only reinit if no error and state changed */
if (!err && current_flags != adapter->flags) {
/* ixgbe_disable_sriov() doesn't clear VMDQ flag */
adapter->flags &= ~IXGBE_FLAG_VMDQ_ENABLED;
#ifdef CONFIG_PCI_IOV
if (!err && current_flags != adapter->flags)
ixgbe_sriov_reinit(adapter);
#endif
}
return err;
}
......
......@@ -297,7 +297,13 @@ netdev_tx_t macvlan_start_xmit(struct sk_buff *skb,
int ret;
const struct macvlan_dev *vlan = netdev_priv(dev);
if (vlan->fwd_priv) {
skb->dev = vlan->lowerdev;
ret = dev_hard_start_xmit(skb, skb->dev, NULL, vlan->fwd_priv);
} else {
ret = macvlan_queue_xmit(skb, dev);
}
if (likely(ret == NET_XMIT_SUCCESS || ret == NET_XMIT_CN)) {
struct macvlan_pcpu_stats *pcpu_stats;
......@@ -347,6 +353,21 @@ static int macvlan_open(struct net_device *dev)
goto hash_add;
}
if (lowerdev->features & NETIF_F_HW_L2FW_DOFFLOAD) {
vlan->fwd_priv =
lowerdev->netdev_ops->ndo_dfwd_add_station(lowerdev, dev);
/* If we get a NULL pointer back, or if we get an error
* then we should just fall through to the non accelerated path
*/
if (IS_ERR_OR_NULL(vlan->fwd_priv)) {
vlan->fwd_priv = NULL;
} else {
dev->features &= ~NETIF_F_LLTX;
return 0;
}
}
err = -EBUSY;
if (macvlan_addr_busy(vlan->port, dev->dev_addr))
goto out;
......@@ -367,6 +388,11 @@ static int macvlan_open(struct net_device *dev)
del_unicast:
dev_uc_del(lowerdev, dev->dev_addr);
out:
if (vlan->fwd_priv) {
lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
vlan->fwd_priv);
vlan->fwd_priv = NULL;
}
return err;
}
......@@ -375,6 +401,13 @@ static int macvlan_stop(struct net_device *dev)
struct macvlan_dev *vlan = netdev_priv(dev);
struct net_device *lowerdev = vlan->lowerdev;
if (vlan->fwd_priv) {
lowerdev->netdev_ops->ndo_dfwd_del_station(lowerdev,
vlan->fwd_priv);
vlan->fwd_priv = NULL;
return 0;
}
dev_uc_unsync(lowerdev, dev);
dev_mc_unsync(lowerdev, dev);
......@@ -833,6 +866,7 @@ int macvlan_common_newlink(struct net *src_net, struct net_device *dev,
if (err < 0)
goto destroy_port;
dev->priv_flags |= IFF_MACVLAN;
err = netdev_upper_dev_link(lowerdev, dev);
if (err)
goto destroy_port;
......
......@@ -61,6 +61,7 @@ struct macvlan_dev {
struct hlist_node hlist;
struct macvlan_port *port;
struct net_device *lowerdev;
void *fwd_priv;
struct macvlan_pcpu_stats __percpu *pcpu_stats;
DECLARE_BITMAP(mc_filter, MACVLAN_MC_FILTER_SZ);
......
......@@ -62,6 +62,7 @@ enum {
NETIF_F_HW_VLAN_STAG_TX_BIT, /* Transmit VLAN STAG HW acceleration */
NETIF_F_HW_VLAN_STAG_RX_BIT, /* Receive VLAN STAG HW acceleration */
NETIF_F_HW_VLAN_STAG_FILTER_BIT,/* Receive filtering on VLAN STAGs */
NETIF_F_HW_L2FW_DOFFLOAD_BIT, /* Allow L2 Forwarding in Hardware */
/*
* Add your fresh new feature above and remember to update
......@@ -116,6 +117,7 @@ enum {
#define NETIF_F_HW_VLAN_STAG_FILTER __NETIF_F(HW_VLAN_STAG_FILTER)
#define NETIF_F_HW_VLAN_STAG_RX __NETIF_F(HW_VLAN_STAG_RX)
#define NETIF_F_HW_VLAN_STAG_TX __NETIF_F(HW_VLAN_STAG_TX)
#define NETIF_F_HW_L2FW_DOFFLOAD __NETIF_F(HW_L2FW_DOFFLOAD)
/* Features valid for ethtool to change */
/* = all defined minus driver/device-class-related */
......
......@@ -962,6 +962,25 @@ struct netdev_phys_port_id {
* Called by vxlan to notify the driver about a UDP port and socket
* address family that vxlan is not listening to anymore. The operation
* is protected by the vxlan_net->sock_lock.
*
* void* (*ndo_dfwd_add_station)(struct net_device *pdev,
* struct net_device *dev)
* Called by upper layer devices to accelerate switching or other
* station functionality into hardware. 'pdev is the lowerdev
* to use for the offload and 'dev' is the net device that will
* back the offload. Returns a pointer to the private structure
* the upper layer will maintain.
* void (*ndo_dfwd_del_station)(struct net_device *pdev, void *priv)
* Called by upper layer device to delete the station created
* by 'ndo_dfwd_add_station'. 'pdev' is the net device backing
* the station and priv is the structure returned by the add
* operation.
* netdev_tx_t (*ndo_dfwd_start_xmit)(struct sk_buff *skb,
* struct net_device *dev,
* void *priv);
* Callback to use for xmit over the accelerated station. This
* is used in place of ndo_start_xmit on accelerated net
* devices.
*/
struct net_device_ops {
int (*ndo_init)(struct net_device *dev);
......@@ -1098,6 +1117,15 @@ struct net_device_ops {
void (*ndo_del_vxlan_port)(struct net_device *dev,
sa_family_t sa_family,
__be16 port);
void* (*ndo_dfwd_add_station)(struct net_device *pdev,
struct net_device *dev);
void (*ndo_dfwd_del_station)(struct net_device *pdev,
void *priv);
netdev_tx_t (*ndo_dfwd_start_xmit) (struct sk_buff *skb,
struct net_device *dev,
void *priv);
};
/*
......@@ -1195,6 +1223,7 @@ struct net_device {
/* Management operations */
const struct net_device_ops *netdev_ops;
const struct ethtool_ops *ethtool_ops;
const struct forwarding_accel_ops *fwd_ops;
/* Hardware header description */
const struct header_ops *header_ops;
......@@ -2388,7 +2417,7 @@ int dev_change_carrier(struct net_device *, bool new_carrier);
int dev_get_phys_port_id(struct net_device *dev,
struct netdev_phys_port_id *ppid);
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq);
struct netdev_queue *txq, void *accel_priv);
int dev_forward_skb(struct net_device *dev, struct sk_buff *skb);
extern int netdev_budget;
......@@ -2967,6 +2996,11 @@ static inline void netif_set_gso_max_size(struct net_device *dev,
dev->gso_max_size = size;
}
static inline bool netif_is_macvlan(struct net_device *dev)
{
return dev->priv_flags & IFF_MACVLAN;
}
static inline bool netif_is_bond_master(struct net_device *dev)
{
return dev->flags & IFF_MASTER && dev->priv_flags & IFF_BONDING;
......
......@@ -83,6 +83,7 @@
#define IFF_SUPP_NOFCS 0x80000 /* device supports sending custom FCS */
#define IFF_LIVE_ADDR_CHANGE 0x100000 /* device supports hardware address
* change when it's running */
#define IFF_MACVLAN 0x200000 /* Macvlan device */
#define IF_GET_IFACE 0x0001 /* for querying only */
......
......@@ -2538,7 +2538,7 @@ static inline int skb_needs_linearize(struct sk_buff *skb,
}
int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
struct netdev_queue *txq)
struct netdev_queue *txq, void *accel_priv)
{
const struct net_device_ops *ops = dev->netdev_ops;
int rc = NETDEV_TX_OK;
......@@ -2604,9 +2604,13 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
dev_queue_xmit_nit(skb, dev);
skb_len = skb->len;
if (accel_priv)
rc = ops->ndo_dfwd_start_xmit(skb, dev, accel_priv);
else
rc = ops->ndo_start_xmit(skb, dev);
trace_net_dev_xmit(skb, rc, dev, skb_len);
if (rc == NETDEV_TX_OK)
if (rc == NETDEV_TX_OK && txq)
txq_trans_update(txq);
return rc;
}
......@@ -2622,6 +2626,9 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
dev_queue_xmit_nit(nskb, dev);
skb_len = nskb->len;
if (accel_priv)
rc = ops->ndo_dfwd_start_xmit(nskb, dev, accel_priv);
else
rc = ops->ndo_start_xmit(nskb, dev);
trace_net_dev_xmit(nskb, rc, dev, skb_len);
if (unlikely(rc != NETDEV_TX_OK)) {
......@@ -2647,6 +2654,7 @@ int dev_hard_start_xmit(struct sk_buff *skb, struct net_device *dev,
out:
return rc;
}
EXPORT_SYMBOL_GPL(dev_hard_start_xmit);
static void qdisc_pkt_len_init(struct sk_buff *skb)
{
......@@ -2854,7 +2862,7 @@ int dev_queue_xmit(struct sk_buff *skb)
if (!netif_xmit_stopped(txq)) {
__this_cpu_inc(xmit_recursion);
rc = dev_hard_start_xmit(skb, dev, txq);
rc = dev_hard_start_xmit(skb, dev, txq, NULL);
__this_cpu_dec(xmit_recursion);
if (dev_xmit_complete(rc)) {
HARD_TX_UNLOCK(dev, txq);
......
......@@ -96,6 +96,7 @@ static const char netdev_features_strings[NETDEV_FEATURE_COUNT][ETH_GSTRING_LEN]
[NETIF_F_LOOPBACK_BIT] = "loopback",
[NETIF_F_RXFCS_BIT] = "rx-fcs",
[NETIF_F_RXALL_BIT] = "rx-all",
[NETIF_F_HW_L2FW_DOFFLOAD_BIT] = "l2-fwd-offload",
};
static int ethtool_get_features(struct net_device *dev, void __user *useraddr)
......
......@@ -126,7 +126,7 @@ int sch_direct_xmit(struct sk_buff *skb, struct Qdisc *q,
HARD_TX_LOCK(dev, txq, smp_processor_id());
if (!netif_xmit_frozen_or_stopped(txq))
ret = dev_hard_start_xmit(skb, dev, txq);
ret = dev_hard_start_xmit(skb, dev, txq, NULL);
HARD_TX_UNLOCK(dev, txq);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment