Commit 1cb9d3b6 authored by Haiyang Zhang's avatar Haiyang Zhang Committed by Jakub Kicinski

hv_netvsc: Add support for XDP_REDIRECT

Handle XDP_REDIRECT action in netvsc driver.
Also, transparently pass ndo_xdp_xmit to VF when available.
Signed-off-by: default avatarHaiyang Zhang <haiyangz@microsoft.com>
Link: https://lore.kernel.org/r/1649362894-20077-1-git-send-email-haiyangz@microsoft.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parent 2e36437f
......@@ -15,6 +15,7 @@
#include <linux/list.h>
#include <linux/hyperv.h>
#include <linux/rndis.h>
#include <linux/jhash.h>
/* RSS related */
#define OID_GEN_RECEIVE_SCALE_CAPABILITIES 0x00010203 /* query only */
......@@ -237,6 +238,7 @@ int netvsc_recv_callback(struct net_device *net,
void netvsc_channel_cb(void *context);
int netvsc_poll(struct napi_struct *napi, int budget);
void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev);
u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
struct xdp_buff *xdp);
unsigned int netvsc_xdp_fraglen(unsigned int len);
......@@ -246,6 +248,8 @@ int netvsc_xdp_set(struct net_device *dev, struct bpf_prog *prog,
struct netvsc_device *nvdev);
int netvsc_vf_setxdp(struct net_device *vf_netdev, struct bpf_prog *prog);
int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf);
int netvsc_ndoxdp_xmit(struct net_device *ndev, int n,
struct xdp_frame **frames, u32 flags);
int rndis_set_subchannel(struct net_device *ndev,
struct netvsc_device *nvdev,
......@@ -942,12 +946,21 @@ struct nvsc_rsc {
#define NVSC_RSC_CSUM_INFO BIT(1) /* valid/present bit for 'csum_info' */
#define NVSC_RSC_HASH_INFO BIT(2) /* valid/present bit for 'hash_info' */
struct netvsc_stats {
struct netvsc_stats_tx {
u64 packets;
u64 bytes;
u64 xdp_xmit;
struct u64_stats_sync syncp;
};
struct netvsc_stats_rx {
u64 packets;
u64 bytes;
u64 broadcast;
u64 multicast;
u64 xdp_drop;
u64 xdp_redirect;
u64 xdp_tx;
struct u64_stats_sync syncp;
};
......@@ -1046,6 +1059,55 @@ struct net_device_context {
struct netvsc_device_info *saved_netvsc_dev_info;
};
/* Azure hosts don't support non-TCP port numbers in hashing for fragmented
* packets. We can use ethtool to change UDP hash level when necessary.
*/
static inline u32 netvsc_get_hash(struct sk_buff *skb,
const struct net_device_context *ndc)
{
struct flow_keys flow;
u32 hash, pkt_proto = 0;
static u32 hashrnd __read_mostly;
net_get_random_once(&hashrnd, sizeof(hashrnd));
if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
return 0;
switch (flow.basic.ip_proto) {
case IPPROTO_TCP:
if (flow.basic.n_proto == htons(ETH_P_IP))
pkt_proto = HV_TCP4_L4HASH;
else if (flow.basic.n_proto == htons(ETH_P_IPV6))
pkt_proto = HV_TCP6_L4HASH;
break;
case IPPROTO_UDP:
if (flow.basic.n_proto == htons(ETH_P_IP))
pkt_proto = HV_UDP4_L4HASH;
else if (flow.basic.n_proto == htons(ETH_P_IPV6))
pkt_proto = HV_UDP6_L4HASH;
break;
}
if (pkt_proto & ndc->l4_hash) {
return skb_get_hash(skb);
} else {
if (flow.basic.n_proto == htons(ETH_P_IP))
hash = jhash2((u32 *)&flow.addrs.v4addrs, 2, hashrnd);
else if (flow.basic.n_proto == htons(ETH_P_IPV6))
hash = jhash2((u32 *)&flow.addrs.v6addrs, 8, hashrnd);
else
return 0;
__skb_set_sw_hash(skb, hash, false);
}
return hash;
}
/* Per channel data */
struct netvsc_channel {
struct vmbus_channel *channel;
......@@ -1060,9 +1122,10 @@ struct netvsc_channel {
struct bpf_prog __rcu *bpf_prog;
struct xdp_rxq_info xdp_rxq;
bool xdp_flush;
struct netvsc_stats tx_stats;
struct netvsc_stats rx_stats;
struct netvsc_stats_tx tx_stats;
struct netvsc_stats_rx rx_stats;
};
/* Per netvsc device */
......
......@@ -20,6 +20,7 @@
#include <linux/vmalloc.h>
#include <linux/rtnetlink.h>
#include <linux/prefetch.h>
#include <linux/filter.h>
#include <asm/sync_bitops.h>
#include <asm/mshyperv.h>
......@@ -805,7 +806,7 @@ static void netvsc_send_tx_complete(struct net_device *ndev,
struct hv_netvsc_packet *packet
= (struct hv_netvsc_packet *)skb->cb;
u32 send_index = packet->send_buf_index;
struct netvsc_stats *tx_stats;
struct netvsc_stats_tx *tx_stats;
if (send_index != NETVSC_INVALID_INDEX)
netvsc_free_send_slot(net_device, send_index);
......@@ -1670,12 +1671,17 @@ int netvsc_poll(struct napi_struct *napi, int budget)
if (!nvchan->desc)
nvchan->desc = hv_pkt_iter_first(channel);
nvchan->xdp_flush = false;
while (nvchan->desc && work_done < budget) {
work_done += netvsc_process_raw_pkt(device, nvchan, net_device,
ndev, nvchan->desc, budget);
nvchan->desc = hv_pkt_iter_next(channel, nvchan->desc);
}
if (nvchan->xdp_flush)
xdp_do_flush();
/* Send any pending receive completions */
ret = send_recv_completions(ndev, net_device, nvchan);
......
......@@ -10,6 +10,7 @@
#include <linux/netdevice.h>
#include <linux/etherdevice.h>
#include <linux/ethtool.h>
#include <linux/netpoll.h>
#include <linux/bpf.h>
#include <linux/bpf_trace.h>
#include <linux/kernel.h>
......@@ -23,11 +24,13 @@
u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
struct xdp_buff *xdp)
{
struct netvsc_stats_rx *rx_stats = &nvchan->rx_stats;
void *data = nvchan->rsc.data[0];
u32 len = nvchan->rsc.len[0];
struct page *page = NULL;
struct bpf_prog *prog;
u32 act = XDP_PASS;
bool drop = true;
xdp->data_hard_start = NULL;
......@@ -60,9 +63,34 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
switch (act) {
case XDP_PASS:
case XDP_TX:
drop = false;
break;
case XDP_DROP:
break;
case XDP_REDIRECT:
if (!xdp_do_redirect(ndev, xdp, prog)) {
nvchan->xdp_flush = true;
drop = false;
u64_stats_update_begin(&rx_stats->syncp);
rx_stats->xdp_redirect++;
rx_stats->packets++;
rx_stats->bytes += nvchan->rsc.pktlen;
u64_stats_update_end(&rx_stats->syncp);
break;
} else {
u64_stats_update_begin(&rx_stats->syncp);
rx_stats->xdp_drop++;
u64_stats_update_end(&rx_stats->syncp);
}
fallthrough;
case XDP_ABORTED:
trace_xdp_exception(ndev, prog, act);
break;
......@@ -74,7 +102,7 @@ u32 netvsc_run_xdp(struct net_device *ndev, struct netvsc_channel *nvchan,
out:
rcu_read_unlock();
if (page && act != XDP_PASS && act != XDP_TX) {
if (page && drop) {
__free_page(page);
xdp->data_hard_start = NULL;
}
......@@ -197,3 +225,68 @@ int netvsc_bpf(struct net_device *dev, struct netdev_bpf *bpf)
return -EINVAL;
}
}
static int netvsc_ndoxdp_xmit_fm(struct net_device *ndev,
struct xdp_frame *frame, u16 q_idx)
{
struct sk_buff *skb;
skb = xdp_build_skb_from_frame(frame, ndev);
if (unlikely(!skb))
return -ENOMEM;
netvsc_get_hash(skb, netdev_priv(ndev));
skb_record_rx_queue(skb, q_idx);
netvsc_xdp_xmit(skb, ndev);
return 0;
}
int netvsc_ndoxdp_xmit(struct net_device *ndev, int n,
struct xdp_frame **frames, u32 flags)
{
struct net_device_context *ndev_ctx = netdev_priv(ndev);
const struct net_device_ops *vf_ops;
struct netvsc_stats_tx *tx_stats;
struct netvsc_device *nvsc_dev;
struct net_device *vf_netdev;
int i, count = 0;
u16 q_idx;
/* Don't transmit if netvsc_device is gone */
nvsc_dev = rcu_dereference_bh(ndev_ctx->nvdev);
if (unlikely(!nvsc_dev || nvsc_dev->destroy))
return 0;
/* If VF is present and up then redirect packets to it.
* Skip the VF if it is marked down or has no carrier.
* If netpoll is in uses, then VF can not be used either.
*/
vf_netdev = rcu_dereference_bh(ndev_ctx->vf_netdev);
if (vf_netdev && netif_running(vf_netdev) &&
netif_carrier_ok(vf_netdev) && !netpoll_tx_running(ndev) &&
vf_netdev->netdev_ops->ndo_xdp_xmit &&
ndev_ctx->data_path_is_vf) {
vf_ops = vf_netdev->netdev_ops;
return vf_ops->ndo_xdp_xmit(vf_netdev, n, frames, flags);
}
q_idx = smp_processor_id() % ndev->real_num_tx_queues;
for (i = 0; i < n; i++) {
if (netvsc_ndoxdp_xmit_fm(ndev, frames[i], q_idx))
break;
count++;
}
tx_stats = &nvsc_dev->chan_table[q_idx].tx_stats;
u64_stats_update_begin(&tx_stats->syncp);
tx_stats->xdp_xmit += count;
u64_stats_update_end(&tx_stats->syncp);
return count;
}
......@@ -242,56 +242,6 @@ static inline void *init_ppi_data(struct rndis_message *msg,
return ppi + 1;
}
/* Azure hosts don't support non-TCP port numbers in hashing for fragmented
* packets. We can use ethtool to change UDP hash level when necessary.
*/
static inline u32 netvsc_get_hash(
struct sk_buff *skb,
const struct net_device_context *ndc)
{
struct flow_keys flow;
u32 hash, pkt_proto = 0;
static u32 hashrnd __read_mostly;
net_get_random_once(&hashrnd, sizeof(hashrnd));
if (!skb_flow_dissect_flow_keys(skb, &flow, 0))
return 0;
switch (flow.basic.ip_proto) {
case IPPROTO_TCP:
if (flow.basic.n_proto == htons(ETH_P_IP))
pkt_proto = HV_TCP4_L4HASH;
else if (flow.basic.n_proto == htons(ETH_P_IPV6))
pkt_proto = HV_TCP6_L4HASH;
break;
case IPPROTO_UDP:
if (flow.basic.n_proto == htons(ETH_P_IP))
pkt_proto = HV_UDP4_L4HASH;
else if (flow.basic.n_proto == htons(ETH_P_IPV6))
pkt_proto = HV_UDP6_L4HASH;
break;
}
if (pkt_proto & ndc->l4_hash) {
return skb_get_hash(skb);
} else {
if (flow.basic.n_proto == htons(ETH_P_IP))
hash = jhash2((u32 *)&flow.addrs.v4addrs, 2, hashrnd);
else if (flow.basic.n_proto == htons(ETH_P_IPV6))
hash = jhash2((u32 *)&flow.addrs.v6addrs, 8, hashrnd);
else
return 0;
__skb_set_sw_hash(skb, hash, false);
}
return hash;
}
static inline int netvsc_get_tx_queue(struct net_device *ndev,
struct sk_buff *skb, int old_idx)
{
......@@ -804,7 +754,7 @@ void netvsc_linkstatus_callback(struct net_device *net,
}
/* This function should only be called after skb_record_rx_queue() */
static void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev)
void netvsc_xdp_xmit(struct sk_buff *skb, struct net_device *ndev)
{
int rc;
......@@ -925,7 +875,7 @@ int netvsc_recv_callback(struct net_device *net,
struct vmbus_channel *channel = nvchan->channel;
u16 q_idx = channel->offermsg.offer.sub_channel_index;
struct sk_buff *skb;
struct netvsc_stats *rx_stats = &nvchan->rx_stats;
struct netvsc_stats_rx *rx_stats = &nvchan->rx_stats;
struct xdp_buff xdp;
u32 act;
......@@ -934,6 +884,9 @@ int netvsc_recv_callback(struct net_device *net,
act = netvsc_run_xdp(net, nvchan, &xdp);
if (act == XDP_REDIRECT)
return NVSP_STAT_SUCCESS;
if (act != XDP_PASS && act != XDP_TX) {
u64_stats_update_begin(&rx_stats->syncp);
rx_stats->xdp_drop++;
......@@ -958,6 +911,9 @@ int netvsc_recv_callback(struct net_device *net,
* statistics will not work correctly.
*/
u64_stats_update_begin(&rx_stats->syncp);
if (act == XDP_TX)
rx_stats->xdp_tx++;
rx_stats->packets++;
rx_stats->bytes += nvchan->rsc.pktlen;
......@@ -1353,28 +1309,29 @@ static void netvsc_get_pcpu_stats(struct net_device *net,
/* fetch percpu stats of netvsc */
for (i = 0; i < nvdev->num_chn; i++) {
const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
const struct netvsc_stats *stats;
const struct netvsc_stats_tx *tx_stats;
const struct netvsc_stats_rx *rx_stats;
struct netvsc_ethtool_pcpu_stats *this_tot =
&pcpu_tot[nvchan->channel->target_cpu];
u64 packets, bytes;
unsigned int start;
stats = &nvchan->tx_stats;
tx_stats = &nvchan->tx_stats;
do {
start = u64_stats_fetch_begin_irq(&stats->syncp);
packets = stats->packets;
bytes = stats->bytes;
} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
packets = tx_stats->packets;
bytes = tx_stats->bytes;
} while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
this_tot->tx_bytes += bytes;
this_tot->tx_packets += packets;
stats = &nvchan->rx_stats;
rx_stats = &nvchan->rx_stats;
do {
start = u64_stats_fetch_begin_irq(&stats->syncp);
packets = stats->packets;
bytes = stats->bytes;
} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
packets = rx_stats->packets;
bytes = rx_stats->bytes;
} while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
this_tot->rx_bytes += bytes;
this_tot->rx_packets += packets;
......@@ -1406,27 +1363,28 @@ static void netvsc_get_stats64(struct net_device *net,
for (i = 0; i < nvdev->num_chn; i++) {
const struct netvsc_channel *nvchan = &nvdev->chan_table[i];
const struct netvsc_stats *stats;
const struct netvsc_stats_tx *tx_stats;
const struct netvsc_stats_rx *rx_stats;
u64 packets, bytes, multicast;
unsigned int start;
stats = &nvchan->tx_stats;
tx_stats = &nvchan->tx_stats;
do {
start = u64_stats_fetch_begin_irq(&stats->syncp);
packets = stats->packets;
bytes = stats->bytes;
} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
packets = tx_stats->packets;
bytes = tx_stats->bytes;
} while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
t->tx_bytes += bytes;
t->tx_packets += packets;
stats = &nvchan->rx_stats;
rx_stats = &nvchan->rx_stats;
do {
start = u64_stats_fetch_begin_irq(&stats->syncp);
packets = stats->packets;
bytes = stats->bytes;
multicast = stats->multicast + stats->broadcast;
} while (u64_stats_fetch_retry_irq(&stats->syncp, start));
start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
packets = rx_stats->packets;
bytes = rx_stats->bytes;
multicast = rx_stats->multicast + rx_stats->broadcast;
} while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
t->rx_bytes += bytes;
t->rx_packets += packets;
......@@ -1515,8 +1473,8 @@ static const struct {
/* statistics per queue (rx/tx packets/bytes) */
#define NETVSC_PCPU_STATS_LEN (num_present_cpus() * ARRAY_SIZE(pcpu_stats))
/* 5 statistics per queue (rx/tx packets/bytes, rx xdp_drop) */
#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 5)
/* 8 statistics per queue (rx/tx packets/bytes, XDP actions) */
#define NETVSC_QUEUE_STATS_LEN(dev) ((dev)->num_chn * 8)
static int netvsc_get_sset_count(struct net_device *dev, int string_set)
{
......@@ -1543,12 +1501,16 @@ static void netvsc_get_ethtool_stats(struct net_device *dev,
struct net_device_context *ndc = netdev_priv(dev);
struct netvsc_device *nvdev = rtnl_dereference(ndc->nvdev);
const void *nds = &ndc->eth_stats;
const struct netvsc_stats *qstats;
const struct netvsc_stats_tx *tx_stats;
const struct netvsc_stats_rx *rx_stats;
struct netvsc_vf_pcpu_stats sum;
struct netvsc_ethtool_pcpu_stats *pcpu_sum;
unsigned int start;
u64 packets, bytes;
u64 xdp_drop;
u64 xdp_redirect;
u64 xdp_tx;
u64 xdp_xmit;
int i, j, cpu;
if (!nvdev)
......@@ -1562,26 +1524,32 @@ static void netvsc_get_ethtool_stats(struct net_device *dev,
data[i++] = *(u64 *)((void *)&sum + vf_stats[j].offset);
for (j = 0; j < nvdev->num_chn; j++) {
qstats = &nvdev->chan_table[j].tx_stats;
tx_stats = &nvdev->chan_table[j].tx_stats;
do {
start = u64_stats_fetch_begin_irq(&qstats->syncp);
packets = qstats->packets;
bytes = qstats->bytes;
} while (u64_stats_fetch_retry_irq(&qstats->syncp, start));
start = u64_stats_fetch_begin_irq(&tx_stats->syncp);
packets = tx_stats->packets;
bytes = tx_stats->bytes;
xdp_xmit = tx_stats->xdp_xmit;
} while (u64_stats_fetch_retry_irq(&tx_stats->syncp, start));
data[i++] = packets;
data[i++] = bytes;
data[i++] = xdp_xmit;
qstats = &nvdev->chan_table[j].rx_stats;
rx_stats = &nvdev->chan_table[j].rx_stats;
do {
start = u64_stats_fetch_begin_irq(&qstats->syncp);
packets = qstats->packets;
bytes = qstats->bytes;
xdp_drop = qstats->xdp_drop;
} while (u64_stats_fetch_retry_irq(&qstats->syncp, start));
start = u64_stats_fetch_begin_irq(&rx_stats->syncp);
packets = rx_stats->packets;
bytes = rx_stats->bytes;
xdp_drop = rx_stats->xdp_drop;
xdp_redirect = rx_stats->xdp_redirect;
xdp_tx = rx_stats->xdp_tx;
} while (u64_stats_fetch_retry_irq(&rx_stats->syncp, start));
data[i++] = packets;
data[i++] = bytes;
data[i++] = xdp_drop;
data[i++] = xdp_redirect;
data[i++] = xdp_tx;
}
pcpu_sum = kvmalloc_array(num_possible_cpus(),
......@@ -1622,9 +1590,12 @@ static void netvsc_get_strings(struct net_device *dev, u32 stringset, u8 *data)
for (i = 0; i < nvdev->num_chn; i++) {
ethtool_sprintf(&p, "tx_queue_%u_packets", i);
ethtool_sprintf(&p, "tx_queue_%u_bytes", i);
ethtool_sprintf(&p, "tx_queue_%u_xdp_xmit", i);
ethtool_sprintf(&p, "rx_queue_%u_packets", i);
ethtool_sprintf(&p, "rx_queue_%u_bytes", i);
ethtool_sprintf(&p, "rx_queue_%u_xdp_drop", i);
ethtool_sprintf(&p, "rx_queue_%u_xdp_redirect", i);
ethtool_sprintf(&p, "rx_queue_%u_xdp_tx", i);
}
for_each_present_cpu(cpu) {
......@@ -2057,6 +2028,7 @@ static const struct net_device_ops device_ops = {
.ndo_select_queue = netvsc_select_queue,
.ndo_get_stats64 = netvsc_get_stats64,
.ndo_bpf = netvsc_bpf,
.ndo_xdp_xmit = netvsc_ndoxdp_xmit,
};
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment