Commit 9aa1206e authored by Daniel Borkmann's avatar Daniel Borkmann Committed by Alexei Starovoitov

bpf: Add redirect_peer helper

Add an efficient ingress to ingress netns switch that can be used out of tc BPF
programs in order to redirect traffic from host ns ingress into a container
veth device ingress without having to go via CPU backlog queue [0]. For local
containers this can also be utilized and path via CPU backlog queue only needs
to be taken once, not twice. On a high level this borrows from ipvlan which does
similar switch in __netif_receive_skb_core() and then iterates via another_round.
This helps to reduce latency for mentioned use cases.

Pod to remote pod with redirect(), TCP_RR [1]:

  # percpu_netperf 10.217.1.33
          RT_LATENCY:         122.450         (per CPU:         122.666         122.401         122.333         122.401 )
        MEAN_LATENCY:         121.210         (per CPU:         121.100         121.260         121.320         121.160 )
      STDDEV_LATENCY:         120.040         (per CPU:         119.420         119.910         125.460         115.370 )
         MIN_LATENCY:          46.500         (per CPU:          47.000          47.000          47.000          45.000 )
         P50_LATENCY:         118.500         (per CPU:         118.000         119.000         118.000         119.000 )
         P90_LATENCY:         127.500         (per CPU:         127.000         128.000         127.000         128.000 )
         P99_LATENCY:         130.750         (per CPU:         131.000         131.000         129.000         132.000 )

    TRANSACTION_RATE:       32666.400         (per CPU:        8152.200        8169.842        8174.439        8169.897 )

Pod to remote pod with redirect_peer(), TCP_RR:

  # percpu_netperf 10.217.1.33
          RT_LATENCY:          44.449         (per CPU:          43.767          43.127          45.279          45.622 )
        MEAN_LATENCY:          45.065         (per CPU:          44.030          45.530          45.190          45.510 )
      STDDEV_LATENCY:          84.823         (per CPU:          66.770          97.290          84.380          90.850 )
         MIN_LATENCY:          33.500         (per CPU:          33.000          33.000          34.000          34.000 )
         P50_LATENCY:          43.250         (per CPU:          43.000          43.000          43.000          44.000 )
         P90_LATENCY:          46.750         (per CPU:          46.000          47.000          47.000          47.000 )
         P99_LATENCY:          52.750         (per CPU:          51.000          54.000          53.000          53.000 )

    TRANSACTION_RATE:       90039.500         (per CPU:       22848.186       23187.089       22085.077       21919.130 )

  [0] https://linuxplumbersconf.org/event/7/contributions/674/attachments/568/1002/plumbers_2020_cilium_load_balancer.pdf
  [1] https://github.com/borkmann/netperf_scripts/blob/master/percpu_netperfSigned-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
Link: https://lore.kernel.org/bpf/20201010234006.7075-3-daniel@iogearbox.net
parent dd2ce6a5
...@@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev) ...@@ -420,6 +420,14 @@ static int veth_select_rxq(struct net_device *dev)
return smp_processor_id() % dev->real_num_rx_queues; return smp_processor_id() % dev->real_num_rx_queues;
} }
static struct net_device *veth_peer_dev(struct net_device *dev)
{
struct veth_priv *priv = netdev_priv(dev);
/* Callers must be under RCU read side. */
return rcu_dereference(priv->peer);
}
static int veth_xdp_xmit(struct net_device *dev, int n, static int veth_xdp_xmit(struct net_device *dev, int n,
struct xdp_frame **frames, struct xdp_frame **frames,
u32 flags, bool ndo_xmit) u32 flags, bool ndo_xmit)
...@@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = { ...@@ -1224,6 +1232,7 @@ static const struct net_device_ops veth_netdev_ops = {
.ndo_set_rx_headroom = veth_set_rx_headroom, .ndo_set_rx_headroom = veth_set_rx_headroom,
.ndo_bpf = veth_xdp, .ndo_bpf = veth_xdp,
.ndo_xdp_xmit = veth_ndo_xdp_xmit, .ndo_xdp_xmit = veth_ndo_xdp_xmit,
.ndo_get_peer_dev = veth_peer_dev,
}; };
#define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \ #define VETH_FEATURES (NETIF_F_SG | NETIF_F_FRAGLIST | NETIF_F_HW_CSUM | \
......
...@@ -1277,6 +1277,9 @@ struct netdev_net_notifier { ...@@ -1277,6 +1277,9 @@ struct netdev_net_notifier {
* int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p, * int (*ndo_tunnel_ctl)(struct net_device *dev, struct ip_tunnel_parm *p,
* int cmd); * int cmd);
* Add, change, delete or get information on an IPv4 tunnel. * Add, change, delete or get information on an IPv4 tunnel.
* struct net_device *(*ndo_get_peer_dev)(struct net_device *dev);
* If a device is paired with a peer device, return the peer instance.
* The caller must be under RCU read context.
*/ */
struct net_device_ops { struct net_device_ops {
int (*ndo_init)(struct net_device *dev); int (*ndo_init)(struct net_device *dev);
...@@ -1484,6 +1487,7 @@ struct net_device_ops { ...@@ -1484,6 +1487,7 @@ struct net_device_ops {
struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev); struct devlink_port * (*ndo_get_devlink_port)(struct net_device *dev);
int (*ndo_tunnel_ctl)(struct net_device *dev, int (*ndo_tunnel_ctl)(struct net_device *dev,
struct ip_tunnel_parm *p, int cmd); struct ip_tunnel_parm *p, int cmd);
struct net_device * (*ndo_get_peer_dev)(struct net_device *dev);
}; };
/** /**
......
...@@ -3719,6 +3719,22 @@ union bpf_attr { ...@@ -3719,6 +3719,22 @@ union bpf_attr {
* never return NULL. * never return NULL.
* Return * Return
* A pointer pointing to the kernel percpu variable on this cpu. * A pointer pointing to the kernel percpu variable on this cpu.
*
* long bpf_redirect_peer(u32 ifindex, u64 flags)
* Description
* Redirect the packet to another net device of index *ifindex*.
* This helper is somewhat similar to **bpf_redirect**\ (), except
* that the redirection happens to the *ifindex*' peer device and
* the netns switch takes place from ingress to ingress without
* going through the CPU's backlog queue.
*
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types at the ingress
* hook and for veth device types. The peer device must reside in a
* different network namespace.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
...@@ -3876,6 +3892,7 @@ union bpf_attr { ...@@ -3876,6 +3892,7 @@ union bpf_attr {
FN(redirect_neigh), \ FN(redirect_neigh), \
FN(bpf_per_cpu_ptr), \ FN(bpf_per_cpu_ptr), \
FN(bpf_this_cpu_ptr), \ FN(bpf_this_cpu_ptr), \
FN(redirect_peer), \
/* */ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
......
...@@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook); ...@@ -4930,7 +4930,7 @@ EXPORT_SYMBOL_GPL(br_fdb_test_addr_hook);
static inline struct sk_buff * static inline struct sk_buff *
sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
struct net_device *orig_dev) struct net_device *orig_dev, bool *another)
{ {
#ifdef CONFIG_NET_CLS_ACT #ifdef CONFIG_NET_CLS_ACT
struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress); struct mini_Qdisc *miniq = rcu_dereference_bh(skb->dev->miniq_ingress);
...@@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret, ...@@ -4974,7 +4974,11 @@ sch_handle_ingress(struct sk_buff *skb, struct packet_type **pt_prev, int *ret,
* redirecting to another netdev * redirecting to another netdev
*/ */
__skb_push(skb, skb->mac_len); __skb_push(skb, skb->mac_len);
skb_do_redirect(skb); if (skb_do_redirect(skb) == -EAGAIN) {
__skb_pull(skb, skb->mac_len);
*another = true;
break;
}
return NULL; return NULL;
case TC_ACT_CONSUMED: case TC_ACT_CONSUMED:
return NULL; return NULL;
...@@ -5163,7 +5167,12 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc, ...@@ -5163,7 +5167,12 @@ static int __netif_receive_skb_core(struct sk_buff **pskb, bool pfmemalloc,
skip_taps: skip_taps:
#ifdef CONFIG_NET_INGRESS #ifdef CONFIG_NET_INGRESS
if (static_branch_unlikely(&ingress_needed_key)) { if (static_branch_unlikely(&ingress_needed_key)) {
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev); bool another = false;
skb = sch_handle_ingress(skb, &pt_prev, &ret, orig_dev,
&another);
if (another)
goto another_round;
if (!skb) if (!skb)
goto out; goto out;
......
...@@ -2381,7 +2381,8 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev) ...@@ -2381,7 +2381,8 @@ static int __bpf_redirect_neigh(struct sk_buff *skb, struct net_device *dev)
/* Internal, non-exposed redirect flags. */ /* Internal, non-exposed redirect flags. */
enum { enum {
BPF_F_NEIGH = (1ULL << 1), BPF_F_NEIGH = (1ULL << 1),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH) BPF_F_PEER = (1ULL << 2),
#define BPF_F_REDIRECT_INTERNAL (BPF_F_NEIGH | BPF_F_PEER)
}; };
BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags) BPF_CALL_3(bpf_clone_redirect, struct sk_buff *, skb, u32, ifindex, u64, flags)
...@@ -2430,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info); ...@@ -2430,19 +2431,35 @@ EXPORT_PER_CPU_SYMBOL_GPL(bpf_redirect_info);
int skb_do_redirect(struct sk_buff *skb) int skb_do_redirect(struct sk_buff *skb)
{ {
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
struct net *net = dev_net(skb->dev);
struct net_device *dev; struct net_device *dev;
u32 flags = ri->flags; u32 flags = ri->flags;
dev = dev_get_by_index_rcu(dev_net(skb->dev), ri->tgt_index); dev = dev_get_by_index_rcu(net, ri->tgt_index);
ri->tgt_index = 0; ri->tgt_index = 0;
if (unlikely(!dev)) { ri->flags = 0;
kfree_skb(skb); if (unlikely(!dev))
return -EINVAL; goto out_drop;
} if (flags & BPF_F_PEER) {
const struct net_device_ops *ops = dev->netdev_ops;
if (unlikely(!ops->ndo_get_peer_dev ||
!skb_at_tc_ingress(skb)))
goto out_drop;
dev = ops->ndo_get_peer_dev(dev);
if (unlikely(!dev ||
!is_skb_forwardable(dev, skb) ||
net_eq(net, dev_net(dev))))
goto out_drop;
skb->dev = dev;
return -EAGAIN;
}
return flags & BPF_F_NEIGH ? return flags & BPF_F_NEIGH ?
__bpf_redirect_neigh(skb, dev) : __bpf_redirect_neigh(skb, dev) :
__bpf_redirect(skb, dev, flags); __bpf_redirect(skb, dev, flags);
out_drop:
kfree_skb(skb);
return -EINVAL;
} }
BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags) BPF_CALL_2(bpf_redirect, u32, ifindex, u64, flags)
...@@ -2466,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = { ...@@ -2466,6 +2483,27 @@ static const struct bpf_func_proto bpf_redirect_proto = {
.arg2_type = ARG_ANYTHING, .arg2_type = ARG_ANYTHING,
}; };
BPF_CALL_2(bpf_redirect_peer, u32, ifindex, u64, flags)
{
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
if (unlikely(flags))
return TC_ACT_SHOT;
ri->flags = BPF_F_PEER;
ri->tgt_index = ifindex;
return TC_ACT_REDIRECT;
}
static const struct bpf_func_proto bpf_redirect_peer_proto = {
.func = bpf_redirect_peer,
.gpl_only = false,
.ret_type = RET_INTEGER,
.arg1_type = ARG_ANYTHING,
.arg2_type = ARG_ANYTHING,
};
BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags) BPF_CALL_2(bpf_redirect_neigh, u32, ifindex, u64, flags)
{ {
struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info); struct bpf_redirect_info *ri = this_cpu_ptr(&bpf_redirect_info);
...@@ -7053,6 +7091,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) ...@@ -7053,6 +7091,8 @@ tc_cls_act_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog)
return &bpf_redirect_proto; return &bpf_redirect_proto;
case BPF_FUNC_redirect_neigh: case BPF_FUNC_redirect_neigh:
return &bpf_redirect_neigh_proto; return &bpf_redirect_neigh_proto;
case BPF_FUNC_redirect_peer:
return &bpf_redirect_peer_proto;
case BPF_FUNC_get_route_realm: case BPF_FUNC_get_route_realm:
return &bpf_get_route_realm_proto; return &bpf_get_route_realm_proto;
case BPF_FUNC_get_hash_recalc: case BPF_FUNC_get_hash_recalc:
......
...@@ -3719,6 +3719,22 @@ union bpf_attr { ...@@ -3719,6 +3719,22 @@ union bpf_attr {
* never return NULL. * never return NULL.
* Return * Return
* A pointer pointing to the kernel percpu variable on this cpu. * A pointer pointing to the kernel percpu variable on this cpu.
*
* long bpf_redirect_peer(u32 ifindex, u64 flags)
* Description
* Redirect the packet to another net device of index *ifindex*.
* This helper is somewhat similar to **bpf_redirect**\ (), except
* that the redirection happens to the *ifindex*' peer device and
* the netns switch takes place from ingress to ingress without
* going through the CPU's backlog queue.
*
* The *flags* argument is reserved and must be 0. The helper is
* currently only supported for tc BPF program types at the ingress
* hook and for veth device types. The peer device must reside in a
* different network namespace.
* Return
* The helper returns **TC_ACT_REDIRECT** on success or
* **TC_ACT_SHOT** on error.
*/ */
#define __BPF_FUNC_MAPPER(FN) \ #define __BPF_FUNC_MAPPER(FN) \
FN(unspec), \ FN(unspec), \
...@@ -3876,6 +3892,7 @@ union bpf_attr { ...@@ -3876,6 +3892,7 @@ union bpf_attr {
FN(redirect_neigh), \ FN(redirect_neigh), \
FN(bpf_per_cpu_ptr), \ FN(bpf_per_cpu_ptr), \
FN(bpf_this_cpu_ptr), \ FN(bpf_this_cpu_ptr), \
FN(redirect_peer), \
/* */ /* */
/* integer value in 'imm' field of BPF_CALL instruction selects which helper /* integer value in 'imm' field of BPF_CALL instruction selects which helper
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment