Commit 3431205e authored by Alexei Starovoitov's avatar Alexei Starovoitov Committed by David S. Miller

bpf: make programs see skb->data == L2 for ingress and egress

eBPF programs attached to ingress and egress qdiscs see inconsistent skb->data.
For ingress L2 header is already pulled, whereas for egress it's present.
This is known to program writers which are currently forced to use
BPF_LL_OFF workaround.
Since programs don't change skb internal pointers it is safe to do
pull/push right around invocation of the program and earlier taps and
later pt->func() will not be affected.
Multiple taps via packet_rcv(), tpacket_rcv() are doing the same trick
around run_filter/BPF_PROG_RUN even if skb_shared.

This fix finally allows programs to use optimized LD_ABS/IND instructions
without BPF_LL_OFF for higher performance.
tc ingress + cls_bpf + samples/bpf/tcbpf1_kern.o
       w/o JIT   w/JIT
before  20.5     23.6 Mpps
after   21.8     26.6 Mpps

Old programs with BPF_LL_OFF will still work as-is.

We can now undo most of the earlier workaround commit:
a166151c ("bpf: fix bpf helpers to use skb->mac_header relative offsets")
Signed-off-by: default avatarAlexei Starovoitov <ast@plumgrid.com>
Acked-by: default avatarJamal Hadi Salim <jhs@mojatatu.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 98da81a4
...@@ -1238,21 +1238,6 @@ int sk_attach_bpf(u32 ufd, struct sock *sk) ...@@ -1238,21 +1238,6 @@ int sk_attach_bpf(u32 ufd, struct sock *sk)
return 0; return 0;
} }
/**
* bpf_skb_clone_not_writable - is the header of a clone not writable
* @skb: buffer to check
* @len: length up to which to write, can be negative
*
* Returns true if modifying the header part of the cloned buffer
* does require the data to be copied. I.e. this version works with
* negative lengths needed for eBPF case!
*/
static bool bpf_skb_clone_unwritable(const struct sk_buff *skb, int len)
{
return skb_header_cloned(skb) ||
(int) skb_headroom(skb) + len > skb->hdr_len;
}
#define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1) #define BPF_RECOMPUTE_CSUM(flags) ((flags) & 1)
static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
...@@ -1275,9 +1260,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags) ...@@ -1275,9 +1260,8 @@ static u64 bpf_skb_store_bytes(u64 r1, u64 r2, u64 r3, u64 r4, u64 flags)
if (unlikely((u32) offset > 0xffff || len > sizeof(buf))) if (unlikely((u32) offset > 0xffff || len > sizeof(buf)))
return -EFAULT; return -EFAULT;
offset -= skb->data - skb_mac_header(skb);
if (unlikely(skb_cloned(skb) && if (unlikely(skb_cloned(skb) &&
bpf_skb_clone_unwritable(skb, offset + len))) !skb_clone_writable(skb, offset + len)))
return -EFAULT; return -EFAULT;
ptr = skb_header_pointer(skb, offset, len, buf); ptr = skb_header_pointer(skb, offset, len, buf);
...@@ -1321,9 +1305,8 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) ...@@ -1321,9 +1305,8 @@ static u64 bpf_l3_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
if (unlikely((u32) offset > 0xffff)) if (unlikely((u32) offset > 0xffff))
return -EFAULT; return -EFAULT;
offset -= skb->data - skb_mac_header(skb);
if (unlikely(skb_cloned(skb) && if (unlikely(skb_cloned(skb) &&
bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) !skb_clone_writable(skb, offset + sizeof(sum))))
return -EFAULT; return -EFAULT;
ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
...@@ -1369,9 +1352,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags) ...@@ -1369,9 +1352,8 @@ static u64 bpf_l4_csum_replace(u64 r1, u64 r2, u64 from, u64 to, u64 flags)
if (unlikely((u32) offset > 0xffff)) if (unlikely((u32) offset > 0xffff))
return -EFAULT; return -EFAULT;
offset -= skb->data - skb_mac_header(skb);
if (unlikely(skb_cloned(skb) && if (unlikely(skb_cloned(skb) &&
bpf_skb_clone_unwritable(skb, offset + sizeof(sum)))) !skb_clone_writable(skb, offset + sizeof(sum))))
return -EFAULT; return -EFAULT;
ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum); ptr = skb_header_pointer(skb, offset, sizeof(sum), &sum);
...@@ -1425,8 +1407,6 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5) ...@@ -1425,8 +1407,6 @@ static u64 bpf_clone_redirect(u64 r1, u64 ifindex, u64 flags, u64 r4, u64 r5)
if (unlikely(!skb2)) if (unlikely(!skb2))
return -ENOMEM; return -ENOMEM;
skb_push(skb2, skb2->data - skb_mac_header(skb2));
if (BPF_IS_REDIRECT_INGRESS(flags)) if (BPF_IS_REDIRECT_INGRESS(flags))
return dev_forward_skb(dev, skb2); return dev_forward_skb(dev, skb2);
......
...@@ -37,6 +37,7 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, ...@@ -37,6 +37,7 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
{ {
struct tcf_bpf *prog = act->priv; struct tcf_bpf *prog = act->priv;
int action, filter_res; int action, filter_res;
bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
if (unlikely(!skb_mac_header_was_set(skb))) if (unlikely(!skb_mac_header_was_set(skb)))
return TC_ACT_UNSPEC; return TC_ACT_UNSPEC;
...@@ -48,7 +49,13 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act, ...@@ -48,7 +49,13 @@ static int tcf_bpf(struct sk_buff *skb, const struct tc_action *act,
/* Needed here for accessing maps. */ /* Needed here for accessing maps. */
rcu_read_lock(); rcu_read_lock();
if (at_ingress) {
__skb_push(skb, skb->mac_len);
filter_res = BPF_PROG_RUN(prog->filter, skb); filter_res = BPF_PROG_RUN(prog->filter, skb);
__skb_pull(skb, skb->mac_len);
} else {
filter_res = BPF_PROG_RUN(prog->filter, skb);
}
rcu_read_unlock(); rcu_read_unlock();
/* A BPF program may overwrite the default action opcode. /* A BPF program may overwrite the default action opcode.
......
...@@ -64,6 +64,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, ...@@ -64,6 +64,11 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
{ {
struct cls_bpf_head *head = rcu_dereference_bh(tp->root); struct cls_bpf_head *head = rcu_dereference_bh(tp->root);
struct cls_bpf_prog *prog; struct cls_bpf_prog *prog;
#ifdef CONFIG_NET_CLS_ACT
bool at_ingress = G_TC_AT(skb->tc_verd) & AT_INGRESS;
#else
bool at_ingress = false;
#endif
int ret = -1; int ret = -1;
if (unlikely(!skb_mac_header_was_set(skb))) if (unlikely(!skb_mac_header_was_set(skb)))
...@@ -72,7 +77,16 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp, ...@@ -72,7 +77,16 @@ static int cls_bpf_classify(struct sk_buff *skb, const struct tcf_proto *tp,
/* Needed here for accessing maps. */ /* Needed here for accessing maps. */
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_rcu(prog, &head->plist, link) { list_for_each_entry_rcu(prog, &head->plist, link) {
int filter_res = BPF_PROG_RUN(prog->filter, skb); int filter_res;
if (at_ingress) {
/* It is safe to push/pull even if skb_shared() */
__skb_push(skb, skb->mac_len);
filter_res = BPF_PROG_RUN(prog->filter, skb);
__skb_pull(skb, skb->mac_len);
} else {
filter_res = BPF_PROG_RUN(prog->filter, skb);
}
if (filter_res == 0) if (filter_res == 0)
continue; continue;
......
...@@ -21,7 +21,7 @@ static inline void set_dst_mac(struct __sk_buff *skb, char *mac) ...@@ -21,7 +21,7 @@ static inline void set_dst_mac(struct __sk_buff *skb, char *mac)
static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos)
{ {
__u8 old_tos = load_byte(skb, BPF_LL_OFF + TOS_OFF); __u8 old_tos = load_byte(skb, TOS_OFF);
bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2); bpf_l3_csum_replace(skb, IP_CSUM_OFF, htons(old_tos), htons(new_tos), 2);
bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0); bpf_skb_store_bytes(skb, TOS_OFF, &new_tos, sizeof(new_tos), 0);
...@@ -34,7 +34,7 @@ static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos) ...@@ -34,7 +34,7 @@ static inline void set_ip_tos(struct __sk_buff *skb, __u8 new_tos)
static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip)
{ {
__u32 old_ip = _htonl(load_word(skb, BPF_LL_OFF + IP_SRC_OFF)); __u32 old_ip = _htonl(load_word(skb, IP_SRC_OFF));
bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip)); bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_ip, new_ip, IS_PSEUDO | sizeof(new_ip));
bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip)); bpf_l3_csum_replace(skb, IP_CSUM_OFF, old_ip, new_ip, sizeof(new_ip));
...@@ -44,7 +44,7 @@ static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip) ...@@ -44,7 +44,7 @@ static inline void set_tcp_ip_src(struct __sk_buff *skb, __u32 new_ip)
#define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest)) #define TCP_DPORT_OFF (ETH_HLEN + sizeof(struct iphdr) + offsetof(struct tcphdr, dest))
static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port)
{ {
__u16 old_port = htons(load_half(skb, BPF_LL_OFF + TCP_DPORT_OFF)); __u16 old_port = htons(load_half(skb, TCP_DPORT_OFF));
bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port)); bpf_l4_csum_replace(skb, TCP_CSUM_OFF, old_port, new_port, sizeof(new_port));
bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0); bpf_skb_store_bytes(skb, TCP_DPORT_OFF, &new_port, sizeof(new_port), 0);
...@@ -53,7 +53,7 @@ static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port) ...@@ -53,7 +53,7 @@ static inline void set_tcp_dest_port(struct __sk_buff *skb, __u16 new_port)
SEC("classifier") SEC("classifier")
int bpf_prog1(struct __sk_buff *skb) int bpf_prog1(struct __sk_buff *skb)
{ {
__u8 proto = load_byte(skb, BPF_LL_OFF + ETH_HLEN + offsetof(struct iphdr, protocol)); __u8 proto = load_byte(skb, ETH_HLEN + offsetof(struct iphdr, protocol));
long *value; long *value;
if (proto == IPPROTO_TCP) { if (proto == IPPROTO_TCP) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment