Commit b5b6ff73 authored by Alexei Starovoitov's avatar Alexei Starovoitov

Merge branch 'bpf-sockmap-fixes'

John Fastabend says:

====================
When I added the test_sockmap to selftests I mistakenly changed the
test logic a bit. The result of this was on redirect cases we ended up
choosing the wrong sock from the BPF program and ended up sending to a
socket that had no receive handler. The result was the actual receive
handler, running on a different socket, is timing out and closing the
socket. This results in errors (-EPIPE to be specific) on the sending
side. Typically happening if the sender does not complete the send
before the receive side times out. So depending on timing and the size
of the send we may get errors. This exposed some bugs in the sockmap
error path handling.

This series fixes the errors. The primary issue is we did not do proper
memory accounting in these cases which resulted in missing a
sk_mem_uncharge(). This happened in the redirect path and in one case
on the normal send path. See the three patches for the details.

The other take-away from this is we need to fix the test_sockmap and
also add more negative test cases. That will happen in bpf-next.

Finally, I tested this using the existing test_sockmap program, the
older sockmap sample test script, and a few real use cases with
Cilium. All of these seem to be in working correctly.

v2: fix compiler warning, drop iterator variable 'i' that is no longer
    used in patch 3.
====================
Signed-off-by: default avatarAlexei Starovoitov <ast@kernel.org>
parents 0f58e58e abaeb096
...@@ -326,6 +326,9 @@ static int bpf_tcp_push(struct sock *sk, int apply_bytes, ...@@ -326,6 +326,9 @@ static int bpf_tcp_push(struct sock *sk, int apply_bytes,
if (ret > 0) { if (ret > 0) {
if (apply) if (apply)
apply_bytes -= ret; apply_bytes -= ret;
sg->offset += ret;
sg->length -= ret;
size -= ret; size -= ret;
offset += ret; offset += ret;
if (uncharge) if (uncharge)
...@@ -333,8 +336,6 @@ static int bpf_tcp_push(struct sock *sk, int apply_bytes, ...@@ -333,8 +336,6 @@ static int bpf_tcp_push(struct sock *sk, int apply_bytes,
goto retry; goto retry;
} }
sg->length = size;
sg->offset = offset;
return ret; return ret;
} }
...@@ -392,7 +393,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) ...@@ -392,7 +393,8 @@ static void return_mem_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
} while (i != md->sg_end); } while (i != md->sg_end);
} }
static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) static void free_bytes_sg(struct sock *sk, int bytes,
struct sk_msg_buff *md, bool charge)
{ {
struct scatterlist *sg = md->sg_data; struct scatterlist *sg = md->sg_data;
int i = md->sg_start, free; int i = md->sg_start, free;
...@@ -402,10 +404,12 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) ...@@ -402,10 +404,12 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
if (bytes < free) { if (bytes < free) {
sg[i].length -= bytes; sg[i].length -= bytes;
sg[i].offset += bytes; sg[i].offset += bytes;
if (charge)
sk_mem_uncharge(sk, bytes); sk_mem_uncharge(sk, bytes);
break; break;
} }
if (charge)
sk_mem_uncharge(sk, sg[i].length); sk_mem_uncharge(sk, sg[i].length);
put_page(sg_page(&sg[i])); put_page(sg_page(&sg[i]));
bytes -= sg[i].length; bytes -= sg[i].length;
...@@ -417,6 +421,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md) ...@@ -417,6 +421,7 @@ static void free_bytes_sg(struct sock *sk, int bytes, struct sk_msg_buff *md)
if (i == MAX_SKB_FRAGS) if (i == MAX_SKB_FRAGS)
i = 0; i = 0;
} }
md->sg_start = i;
} }
static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md) static int free_sg(struct sock *sk, int start, struct sk_msg_buff *md)
...@@ -575,10 +580,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, ...@@ -575,10 +580,10 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
struct sk_msg_buff *md, struct sk_msg_buff *md,
int flags) int flags)
{ {
bool ingress = !!(md->flags & BPF_F_INGRESS);
struct smap_psock *psock; struct smap_psock *psock;
struct scatterlist *sg; struct scatterlist *sg;
int i, err, free = 0; int err = 0;
bool ingress = !!(md->flags & BPF_F_INGRESS);
sg = md->sg_data; sg = md->sg_data;
...@@ -606,16 +611,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send, ...@@ -606,16 +611,8 @@ static int bpf_tcp_sendmsg_do_redirect(struct sock *sk, int send,
out_rcu: out_rcu:
rcu_read_unlock(); rcu_read_unlock();
out: out:
i = md->sg_start; free_bytes_sg(NULL, send, md, false);
while (sg[i].length) { return err;
free += sg[i].length;
put_page(sg_page(&sg[i]));
sg[i].length = 0;
i++;
if (i == MAX_SKB_FRAGS)
i = 0;
}
return free;
} }
static inline void bpf_md_init(struct smap_psock *psock) static inline void bpf_md_init(struct smap_psock *psock)
...@@ -700,19 +697,26 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock, ...@@ -700,19 +697,26 @@ static int bpf_exec_tx_verdict(struct smap_psock *psock,
err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags); err = bpf_tcp_sendmsg_do_redirect(redir, send, m, flags);
lock_sock(sk); lock_sock(sk);
if (unlikely(err < 0)) {
free_start_sg(sk, m);
psock->sg_size = 0;
if (!cork)
*copied -= send;
} else {
psock->sg_size -= send;
}
if (cork) { if (cork) {
free_start_sg(sk, m); free_start_sg(sk, m);
psock->sg_size = 0;
kfree(m); kfree(m);
m = NULL; m = NULL;
err = 0;
} }
if (unlikely(err))
*copied -= err;
else
psock->sg_size -= send;
break; break;
case __SK_DROP: case __SK_DROP:
default: default:
free_bytes_sg(sk, send, m); free_bytes_sg(sk, send, m, true);
apply_bytes_dec(psock, send); apply_bytes_dec(psock, send);
*copied -= send; *copied -= send;
psock->sg_size -= send; psock->sg_size -= send;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment