Commit 85ddd9c3 authored by Daniel Borkmann's avatar Daniel Borkmann

Merge branch 'bpf-sockmap-tls-fixes'

John Fastabend says:

====================
To date our usage of sockmap/tls has been fairly simple, the BPF programs
did only well-defined pop, push, pull and apply/cork operations.

Now that we started to push more complex programs into sockmap we uncovered
a series of issues addressed here. Further OpenSSL3.0 version should be
released soon with kTLS support so its important to get any remaining
issues on BPF and kTLS support resolved.

Additionally, I have a patch under development to allow sockmap to be
enabled/disabled at runtime for Cilium endpoints. This allows us to stress
the map insert/delete with kTLS more than previously where Cilium only
added the socket to the map when it entered ESTABLISHED state and never
touched it from the control path side again relying on the sockets own
close() hook to remove it.

To test I have a set of test cases in test_sockmap.c that expose these
issues. Once we get fixes here merged and in bpf-next I'll submit the
tests to bpf-next tree to ensure we don't regress again. Also I've run
these patches in the Cilium CI with OpenSSL (master branch) this will
run tools such as netperf, ab, wrk2, curl, etc. to get a broad set of
testing.

I'm aware of two more issues that we are working to resolve in another
couple (probably two) patches. First we see an auth tag corruption in
kTLS when sending small 1byte chunks under stress. I've not pinned this
down yet. But, guessing because its under 1B stress tests it must be
some error path being triggered. And second we need to ensure BPF RX
programs are not skipped when kTLS ULP is loaded. This breaks some of the
sockmap selftests when running with kTLS. I'll send a follow up for this.

v2: I dropped a patch that added !0 size check in tls_push_record
    this originated from a panic I caught awhile ago with a trace
    in the crypto stack. But I can not reproduce it anymore so will
    dig into that and send another patch later if needed. Anyways
    after a bit of thought it would be nicer if tls/crypto/bpf didn't
    require special case handling for the !0 size.
====================
Signed-off-by: default avatarDaniel Borkmann <daniel@iogearbox.net>
parents 0af2ffc9 7361d448
......@@ -358,17 +358,22 @@ static inline void sk_psock_update_proto(struct sock *sk,
static inline void sk_psock_restore_proto(struct sock *sk,
struct sk_psock *psock)
{
sk->sk_write_space = psock->saved_write_space;
sk->sk_prot->unhash = psock->saved_unhash;
if (psock->sk_proto) {
struct inet_connection_sock *icsk = inet_csk(sk);
bool has_ulp = !!icsk->icsk_ulp_data;
if (has_ulp)
tcp_update_ulp(sk, psock->sk_proto);
else
if (has_ulp) {
tcp_update_ulp(sk, psock->sk_proto,
psock->saved_write_space);
} else {
sk->sk_prot = psock->sk_proto;
sk->sk_write_space = psock->saved_write_space;
}
psock->sk_proto = NULL;
} else {
sk->sk_write_space = psock->saved_write_space;
}
}
......
......@@ -2147,7 +2147,8 @@ struct tcp_ulp_ops {
/* initialize ulp */
int (*init)(struct sock *sk);
/* update ulp */
void (*update)(struct sock *sk, struct proto *p);
void (*update)(struct sock *sk, struct proto *p,
void (*write_space)(struct sock *sk));
/* cleanup ulp */
void (*release)(struct sock *sk);
/* diagnostic */
......@@ -2162,7 +2163,8 @@ void tcp_unregister_ulp(struct tcp_ulp_ops *type);
int tcp_set_ulp(struct sock *sk, const char *name);
void tcp_get_available_ulp(char *buf, size_t len);
void tcp_cleanup_ulp(struct sock *sk);
void tcp_update_ulp(struct sock *sk, struct proto *p);
void tcp_update_ulp(struct sock *sk, struct proto *p,
void (*write_space)(struct sock *sk));
#define MODULE_ALIAS_TCP_ULP(name) \
__MODULE_INFO(alias, alias_userspace, name); \
......
......@@ -2231,10 +2231,10 @@ BPF_CALL_4(bpf_msg_pull_data, struct sk_msg *, msg, u32, start,
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
offset += len;
len = sk_msg_elem(msg, i)->length;
if (start < offset + len)
break;
offset += len;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
......@@ -2346,7 +2346,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
u32, len, u64, flags)
{
struct scatterlist sge, nsge, nnsge, rsge = {0}, *psge;
u32 new, i = 0, l, space, copy = 0, offset = 0;
u32 new, i = 0, l = 0, space, copy = 0, offset = 0;
u8 *raw, *to, *from;
struct page *page;
......@@ -2356,11 +2356,11 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
offset += l;
l = sk_msg_elem(msg, i)->length;
if (start < offset + l)
break;
offset += l;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
......@@ -2415,6 +2415,7 @@ BPF_CALL_4(bpf_msg_push_data, struct sk_msg *, msg, u32, start,
sk_msg_iter_var_next(i);
sg_unmark_end(psge);
sg_unmark_end(&rsge);
sk_msg_iter_next(msg, end);
}
......@@ -2506,7 +2507,7 @@ static void sk_msg_shift_right(struct sk_msg *msg, int i)
BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
u32, len, u64, flags)
{
u32 i = 0, l, space, offset = 0;
u32 i = 0, l = 0, space, offset = 0;
u64 last = start + len;
int pop;
......@@ -2516,11 +2517,11 @@ BPF_CALL_4(bpf_msg_pop_data, struct sk_msg *, msg, u32, start,
/* First find the starting scatterlist element */
i = msg->sg.start;
do {
offset += l;
l = sk_msg_elem(msg, i)->length;
if (start < offset + l)
break;
offset += l;
sk_msg_iter_var_next(i);
} while (i != msg->sg.end);
......
......@@ -594,6 +594,8 @@ EXPORT_SYMBOL_GPL(sk_psock_destroy);
void sk_psock_drop(struct sock *sk, struct sk_psock *psock)
{
sock_owned_by_me(sk);
sk_psock_cork_free(psock);
sk_psock_zap_ingress(psock);
......
......@@ -241,8 +241,11 @@ static void sock_map_free(struct bpf_map *map)
struct sock *sk;
sk = xchg(psk, NULL);
if (sk)
if (sk) {
lock_sock(sk);
sock_map_unref(sk, psk);
release_sock(sk);
}
}
raw_spin_unlock_bh(&stab->lock);
rcu_read_unlock();
......@@ -862,7 +865,9 @@ static void sock_hash_free(struct bpf_map *map)
raw_spin_lock_bh(&bucket->lock);
hlist_for_each_entry_safe(elem, node, &bucket->head, node) {
hlist_del_rcu(&elem->node);
lock_sock(elem->sk);
sock_map_unref(elem->sk, elem);
release_sock(elem->sk);
}
raw_spin_unlock_bh(&bucket->lock);
}
......
......@@ -315,10 +315,7 @@ static int tcp_bpf_send_verdict(struct sock *sk, struct sk_psock *psock,
*/
delta = msg->sg.size;
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
if (msg->sg.size < delta)
delta -= msg->sg.size;
else
delta = 0;
delta -= msg->sg.size;
}
if (msg->cork_bytes &&
......
......@@ -99,17 +99,19 @@ void tcp_get_available_ulp(char *buf, size_t maxlen)
rcu_read_unlock();
}
void tcp_update_ulp(struct sock *sk, struct proto *proto)
void tcp_update_ulp(struct sock *sk, struct proto *proto,
void (*write_space)(struct sock *sk))
{
struct inet_connection_sock *icsk = inet_csk(sk);
if (!icsk->icsk_ulp_ops) {
sk->sk_write_space = write_space;
sk->sk_prot = proto;
return;
}
if (icsk->icsk_ulp_ops->update)
icsk->icsk_ulp_ops->update(sk, proto);
icsk->icsk_ulp_ops->update(sk, proto, write_space);
}
void tcp_cleanup_ulp(struct sock *sk)
......
......@@ -732,15 +732,19 @@ static int tls_init(struct sock *sk)
return rc;
}
static void tls_update(struct sock *sk, struct proto *p)
static void tls_update(struct sock *sk, struct proto *p,
void (*write_space)(struct sock *sk))
{
struct tls_context *ctx;
ctx = tls_get_ctx(sk);
if (likely(ctx))
if (likely(ctx)) {
ctx->sk_write_space = write_space;
ctx->sk_proto = p;
else
} else {
sk->sk_prot = p;
sk->sk_write_space = write_space;
}
}
static int tls_get_info(const struct sock *sk, struct sk_buff *skb)
......
......@@ -682,12 +682,32 @@ static int tls_push_record(struct sock *sk, int flags,
split_point = msg_pl->apply_bytes;
split = split_point && split_point < msg_pl->sg.size;
if (unlikely((!split &&
msg_pl->sg.size +
prot->overhead_size > msg_en->sg.size) ||
(split &&
split_point +
prot->overhead_size > msg_en->sg.size))) {
split = true;
split_point = msg_en->sg.size;
}
if (split) {
rc = tls_split_open_record(sk, rec, &tmp, msg_pl, msg_en,
split_point, prot->overhead_size,
&orig_end);
if (rc < 0)
return rc;
/* This can happen if above tls_split_open_record allocates
* a single large encryption buffer instead of two smaller
* ones. In this case adjust pointers and continue without
* split.
*/
if (!msg_pl->sg.size) {
tls_merge_open_record(sk, rec, tmp, orig_end);
msg_pl = &rec->msg_plaintext;
msg_en = &rec->msg_encrypted;
split = false;
}
sk_msg_trim(sk, msg_en, msg_pl->sg.size +
prot->overhead_size);
}
......@@ -709,6 +729,12 @@ static int tls_push_record(struct sock *sk, int flags,
sg_mark_end(sk_msg_elem(msg_pl, i));
}
if (msg_pl->sg.end < msg_pl->sg.start) {
sg_chain(&msg_pl->sg.data[msg_pl->sg.start],
MAX_SKB_FRAGS - msg_pl->sg.start + 1,
msg_pl->sg.data);
}
i = msg_pl->sg.start;
sg_chain(rec->sg_aead_in, 2, &msg_pl->sg.data[i]);
......@@ -783,10 +809,7 @@ static int bpf_exec_tx_verdict(struct sk_msg *msg, struct sock *sk,
if (psock->eval == __SK_NONE) {
delta = msg->sg.size;
psock->eval = sk_psock_msg_verdict(sk, psock, msg);
if (delta < msg->sg.size)
delta -= msg->sg.size;
else
delta = 0;
delta -= msg->sg.size;
}
if (msg->cork_bytes && msg->cork_bytes > msg->sg.size &&
!enospc && !full_record) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment