Commit ebfa00c5 authored by Sabrina Dubroca's avatar Sabrina Dubroca Committed by David S. Miller

tcp: fix refcnt leak with ebpf congestion control

There are a few bugs around refcnt handling in the new BPF congestion
control setsockopt:

 - The new ca is assigned to icsk->icsk_ca_ops even in the case where we
   cannot get a reference on it. This would lead to a use after free,
   since that ca is going away soon.

 - Changing the congestion control case doesn't release the refcnt on
   the previous ca.

 - In the reinit case, we first leak a reference on the old ca, then we
   call tcp_reinit_congestion_control on the ca that we have just
   assigned, leading to deinitializing the wrong ca (->release of the
   new ca on the old ca's data) and releasing the refcount on the ca
   that we actually want to use.

This is visible by building (for example) BIC as a module and setting
net.ipv4.tcp_congestion_control=bic, and using tcp_cong_kern.c from
samples/bpf.

This patch fixes the refcount issues, and moves reinit back into tcp
core to avoid passing a ca pointer back to BPF.

Fixes: 91b5b21c ("bpf: Add support for changing congestion control")
Signed-off-by: default avatarSabrina Dubroca <sd@queasysnail.net>
Acked-by: default avatarLawrence Brakmo <brakmo@fb.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 36143645
...@@ -1004,9 +1004,7 @@ void tcp_get_default_congestion_control(char *name); ...@@ -1004,9 +1004,7 @@ void tcp_get_default_congestion_control(char *name);
void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_allowed_congestion_control(char *allowed);
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load); int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit);
void tcp_reinit_congestion_control(struct sock *sk,
const struct tcp_congestion_ops *ca);
u32 tcp_slow_start(struct tcp_sock *tp, u32 acked); u32 tcp_slow_start(struct tcp_sock *tp, u32 acked);
void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked); void tcp_cong_avoid_ai(struct tcp_sock *tp, u32 w, u32 acked);
......
...@@ -2836,15 +2836,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock, ...@@ -2836,15 +2836,12 @@ BPF_CALL_5(bpf_setsockopt, struct bpf_sock_ops_kern *, bpf_sock,
sk->sk_prot->setsockopt == tcp_setsockopt) { sk->sk_prot->setsockopt == tcp_setsockopt) {
if (optname == TCP_CONGESTION) { if (optname == TCP_CONGESTION) {
char name[TCP_CA_NAME_MAX]; char name[TCP_CA_NAME_MAX];
bool reinit = bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN;
strncpy(name, optval, min_t(long, optlen, strncpy(name, optval, min_t(long, optlen,
TCP_CA_NAME_MAX-1)); TCP_CA_NAME_MAX-1));
name[TCP_CA_NAME_MAX-1] = 0; name[TCP_CA_NAME_MAX-1] = 0;
ret = tcp_set_congestion_control(sk, name, false); ret = tcp_set_congestion_control(sk, name, false, reinit);
if (!ret && bpf_sock->op > BPF_SOCK_OPS_NEEDS_ECN)
/* replacing an existing ca */
tcp_reinit_congestion_control(sk,
inet_csk(sk)->icsk_ca_ops);
} else { } else {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
......
...@@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level, ...@@ -2481,7 +2481,7 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
name[val] = 0; name[val] = 0;
lock_sock(sk); lock_sock(sk);
err = tcp_set_congestion_control(sk, name, true); err = tcp_set_congestion_control(sk, name, true, true);
release_sock(sk); release_sock(sk);
return err; return err;
} }
......
...@@ -189,7 +189,7 @@ void tcp_init_congestion_control(struct sock *sk) ...@@ -189,7 +189,7 @@ void tcp_init_congestion_control(struct sock *sk)
INET_ECN_dontxmit(sk); INET_ECN_dontxmit(sk);
} }
void tcp_reinit_congestion_control(struct sock *sk, static void tcp_reinit_congestion_control(struct sock *sk,
const struct tcp_congestion_ops *ca) const struct tcp_congestion_ops *ca)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
...@@ -338,7 +338,7 @@ int tcp_set_allowed_congestion_control(char *val) ...@@ -338,7 +338,7 @@ int tcp_set_allowed_congestion_control(char *val)
* tcp_reinit_congestion_control (if the current congestion control was * tcp_reinit_congestion_control (if the current congestion control was
* already initialized. * already initialized.
*/ */
int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, bool reinit)
{ {
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
const struct tcp_congestion_ops *ca; const struct tcp_congestion_ops *ca;
...@@ -360,9 +360,18 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load) ...@@ -360,9 +360,18 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load)
if (!ca) { if (!ca) {
err = -ENOENT; err = -ENOENT;
} else if (!load) { } else if (!load) {
const struct tcp_congestion_ops *old_ca = icsk->icsk_ca_ops;
if (try_module_get(ca->owner)) {
if (reinit) {
tcp_reinit_congestion_control(sk, ca);
} else {
icsk->icsk_ca_ops = ca; icsk->icsk_ca_ops = ca;
if (!try_module_get(ca->owner)) module_put(old_ca->owner);
}
} else {
err = -EBUSY; err = -EBUSY;
}
} else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) || } else if (!((ca->flags & TCP_CONG_NON_RESTRICTED) ||
ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) { ns_capable(sock_net(sk)->user_ns, CAP_NET_ADMIN))) {
err = -EPERM; err = -EPERM;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment