Commit 6670e152 authored by Stephen Hemminger's avatar Stephen Hemminger Committed by David S. Miller

tcp: Namespace-ify sysctl_tcp_default_congestion_control

Make default TCP default congestion control to a per namespace
value. This changes default congestion control to a pointer to congestion ops
(rather than implicit as first element of available lsit).

The congestion control setting of new namespaces is inherited
from the current setting of the root namespace.
Signed-off-by: default avatarStephen Hemminger <sthemmin@microsoft.com>
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 11bf284f
...@@ -160,6 +160,7 @@ struct netns_ipv4 { ...@@ -160,6 +160,7 @@ struct netns_ipv4 {
struct inet_timewait_death_row tcp_death_row; struct inet_timewait_death_row tcp_death_row;
int sysctl_max_syn_backlog; int sysctl_max_syn_backlog;
int sysctl_tcp_fastopen; int sysctl_tcp_fastopen;
const struct tcp_congestion_ops __rcu *tcp_congestion_control;
struct tcp_fastopen_context __rcu *tcp_fastopen_ctx; struct tcp_fastopen_context __rcu *tcp_fastopen_ctx;
spinlock_t tcp_fastopen_ctx_lock; spinlock_t tcp_fastopen_ctx_lock;
unsigned int sysctl_tcp_fastopen_blackhole_timeout; unsigned int sysctl_tcp_fastopen_blackhole_timeout;
......
...@@ -1002,8 +1002,8 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *type); ...@@ -1002,8 +1002,8 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *type);
void tcp_assign_congestion_control(struct sock *sk); void tcp_assign_congestion_control(struct sock *sk);
void tcp_init_congestion_control(struct sock *sk); void tcp_init_congestion_control(struct sock *sk);
void tcp_cleanup_congestion_control(struct sock *sk); void tcp_cleanup_congestion_control(struct sock *sk);
int tcp_set_default_congestion_control(const char *name); int tcp_set_default_congestion_control(struct net *net, const char *name);
void tcp_get_default_congestion_control(char *name); void tcp_get_default_congestion_control(struct net *net, char *name);
void tcp_get_available_congestion_control(char *buf, size_t len); void tcp_get_available_congestion_control(char *buf, size_t len);
void tcp_get_allowed_congestion_control(char *buf, size_t len); void tcp_get_allowed_congestion_control(char *buf, size_t len);
int tcp_set_allowed_congestion_control(char *allowed); int tcp_set_allowed_congestion_control(char *allowed);
...@@ -1017,7 +1017,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked); ...@@ -1017,7 +1017,7 @@ void tcp_reno_cong_avoid(struct sock *sk, u32 ack, u32 acked);
extern struct tcp_congestion_ops tcp_reno; extern struct tcp_congestion_ops tcp_reno;
struct tcp_congestion_ops *tcp_ca_find_key(u32 key); struct tcp_congestion_ops *tcp_ca_find_key(u32 key);
u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca); u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca);
#ifdef CONFIG_INET #ifdef CONFIG_INET
char *tcp_ca_get_name_by_key(u32 key, char *buffer); char *tcp_ca_get_name_by_key(u32 key, char *buffer);
#else #else
......
...@@ -710,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi) ...@@ -710,7 +710,7 @@ bool fib_metrics_match(struct fib_config *cfg, struct fib_info *fi)
bool ecn_ca = false; bool ecn_ca = false;
nla_strlcpy(tmp, nla, sizeof(tmp)); nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(tmp, &ecn_ca); val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
} else { } else {
val = nla_get_u32(nla); val = nla_get_u32(nla);
} }
...@@ -1030,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg) ...@@ -1030,7 +1030,7 @@ fib_convert_metrics(struct fib_info *fi, const struct fib_config *cfg)
char tmp[TCP_CA_NAME_MAX]; char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp)); nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(tmp, &ecn_ca); val = tcp_ca_get_key_by_name(fi->fib_net, tmp, &ecn_ca);
if (val == TCP_CA_UNSPEC) if (val == TCP_CA_UNSPEC)
return -EINVAL; return -EINVAL;
} else { } else {
......
...@@ -201,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write, ...@@ -201,6 +201,8 @@ static int ipv4_ping_group_range(struct ctl_table *table, int write,
static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
void __user *buffer, size_t *lenp, loff_t *ppos) void __user *buffer, size_t *lenp, loff_t *ppos)
{ {
struct net *net = container_of(ctl->data, struct net,
ipv4.tcp_congestion_control);
char val[TCP_CA_NAME_MAX]; char val[TCP_CA_NAME_MAX];
struct ctl_table tbl = { struct ctl_table tbl = {
.data = val, .data = val,
...@@ -208,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write, ...@@ -208,11 +210,11 @@ static int proc_tcp_congestion_control(struct ctl_table *ctl, int write,
}; };
int ret; int ret;
tcp_get_default_congestion_control(val); tcp_get_default_congestion_control(net, val);
ret = proc_dostring(&tbl, write, buffer, lenp, ppos); ret = proc_dostring(&tbl, write, buffer, lenp, ppos);
if (write && ret == 0) if (write && ret == 0)
ret = tcp_set_default_congestion_control(val); ret = tcp_set_default_congestion_control(net, val);
return ret; return ret;
} }
...@@ -447,12 +449,6 @@ static struct ctl_table ipv4_table[] = { ...@@ -447,12 +449,6 @@ static struct ctl_table ipv4_table[] = {
.mode = 0644, .mode = 0644,
.proc_handler = proc_dointvec .proc_handler = proc_dointvec
}, },
{
.procname = "tcp_congestion_control",
.mode = 0644,
.maxlen = TCP_CA_NAME_MAX,
.proc_handler = proc_tcp_congestion_control,
},
#ifdef CONFIG_NETLABEL #ifdef CONFIG_NETLABEL
{ {
.procname = "cipso_cache_enable", .procname = "cipso_cache_enable",
...@@ -763,6 +759,13 @@ static struct ctl_table ipv4_net_table[] = { ...@@ -763,6 +759,13 @@ static struct ctl_table ipv4_net_table[] = {
.extra1 = &one .extra1 = &one
}, },
#endif #endif
{
.procname = "tcp_congestion_control",
.data = &init_net.ipv4.tcp_congestion_control,
.mode = 0644,
.maxlen = TCP_CA_NAME_MAX,
.proc_handler = proc_tcp_congestion_control,
},
{ {
.procname = "tcp_keepalive_time", .procname = "tcp_keepalive_time",
.data = &init_net.ipv4.sysctl_tcp_keepalive_time, .data = &init_net.ipv4.sysctl_tcp_keepalive_time,
......
...@@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name) ...@@ -33,9 +33,11 @@ static struct tcp_congestion_ops *tcp_ca_find(const char *name)
} }
/* Must be called with rcu lock held */ /* Must be called with rcu lock held */
static const struct tcp_congestion_ops *__tcp_ca_find_autoload(const char *name) static struct tcp_congestion_ops *tcp_ca_find_autoload(struct net *net,
const char *name)
{ {
const struct tcp_congestion_ops *ca = tcp_ca_find(name); struct tcp_congestion_ops *ca = tcp_ca_find(name);
#ifdef CONFIG_MODULES #ifdef CONFIG_MODULES
if (!ca && capable(CAP_NET_ADMIN)) { if (!ca && capable(CAP_NET_ADMIN)) {
rcu_read_unlock(); rcu_read_unlock();
...@@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca) ...@@ -115,7 +117,7 @@ void tcp_unregister_congestion_control(struct tcp_congestion_ops *ca)
} }
EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control); EXPORT_SYMBOL_GPL(tcp_unregister_congestion_control);
u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) u32 tcp_ca_get_key_by_name(struct net *net, const char *name, bool *ecn_ca)
{ {
const struct tcp_congestion_ops *ca; const struct tcp_congestion_ops *ca;
u32 key = TCP_CA_UNSPEC; u32 key = TCP_CA_UNSPEC;
...@@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca) ...@@ -123,7 +125,7 @@ u32 tcp_ca_get_key_by_name(const char *name, bool *ecn_ca)
might_sleep(); might_sleep();
rcu_read_lock(); rcu_read_lock();
ca = __tcp_ca_find_autoload(name); ca = tcp_ca_find_autoload(net, name);
if (ca) { if (ca) {
key = ca->key; key = ca->key;
*ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN; *ecn_ca = ca->flags & TCP_CONG_NEEDS_ECN;
...@@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key); ...@@ -153,23 +155,18 @@ EXPORT_SYMBOL_GPL(tcp_ca_get_name_by_key);
/* Assign choice of congestion control. */ /* Assign choice of congestion control. */
void tcp_assign_congestion_control(struct sock *sk) void tcp_assign_congestion_control(struct sock *sk)
{ {
struct net *net = sock_net(sk);
struct inet_connection_sock *icsk = inet_csk(sk); struct inet_connection_sock *icsk = inet_csk(sk);
struct tcp_congestion_ops *ca; const struct tcp_congestion_ops *ca;
rcu_read_lock(); rcu_read_lock();
list_for_each_entry_rcu(ca, &tcp_cong_list, list) { ca = rcu_dereference(net->ipv4.tcp_congestion_control);
if (likely(try_module_get(ca->owner))) { if (unlikely(!try_module_get(ca->owner)))
icsk->icsk_ca_ops = ca; ca = &tcp_reno;
goto out; icsk->icsk_ca_ops = ca;
}
/* Fallback to next available. The last really
* guaranteed fallback is Reno from this list.
*/
}
out:
rcu_read_unlock(); rcu_read_unlock();
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
memset(icsk->icsk_ca_priv, 0, sizeof(icsk->icsk_ca_priv));
if (ca->flags & TCP_CONG_NEEDS_ECN) if (ca->flags & TCP_CONG_NEEDS_ECN)
INET_ECN_xmit(sk); INET_ECN_xmit(sk);
else else
...@@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk) ...@@ -214,29 +211,27 @@ void tcp_cleanup_congestion_control(struct sock *sk)
} }
/* Used by sysctl to change default congestion control */ /* Used by sysctl to change default congestion control */
int tcp_set_default_congestion_control(const char *name) int tcp_set_default_congestion_control(struct net *net, const char *name)
{ {
struct tcp_congestion_ops *ca; struct tcp_congestion_ops *ca;
int ret = -ENOENT; const struct tcp_congestion_ops *prev;
int ret;
spin_lock(&tcp_cong_list_lock);
ca = tcp_ca_find(name);
#ifdef CONFIG_MODULES
if (!ca && capable(CAP_NET_ADMIN)) {
spin_unlock(&tcp_cong_list_lock);
request_module("tcp_%s", name); rcu_read_lock();
spin_lock(&tcp_cong_list_lock); ca = tcp_ca_find_autoload(net, name);
ca = tcp_ca_find(name); if (!ca) {
} ret = -ENOENT;
#endif } else if (!try_module_get(ca->owner)) {
ret = -EBUSY;
} else {
prev = xchg(&net->ipv4.tcp_congestion_control, ca);
if (prev)
module_put(prev->owner);
if (ca) { ca->flags |= TCP_CONG_NON_RESTRICTED;
ca->flags |= TCP_CONG_NON_RESTRICTED; /* default is always allowed */
list_move(&ca->list, &tcp_cong_list);
ret = 0; ret = 0;
} }
spin_unlock(&tcp_cong_list_lock); rcu_read_unlock();
return ret; return ret;
} }
...@@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name) ...@@ -244,7 +239,8 @@ int tcp_set_default_congestion_control(const char *name)
/* Set default value from kernel configuration at bootup */ /* Set default value from kernel configuration at bootup */
static int __init tcp_congestion_default(void) static int __init tcp_congestion_default(void)
{ {
return tcp_set_default_congestion_control(CONFIG_DEFAULT_TCP_CONG); return tcp_set_default_congestion_control(&init_net,
CONFIG_DEFAULT_TCP_CONG);
} }
late_initcall(tcp_congestion_default); late_initcall(tcp_congestion_default);
...@@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen) ...@@ -264,14 +260,12 @@ void tcp_get_available_congestion_control(char *buf, size_t maxlen)
} }
/* Get current default congestion control */ /* Get current default congestion control */
void tcp_get_default_congestion_control(char *name) void tcp_get_default_congestion_control(struct net *net, char *name)
{ {
struct tcp_congestion_ops *ca; const struct tcp_congestion_ops *ca;
/* We will always have reno... */
BUG_ON(list_empty(&tcp_cong_list));
rcu_read_lock(); rcu_read_lock();
ca = list_entry(tcp_cong_list.next, struct tcp_congestion_ops, list); ca = rcu_dereference(net->ipv4.tcp_congestion_control);
strncpy(name, ca->name, TCP_CA_NAME_MAX); strncpy(name, ca->name, TCP_CA_NAME_MAX);
rcu_read_unlock(); rcu_read_unlock();
} }
...@@ -351,12 +345,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo ...@@ -351,12 +345,14 @@ int tcp_set_congestion_control(struct sock *sk, const char *name, bool load, boo
if (!load) if (!load)
ca = tcp_ca_find(name); ca = tcp_ca_find(name);
else else
ca = __tcp_ca_find_autoload(name); ca = tcp_ca_find_autoload(sock_net(sk), name);
/* No change asking for existing value */ /* No change asking for existing value */
if (ca == icsk->icsk_ca_ops) { if (ca == icsk->icsk_ca_ops) {
icsk->icsk_ca_setsockopt = 1; icsk->icsk_ca_setsockopt = 1;
goto out; goto out;
} }
if (!ca) { if (!ca) {
err = -ENOENT; err = -ENOENT;
} else if (!load) { } else if (!load) {
......
...@@ -2430,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net) ...@@ -2430,6 +2430,8 @@ static void __net_exit tcp_sk_exit(struct net *net)
{ {
int cpu; int cpu;
module_put(net->ipv4.tcp_congestion_control->owner);
for_each_possible_cpu(cpu) for_each_possible_cpu(cpu)
inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu)); inet_ctl_sock_destroy(*per_cpu_ptr(net->ipv4.tcp_sk, cpu));
free_percpu(net->ipv4.tcp_sk); free_percpu(net->ipv4.tcp_sk);
...@@ -2522,6 +2524,13 @@ static int __net_init tcp_sk_init(struct net *net) ...@@ -2522,6 +2524,13 @@ static int __net_init tcp_sk_init(struct net *net)
net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60; net->ipv4.sysctl_tcp_fastopen_blackhole_timeout = 60 * 60;
atomic_set(&net->ipv4.tfo_active_disable_times, 0); atomic_set(&net->ipv4.tfo_active_disable_times, 0);
/* Reno is always built in */
if (!net_eq(net, &init_net) &&
try_module_get(init_net.ipv4.tcp_congestion_control->owner))
net->ipv4.tcp_congestion_control = init_net.ipv4.tcp_congestion_control;
else
net->ipv4.tcp_congestion_control = &tcp_reno;
return 0; return 0;
fail: fail:
tcp_sk_exit(net); tcp_sk_exit(net);
......
...@@ -2378,6 +2378,7 @@ static int ip6_dst_gc(struct dst_ops *ops) ...@@ -2378,6 +2378,7 @@ static int ip6_dst_gc(struct dst_ops *ops)
static int ip6_convert_metrics(struct mx6_config *mxc, static int ip6_convert_metrics(struct mx6_config *mxc,
const struct fib6_config *cfg) const struct fib6_config *cfg)
{ {
struct net *net = cfg->fc_nlinfo.nl_net;
bool ecn_ca = false; bool ecn_ca = false;
struct nlattr *nla; struct nlattr *nla;
int remaining; int remaining;
...@@ -2403,7 +2404,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc, ...@@ -2403,7 +2404,7 @@ static int ip6_convert_metrics(struct mx6_config *mxc,
char tmp[TCP_CA_NAME_MAX]; char tmp[TCP_CA_NAME_MAX];
nla_strlcpy(tmp, nla, sizeof(tmp)); nla_strlcpy(tmp, nla, sizeof(tmp));
val = tcp_ca_get_key_by_name(tmp, &ecn_ca); val = tcp_ca_get_key_by_name(net, tmp, &ecn_ca);
if (val == TCP_CA_UNSPEC) if (val == TCP_CA_UNSPEC)
goto err; goto err;
} else { } else {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment