Commit 90284c2b authored by David S. Miller's avatar David S. Miller

Merge branch 'ecn_via_routing_table'

Florian Westphal says:

====================
net: allow setting ecn via routing table

Here is v4 of the patchset, its exactly the same as v3 except in patch3/3
where I added the missing 'const' qualifier to a function argument that
Eric spotted during review.

I preserved Erics Acks so that he doesn't have to resend them.

v3 cover letter:

When using syn cookies, then do not simply trust that the echoed timestamp
was not modified to make sure that ecn is not turned on magically when it
is disabled on the host.

The first two patches, which were not part of earlier series, prepare
the cookie code for the ecn route metrics change by allowing is to
more easily use the existing dst object for ecn validation.

The 3rd patch adds the ecn route metric feature support.
It is almost the same as in v2, except that we'll now also test the
dst_features when decoding a syn cookie timestamp that indicates ecn support.

These three patches then allow turning on explicit congestion notification
based on the destination network.

For example, assuming the default tcp_ecn sysctl '2', the following will
enable ecn (tcp_ecn=1 behaviour, i.e. request ecn to be enabled for a
tcp connection) for all connections to hosts inside the 192.168.2/24 network:

ip route change 192.168.2.0/24 dev eth0 features ecn

Having a more fine-grained per-route setting can be beneficial for
various reasons, for example 1) within data centers, or 2) local ISPs
may deploy ECN support for their own video/streaming services [1], etc.

Joint work with Daniel Borkmann, feature suggested by Hannes Frederic Sowa.

The patch to enable this in iproute2 will be posted shortly, it is currently
also available here:
http://git.breakpoint.cc/cgit/fw/iproute2.git/commit/?h=iproute_features&id=8843d2d8973fb81c78a7efe6d42e3a17d739003e

[1] http://www.ietf.org/proceedings/89/slides/slides-89-tsvarea-1.pdf, p.15
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 436f7c20 f7b3bec6
...@@ -490,17 +490,16 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th, ...@@ -490,17 +490,16 @@ u32 __cookie_v4_init_sequence(const struct iphdr *iph, const struct tcphdr *th,
u16 *mssp); u16 *mssp);
__u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb, __u32 cookie_v4_init_sequence(struct sock *sk, const struct sk_buff *skb,
__u16 *mss); __u16 *mss);
#endif
__u32 cookie_init_timestamp(struct request_sock *req); __u32 cookie_init_timestamp(struct request_sock *req);
bool cookie_check_timestamp(struct tcp_options_received *opt, struct net *net, bool cookie_timestamp_decode(struct tcp_options_received *opt);
bool *ecn_ok); bool cookie_ecn_ok(const struct tcp_options_received *opt,
const struct net *net, const struct dst_entry *dst);
/* From net/ipv6/syncookies.c */ /* From net/ipv6/syncookies.c */
int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th, int __cookie_v6_check(const struct ipv6hdr *iph, const struct tcphdr *th,
u32 cookie); u32 cookie);
struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb); struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb);
#ifdef CONFIG_SYN_COOKIES
u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph, u32 __cookie_v6_init_sequence(const struct ipv6hdr *iph,
const struct tcphdr *th, u16 *mssp); const struct tcphdr *th, u16 *mssp);
__u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb, __u32 cookie_v6_init_sequence(struct sock *sk, const struct sk_buff *skb,
......
...@@ -19,10 +19,6 @@ ...@@ -19,10 +19,6 @@
#include <net/tcp.h> #include <net/tcp.h>
#include <net/route.h> #include <net/route.h>
/* Timestamps: lowest bits store TCP options */
#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)
extern int sysctl_tcp_syncookies; extern int sysctl_tcp_syncookies;
static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
...@@ -30,6 +26,30 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly; ...@@ -30,6 +26,30 @@ static u32 syncookie_secret[2][16-4+SHA_DIGEST_WORDS] __read_mostly;
#define COOKIEBITS 24 /* Upper bits store count */ #define COOKIEBITS 24 /* Upper bits store count */
#define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1) #define COOKIEMASK (((__u32)1 << COOKIEBITS) - 1)
/* TCP Timestamp: 6 lowest bits of timestamp sent in the cookie SYN-ACK
* stores TCP options:
*
* MSB LSB
* | 31 ... 6 | 5 | 4 | 3 2 1 0 |
* | Timestamp | ECN | SACK | WScale |
*
* When we receive a valid cookie-ACK, we look at the echoed tsval (if
* any) to figure out which TCP options we should use for the rebuilt
* connection.
*
* A WScale setting of '0xf' (which is an invalid scaling value)
* means that original syn did not include the TCP window scaling option.
*/
#define TS_OPT_WSCALE_MASK 0xf
#define TS_OPT_SACK BIT(4)
#define TS_OPT_ECN BIT(5)
/* There is no TS_OPT_TIMESTAMP:
* if ACK contains timestamp option, we already know it was
* requested/supported by the syn/synack exchange.
*/
#define TSBITS 6
#define TSMASK (((__u32)1 << TSBITS) - 1)
static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS], static DEFINE_PER_CPU(__u32 [16 + 5 + SHA_WORKSPACE_WORDS],
ipv4_cookie_scratch); ipv4_cookie_scratch);
...@@ -67,9 +87,11 @@ __u32 cookie_init_timestamp(struct request_sock *req) ...@@ -67,9 +87,11 @@ __u32 cookie_init_timestamp(struct request_sock *req)
ireq = inet_rsk(req); ireq = inet_rsk(req);
options = ireq->wscale_ok ? ireq->snd_wscale : 0xf; options = ireq->wscale_ok ? ireq->snd_wscale : TS_OPT_WSCALE_MASK;
options |= ireq->sack_ok << 4; if (ireq->sack_ok)
options |= ireq->ecn_ok << 5; options |= TS_OPT_SACK;
if (ireq->ecn_ok)
options |= TS_OPT_ECN;
ts = ts_now & ~TSMASK; ts = ts_now & ~TSMASK;
ts |= options; ts |= options;
...@@ -219,16 +241,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb, ...@@ -219,16 +241,13 @@ static inline struct sock *get_cookie_sock(struct sock *sk, struct sk_buff *skb,
* additional tcp options in the timestamp. * additional tcp options in the timestamp.
* This extracts these options from the timestamp echo. * This extracts these options from the timestamp echo.
* *
* The lowest 4 bits store snd_wscale. * return false if we decode a tcp option that is disabled
* next 2 bits indicate SACK and ECN support. * on the host.
*
* return false if we decode an option that should not be.
*/ */
bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, bool cookie_timestamp_decode(struct tcp_options_received *tcp_opt)
struct net *net, bool *ecn_ok)
{ {
/* echoed timestamp, lowest bits contain options */ /* echoed timestamp, lowest bits contain options */
u32 options = tcp_opt->rcv_tsecr & TSMASK; u32 options = tcp_opt->rcv_tsecr;
if (!tcp_opt->saw_tstamp) { if (!tcp_opt->saw_tstamp) {
tcp_clear_options(tcp_opt); tcp_clear_options(tcp_opt);
...@@ -238,22 +257,35 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt, ...@@ -238,22 +257,35 @@ bool cookie_check_timestamp(struct tcp_options_received *tcp_opt,
if (!sysctl_tcp_timestamps) if (!sysctl_tcp_timestamps)
return false; return false;
tcp_opt->sack_ok = (options & (1 << 4)) ? TCP_SACK_SEEN : 0; tcp_opt->sack_ok = (options & TS_OPT_SACK) ? TCP_SACK_SEEN : 0;
*ecn_ok = (options >> 5) & 1;
if (*ecn_ok && !net->ipv4.sysctl_tcp_ecn)
return false;
if (tcp_opt->sack_ok && !sysctl_tcp_sack) if (tcp_opt->sack_ok && !sysctl_tcp_sack)
return false; return false;
if ((options & 0xf) == 0xf) if ((options & TS_OPT_WSCALE_MASK) == TS_OPT_WSCALE_MASK)
return true; /* no window scaling */ return true; /* no window scaling */
tcp_opt->wscale_ok = 1; tcp_opt->wscale_ok = 1;
tcp_opt->snd_wscale = options & 0xf; tcp_opt->snd_wscale = options & TS_OPT_WSCALE_MASK;
return sysctl_tcp_window_scaling != 0; return sysctl_tcp_window_scaling != 0;
} }
EXPORT_SYMBOL(cookie_check_timestamp); EXPORT_SYMBOL(cookie_timestamp_decode);
bool cookie_ecn_ok(const struct tcp_options_received *tcp_opt,
const struct net *net, const struct dst_entry *dst)
{
bool ecn_ok = tcp_opt->rcv_tsecr & TS_OPT_ECN;
if (!ecn_ok)
return false;
if (net->ipv4.sysctl_tcp_ecn)
return true;
return dst_feature(dst, RTAX_FEATURE_ECN);
}
EXPORT_SYMBOL(cookie_ecn_ok);
struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
{ {
...@@ -269,7 +301,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ...@@ -269,7 +301,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
int mss; int mss;
struct rtable *rt; struct rtable *rt;
__u8 rcv_wscale; __u8 rcv_wscale;
bool ecn_ok = false;
struct flowi4 fl4; struct flowi4 fl4;
if (!sysctl_tcp_syncookies || !th->ack || th->rst) if (!sysctl_tcp_syncookies || !th->ack || th->rst)
...@@ -290,7 +321,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ...@@ -290,7 +321,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
memset(&tcp_opt, 0, sizeof(tcp_opt)); memset(&tcp_opt, 0, sizeof(tcp_opt));
tcp_parse_options(skb, &tcp_opt, 0, NULL); tcp_parse_options(skb, &tcp_opt, 0, NULL);
if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) if (!cookie_timestamp_decode(&tcp_opt))
goto out; goto out;
ret = NULL; ret = NULL;
...@@ -308,7 +339,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ...@@ -308,7 +339,6 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
ireq->ir_loc_addr = ip_hdr(skb)->daddr; ireq->ir_loc_addr = ip_hdr(skb)->daddr;
ireq->ir_rmt_addr = ip_hdr(skb)->saddr; ireq->ir_rmt_addr = ip_hdr(skb)->saddr;
ireq->ir_mark = inet_request_mark(sk, skb); ireq->ir_mark = inet_request_mark(sk, skb);
ireq->ecn_ok = ecn_ok;
ireq->snd_wscale = tcp_opt.snd_wscale; ireq->snd_wscale = tcp_opt.snd_wscale;
ireq->sack_ok = tcp_opt.sack_ok; ireq->sack_ok = tcp_opt.sack_ok;
ireq->wscale_ok = tcp_opt.wscale_ok; ireq->wscale_ok = tcp_opt.wscale_ok;
...@@ -357,6 +387,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb) ...@@ -357,6 +387,7 @@ struct sock *cookie_v4_check(struct sock *sk, struct sk_buff *skb)
dst_metric(&rt->dst, RTAX_INITRWND)); dst_metric(&rt->dst, RTAX_INITRWND));
ireq->rcv_wscale = rcv_wscale; ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), &rt->dst);
ret = get_cookie_sock(sk, skb, req, &rt->dst); ret = get_cookie_sock(sk, skb, req, &rt->dst);
/* ip_queue_xmit() depends on our flow being setup /* ip_queue_xmit() depends on our flow being setup
......
...@@ -5876,20 +5876,22 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family) ...@@ -5876,20 +5876,22 @@ static inline void pr_drop_req(struct request_sock *req, __u16 port, int family)
*/ */
static void tcp_ecn_create_request(struct request_sock *req, static void tcp_ecn_create_request(struct request_sock *req,
const struct sk_buff *skb, const struct sk_buff *skb,
const struct sock *listen_sk) const struct sock *listen_sk,
const struct dst_entry *dst)
{ {
const struct tcphdr *th = tcp_hdr(skb); const struct tcphdr *th = tcp_hdr(skb);
const struct net *net = sock_net(listen_sk); const struct net *net = sock_net(listen_sk);
bool th_ecn = th->ece && th->cwr; bool th_ecn = th->ece && th->cwr;
bool ect, need_ecn; bool ect, need_ecn, ecn_ok;
if (!th_ecn) if (!th_ecn)
return; return;
ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield); ect = !INET_ECN_is_not_ect(TCP_SKB_CB(skb)->ip_dsfield);
need_ecn = tcp_ca_needs_ecn(listen_sk); need_ecn = tcp_ca_needs_ecn(listen_sk);
ecn_ok = net->ipv4.sysctl_tcp_ecn || dst_feature(dst, RTAX_FEATURE_ECN);
if (!ect && !need_ecn && net->ipv4.sysctl_tcp_ecn) if (!ect && !need_ecn && ecn_ok)
inet_rsk(req)->ecn_ok = 1; inet_rsk(req)->ecn_ok = 1;
else if (ect && need_ecn) else if (ect && need_ecn)
inet_rsk(req)->ecn_ok = 1; inet_rsk(req)->ecn_ok = 1;
...@@ -5954,13 +5956,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, ...@@ -5954,13 +5956,7 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
if (security_inet_conn_request(sk, skb, req)) if (security_inet_conn_request(sk, skb, req))
goto drop_and_free; goto drop_and_free;
if (!want_cookie || tmp_opt.tstamp_ok) if (!want_cookie && !isn) {
tcp_ecn_create_request(req, skb, sk);
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
req->cookie_ts = tmp_opt.tstamp_ok;
} else if (!isn) {
/* VJ's idea. We save last timestamp seen /* VJ's idea. We save last timestamp seen
* from the destination in peer table, when entering * from the destination in peer table, when entering
* state TIME-WAIT, and check against it before * state TIME-WAIT, and check against it before
...@@ -6008,6 +6004,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops, ...@@ -6008,6 +6004,15 @@ int tcp_conn_request(struct request_sock_ops *rsk_ops,
goto drop_and_free; goto drop_and_free;
} }
tcp_ecn_create_request(req, skb, sk, dst);
if (want_cookie) {
isn = cookie_init_sequence(af_ops, sk, skb, &req->mss);
req->cookie_ts = tmp_opt.tstamp_ok;
if (!tmp_opt.tstamp_ok)
inet_rsk(req)->ecn_ok = 0;
}
tcp_rsk(req)->snt_isn = isn; tcp_rsk(req)->snt_isn = isn;
tcp_openreq_init_rwin(req, sk, dst); tcp_openreq_init_rwin(req, sk, dst);
fastopen = !want_cookie && fastopen = !want_cookie &&
......
...@@ -333,10 +333,19 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb) ...@@ -333,10 +333,19 @@ static void tcp_ecn_send_synack(struct sock *sk, struct sk_buff *skb)
static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
bool use_ecn = sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
tcp_ca_needs_ecn(sk);
if (!use_ecn) {
const struct dst_entry *dst = __sk_dst_get(sk);
if (dst && dst_feature(dst, RTAX_FEATURE_ECN))
use_ecn = true;
}
tp->ecn_flags = 0; tp->ecn_flags = 0;
if (sock_net(sk)->ipv4.sysctl_tcp_ecn == 1 ||
tcp_ca_needs_ecn(sk)) { if (use_ecn) {
TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR; TCP_SKB_CB(skb)->tcp_flags |= TCPHDR_ECE | TCPHDR_CWR;
tp->ecn_flags = TCP_ECN_OK; tp->ecn_flags = TCP_ECN_OK;
if (tcp_ca_needs_ecn(sk)) if (tcp_ca_needs_ecn(sk))
......
...@@ -166,7 +166,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ...@@ -166,7 +166,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
int mss; int mss;
struct dst_entry *dst; struct dst_entry *dst;
__u8 rcv_wscale; __u8 rcv_wscale;
bool ecn_ok = false;
if (!sysctl_tcp_syncookies || !th->ack || th->rst) if (!sysctl_tcp_syncookies || !th->ack || th->rst)
goto out; goto out;
...@@ -186,7 +185,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ...@@ -186,7 +185,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
memset(&tcp_opt, 0, sizeof(tcp_opt)); memset(&tcp_opt, 0, sizeof(tcp_opt));
tcp_parse_options(skb, &tcp_opt, 0, NULL); tcp_parse_options(skb, &tcp_opt, 0, NULL);
if (!cookie_check_timestamp(&tcp_opt, sock_net(sk), &ecn_ok)) if (!cookie_timestamp_decode(&tcp_opt))
goto out; goto out;
ret = NULL; ret = NULL;
...@@ -223,7 +222,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ...@@ -223,7 +222,6 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
req->expires = 0UL; req->expires = 0UL;
req->num_retrans = 0; req->num_retrans = 0;
ireq->ecn_ok = ecn_ok;
ireq->snd_wscale = tcp_opt.snd_wscale; ireq->snd_wscale = tcp_opt.snd_wscale;
ireq->sack_ok = tcp_opt.sack_ok; ireq->sack_ok = tcp_opt.sack_ok;
ireq->wscale_ok = tcp_opt.wscale_ok; ireq->wscale_ok = tcp_opt.wscale_ok;
...@@ -264,6 +262,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb) ...@@ -264,6 +262,7 @@ struct sock *cookie_v6_check(struct sock *sk, struct sk_buff *skb)
dst_metric(dst, RTAX_INITRWND)); dst_metric(dst, RTAX_INITRWND));
ireq->rcv_wscale = rcv_wscale; ireq->rcv_wscale = rcv_wscale;
ireq->ecn_ok = cookie_ecn_ok(&tcp_opt, sock_net(sk), dst);
ret = get_cookie_sock(sk, skb, req, dst); ret = get_cookie_sock(sk, skb, req, dst);
out: out:
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment