Commit eb92f76e authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-fastopen-new-API'

Wei Wang says:

====================
net/tcp-fastopen: Add new userspace API support

The patch series is to add support for new userspace API for TCP fastopen
sockets.
In the current code, user has to call sendto()/sendmsg() with special flag
MSG_FASTOPEN for TCP fastopen sockets. This API is quite different from the
normal TCP socket API and can be cumbersome for applications to make use
fastopen sockets.
So this new patch introduces a new way of using TCP fastopen sockets which
is similar to normal TCP sockets with a new sockopt TCP_FASTOPEN_CONNECT.
More details about it is described in the third patch.
(First 2 patches are preparations for the third patch.)
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents a9c54ad2 19f6d3f3
...@@ -222,7 +222,8 @@ struct tcp_sock { ...@@ -222,7 +222,8 @@ struct tcp_sock {
u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */
u8 chrono_type:2, /* current chronograph type */ u8 chrono_type:2, /* current chronograph type */
rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */
unused:5; fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */
unused:4;
u8 nonagle : 4,/* Disable Nagle algorithm? */ u8 nonagle : 4,/* Disable Nagle algorithm? */
thin_lto : 1,/* Use linear timeouts for thin streams */ thin_lto : 1,/* Use linear timeouts for thin streams */
unused1 : 1, unused1 : 1,
......
...@@ -206,7 +206,11 @@ struct inet_sock { ...@@ -206,7 +206,11 @@ struct inet_sock {
transparent:1, transparent:1,
mc_all:1, mc_all:1,
nodefrag:1; nodefrag:1;
__u8 bind_address_no_port:1; __u8 bind_address_no_port:1,
defer_connect:1; /* Indicates that fastopen_connect is set
* and cookie exists so we defer connect
* until first data frame is written
*/
__u8 rcv_tos; __u8 rcv_tos;
__u8 convert_csum; __u8 convert_csum;
int uc_index; int uc_index;
......
...@@ -1493,6 +1493,9 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, ...@@ -1493,6 +1493,9 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
struct tcp_fastopen_cookie *foc, struct tcp_fastopen_cookie *foc,
struct dst_entry *dst); struct dst_entry *dst);
void tcp_fastopen_init_key_once(bool publish); void tcp_fastopen_init_key_once(bool publish);
bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie);
bool tcp_fastopen_defer_connect(struct sock *sk, int *err);
#define TCP_FASTOPEN_KEY_LENGTH 16 #define TCP_FASTOPEN_KEY_LENGTH 16
/* Fastopen key context */ /* Fastopen key context */
......
...@@ -116,6 +116,7 @@ enum { ...@@ -116,6 +116,7 @@ enum {
#define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */ #define TCP_SAVE_SYN 27 /* Record SYN headers for new connections */
#define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */ #define TCP_SAVED_SYN 28 /* Get SYN headers recorded for connection */
#define TCP_REPAIR_WINDOW 29 /* Get/set window parameters */ #define TCP_REPAIR_WINDOW 29 /* Get/set window parameters */
#define TCP_FASTOPEN_CONNECT 30 /* Attempt FastOpen with connect */
struct tcp_repair_opt { struct tcp_repair_opt {
__u32 opt_code; __u32 opt_code;
......
...@@ -576,13 +576,24 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, ...@@ -576,13 +576,24 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
int err; int err;
long timeo; long timeo;
if (addr_len < sizeof(uaddr->sa_family)) /*
return -EINVAL; * uaddr can be NULL and addr_len can be 0 if:
* sk is a TCP fastopen active socket and
* TCP_FASTOPEN_CONNECT sockopt is set and
* we already have a valid cookie for this socket.
* In this case, user can call write() after connect().
* write() will invoke tcp_sendmsg_fastopen() which calls
* __inet_stream_connect().
*/
if (uaddr) {
if (addr_len < sizeof(uaddr->sa_family))
return -EINVAL;
if (uaddr->sa_family == AF_UNSPEC) { if (uaddr->sa_family == AF_UNSPEC) {
err = sk->sk_prot->disconnect(sk, flags); err = sk->sk_prot->disconnect(sk, flags);
sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED; sock->state = err ? SS_DISCONNECTING : SS_UNCONNECTED;
goto out; goto out;
}
} }
switch (sock->state) { switch (sock->state) {
...@@ -593,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, ...@@ -593,7 +604,10 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
err = -EISCONN; err = -EISCONN;
goto out; goto out;
case SS_CONNECTING: case SS_CONNECTING:
err = -EALREADY; if (inet_sk(sk)->defer_connect)
err = -EINPROGRESS;
else
err = -EALREADY;
/* Fall out of switch with err, set for this state */ /* Fall out of switch with err, set for this state */
break; break;
case SS_UNCONNECTED: case SS_UNCONNECTED:
...@@ -607,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr, ...@@ -607,6 +621,9 @@ int __inet_stream_connect(struct socket *sock, struct sockaddr *uaddr,
sock->state = SS_CONNECTING; sock->state = SS_CONNECTING;
if (!err && inet_sk(sk)->defer_connect)
goto out;
/* Just entered SS_CONNECTING state; the only /* Just entered SS_CONNECTING state; the only
* difference is that return value in non-blocking * difference is that return value in non-blocking
* case is EINPROGRESS, rather than EALREADY. * case is EINPROGRESS, rather than EALREADY.
......
...@@ -533,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait) ...@@ -533,6 +533,12 @@ unsigned int tcp_poll(struct file *file, struct socket *sock, poll_table *wait)
if (tp->urg_data & TCP_URG_VALID) if (tp->urg_data & TCP_URG_VALID)
mask |= POLLPRI; mask |= POLLPRI;
} else if (sk->sk_state == TCP_SYN_SENT && inet_sk(sk)->defer_connect) {
/* Active TCP fastopen socket with defer_connect
* Return POLLOUT so application can call write()
* in order for kernel to generate SYN+data
*/
mask |= POLLOUT | POLLWRNORM;
} }
/* This barrier is coupled with smp_wmb() in tcp_reset() */ /* This barrier is coupled with smp_wmb() in tcp_reset() */
smp_rmb(); smp_rmb();
...@@ -1071,6 +1077,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, ...@@ -1071,6 +1077,7 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
int *copied, size_t size) int *copied, size_t size)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct inet_sock *inet = inet_sk(sk);
int err, flags; int err, flags;
if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE)) if (!(sysctl_tcp_fastopen & TFO_CLIENT_ENABLE))
...@@ -1085,9 +1092,19 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg, ...@@ -1085,9 +1092,19 @@ static int tcp_sendmsg_fastopen(struct sock *sk, struct msghdr *msg,
tp->fastopen_req->data = msg; tp->fastopen_req->data = msg;
tp->fastopen_req->size = size; tp->fastopen_req->size = size;
if (inet->defer_connect) {
err = tcp_connect(sk);
/* Same failure procedure as in tcp_v4/6_connect */
if (err) {
tcp_set_state(sk, TCP_CLOSE);
inet->inet_dport = 0;
sk->sk_route_caps = 0;
}
}
flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0; flags = (msg->msg_flags & MSG_DONTWAIT) ? O_NONBLOCK : 0;
err = __inet_stream_connect(sk->sk_socket, msg->msg_name, err = __inet_stream_connect(sk->sk_socket, msg->msg_name,
msg->msg_namelen, flags); msg->msg_namelen, flags);
inet->defer_connect = 0;
*copied = tp->fastopen_req->copied; *copied = tp->fastopen_req->copied;
tcp_free_fastopen_req(tp); tcp_free_fastopen_req(tp);
return err; return err;
...@@ -1107,7 +1124,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size) ...@@ -1107,7 +1124,7 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
lock_sock(sk); lock_sock(sk);
flags = msg->msg_flags; flags = msg->msg_flags;
if (flags & MSG_FASTOPEN) { if (unlikely(flags & MSG_FASTOPEN || inet_sk(sk)->defer_connect)) {
err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size); err = tcp_sendmsg_fastopen(sk, msg, &copied_syn, size);
if (err == -EINPROGRESS && copied_syn > 0) if (err == -EINPROGRESS && copied_syn > 0)
goto out; goto out;
...@@ -2656,6 +2673,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level, ...@@ -2656,6 +2673,18 @@ static int do_tcp_setsockopt(struct sock *sk, int level,
err = -EINVAL; err = -EINVAL;
} }
break; break;
case TCP_FASTOPEN_CONNECT:
if (val > 1 || val < 0) {
err = -EINVAL;
} else if (sysctl_tcp_fastopen & TFO_CLIENT_ENABLE) {
if (sk->sk_state == TCP_CLOSE)
tp->fastopen_connect = val;
else
err = -EINVAL;
} else {
err = -EOPNOTSUPP;
}
break;
case TCP_TIMESTAMP: case TCP_TIMESTAMP:
if (!tp->repair) if (!tp->repair)
err = -EPERM; err = -EPERM;
...@@ -3016,6 +3045,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level, ...@@ -3016,6 +3045,10 @@ static int do_tcp_getsockopt(struct sock *sk, int level,
val = icsk->icsk_accept_queue.fastopenq.max_qlen; val = icsk->icsk_accept_queue.fastopenq.max_qlen;
break; break;
case TCP_FASTOPEN_CONNECT:
val = tp->fastopen_connect;
break;
case TCP_TIMESTAMP: case TCP_TIMESTAMP:
val = tcp_time_stamp + tp->tsoffset; val = tcp_time_stamp + tp->tsoffset;
break; break;
......
...@@ -325,3 +325,57 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb, ...@@ -325,3 +325,57 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
*foc = valid_foc; *foc = valid_foc;
return NULL; return NULL;
} }
bool tcp_fastopen_cookie_check(struct sock *sk, u16 *mss,
struct tcp_fastopen_cookie *cookie)
{
unsigned long last_syn_loss = 0;
int syn_loss = 0;
tcp_fastopen_cache_get(sk, mss, cookie, &syn_loss, &last_syn_loss);
/* Recurring FO SYN losses: no cookie or data in SYN */
if (syn_loss > 1 &&
time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
cookie->len = -1;
return false;
}
if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE) {
cookie->len = -1;
return true;
}
return cookie->len > 0;
}
/* This function checks if we want to defer sending SYN until the first
* write(). We defer under the following conditions:
* 1. fastopen_connect sockopt is set
* 2. we have a valid cookie
* Return value: return true if we want to defer until application writes data
* return false if we want to send out SYN immediately
*/
bool tcp_fastopen_defer_connect(struct sock *sk, int *err)
{
struct tcp_fastopen_cookie cookie = { .len = 0 };
struct tcp_sock *tp = tcp_sk(sk);
u16 mss;
if (tp->fastopen_connect && !tp->fastopen_req) {
if (tcp_fastopen_cookie_check(sk, &mss, &cookie)) {
inet_sk(sk)->defer_connect = 1;
return true;
}
/* Alloc fastopen_req in order for FO option to be included
* in SYN
*/
tp->fastopen_req = kzalloc(sizeof(*tp->fastopen_req),
sk->sk_allocation);
if (tp->fastopen_req)
tp->fastopen_req->cookie = cookie;
else
*err = -ENOBUFS;
}
return false;
}
EXPORT_SYMBOL(tcp_fastopen_defer_connect);
...@@ -232,6 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) ...@@ -232,6 +232,7 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
/* OK, now commit destination to socket. */ /* OK, now commit destination to socket. */
sk->sk_gso_type = SKB_GSO_TCPV4; sk->sk_gso_type = SKB_GSO_TCPV4;
sk_setup_caps(sk, &rt->dst); sk_setup_caps(sk, &rt->dst);
rt = NULL;
if (!tp->write_seq && likely(!tp->repair)) if (!tp->write_seq && likely(!tp->repair))
tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr, tp->write_seq = secure_tcp_sequence_number(inet->inet_saddr,
...@@ -242,9 +243,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len) ...@@ -242,9 +243,13 @@ int tcp_v4_connect(struct sock *sk, struct sockaddr *uaddr, int addr_len)
inet->inet_id = tp->write_seq ^ jiffies; inet->inet_id = tp->write_seq ^ jiffies;
if (tcp_fastopen_defer_connect(sk, &err))
return err;
if (err)
goto failure;
err = tcp_connect(sk); err = tcp_connect(sk);
rt = NULL;
if (err) if (err)
goto failure; goto failure;
......
...@@ -3267,23 +3267,11 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) ...@@ -3267,23 +3267,11 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
{ {
struct tcp_sock *tp = tcp_sk(sk); struct tcp_sock *tp = tcp_sk(sk);
struct tcp_fastopen_request *fo = tp->fastopen_req; struct tcp_fastopen_request *fo = tp->fastopen_req;
int syn_loss = 0, space, err = 0; int space, err = 0;
unsigned long last_syn_loss = 0;
struct sk_buff *syn_data; struct sk_buff *syn_data;
tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */ tp->rx_opt.mss_clamp = tp->advmss; /* If MSS is not cached */
tcp_fastopen_cache_get(sk, &tp->rx_opt.mss_clamp, &fo->cookie, if (!tcp_fastopen_cookie_check(sk, &tp->rx_opt.mss_clamp, &fo->cookie))
&syn_loss, &last_syn_loss);
/* Recurring FO SYN losses: revert to regular handshake temporarily */
if (syn_loss > 1 &&
time_before(jiffies, last_syn_loss + (60*HZ << syn_loss))) {
fo->cookie.len = -1;
goto fallback;
}
if (sysctl_tcp_fastopen & TFO_CLIENT_NO_COOKIE)
fo->cookie.len = -1;
else if (fo->cookie.len <= 0)
goto fallback; goto fallback;
/* MSS for SYN-data is based on cached MSS and bounded by PMTU and /* MSS for SYN-data is based on cached MSS and bounded by PMTU and
......
...@@ -287,6 +287,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ...@@ -287,6 +287,11 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
inet->inet_dport, inet->inet_dport,
&tp->tsoffset); &tp->tsoffset);
if (tcp_fastopen_defer_connect(sk, &err))
return err;
if (err)
goto late_failure;
err = tcp_connect(sk); err = tcp_connect(sk);
if (err) if (err)
goto late_failure; goto late_failure;
...@@ -295,7 +300,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr, ...@@ -295,7 +300,6 @@ static int tcp_v6_connect(struct sock *sk, struct sockaddr *uaddr,
late_failure: late_failure:
tcp_set_state(sk, TCP_CLOSE); tcp_set_state(sk, TCP_CLOSE);
__sk_dst_reset(sk);
failure: failure:
inet->inet_dport = 0; inet->inet_dport = 0;
sk->sk_route_caps = 0; sk->sk_route_caps = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment