Commit 570d6320 authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp-preempt'

Eric Dumazet says:

====================
net: make TCP preemptible

Most of TCP stack assumed it was running from BH handler.

This is great for most things, as TCP behavior is very sensitive
to scheduling artifacts.

However, the prequeue and backlog processing are problematic,
as they need to be flushed with BH being blocked.

To cope with modern needs, TCP sockets have big sk_rcvbuf values,
in the order of 16 MB, and soon 32 MB.
This means that backlog can hold thousands of packets, and things
like TCP coalescing or collapsing on this amount of packets can
lead to insane latency spikes, since BH are blocked for too long.

It is time to make UDP/TCP stacks preemptible.

Note that fast path still runs from BH handler.

v2: Added "tcp: make tcp_sendmsg() aware of socket backlog"
    to reduce latency problems of large sends.

v3: Fixed a typo in tcp_cdg.c
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 5e59c83f d41a69f1
......@@ -926,6 +926,17 @@ void sk_stream_kill_queues(struct sock *sk);
void sk_set_memalloc(struct sock *sk);
void sk_clear_memalloc(struct sock *sk);
void __sk_flush_backlog(struct sock *sk);
static inline bool sk_flush_backlog(struct sock *sk)
{
if (unlikely(READ_ONCE(sk->sk_backlog.tail))) {
__sk_flush_backlog(sk);
return true;
}
return false;
}
int sk_wait_data(struct sock *sk, long *timeo, const struct sk_buff *skb);
struct request_sock_ops;
......
......@@ -2019,33 +2019,27 @@ static void __release_sock(struct sock *sk)
__releases(&sk->sk_lock.slock)
__acquires(&sk->sk_lock.slock)
{
struct sk_buff *skb = sk->sk_backlog.head;
struct sk_buff *skb, *next;
do {
while ((skb = sk->sk_backlog.head) != NULL) {
sk->sk_backlog.head = sk->sk_backlog.tail = NULL;
bh_unlock_sock(sk);
do {
struct sk_buff *next = skb->next;
spin_unlock_bh(&sk->sk_lock.slock);
do {
next = skb->next;
prefetch(next);
WARN_ON_ONCE(skb_dst_is_noref(skb));
skb->next = NULL;
sk_backlog_rcv(sk, skb);
/*
* We are in process context here with softirqs
* disabled, use cond_resched_softirq() to preempt.
* This is safe to do because we've taken the backlog
* queue private:
*/
cond_resched_softirq();
cond_resched();
skb = next;
} while (skb != NULL);
bh_lock_sock(sk);
} while ((skb = sk->sk_backlog.head) != NULL);
spin_lock_bh(&sk->sk_lock.slock);
}
/*
* Doing the zeroing here guarantee we can not loop forever
......@@ -2054,6 +2048,13 @@ static void __release_sock(struct sock *sk)
sk->sk_backlog.len = 0;
}
void __sk_flush_backlog(struct sock *sk)
{
spin_lock_bh(&sk->sk_lock.slock);
__release_sock(sk);
spin_unlock_bh(&sk->sk_lock.slock);
}
/**
* sk_wait_data - wait for data to arrive at sk_receive_queue
* @sk: sock to wait on
......
......@@ -359,7 +359,7 @@ static int __dccp_rcv_established(struct sock *sk, struct sk_buff *skb,
goto discard;
}
__DCCP_INC_STATS(DCCP_MIB_INERRS);
DCCP_INC_STATS(DCCP_MIB_INERRS);
discard:
__kfree_skb(skb);
return 0;
......
......@@ -533,8 +533,8 @@ static void dccp_v4_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
bh_unlock_sock(ctl_sk);
if (net_xmit_eval(err) == 0) {
__DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
__DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
}
out:
dst_release(dst);
......
......@@ -277,8 +277,8 @@ static void dccp_v6_ctl_send_reset(const struct sock *sk, struct sk_buff *rxskb)
if (!IS_ERR(dst)) {
skb_dst_set(skb, dst);
ip6_xmit(ctl_sk, skb, &fl6, NULL, 0);
__DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
__DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
DCCP_INC_STATS(DCCP_MIB_OUTSEGS);
DCCP_INC_STATS(DCCP_MIB_OUTRSTS);
return;
}
......
......@@ -253,7 +253,7 @@ int dccp_parse_options(struct sock *sk, struct dccp_request_sock *dreq,
return 0;
out_invalid_option:
__DCCP_INC_STATS(DCCP_MIB_INVALIDOPT);
DCCP_INC_STATS(DCCP_MIB_INVALIDOPT);
rc = DCCP_RESET_CODE_OPTION_ERROR;
out_featneg_failed:
DCCP_WARN("DCCP(%p): Option %d (len=%d) error=%u\n", sk, opt, len, rc);
......
......@@ -1136,11 +1136,12 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
/* This should be in poll */
sk_clear_bit(SOCKWQ_ASYNC_NOSPACE, sk);
mss_now = tcp_send_mss(sk, &size_goal, flags);
/* Ok commence sending. */
copied = 0;
restart:
mss_now = tcp_send_mss(sk, &size_goal, flags);
err = -EPIPE;
if (sk->sk_err || (sk->sk_shutdown & SEND_SHUTDOWN))
goto out_err;
......@@ -1166,6 +1167,9 @@ int tcp_sendmsg(struct sock *sk, struct msghdr *msg, size_t size)
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
if (sk_flush_backlog(sk))
goto restart;
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg),
sk->sk_allocation,
......@@ -1449,12 +1453,8 @@ static void tcp_prequeue_process(struct sock *sk)
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPPREQUEUED);
/* RX process wants to run with disabled BHs, though it is not
* necessary */
local_bh_disable();
while ((skb = __skb_dequeue(&tp->ucopy.prequeue)) != NULL)
sk_backlog_rcv(sk, skb);
local_bh_enable();
/* Clear memory counter. */
tp->ucopy.memory = 0;
......@@ -3095,7 +3095,7 @@ void tcp_done(struct sock *sk)
struct request_sock *req = tcp_sk(sk)->fastopen_rsk;
if (sk->sk_state == TCP_SYN_SENT || sk->sk_state == TCP_SYN_RECV)
__TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_ATTEMPTFAILS);
tcp_set_state(sk, TCP_CLOSE);
tcp_clear_xmit_timers(sk);
......
......@@ -155,9 +155,9 @@ static void tcp_cdg_hystart_update(struct sock *sk)
ca->last_ack = now_us;
if (after(now_us, ca->round_start + base_owd)) {
__NET_INC_STATS(sock_net(sk),
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINDETECT);
__NET_ADD_STATS(sock_net(sk),
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINCWND,
tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
......@@ -174,9 +174,9 @@ static void tcp_cdg_hystart_update(struct sock *sk)
125U);
if (ca->rtt.min > thresh) {
__NET_INC_STATS(sock_net(sk),
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYDETECT);
__NET_ADD_STATS(sock_net(sk),
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYCWND,
tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
......
......@@ -402,9 +402,9 @@ static void hystart_update(struct sock *sk, u32 delay)
ca->last_ack = now;
if ((s32)(now - ca->round_start) > ca->delay_min >> 4) {
ca->found |= HYSTART_ACK_TRAIN;
__NET_INC_STATS(sock_net(sk),
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINDETECT);
__NET_ADD_STATS(sock_net(sk),
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTTRAINCWND,
tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
......@@ -423,9 +423,9 @@ static void hystart_update(struct sock *sk, u32 delay)
if (ca->curr_rtt > ca->delay_min +
HYSTART_DELAY_THRESH(ca->delay_min >> 3)) {
ca->found |= HYSTART_DELAY;
__NET_INC_STATS(sock_net(sk),
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYDETECT);
__NET_ADD_STATS(sock_net(sk),
NET_ADD_STATS(sock_net(sk),
LINUX_MIB_TCPHYSTARTDELAYCWND,
tp->snd_cwnd);
tp->snd_ssthresh = tp->snd_cwnd;
......
......@@ -255,9 +255,9 @@ static bool tcp_fastopen_queue_check(struct sock *sk)
spin_lock(&fastopenq->lock);
req1 = fastopenq->rskq_rst_head;
if (!req1 || time_after(req1->rsk_timer.expires, jiffies)) {
spin_unlock(&fastopenq->lock);
__NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENLISTENOVERFLOW);
spin_unlock(&fastopenq->lock);
return false;
}
fastopenq->rskq_rst_head = req1->dl_next;
......@@ -282,7 +282,7 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
struct sock *child;
if (foc->len == 0) /* Client requests a cookie */
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENCOOKIEREQD);
if (!((sysctl_tcp_fastopen & TFO_SERVER_ENABLE) &&
(syn_data || foc->len >= 0) &&
......@@ -311,13 +311,13 @@ struct sock *tcp_try_fastopen(struct sock *sk, struct sk_buff *skb,
child = tcp_fastopen_create_child(sk, skb, dst, req);
if (child) {
foc->len = -1;
__NET_INC_STATS(sock_net(sk),
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENPASSIVE);
return child;
}
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
} else if (foc->len > 0) /* Client presents an invalid cookie */
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPFASTOPENPASSIVEFAIL);
valid_foc.exp = foc->exp;
*foc = valid_foc;
......
This diff is collapsed.
......@@ -692,6 +692,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
offsetof(struct inet_timewait_sock, tw_bound_dev_if));
arg.tos = ip_hdr(skb)->tos;
preempt_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
......@@ -699,6 +700,7 @@ static void tcp_v4_send_reset(const struct sock *sk, struct sk_buff *skb)
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
preempt_enable();
#ifdef CONFIG_TCP_MD5SIG
out:
......@@ -774,12 +776,14 @@ static void tcp_v4_send_ack(struct net *net,
if (oif)
arg.bound_dev_if = oif;
arg.tos = tos;
preempt_disable();
ip_send_unicast_reply(*this_cpu_ptr(net->ipv4.tcp_sk),
skb, &TCP_SKB_CB(skb)->header.h4.opt,
ip_hdr(skb)->saddr, ip_hdr(skb)->daddr,
&arg, arg.iov[0].iov_len);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
preempt_enable();
}
static void tcp_v4_timewait_ack(struct sock *sk, struct sk_buff *skb)
......@@ -1151,12 +1155,12 @@ static bool tcp_v4_inbound_md5_hash(const struct sock *sk,
return false;
if (hash_expected && !hash_location) {
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
return true;
}
if (!hash_expected && hash_location) {
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
return true;
}
......@@ -1342,7 +1346,7 @@ struct sock *tcp_v4_syn_recv_sock(const struct sock *sk, struct sk_buff *skb,
return newsk;
exit_overflow:
__NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
NET_INC_STATS(sock_net(sk), LINUX_MIB_LISTENOVERFLOWS);
exit_nonewsk:
dst_release(dst);
exit:
......@@ -1432,8 +1436,8 @@ int tcp_v4_do_rcv(struct sock *sk, struct sk_buff *skb)
return 0;
csum_err:
__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
}
EXPORT_SYMBOL(tcp_v4_do_rcv);
......
......@@ -337,7 +337,7 @@ void tcp_time_wait(struct sock *sk, int state, int timeo)
* socket up. We've got bigger problems than
* non-graceful socket closings.
*/
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPTIMEWAITOVERFLOW);
}
tcp_update_metrics(sk);
......
......@@ -2221,13 +2221,12 @@ bool tcp_schedule_loss_probe(struct sock *sk)
/* Thanks to skb fast clones, we can detect if a prior transmit of
* a packet is still in a qdisc or driver queue.
* In this case, there is very little point doing a retransmit !
* Note: This is called from BH context only.
*/
static bool skb_still_in_host_queue(const struct sock *sk,
const struct sk_buff *skb)
{
if (unlikely(skb_fclone_busy(sk, skb))) {
__NET_INC_STATS(sock_net(sk),
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPSPURIOUS_RTX_HOSTQUEUES);
return true;
}
......@@ -2290,7 +2289,7 @@ void tcp_send_loss_probe(struct sock *sk)
tp->tlp_high_seq = tp->snd_nxt;
probe_sent:
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPLOSSPROBES);
/* Reset s.t. tcp_rearm_rto will restart timer from now */
inet_csk(sk)->icsk_pending = 0;
rearm_timer:
......@@ -2699,7 +2698,7 @@ int tcp_retransmit_skb(struct sock *sk, struct sk_buff *skb, int segs)
tp->retrans_stamp = tcp_skb_timestamp(skb);
} else if (err != -EBUSY) {
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPRETRANSFAIL);
}
if (tp->undo_retrans < 0)
......@@ -2823,7 +2822,7 @@ void tcp_xmit_retransmit_queue(struct sock *sk)
if (tcp_retransmit_skb(sk, skb, segs))
return;
__NET_INC_STATS(sock_net(sk), mib_idx);
NET_INC_STATS(sock_net(sk), mib_idx);
if (tcp_in_cwnd_reduction(sk))
tp->prr_out += tcp_skb_pcount(skb);
......
......@@ -65,7 +65,7 @@ int tcp_rack_mark_lost(struct sock *sk)
if (scb->sacked & TCPCB_SACKED_RETRANS) {
scb->sacked &= ~TCPCB_SACKED_RETRANS;
tp->retrans_out -= tcp_skb_pcount(skb);
__NET_INC_STATS(sock_net(sk),
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPLOSTRETRANSMIT);
}
} else if (!(scb->sacked & TCPCB_RETRANS)) {
......
......@@ -162,7 +162,7 @@ static int tcp_write_timeout(struct sock *sk)
if (tp->syn_fastopen || tp->syn_data)
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
if (tp->syn_data && icsk->icsk_retransmits == 1)
__NET_INC_STATS(sock_net(sk),
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
retry_until = icsk->icsk_syn_retries ? : net->ipv4.sysctl_tcp_syn_retries;
......@@ -178,7 +178,7 @@ static int tcp_write_timeout(struct sock *sk)
tp->bytes_acked <= tp->rx_opt.mss_clamp) {
tcp_fastopen_cache_set(sk, 0, NULL, true, 0);
if (icsk->icsk_retransmits == net->ipv4.sysctl_tcp_retries1)
__NET_INC_STATS(sock_net(sk),
NET_INC_STATS(sock_net(sk),
LINUX_MIB_TCPFASTOPENACTIVEFAIL);
}
/* Black hole detection */
......@@ -209,6 +209,7 @@ static int tcp_write_timeout(struct sock *sk)
return 0;
}
/* Called with BH disabled */
void tcp_delack_timer_handler(struct sock *sk)
{
struct tcp_sock *tp = tcp_sk(sk);
......@@ -493,6 +494,7 @@ void tcp_retransmit_timer(struct sock *sk)
out:;
}
/* Called with BH disabled */
void tcp_write_timer_handler(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
......
......@@ -1514,9 +1514,9 @@ static int __udp_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM)
__UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
UDP_INC_STATS(sock_net(sk), UDP_MIB_RCVBUFERRORS,
is_udplite);
__UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
UDP_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
trace_udp_fail_queue_rcv_skb(rc, sk);
return -1;
......
......@@ -649,12 +649,12 @@ static bool tcp_v6_inbound_md5_hash(const struct sock *sk,
return false;
if (hash_expected && !hash_location) {
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5NOTFOUND);
return true;
}
if (!hash_expected && hash_location) {
__NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
NET_INC_STATS(sock_net(sk), LINUX_MIB_TCPMD5UNEXPECTED);
return true;
}
......@@ -825,9 +825,9 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
if (!IS_ERR(dst)) {
skb_dst_set(buff, dst);
ip6_xmit(ctl_sk, buff, &fl6, NULL, tclass);
__TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
TCP_INC_STATS(net, TCP_MIB_OUTSEGS);
if (rst)
__TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
TCP_INC_STATS(net, TCP_MIB_OUTRSTS);
return;
}
......@@ -1276,8 +1276,8 @@ static int tcp_v6_do_rcv(struct sock *sk, struct sk_buff *skb)
kfree_skb(skb);
return 0;
csum_err:
__TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
__TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_CSUMERRORS);
TCP_INC_STATS(sock_net(sk), TCP_MIB_INERRS);
goto discard;
......
......@@ -570,9 +570,9 @@ static int __udpv6_queue_rcv_skb(struct sock *sk, struct sk_buff *skb)
/* Note that an ENOMEM error is charged twice */
if (rc == -ENOMEM)
__UDP6_INC_STATS(sock_net(sk),
UDP6_INC_STATS(sock_net(sk),
UDP_MIB_RCVBUFERRORS, is_udplite);
__UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
UDP6_INC_STATS(sock_net(sk), UDP_MIB_INERRORS, is_udplite);
kfree_skb(skb);
return -1;
}
......
......@@ -89,10 +89,12 @@ void sctp_inq_push(struct sctp_inq *q, struct sctp_chunk *chunk)
* Eventually, we should clean up inqueue to not rely
* on the BH related data structures.
*/
local_bh_disable();
list_add_tail(&chunk->list, &q->in_chunk_list);
if (chunk->asoc)
chunk->asoc->stats.ipackets++;
q->immediate.func(&q->immediate);
local_bh_enable();
}
/* Peek at the next chunk on the inqeue. */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment