Commit 75c119af authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

tcp: implement rb-tree based retransmit queue

Using a linear list to store all skbs in write queue has been okay
for quite a while : O(N) is not too bad when N < 500.

Things get messy when N is the order of 100,000 : Modern TCP stacks
want 10Gbit+ of throughput even with 200 ms RTT flows.

40 ns per cache line miss means a full scan can use 4 ms,
blowing away CPU caches.

SACK processing often can use various hints to avoid parsing
whole retransmit queue. But with high packet losses and/or high
reordering, hints no longer work.

Sender has to process thousands of unfriendly SACK, accumulating
a huge socket backlog, burning a cpu and massively dropping packets.

Using an rb-tree for retransmit queue has been avoided for years
because it added complexity and overhead, but now is the time
to be more resistant and say no to quadratic behavior.

1) RTX queue is no longer part of the write queue : already sent skbs
are stored in one rb-tree.

2) Since reaching the head of write queue no longer needs
sk->sk_send_head, we added an union of sk_send_head and tcp_rtx_queue

Tested:

 On receiver :
 netem on ingress : delay 150ms 200us loss 1
 GRO disabled to force stress and SACK storms.

for f in `seq 1 10`
do
 ./netperf -H lpaa6 -l30 -- -K bbr -o THROUGHPUT|tail -1
done | awk '{print $0} {sum += $0} END {printf "%7u\n",sum}'

Before patch :

323.87
351.48
339.59
338.62
306.72
204.07
304.93
291.88
202.47
176.88
   2840

After patch:

1700.83
2207.98
2070.17
1544.26
2114.76
2124.89
1693.14
1080.91
2216.82
1299.94
  18053
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent f3319816
......@@ -60,7 +60,7 @@
#include <linux/sched.h>
#include <linux/wait.h>
#include <linux/cgroup-defs.h>
#include <linux/rbtree.h>
#include <linux/filter.h>
#include <linux/rculist_nulls.h>
#include <linux/poll.h>
......@@ -397,7 +397,10 @@ struct sock {
int sk_wmem_queued;
refcount_t sk_wmem_alloc;
unsigned long sk_tsq_flags;
struct sk_buff *sk_send_head;
union {
struct sk_buff *sk_send_head;
struct rb_root tcp_rtx_queue;
};
struct sk_buff_head sk_write_queue;
__s32 sk_peek_off;
int sk_write_pending;
......
......@@ -551,7 +551,13 @@ void tcp_xmit_retransmit_queue(struct sock *);
void tcp_simple_retransmit(struct sock *);
void tcp_enter_recovery(struct sock *sk, bool ece_ack);
int tcp_trim_head(struct sock *, struct sk_buff *, u32);
int tcp_fragment(struct sock *, struct sk_buff *, u32, unsigned int, gfp_t);
enum tcp_queue {
TCP_FRAG_IN_WRITE_QUEUE,
TCP_FRAG_IN_RTX_QUEUE,
};
int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
struct sk_buff *skb, u32 len,
unsigned int mss_now, gfp_t gfp);
void tcp_send_probe0(struct sock *);
void tcp_send_partial(struct sock *);
......@@ -1608,6 +1614,11 @@ static inline void tcp_skb_tsorted_anchor_cleanup(struct sk_buff *skb)
void tcp_write_queue_purge(struct sock *sk);
static inline struct sk_buff *tcp_rtx_queue_head(const struct sock *sk)
{
return skb_rb_first(&sk->tcp_rtx_queue);
}
static inline struct sk_buff *tcp_write_queue_head(const struct sock *sk)
{
return skb_peek(&sk->sk_write_queue);
......@@ -1630,18 +1641,12 @@ static inline struct sk_buff *tcp_write_queue_prev(const struct sock *sk,
return skb_queue_prev(&sk->sk_write_queue, skb);
}
#define tcp_for_write_queue(skb, sk) \
skb_queue_walk(&(sk)->sk_write_queue, skb)
#define tcp_for_write_queue_from(skb, sk) \
skb_queue_walk_from(&(sk)->sk_write_queue, skb)
#define tcp_for_write_queue_from_safe(skb, tmp, sk) \
skb_queue_walk_from_safe(&(sk)->sk_write_queue, skb, tmp)
static inline struct sk_buff *tcp_send_head(const struct sock *sk)
{
return sk->sk_send_head;
return skb_peek(&sk->sk_write_queue);
}
static inline bool tcp_skb_is_last(const struct sock *sk,
......@@ -1650,29 +1655,30 @@ static inline bool tcp_skb_is_last(const struct sock *sk,
return skb_queue_is_last(&sk->sk_write_queue, skb);
}
static inline void tcp_advance_send_head(struct sock *sk, const struct sk_buff *skb)
static inline bool tcp_write_queue_empty(const struct sock *sk)
{
if (tcp_skb_is_last(sk, skb))
sk->sk_send_head = NULL;
else
sk->sk_send_head = tcp_write_queue_next(sk, skb);
return skb_queue_empty(&sk->sk_write_queue);
}
static inline bool tcp_rtx_queue_empty(const struct sock *sk)
{
return RB_EMPTY_ROOT(&sk->tcp_rtx_queue);
}
static inline bool tcp_rtx_and_write_queues_empty(const struct sock *sk)
{
return tcp_rtx_queue_empty(sk) && tcp_write_queue_empty(sk);
}
static inline void tcp_check_send_head(struct sock *sk, struct sk_buff *skb_unlinked)
{
if (sk->sk_send_head == skb_unlinked) {
sk->sk_send_head = NULL;
if (tcp_write_queue_empty(sk))
tcp_chrono_stop(sk, TCP_CHRONO_BUSY);
}
if (tcp_sk(sk)->highest_sack == skb_unlinked)
tcp_sk(sk)->highest_sack = NULL;
}
static inline void tcp_init_send_head(struct sock *sk)
{
sk->sk_send_head = NULL;
}
static inline void __tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb)
{
__skb_queue_tail(&sk->sk_write_queue, skb);
......@@ -1683,8 +1689,7 @@ static inline void tcp_add_write_queue_tail(struct sock *sk, struct sk_buff *skb
__tcp_add_write_queue_tail(sk, skb);
/* Queue it, remembering where we must start sending. */
if (sk->sk_send_head == NULL) {
sk->sk_send_head = skb;
if (sk->sk_write_queue.next == skb) {
tcp_chrono_start(sk, TCP_CHRONO_BUSY);
if (tcp_sk(sk)->highest_sack == NULL)
......@@ -1697,35 +1702,32 @@ static inline void __tcp_add_write_queue_head(struct sock *sk, struct sk_buff *s
__skb_queue_head(&sk->sk_write_queue, skb);
}
/* Insert buff after skb on the write queue of sk. */
static inline void tcp_insert_write_queue_after(struct sk_buff *skb,
struct sk_buff *buff,
struct sock *sk)
{
__skb_queue_after(&sk->sk_write_queue, skb, buff);
}
/* Insert new before skb on the write queue of sk. */
static inline void tcp_insert_write_queue_before(struct sk_buff *new,
struct sk_buff *skb,
struct sock *sk)
{
__skb_queue_before(&sk->sk_write_queue, skb, new);
if (sk->sk_send_head == skb)
sk->sk_send_head = new;
}
static inline void tcp_unlink_write_queue(struct sk_buff *skb, struct sock *sk)
{
list_del(&skb->tcp_tsorted_anchor);
tcp_skb_tsorted_anchor_cleanup(skb);
__skb_unlink(skb, &sk->sk_write_queue);
}
static inline bool tcp_write_queue_empty(struct sock *sk)
void tcp_rbtree_insert(struct rb_root *root, struct sk_buff *skb);
static inline void tcp_rtx_queue_unlink(struct sk_buff *skb, struct sock *sk)
{
return skb_queue_empty(&sk->sk_write_queue);
tcp_skb_tsorted_anchor_cleanup(skb);
rb_erase(&skb->rbnode, &sk->tcp_rtx_queue);
}
static inline void tcp_rtx_queue_unlink_and_free(struct sk_buff *skb, struct sock *sk)
{
list_del(&skb->tcp_tsorted_anchor);
tcp_rtx_queue_unlink(skb, sk);
sk_wmem_free_skb(sk, skb);
}
static inline void tcp_push_pending_frames(struct sock *sk)
......@@ -1754,8 +1756,9 @@ static inline u32 tcp_highest_sack_seq(struct tcp_sock *tp)
static inline void tcp_advance_highest_sack(struct sock *sk, struct sk_buff *skb)
{
tcp_sk(sk)->highest_sack = tcp_skb_is_last(sk, skb) ? NULL :
tcp_write_queue_next(sk, skb);
struct sk_buff *next = skb_rb_next(skb);
tcp_sk(sk)->highest_sack = next ?: tcp_send_head(sk);
}
static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
......@@ -1765,7 +1768,9 @@ static inline struct sk_buff *tcp_highest_sack(struct sock *sk)
static inline void tcp_highest_sack_reset(struct sock *sk)
{
tcp_sk(sk)->highest_sack = tcp_write_queue_head(sk);
struct sk_buff *skb = tcp_rtx_queue_head(sk);
tcp_sk(sk)->highest_sack = skb ?: tcp_send_head(sk);
}
/* Called when old skb is about to be deleted (to be combined with new skb) */
......@@ -1935,7 +1940,7 @@ extern void tcp_rack_reo_timeout(struct sock *sk);
/* At how many usecs into the future should the RTO fire? */
static inline s64 tcp_rto_delta_us(const struct sock *sk)
{
const struct sk_buff *skb = tcp_write_queue_head(sk);
const struct sk_buff *skb = tcp_rtx_queue_head(sk);
u32 rto = inet_csk(sk)->icsk_rto;
u64 rto_time_stamp_us = skb->skb_mstamp + jiffies_to_usecs(rto);
......
......@@ -413,6 +413,7 @@ void tcp_init_sock(struct sock *sk)
struct tcp_sock *tp = tcp_sk(sk);
tp->out_of_order_queue = RB_ROOT;
sk->tcp_rtx_queue = RB_ROOT;
tcp_init_xmit_timers(sk);
INIT_LIST_HEAD(&tp->tsq_node);
INIT_LIST_HEAD(&tp->tsorted_sent_queue);
......@@ -701,10 +702,9 @@ static void tcp_push(struct sock *sk, int flags, int mss_now,
struct tcp_sock *tp = tcp_sk(sk);
struct sk_buff *skb;
if (!tcp_send_head(sk))
return;
skb = tcp_write_queue_tail(sk);
if (!skb)
return;
if (!(flags & MSG_MORE) || forced_push(tp))
tcp_mark_push(tp, skb);
......@@ -964,14 +964,14 @@ ssize_t do_tcp_sendpages(struct sock *sk, struct page *page, int offset,
int copy, i;
bool can_coalesce;
if (!tcp_send_head(sk) || (copy = size_goal - skb->len) <= 0 ||
if (!skb || (copy = size_goal - skb->len) <= 0 ||
!tcp_skb_can_collapse_to(skb)) {
new_segment:
if (!sk_stream_memory_free(sk))
goto wait_for_sndbuf;
skb = sk_stream_alloc_skb(sk, 0, sk->sk_allocation,
skb_queue_empty(&sk->sk_write_queue));
tcp_rtx_and_write_queues_empty(sk));
if (!skb)
goto wait_for_memory;
......@@ -1199,7 +1199,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
goto out_err;
}
skb = tcp_send_head(sk) ? tcp_write_queue_tail(sk) : NULL;
skb = tcp_write_queue_tail(sk);
uarg = sock_zerocopy_realloc(sk, size, skb_zcopy(skb));
if (!uarg) {
err = -ENOBUFS;
......@@ -1275,7 +1275,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
int max = size_goal;
skb = tcp_write_queue_tail(sk);
if (tcp_send_head(sk)) {
if (skb) {
if (skb->ip_summed == CHECKSUM_NONE)
max = mss_now;
copy = max - skb->len;
......@@ -1295,7 +1295,7 @@ int tcp_sendmsg_locked(struct sock *sk, struct msghdr *msg, size_t size)
process_backlog = false;
goto restart;
}
first_skb = skb_queue_empty(&sk->sk_write_queue);
first_skb = tcp_rtx_and_write_queues_empty(sk);
skb = sk_stream_alloc_skb(sk,
select_size(sk, sg, first_skb),
sk->sk_allocation,
......@@ -1521,6 +1521,13 @@ static int tcp_peek_sndq(struct sock *sk, struct msghdr *msg, int len)
/* XXX -- need to support SO_PEEK_OFF */
skb_rbtree_walk(skb, &sk->tcp_rtx_queue) {
err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
if (err)
return err;
copied += skb->len;
}
skb_queue_walk(&sk->sk_write_queue, skb) {
err = skb_copy_datagram_msg(skb, 0, msg, skb->len);
if (err)
......@@ -2320,6 +2327,22 @@ static inline bool tcp_need_reset(int state)
TCPF_FIN_WAIT2 | TCPF_SYN_RECV);
}
static void tcp_rtx_queue_purge(struct sock *sk)
{
struct rb_node *p = rb_first(&sk->tcp_rtx_queue);
while (p) {
struct sk_buff *skb = rb_to_skb(p);
p = rb_next(p);
/* Since we are deleting whole queue, no need to
* list_del(&skb->tcp_tsorted_anchor)
*/
tcp_rtx_queue_unlink(skb, sk);
sk_wmem_free_skb(sk, skb);
}
}
void tcp_write_queue_purge(struct sock *sk)
{
struct sk_buff *skb;
......@@ -2329,6 +2352,7 @@ void tcp_write_queue_purge(struct sock *sk)
tcp_skb_tsorted_anchor_cleanup(skb);
sk_wmem_free_skb(sk, skb);
}
tcp_rtx_queue_purge(sk);
INIT_LIST_HEAD(&tcp_sk(sk)->tsorted_sent_queue);
sk_mem_reclaim(sk);
tcp_clear_all_retrans_hints(tcp_sk(sk));
......@@ -2392,7 +2416,6 @@ int tcp_disconnect(struct sock *sk, int flags)
* issue in __tcp_select_window()
*/
icsk->icsk_ack.rcv_mss = TCP_MIN_MSS;
tcp_init_send_head(sk);
memset(&tp->rx_opt, 0, sizeof(tp->rx_opt));
__sk_dst_reset(sk);
dst_release(sk->sk_rx_dst);
......
This diff is collapsed.
......@@ -480,7 +480,7 @@ void tcp_v4_err(struct sk_buff *icmp_skb, u32 info)
TCP_TIMEOUT_INIT;
icsk->icsk_rto = inet_csk_rto_backoff(icsk, TCP_RTO_MAX);
skb = tcp_write_queue_head(sk);
skb = tcp_rtx_queue_head(sk);
BUG_ON(!skb);
tcp_mstamp_refresh(tp);
......
This diff is collapsed.
......@@ -156,8 +156,13 @@ static bool retransmits_timed_out(struct sock *sk,
return false;
start_ts = tcp_sk(sk)->retrans_stamp;
if (unlikely(!start_ts))
start_ts = tcp_skb_timestamp(tcp_write_queue_head(sk));
if (unlikely(!start_ts)) {
struct sk_buff *head = tcp_rtx_queue_head(sk);
if (!head)
return false;
start_ts = tcp_skb_timestamp(head);
}
if (likely(timeout == 0)) {
linear_backoff_thresh = ilog2(TCP_RTO_MAX/rto_base);
......@@ -304,11 +309,12 @@ static void tcp_delack_timer(unsigned long data)
static void tcp_probe_timer(struct sock *sk)
{
struct inet_connection_sock *icsk = inet_csk(sk);
struct sk_buff *skb = tcp_send_head(sk);
struct tcp_sock *tp = tcp_sk(sk);
int max_probes;
u32 start_ts;
if (tp->packets_out || !tcp_send_head(sk)) {
if (tp->packets_out || !skb) {
icsk->icsk_probes_out = 0;
return;
}
......@@ -321,9 +327,9 @@ static void tcp_probe_timer(struct sock *sk)
* corresponding system limit. We also implement similar policy when
* we use RTO to probe window in tcp_retransmit_timer().
*/
start_ts = tcp_skb_timestamp(tcp_send_head(sk));
start_ts = tcp_skb_timestamp(skb);
if (!start_ts)
tcp_send_head(sk)->skb_mstamp = tp->tcp_mstamp;
skb->skb_mstamp = tp->tcp_mstamp;
else if (icsk->icsk_user_timeout &&
(s32)(tcp_time_stamp(tp) - start_ts) >
jiffies_to_msecs(icsk->icsk_user_timeout))
......@@ -408,7 +414,7 @@ void tcp_retransmit_timer(struct sock *sk)
if (!tp->packets_out)
goto out;
WARN_ON(tcp_write_queue_empty(sk));
WARN_ON(tcp_rtx_queue_empty(sk));
tp->tlp_high_seq = 0;
......@@ -441,7 +447,7 @@ void tcp_retransmit_timer(struct sock *sk)
goto out;
}
tcp_enter_loss(sk);
tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1);
tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1);
__sk_dst_reset(sk);
goto out_reset_timer;
}
......@@ -473,7 +479,7 @@ void tcp_retransmit_timer(struct sock *sk)
tcp_enter_loss(sk);
if (tcp_retransmit_skb(sk, tcp_write_queue_head(sk), 1) > 0) {
if (tcp_retransmit_skb(sk, tcp_rtx_queue_head(sk), 1) > 0) {
/* Retransmission failed because of local congestion,
* do not backoff.
*/
......@@ -647,7 +653,7 @@ static void tcp_keepalive_timer (unsigned long data)
elapsed = keepalive_time_when(tp);
/* It is alive without keepalive 8) */
if (tp->packets_out || tcp_send_head(sk))
if (tp->packets_out || !tcp_write_queue_empty(sk))
goto resched;
elapsed = keepalive_time_elapsed(tp);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment