Commit 58e0b4ab authored by David S. Miller's avatar David S. Miller

Merge branch 'tcp_bbr-Improving-TCP-BBR-performance-for-WiFi-and-cellular-networks'

Priyaranjan Jha says:

====================
tcp_bbr: Improving TCP BBR performance for WiFi and cellular networks

Ack aggregation is quite prevalent with wifi, cellular and cable modem
link tchnologies, ACK decimation in middleboxes, and common offloading
techniques such as TSO and GRO, at end hosts. Previously, BBR was often
cwnd-limited in the presence of severe ACK aggregation, which resulted in
low throughput due to insufficient data in flight.

To achieve good throughput for wifi and other paths with aggregation, this
patch series implements an ACK aggregation estimator for BBR, which
estimates the maximum recent degree of ACK aggregation and adapts cwnd
based on it. The algorithm is further described by the following
presentation:
https://datatracker.ietf.org/meeting/101/materials/slides-101-iccrg-an-update-on-bbr-work-at-google-00

(1) A preparatory patch, which refactors bbr_target_cwnd for generic
    inflight provisioning.

(2) Implements BBR ack aggregation estimator and adapts cwnd based
    on measured degree of ACK aggregation.
====================
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents a1ead2ec 78dc70eb
...@@ -139,8 +139,8 @@ struct inet_connection_sock { ...@@ -139,8 +139,8 @@ struct inet_connection_sock {
} icsk_mtup; } icsk_mtup;
u32 icsk_user_timeout; u32 icsk_user_timeout;
u64 icsk_ca_priv[88 / sizeof(u64)]; u64 icsk_ca_priv[104 / sizeof(u64)];
#define ICSK_CA_PRIV_SIZE (11 * sizeof(u64)) #define ICSK_CA_PRIV_SIZE (13 * sizeof(u64))
}; };
#define ICSK_TIME_RETRANS 1 /* Retransmit timer */ #define ICSK_TIME_RETRANS 1 /* Retransmit timer */
......
...@@ -115,6 +115,14 @@ struct bbr { ...@@ -115,6 +115,14 @@ struct bbr {
unused_b:5; unused_b:5;
u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ u32 prior_cwnd; /* prior cwnd upon entering loss recovery */
u32 full_bw; /* recent bw, to estimate if pipe is full */ u32 full_bw; /* recent bw, to estimate if pipe is full */
/* For tracking ACK aggregation: */
u64 ack_epoch_mstamp; /* start of ACK sampling epoch */
u16 extra_acked[2]; /* max excess data ACKed in epoch */
u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */
extra_acked_win_rtts:5, /* age of extra_acked, in round trips */
extra_acked_win_idx:1, /* current index in extra_acked array */
unused_c:6;
}; };
#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ #define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */
...@@ -182,6 +190,15 @@ static const u32 bbr_lt_bw_diff = 4000 / 8; ...@@ -182,6 +190,15 @@ static const u32 bbr_lt_bw_diff = 4000 / 8;
/* If we estimate we're policed, use lt_bw for this many round trips: */ /* If we estimate we're policed, use lt_bw for this many round trips: */
static const u32 bbr_lt_bw_max_rtts = 48; static const u32 bbr_lt_bw_max_rtts = 48;
/* Gain factor for adding extra_acked to target cwnd: */
static const int bbr_extra_acked_gain = BBR_UNIT;
/* Window length of extra_acked window. */
static const u32 bbr_extra_acked_win_rtts = 5;
/* Max allowed val for ack_epoch_acked, after which sampling epoch is reset */
static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20;
/* Time period for clamping cwnd increment due to ack aggregation */
static const u32 bbr_extra_acked_max_us = 100 * 1000;
static void bbr_check_probe_rtt_done(struct sock *sk); static void bbr_check_probe_rtt_done(struct sock *sk);
/* Do we estimate that STARTUP filled the pipe? */ /* Do we estimate that STARTUP filled the pipe? */
...@@ -208,6 +225,16 @@ static u32 bbr_bw(const struct sock *sk) ...@@ -208,6 +225,16 @@ static u32 bbr_bw(const struct sock *sk)
return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk);
} }
/* Return maximum extra acked in past k-2k round trips,
* where k = bbr_extra_acked_win_rtts.
*/
static u16 bbr_extra_acked(const struct sock *sk)
{
struct bbr *bbr = inet_csk_ca(sk);
return max(bbr->extra_acked[0], bbr->extra_acked[1]);
}
/* Return rate in bytes per second, optionally with a gain. /* Return rate in bytes per second, optionally with a gain.
* The order here is chosen carefully to avoid overflow of u64. This should * The order here is chosen carefully to avoid overflow of u64. This should
* work for input rates of up to 2.9Tbit/sec and gain of 2.89x. * work for input rates of up to 2.9Tbit/sec and gain of 2.89x.
...@@ -305,6 +332,8 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) ...@@ -305,6 +332,8 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
if (event == CA_EVENT_TX_START && tp->app_limited) { if (event == CA_EVENT_TX_START && tp->app_limited) {
bbr->idle_restart = 1; bbr->idle_restart = 1;
bbr->ack_epoch_mstamp = tp->tcp_mstamp;
bbr->ack_epoch_acked = 0;
/* Avoid pointless buffer overflows: pace at est. bw if we don't /* Avoid pointless buffer overflows: pace at est. bw if we don't
* need more speed (we're restarting from idle and app-limited). * need more speed (we're restarting from idle and app-limited).
*/ */
...@@ -315,30 +344,19 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) ...@@ -315,30 +344,19 @@ static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event)
} }
} }
/* Find target cwnd. Right-size the cwnd based on min RTT and the /* Calculate bdp based on min RTT and the estimated bottleneck bandwidth:
* estimated bottleneck bandwidth:
* *
* cwnd = bw * min_rtt * gain = BDP * gain * bdp = bw * min_rtt * gain
* *
* The key factor, gain, controls the amount of queue. While a small gain * The key factor, gain, controls the amount of queue. While a small gain
* builds a smaller queue, it becomes more vulnerable to noise in RTT * builds a smaller queue, it becomes more vulnerable to noise in RTT
* measurements (e.g., delayed ACKs or other ACK compression effects). This * measurements (e.g., delayed ACKs or other ACK compression effects). This
* noise may cause BBR to under-estimate the rate. * noise may cause BBR to under-estimate the rate.
*
* To achieve full performance in high-speed paths, we budget enough cwnd to
* fit full-sized skbs in-flight on both end hosts to fully utilize the path:
* - one skb in sending host Qdisc,
* - one skb in sending host TSO/GSO engine
* - one skb being received by receiver host LRO/GRO/delayed-ACK engine
* Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
* in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
* which allows 2 outstanding 2-packet sequences, to try to keep pipe
* full even with ACK-every-other-packet delayed ACKs.
*/ */
static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) static u32 bbr_bdp(struct sock *sk, u32 bw, int gain)
{ {
struct bbr *bbr = inet_csk_ca(sk); struct bbr *bbr = inet_csk_ca(sk);
u32 cwnd; u32 bdp;
u64 w; u64 w;
/* If we've never had a valid RTT sample, cap cwnd at the initial /* If we've never had a valid RTT sample, cap cwnd at the initial
...@@ -353,7 +371,24 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) ...@@ -353,7 +371,24 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
w = (u64)bw * bbr->min_rtt_us; w = (u64)bw * bbr->min_rtt_us;
/* Apply a gain to the given value, then remove the BW_SCALE shift. */ /* Apply a gain to the given value, then remove the BW_SCALE shift. */
cwnd = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT; bdp = (((w * gain) >> BBR_SCALE) + BW_UNIT - 1) / BW_UNIT;
return bdp;
}
/* To achieve full performance in high-speed paths, we budget enough cwnd to
* fit full-sized skbs in-flight on both end hosts to fully utilize the path:
* - one skb in sending host Qdisc,
* - one skb in sending host TSO/GSO engine
* - one skb being received by receiver host LRO/GRO/delayed-ACK engine
* Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because
* in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets,
* which allows 2 outstanding 2-packet sequences, to try to keep pipe
* full even with ACK-every-other-packet delayed ACKs.
*/
static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd, int gain)
{
struct bbr *bbr = inet_csk_ca(sk);
/* Allow enough full-sized skbs in flight to utilize end systems. */ /* Allow enough full-sized skbs in flight to utilize end systems. */
cwnd += 3 * bbr_tso_segs_goal(sk); cwnd += 3 * bbr_tso_segs_goal(sk);
...@@ -368,6 +403,17 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain) ...@@ -368,6 +403,17 @@ static u32 bbr_target_cwnd(struct sock *sk, u32 bw, int gain)
return cwnd; return cwnd;
} }
/* Find inflight based on min RTT and the estimated bottleneck bandwidth. */
static u32 bbr_inflight(struct sock *sk, u32 bw, int gain)
{
u32 inflight;
inflight = bbr_bdp(sk, bw, gain);
inflight = bbr_quantization_budget(sk, inflight, gain);
return inflight;
}
/* With pacing at lower layers, there's often less data "in the network" than /* With pacing at lower layers, there's often less data "in the network" than
* "in flight". With TSQ and departure time pacing at lower layers (e.g. fq), * "in flight". With TSQ and departure time pacing at lower layers (e.g. fq),
* we often have several skbs queued in the pacing layer with a pre-scheduled * we often have several skbs queued in the pacing layer with a pre-scheduled
...@@ -401,6 +447,22 @@ static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now) ...@@ -401,6 +447,22 @@ static u32 bbr_packets_in_net_at_edt(struct sock *sk, u32 inflight_now)
return inflight_at_edt - interval_delivered; return inflight_at_edt - interval_delivered;
} }
/* Find the cwnd increment based on estimate of ack aggregation */
static u32 bbr_ack_aggregation_cwnd(struct sock *sk)
{
u32 max_aggr_cwnd, aggr_cwnd = 0;
if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) {
max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us)
/ BW_UNIT;
aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk))
>> BBR_SCALE;
aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd);
}
return aggr_cwnd;
}
/* An optimization in BBR to reduce losses: On the first round of recovery, we /* An optimization in BBR to reduce losses: On the first round of recovery, we
* follow the packet conservation principle: send P packets per P packets acked. * follow the packet conservation principle: send P packets per P packets acked.
* After that, we slow-start and send at most 2*P packets per P packets acked. * After that, we slow-start and send at most 2*P packets per P packets acked.
...@@ -461,8 +523,15 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, ...@@ -461,8 +523,15 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs,
if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd))
goto done; goto done;
target_cwnd = bbr_bdp(sk, bw, gain);
/* Increment the cwnd to account for excess ACKed data that seems
* due to aggregation (of data and/or ACKs) visible in the ACK stream.
*/
target_cwnd += bbr_ack_aggregation_cwnd(sk);
target_cwnd = bbr_quantization_budget(sk, target_cwnd, gain);
/* If we're below target cwnd, slow start cwnd toward target cwnd. */ /* If we're below target cwnd, slow start cwnd toward target cwnd. */
target_cwnd = bbr_target_cwnd(sk, bw, gain);
if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */
cwnd = min(cwnd + acked, target_cwnd); cwnd = min(cwnd + acked, target_cwnd);
else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND)
...@@ -503,14 +572,14 @@ static bool bbr_is_next_cycle_phase(struct sock *sk, ...@@ -503,14 +572,14 @@ static bool bbr_is_next_cycle_phase(struct sock *sk,
if (bbr->pacing_gain > BBR_UNIT) if (bbr->pacing_gain > BBR_UNIT)
return is_full_length && return is_full_length &&
(rs->losses || /* perhaps pacing_gain*BDP won't fit */ (rs->losses || /* perhaps pacing_gain*BDP won't fit */
inflight >= bbr_target_cwnd(sk, bw, bbr->pacing_gain)); inflight >= bbr_inflight(sk, bw, bbr->pacing_gain));
/* A pacing_gain < 1.0 tries to drain extra queue we added if bw /* A pacing_gain < 1.0 tries to drain extra queue we added if bw
* probing didn't find more bw. If inflight falls to match BDP then we * probing didn't find more bw. If inflight falls to match BDP then we
* estimate queue is drained; persisting would underutilize the pipe. * estimate queue is drained; persisting would underutilize the pipe.
*/ */
return is_full_length || return is_full_length ||
inflight <= bbr_target_cwnd(sk, bw, BBR_UNIT); inflight <= bbr_inflight(sk, bw, BBR_UNIT);
} }
static void bbr_advance_cycle_phase(struct sock *sk) static void bbr_advance_cycle_phase(struct sock *sk)
...@@ -727,6 +796,67 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) ...@@ -727,6 +796,67 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs)
} }
} }
/* Estimates the windowed max degree of ack aggregation.
* This is used to provision extra in-flight data to keep sending during
* inter-ACK silences.
*
* Degree of ack aggregation is estimated as extra data acked beyond expected.
*
* max_extra_acked = "maximum recent excess data ACKed beyond max_bw * interval"
* cwnd += max_extra_acked
*
* Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms).
* Max filter is an approximate sliding window of 5-10 (packet timed) round
* trips.
*/
static void bbr_update_ack_aggregation(struct sock *sk,
const struct rate_sample *rs)
{
u32 epoch_us, expected_acked, extra_acked;
struct bbr *bbr = inet_csk_ca(sk);
struct tcp_sock *tp = tcp_sk(sk);
if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 ||
rs->delivered < 0 || rs->interval_us <= 0)
return;
if (bbr->round_start) {
bbr->extra_acked_win_rtts = min(0x1F,
bbr->extra_acked_win_rtts + 1);
if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) {
bbr->extra_acked_win_rtts = 0;
bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ?
0 : 1;
bbr->extra_acked[bbr->extra_acked_win_idx] = 0;
}
}
/* Compute how many packets we expected to be delivered over epoch. */
epoch_us = tcp_stamp_us_delta(tp->delivered_mstamp,
bbr->ack_epoch_mstamp);
expected_acked = ((u64)bbr_bw(sk) * epoch_us) / BW_UNIT;
/* Reset the aggregation epoch if ACK rate is below expected rate or
* significantly large no. of ack received since epoch (potentially
* quite old epoch).
*/
if (bbr->ack_epoch_acked <= expected_acked ||
(bbr->ack_epoch_acked + rs->acked_sacked >=
bbr_ack_epoch_acked_reset_thresh)) {
bbr->ack_epoch_acked = 0;
bbr->ack_epoch_mstamp = tp->delivered_mstamp;
expected_acked = 0;
}
/* Compute excess data delivered, beyond what was expected. */
bbr->ack_epoch_acked = min_t(u32, 0xFFFFF,
bbr->ack_epoch_acked + rs->acked_sacked);
extra_acked = bbr->ack_epoch_acked - expected_acked;
extra_acked = min(extra_acked, tp->snd_cwnd);
if (extra_acked > bbr->extra_acked[bbr->extra_acked_win_idx])
bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked;
}
/* Estimate when the pipe is full, using the change in delivery rate: BBR /* Estimate when the pipe is full, using the change in delivery rate: BBR
* estimates that STARTUP filled the pipe if the estimated bw hasn't changed by * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by
* at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited
...@@ -762,11 +892,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) ...@@ -762,11 +892,11 @@ static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs)
if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) {
bbr->mode = BBR_DRAIN; /* drain queue we created */ bbr->mode = BBR_DRAIN; /* drain queue we created */
tcp_sk(sk)->snd_ssthresh = tcp_sk(sk)->snd_ssthresh =
bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT); bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT);
} /* fall through to check if in-flight is already small: */ } /* fall through to check if in-flight is already small: */
if (bbr->mode == BBR_DRAIN && if (bbr->mode == BBR_DRAIN &&
bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <=
bbr_target_cwnd(sk, bbr_max_bw(sk), BBR_UNIT)) bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT))
bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */
} }
...@@ -881,6 +1011,7 @@ static void bbr_update_gains(struct sock *sk) ...@@ -881,6 +1011,7 @@ static void bbr_update_gains(struct sock *sk)
static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) static void bbr_update_model(struct sock *sk, const struct rate_sample *rs)
{ {
bbr_update_bw(sk, rs); bbr_update_bw(sk, rs);
bbr_update_ack_aggregation(sk, rs);
bbr_update_cycle_phase(sk, rs); bbr_update_cycle_phase(sk, rs);
bbr_check_full_bw_reached(sk, rs); bbr_check_full_bw_reached(sk, rs);
bbr_check_drain(sk, rs); bbr_check_drain(sk, rs);
...@@ -932,6 +1063,13 @@ static void bbr_init(struct sock *sk) ...@@ -932,6 +1063,13 @@ static void bbr_init(struct sock *sk)
bbr_reset_lt_bw_sampling(sk); bbr_reset_lt_bw_sampling(sk);
bbr_reset_startup_mode(sk); bbr_reset_startup_mode(sk);
bbr->ack_epoch_mstamp = tp->tcp_mstamp;
bbr->ack_epoch_acked = 0;
bbr->extra_acked_win_rtts = 0;
bbr->extra_acked_win_idx = 0;
bbr->extra_acked[0] = 0;
bbr->extra_acked[1] = 0;
cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED);
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment