Commit e3058450 authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

dctcp: more accurate tracking of packets delivery

After commit e21db6f6 ("tcp: track total bytes delivered with ECN CE marks")
core TCP stack does a very good job tracking ECN signals.

The "sender's best estimate of CE information" Yuchung mentioned in his
patch is indeed the best we can do.

DCTCP can use tp->delivered_ce and tp->delivered to not duplicate the logic,
and use the existing best estimate.

This solves some problems, since current DCTCP logic does not deal with losses
and/or GRO or ack aggregation very well.

This also removes a dubious use of inet_csk(sk)->icsk_ack.rcv_mss
(this should have been tp->mss_cache), and a 64 bit divide.

Finally, we can see that the DCTCP logic, calling dctcp_update_alpha() for
every ACK could be done differently, calling it only once per RTT.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Cc: Yuchung Cheng <ycheng@google.com>
Cc: Neal Cardwell <ncardwell@google.com>
Cc: Soheil Hassas Yeganeh <soheil@google.com>
Cc: Florian Westphal <fw@strlen.de>
Cc: Daniel Borkmann <daniel@iogearbox.net>
Cc: Lawrence Brakmo <brakmo@fb.com>
Cc: Abdul Kabbani <akabbani@google.com>
Acked-by: default avatarSoheil Hassas Yeganeh <soheil@google.com>
Acked-by: default avatarNeal Cardwell <ncardwell@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent a5f62298
...@@ -49,9 +49,8 @@ ...@@ -49,9 +49,8 @@
#define DCTCP_MAX_ALPHA 1024U #define DCTCP_MAX_ALPHA 1024U
struct dctcp { struct dctcp {
u32 acked_bytes_ecn; u32 old_delivered;
u32 acked_bytes_total; u32 old_delivered_ce;
u32 prior_snd_una;
u32 prior_rcv_nxt; u32 prior_rcv_nxt;
u32 dctcp_alpha; u32 dctcp_alpha;
u32 next_seq; u32 next_seq;
...@@ -73,8 +72,8 @@ static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca) ...@@ -73,8 +72,8 @@ static void dctcp_reset(const struct tcp_sock *tp, struct dctcp *ca)
{ {
ca->next_seq = tp->snd_nxt; ca->next_seq = tp->snd_nxt;
ca->acked_bytes_ecn = 0; ca->old_delivered = tp->delivered;
ca->acked_bytes_total = 0; ca->old_delivered_ce = tp->delivered_ce;
} }
static void dctcp_init(struct sock *sk) static void dctcp_init(struct sock *sk)
...@@ -86,7 +85,6 @@ static void dctcp_init(struct sock *sk) ...@@ -86,7 +85,6 @@ static void dctcp_init(struct sock *sk)
sk->sk_state == TCP_CLOSE)) { sk->sk_state == TCP_CLOSE)) {
struct dctcp *ca = inet_csk_ca(sk); struct dctcp *ca = inet_csk_ca(sk);
ca->prior_snd_una = tp->snd_una;
ca->prior_rcv_nxt = tp->rcv_nxt; ca->prior_rcv_nxt = tp->rcv_nxt;
ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA); ca->dctcp_alpha = min(dctcp_alpha_on_init, DCTCP_MAX_ALPHA);
...@@ -118,37 +116,25 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags) ...@@ -118,37 +116,25 @@ static void dctcp_update_alpha(struct sock *sk, u32 flags)
{ {
const struct tcp_sock *tp = tcp_sk(sk); const struct tcp_sock *tp = tcp_sk(sk);
struct dctcp *ca = inet_csk_ca(sk); struct dctcp *ca = inet_csk_ca(sk);
u32 acked_bytes = tp->snd_una - ca->prior_snd_una;
/* If ack did not advance snd_una, count dupack as MSS size.
* If ack did update window, do not count it at all.
*/
if (acked_bytes == 0 && !(flags & CA_ACK_WIN_UPDATE))
acked_bytes = inet_csk(sk)->icsk_ack.rcv_mss;
if (acked_bytes) {
ca->acked_bytes_total += acked_bytes;
ca->prior_snd_una = tp->snd_una;
if (flags & CA_ACK_ECE)
ca->acked_bytes_ecn += acked_bytes;
}
/* Expired RTT */ /* Expired RTT */
if (!before(tp->snd_una, ca->next_seq)) { if (!before(tp->snd_una, ca->next_seq)) {
u64 bytes_ecn = ca->acked_bytes_ecn; u32 delivered_ce = tp->delivered_ce - ca->old_delivered_ce;
u32 alpha = ca->dctcp_alpha; u32 alpha = ca->dctcp_alpha;
/* alpha = (1 - g) * alpha + g * F */ /* alpha = (1 - g) * alpha + g * F */
alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g); alpha -= min_not_zero(alpha, alpha >> dctcp_shift_g);
if (bytes_ecn) { if (delivered_ce) {
u32 delivered = tp->delivered - ca->old_delivered;
/* If dctcp_shift_g == 1, a 32bit value would overflow /* If dctcp_shift_g == 1, a 32bit value would overflow
* after 8 Mbytes. * after 8 M packets.
*/ */
bytes_ecn <<= (10 - dctcp_shift_g); delivered_ce <<= (10 - dctcp_shift_g);
do_div(bytes_ecn, max(1U, ca->acked_bytes_total)); delivered_ce /= max(1U, delivered);
alpha = min(alpha + (u32)bytes_ecn, DCTCP_MAX_ALPHA); alpha = min(alpha + delivered_ce, DCTCP_MAX_ALPHA);
} }
/* dctcp_alpha can be read from dctcp_get_info() without /* dctcp_alpha can be read from dctcp_get_info() without
* synchro, so we ask compiler to not use dctcp_alpha * synchro, so we ask compiler to not use dctcp_alpha
...@@ -200,6 +186,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, ...@@ -200,6 +186,7 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
union tcp_cc_info *info) union tcp_cc_info *info)
{ {
const struct dctcp *ca = inet_csk_ca(sk); const struct dctcp *ca = inet_csk_ca(sk);
const struct tcp_sock *tp = tcp_sk(sk);
/* Fill it also in case of VEGASINFO due to req struct limits. /* Fill it also in case of VEGASINFO due to req struct limits.
* We can still correctly retrieve it later. * We can still correctly retrieve it later.
...@@ -211,8 +198,10 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr, ...@@ -211,8 +198,10 @@ static size_t dctcp_get_info(struct sock *sk, u32 ext, int *attr,
info->dctcp.dctcp_enabled = 1; info->dctcp.dctcp_enabled = 1;
info->dctcp.dctcp_ce_state = (u16) ca->ce_state; info->dctcp.dctcp_ce_state = (u16) ca->ce_state;
info->dctcp.dctcp_alpha = ca->dctcp_alpha; info->dctcp.dctcp_alpha = ca->dctcp_alpha;
info->dctcp.dctcp_ab_ecn = ca->acked_bytes_ecn; info->dctcp.dctcp_ab_ecn = tp->mss_cache *
info->dctcp.dctcp_ab_tot = ca->acked_bytes_total; (tp->delivered_ce - ca->old_delivered_ce);
info->dctcp.dctcp_ab_tot = tp->mss_cache *
(tp->delivered - ca->old_delivered);
} }
*attr = INET_DIAG_DCTCPINFO; *attr = INET_DIAG_DCTCPINFO;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment