Commit a1ac9c8a authored by Martin KaFai Lau's avatar Martin KaFai Lau Committed by David S. Miller

net: Add skb->mono_delivery_time to distinguish mono delivery_time from (rcv) timestamp

skb->tstamp was first used as the (rcv) timestamp.
The major usage is to report it to the user (e.g. SO_TIMESTAMP).

Later, skb->tstamp is also set as the (future) delivery_time (e.g. EDT in TCP)
during egress and used by the qdisc (e.g. sch_fq) to make decision on when
the skb can be passed to the dev.

Currently, there is no way to tell skb->tstamp having the (rcv) timestamp
or the delivery_time, so it is always reset to 0 whenever forwarded
between egress and ingress.

While it makes sense to always clear the (rcv) timestamp in skb->tstamp
to avoid confusing sch_fq that expects the delivery_time, it is a
performance issue [0] to clear the delivery_time if the skb finally
egress to a fq@phy-dev.  For example, when forwarding from egress to
ingress and then finally back to egress:

            tcp-sender => veth@netns => veth@hostns => fq@eth0@hostns
                                     ^              ^
                                     reset          rest

This patch adds one bit skb->mono_delivery_time to flag the skb->tstamp
is storing the mono delivery_time (EDT) instead of the (rcv) timestamp.

The current use case is to keep the TCP mono delivery_time (EDT) and
to be used with sch_fq.  A latter patch will also allow tc-bpf@ingress
to read and change the mono delivery_time.

In the future, another bit (e.g. skb->user_delivery_time) can be added
for the SCM_TXTIME where the clock base is tracked by sk->sk_clockid.

[ This patch is a prep work.  The following patches will
  get the other parts of the stack ready first.  Then another patch
  after that will finally set the skb->mono_delivery_time. ]

skb_set_delivery_time() function is added.  It is used by the tcp_output.c
and during ip[6] fragmentation to assign the delivery_time to
the skb->tstamp and also set the skb->mono_delivery_time.

A note on the change in ip_send_unicast_reply() in ip_output.c.
It is only used by TCP to send reset/ack out of a ctl_sk.
Like the new skb_set_delivery_time(), this patch sets
the skb->mono_delivery_time to 0 for now as a place
holder.  It will be enabled in a latter patch.
A similar case in tcp_ipv6 can be done with
skb_set_delivery_time() in tcp_v6_send_response().

[0] (slide 22): https://linuxplumbersconf.org/event/11/contributions/953/attachments/867/1658/LPC_2021_BPF_Datapath_Extensions.pdfSigned-off-by: default avatarMartin KaFai Lau <kafai@fb.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 6fb8661c
...@@ -795,6 +795,10 @@ typedef unsigned char *sk_buff_data_t; ...@@ -795,6 +795,10 @@ typedef unsigned char *sk_buff_data_t;
* @dst_pending_confirm: need to confirm neighbour * @dst_pending_confirm: need to confirm neighbour
* @decrypted: Decrypted SKB * @decrypted: Decrypted SKB
* @slow_gro: state present at GRO time, slower prepare step required * @slow_gro: state present at GRO time, slower prepare step required
* @mono_delivery_time: When set, skb->tstamp has the
* delivery_time in mono clock base (i.e. EDT). Otherwise, the
* skb->tstamp has the (rcv) timestamp at ingress and
* delivery_time at egress.
* @napi_id: id of the NAPI struct this skb came from * @napi_id: id of the NAPI struct this skb came from
* @sender_cpu: (aka @napi_id) source CPU in XPS * @sender_cpu: (aka @napi_id) source CPU in XPS
* @secmark: security marking * @secmark: security marking
...@@ -965,6 +969,7 @@ struct sk_buff { ...@@ -965,6 +969,7 @@ struct sk_buff {
__u8 decrypted:1; __u8 decrypted:1;
#endif #endif
__u8 slow_gro:1; __u8 slow_gro:1;
__u8 mono_delivery_time:1;
#ifdef CONFIG_NET_SCHED #ifdef CONFIG_NET_SCHED
__u16 tc_index; /* traffic control index */ __u16 tc_index; /* traffic control index */
...@@ -3983,6 +3988,14 @@ static inline ktime_t net_timedelta(ktime_t t) ...@@ -3983,6 +3988,14 @@ static inline ktime_t net_timedelta(ktime_t t)
return ktime_sub(ktime_get_real(), t); return ktime_sub(ktime_get_real(), t);
} }
static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt,
bool mono)
{
skb->tstamp = kt;
/* Setting mono_delivery_time will be enabled later */
skb->mono_delivery_time = 0;
}
static inline u8 skb_metadata_len(const struct sk_buff *skb) static inline u8 skb_metadata_len(const struct sk_buff *skb)
{ {
return skb_shinfo(skb)->meta_len; return skb_shinfo(skb)->meta_len;
......
...@@ -32,6 +32,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, ...@@ -32,6 +32,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
struct sk_buff *)) struct sk_buff *))
{ {
int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
bool mono_delivery_time = skb->mono_delivery_time;
unsigned int hlen, ll_rs, mtu; unsigned int hlen, ll_rs, mtu;
ktime_t tstamp = skb->tstamp; ktime_t tstamp = skb->tstamp;
struct ip_frag_state state; struct ip_frag_state state;
...@@ -81,7 +82,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, ...@@ -81,7 +82,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
if (iter.frag) if (iter.frag)
ip_fraglist_prepare(skb, &iter); ip_fraglist_prepare(skb, &iter);
skb->tstamp = tstamp; skb_set_delivery_time(skb, tstamp, mono_delivery_time);
err = output(net, sk, data, skb); err = output(net, sk, data, skb);
if (err || !iter.frag) if (err || !iter.frag)
break; break;
...@@ -112,7 +113,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, ...@@ -112,7 +113,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk,
goto blackhole; goto blackhole;
} }
skb2->tstamp = tstamp; skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
err = output(net, sk, data, skb2); err = output(net, sk, data, skb2);
if (err) if (err)
goto blackhole; goto blackhole;
......
...@@ -761,6 +761,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ...@@ -761,6 +761,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
{ {
struct iphdr *iph; struct iphdr *iph;
struct sk_buff *skb2; struct sk_buff *skb2;
bool mono_delivery_time = skb->mono_delivery_time;
struct rtable *rt = skb_rtable(skb); struct rtable *rt = skb_rtable(skb);
unsigned int mtu, hlen, ll_rs; unsigned int mtu, hlen, ll_rs;
struct ip_fraglist_iter iter; struct ip_fraglist_iter iter;
...@@ -852,7 +853,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ...@@ -852,7 +853,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
} }
} }
skb->tstamp = tstamp; skb_set_delivery_time(skb, tstamp, mono_delivery_time);
err = output(net, sk, skb); err = output(net, sk, skb);
if (!err) if (!err)
...@@ -908,7 +909,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ...@@ -908,7 +909,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
/* /*
* Put this fragment into the sending queue. * Put this fragment into the sending queue.
*/ */
skb2->tstamp = tstamp; skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
err = output(net, sk, skb2); err = output(net, sk, skb2);
if (err) if (err)
goto fail; goto fail;
...@@ -1727,6 +1728,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, ...@@ -1727,6 +1728,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb,
arg->csumoffset) = csum_fold(csum_add(nskb->csum, arg->csumoffset) = csum_fold(csum_add(nskb->csum,
arg->csum)); arg->csum));
nskb->ip_summed = CHECKSUM_NONE; nskb->ip_summed = CHECKSUM_NONE;
/* Setting mono_delivery_time will be enabled later */
nskb->mono_delivery_time = 0;
ip_push_pending_frames(sk, &fl4); ip_push_pending_frames(sk, &fl4);
} }
out: out:
......
...@@ -1253,7 +1253,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, ...@@ -1253,7 +1253,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb,
tp = tcp_sk(sk); tp = tcp_sk(sk);
prior_wstamp = tp->tcp_wstamp_ns; prior_wstamp = tp->tcp_wstamp_ns;
tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache);
skb->skb_mstamp_ns = tp->tcp_wstamp_ns; skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
if (clone_it) { if (clone_it) {
oskb = skb; oskb = skb;
...@@ -1589,7 +1589,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, ...@@ -1589,7 +1589,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue,
skb_split(skb, buff, len); skb_split(skb, buff, len);
buff->tstamp = skb->tstamp; skb_set_delivery_time(buff, skb->tstamp, true);
tcp_fragment_tstamp(skb, buff); tcp_fragment_tstamp(skb, buff);
old_factor = tcp_skb_pcount(skb); old_factor = tcp_skb_pcount(skb);
...@@ -2616,7 +2616,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, ...@@ -2616,7 +2616,8 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle,
if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) {
/* "skb_mstamp_ns" is used as a start point for the retransmit timer */ /* "skb_mstamp_ns" is used as a start point for the retransmit timer */
skb->skb_mstamp_ns = tp->tcp_wstamp_ns = tp->tcp_clock_cache; tp->tcp_wstamp_ns = tp->tcp_clock_cache;
skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true);
list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue);
tcp_init_tso_segs(skb, mss_now); tcp_init_tso_segs(skb, mss_now);
goto repair; /* Skip network transmission */ goto repair; /* Skip network transmission */
...@@ -3541,11 +3542,12 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -3541,11 +3542,12 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
now = tcp_clock_ns(); now = tcp_clock_ns();
#ifdef CONFIG_SYN_COOKIES #ifdef CONFIG_SYN_COOKIES
if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok))
skb->skb_mstamp_ns = cookie_init_timestamp(req, now); skb_set_delivery_time(skb, cookie_init_timestamp(req, now),
true);
else else
#endif #endif
{ {
skb->skb_mstamp_ns = now; skb_set_delivery_time(skb, now, true);
if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */
tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb);
} }
...@@ -3594,7 +3596,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, ...@@ -3594,7 +3596,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst,
bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb,
synack_type, &opts); synack_type, &opts);
skb->skb_mstamp_ns = now; skb_set_delivery_time(skb, now, true);
tcp_add_tx_delay(skb, tp); tcp_add_tx_delay(skb, tp);
return skb; return skb;
...@@ -3771,7 +3773,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) ...@@ -3771,7 +3773,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn)
err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation);
syn->skb_mstamp_ns = syn_data->skb_mstamp_ns; skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true);
/* Now full SYN+DATA was cloned and sent (or not), /* Now full SYN+DATA was cloned and sent (or not),
* remove the SYN from the original skb (syn_data) * remove the SYN from the original skb (syn_data)
......
...@@ -813,6 +813,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ...@@ -813,6 +813,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
struct rt6_info *rt = (struct rt6_info *)skb_dst(skb); struct rt6_info *rt = (struct rt6_info *)skb_dst(skb);
struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ?
inet6_sk(skb->sk) : NULL; inet6_sk(skb->sk) : NULL;
bool mono_delivery_time = skb->mono_delivery_time;
struct ip6_frag_state state; struct ip6_frag_state state;
unsigned int mtu, hlen, nexthdr_offset; unsigned int mtu, hlen, nexthdr_offset;
ktime_t tstamp = skb->tstamp; ktime_t tstamp = skb->tstamp;
...@@ -903,7 +904,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ...@@ -903,7 +904,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
if (iter.frag) if (iter.frag)
ip6_fraglist_prepare(skb, &iter); ip6_fraglist_prepare(skb, &iter);
skb->tstamp = tstamp; skb_set_delivery_time(skb, tstamp, mono_delivery_time);
err = output(net, sk, skb); err = output(net, sk, skb);
if (!err) if (!err)
IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), IP6_INC_STATS(net, ip6_dst_idev(&rt->dst),
...@@ -962,7 +963,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ...@@ -962,7 +963,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
/* /*
* Put this fragment into the sending queue. * Put this fragment into the sending queue.
*/ */
frag->tstamp = tstamp; skb_set_delivery_time(frag, tstamp, mono_delivery_time);
err = output(net, sk, frag); err = output(net, sk, frag);
if (err) if (err)
goto fail; goto fail;
......
...@@ -121,6 +121,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ...@@ -121,6 +121,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
struct sk_buff *)) struct sk_buff *))
{ {
int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size;
bool mono_delivery_time = skb->mono_delivery_time;
ktime_t tstamp = skb->tstamp; ktime_t tstamp = skb->tstamp;
struct ip6_frag_state state; struct ip6_frag_state state;
u8 *prevhdr, nexthdr = 0; u8 *prevhdr, nexthdr = 0;
...@@ -186,7 +187,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ...@@ -186,7 +187,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
if (iter.frag) if (iter.frag)
ip6_fraglist_prepare(skb, &iter); ip6_fraglist_prepare(skb, &iter);
skb->tstamp = tstamp; skb_set_delivery_time(skb, tstamp, mono_delivery_time);
err = output(net, sk, data, skb); err = output(net, sk, data, skb);
if (err || !iter.frag) if (err || !iter.frag)
break; break;
...@@ -219,7 +220,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, ...@@ -219,7 +220,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb,
goto blackhole; goto blackhole;
} }
skb2->tstamp = tstamp; skb_set_delivery_time(skb2, tstamp, mono_delivery_time);
err = output(net, sk, data, skb2); err = output(net, sk, data, skb2);
if (err) if (err)
goto blackhole; goto blackhole;
......
...@@ -940,7 +940,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 ...@@ -940,7 +940,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32
} else { } else {
mark = sk->sk_mark; mark = sk->sk_mark;
} }
buff->tstamp = tcp_transmit_time(sk); skb_set_delivery_time(buff, tcp_transmit_time(sk), true);
} }
fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark; fl6.flowi6_mark = IP6_REPLY_MARK(net, skb->mark) ?: mark;
fl6.fl6_dport = t1->dest; fl6.fl6_dport = t1->dest;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment