Commit a97f4fe6 authored by David S. Miller's avatar David S. Miller

Merge branch 'fc-quic-pacing'

Eric Dumazet says:

====================
net_sched: sch_fq: enable in-kernel pacing for QUIC servers

Willem added GSO support to UDP stack, greatly improving performance
of QUIC servers.

We also want to enable in-kernel pacing, which is possible thanks to EDT
model, since each sendmsg() can provide a timestamp for the skbs.

We have to change sch_fq to enable feeding packets in arbitrary EDT order,
and make sure that packet classification do not trust unconnected sockets.

Note that this patch series also is a prereq for a future TCP change
enabling per-flow delays/reorders/losses to implement high performance
TCP emulators.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents a55a385d 37c0aead
...@@ -54,10 +54,23 @@ ...@@ -54,10 +54,23 @@
#include <net/tcp_states.h> #include <net/tcp_states.h>
#include <net/tcp.h> #include <net/tcp.h>
struct fq_skb_cb {
u64 time_to_send;
};
static inline struct fq_skb_cb *fq_skb_cb(struct sk_buff *skb)
{
qdisc_cb_private_validate(skb, sizeof(struct fq_skb_cb));
return (struct fq_skb_cb *)qdisc_skb_cb(skb)->data;
}
/* /*
* Per flow structure, dynamically allocated * Per flow structure, dynamically allocated.
* If packets have monotically increasing time_to_send, they are placed in O(1)
* in linear list (head,tail), otherwise are placed in a rbtree (t_root).
*/ */
struct fq_flow { struct fq_flow {
struct rb_root t_root;
struct sk_buff *head; /* list of skbs for this flow : first skb */ struct sk_buff *head; /* list of skbs for this flow : first skb */
union { union {
struct sk_buff *tail; /* last skb in the list */ struct sk_buff *tail; /* last skb in the list */
...@@ -257,6 +270,17 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) ...@@ -257,6 +270,17 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
*/ */
sk = (struct sock *)((hash << 1) | 1UL); sk = (struct sock *)((hash << 1) | 1UL);
skb_orphan(skb); skb_orphan(skb);
} else if (sk->sk_state == TCP_CLOSE) {
unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
/*
* Sockets in TCP_CLOSE are non connected.
* Typical use case is UDP sockets, they can send packets
* with sendto() to many different destinations.
* We probably could use a generic bit advertising
* non connected sockets, instead of sk_state == TCP_CLOSE,
* if we care enough.
*/
sk = (struct sock *)((hash << 1) | 1UL);
} }
root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)]; root = &q->fq_root[hash_ptr(sk, q->fq_trees_log)];
...@@ -277,7 +301,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) ...@@ -277,7 +301,7 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
* It not, we need to refill credit with * It not, we need to refill credit with
* initial quantum * initial quantum
*/ */
if (unlikely(skb->sk && if (unlikely(skb->sk == sk &&
f->socket_hash != sk->sk_hash)) { f->socket_hash != sk->sk_hash)) {
f->credit = q->initial_quantum; f->credit = q->initial_quantum;
f->socket_hash = sk->sk_hash; f->socket_hash = sk->sk_hash;
...@@ -298,9 +322,11 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) ...@@ -298,9 +322,11 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
q->stat_allocation_errors++; q->stat_allocation_errors++;
return &q->internal; return &q->internal;
} }
/* f->t_root is already zeroed after kmem_cache_zalloc() */
fq_flow_set_detached(f); fq_flow_set_detached(f);
f->sk = sk; f->sk = sk;
if (skb->sk) if (skb->sk == sk)
f->socket_hash = sk->sk_hash; f->socket_hash = sk->sk_hash;
f->credit = q->initial_quantum; f->credit = q->initial_quantum;
...@@ -312,14 +338,40 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) ...@@ -312,14 +338,40 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
return f; return f;
} }
static struct sk_buff *fq_peek(struct fq_flow *flow)
{
struct sk_buff *skb = skb_rb_first(&flow->t_root);
struct sk_buff *head = flow->head;
if (!skb)
return head;
if (!head)
return skb;
if (fq_skb_cb(skb)->time_to_send < fq_skb_cb(head)->time_to_send)
return skb;
return head;
}
static void fq_erase_head(struct Qdisc *sch, struct fq_flow *flow,
struct sk_buff *skb)
{
if (skb == flow->head) {
flow->head = skb->next;
} else {
rb_erase(&skb->rbnode, &flow->t_root);
skb->dev = qdisc_dev(sch);
}
}
/* remove one skb from head of flow queue */ /* remove one skb from head of flow queue */
static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
{ {
struct sk_buff *skb = flow->head; struct sk_buff *skb = fq_peek(flow);
if (skb) { if (skb) {
flow->head = skb->next; fq_erase_head(sch, flow, skb);
skb_mark_not_on_list(skb); skb_mark_not_on_list(skb);
flow->qlen--; flow->qlen--;
qdisc_qstats_backlog_dec(sch, skb); qdisc_qstats_backlog_dec(sch, skb);
...@@ -330,15 +382,36 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow) ...@@ -330,15 +382,36 @@ static struct sk_buff *fq_dequeue_head(struct Qdisc *sch, struct fq_flow *flow)
static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb) static void flow_queue_add(struct fq_flow *flow, struct sk_buff *skb)
{ {
struct sk_buff *head = flow->head; struct rb_node **p, *parent;
struct sk_buff *head, *aux;
skb->next = NULL; fq_skb_cb(skb)->time_to_send = skb->tstamp ?: ktime_get_ns();
if (!head)
flow->head = skb; head = flow->head;
else if (!head ||
flow->tail->next = skb; fq_skb_cb(skb)->time_to_send >= fq_skb_cb(flow->tail)->time_to_send) {
if (!head)
flow->head = skb;
else
flow->tail->next = skb;
flow->tail = skb;
skb->next = NULL;
return;
}
p = &flow->t_root.rb_node;
parent = NULL;
flow->tail = skb; while (*p) {
parent = *p;
aux = rb_to_skb(parent);
if (fq_skb_cb(skb)->time_to_send >= fq_skb_cb(aux)->time_to_send)
p = &parent->rb_right;
else
p = &parent->rb_left;
}
rb_link_node(&skb->rbnode, parent, p);
rb_insert_color(&skb->rbnode, &flow->t_root);
} }
static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch, static int fq_enqueue(struct sk_buff *skb, struct Qdisc *sch,
...@@ -450,9 +523,9 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) ...@@ -450,9 +523,9 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
goto begin; goto begin;
} }
skb = f->head; skb = fq_peek(f);
if (skb) { if (skb) {
u64 time_next_packet = max_t(u64, ktime_to_ns(skb->tstamp), u64 time_next_packet = max_t(u64, fq_skb_cb(skb)->time_to_send,
f->time_next_packet); f->time_next_packet);
if (now < time_next_packet) { if (now < time_next_packet) {
...@@ -533,6 +606,15 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch) ...@@ -533,6 +606,15 @@ static struct sk_buff *fq_dequeue(struct Qdisc *sch)
static void fq_flow_purge(struct fq_flow *flow) static void fq_flow_purge(struct fq_flow *flow)
{ {
struct rb_node *p = rb_first(&flow->t_root);
while (p) {
struct sk_buff *skb = rb_to_skb(p);
p = rb_next(p);
rb_erase(&skb->rbnode, &flow->t_root);
rtnl_kfree_skbs(skb, skb);
}
rtnl_kfree_skbs(flow->head, flow->tail); rtnl_kfree_skbs(flow->head, flow->tail);
flow->head = NULL; flow->head = NULL;
flow->qlen = 0; flow->qlen = 0;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment