Commit 06eb395f authored by Eric Dumazet's avatar Eric Dumazet Committed by David S. Miller

pkt_sched: fq: better control of DDOS traffic

FQ has a fast path for skb attached to a socket, as it does not
have to compute a flow hash. But for other packets, FQ being non
stochastic means that hosts exposed to random Internet traffic
can allocate million of flows structure (104 bytes each) pretty
easily. Not only host can OOM, but lookup in RB trees can take
too much cpu and memory resources.

This patch adds a new attribute, orphan_mask, that is adding
possibility of having a stochastic hash for orphaned skb.

Its default value is 1024 slots, to mimic SFQ behavior.

Note: This does not apply to locally generated TCP traffic,
and no locally generated traffic will share a flow structure
with another perfect or stochastic flow.

This patch also handles the specific case of SYNACK messages:

They are attached to the listener socket, and therefore all map
to a single hash bucket. If listener have set SO_MAX_PACING_RATE,
hoping to have new accepted socket inherit this rate, SYNACK
might be paced and even dropped.

This is very similar to an internal patch Google have used more
than one year.
Signed-off-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent f2683b74
...@@ -774,6 +774,8 @@ enum { ...@@ -774,6 +774,8 @@ enum {
TCA_FQ_FLOW_REFILL_DELAY, /* flow credit refill delay in usec */ TCA_FQ_FLOW_REFILL_DELAY, /* flow credit refill delay in usec */
TCA_FQ_ORPHAN_MASK, /* mask applied to orphaned skb hashes */
__TCA_FQ_MAX __TCA_FQ_MAX
}; };
......
...@@ -93,6 +93,7 @@ struct fq_sched_data { ...@@ -93,6 +93,7 @@ struct fq_sched_data {
u32 flow_refill_delay; u32 flow_refill_delay;
u32 flow_max_rate; /* optional max rate per flow */ u32 flow_max_rate; /* optional max rate per flow */
u32 flow_plimit; /* max packets per flow */ u32 flow_plimit; /* max packets per flow */
u32 orphan_mask; /* mask for orphaned skb */
struct rb_root *fq_root; struct rb_root *fq_root;
u8 rate_enable; u8 rate_enable;
u8 fq_trees_log; u8 fq_trees_log;
...@@ -223,11 +224,20 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q) ...@@ -223,11 +224,20 @@ static struct fq_flow *fq_classify(struct sk_buff *skb, struct fq_sched_data *q)
if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL)) if (unlikely((skb->priority & TC_PRIO_MAX) == TC_PRIO_CONTROL))
return &q->internal; return &q->internal;
if (unlikely(!sk)) { /* SYNACK messages are attached to a listener socket.
* 1) They are not part of a 'flow' yet
* 2) We do not want to rate limit them (eg SYNFLOOD attack),
* especially if the listener set SO_MAX_PACING_RATE
* 3) We pretend they are orphaned
*/
if (!sk || sk->sk_state == TCP_LISTEN) {
unsigned long hash = skb_get_hash(skb) & q->orphan_mask;
/* By forcing low order bit to 1, we make sure to not /* By forcing low order bit to 1, we make sure to not
* collide with a local flow (socket pointers are word aligned) * collide with a local flow (socket pointers are word aligned)
*/ */
sk = (struct sock *)(skb_get_hash(skb) | 1L); sk = (struct sock *)((hash << 1) | 1UL);
skb_orphan(skb);
} }
root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)]; root = &q->fq_root[hash_32((u32)(long)sk, q->fq_trees_log)];
...@@ -704,6 +714,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt) ...@@ -704,6 +714,9 @@ static int fq_change(struct Qdisc *sch, struct nlattr *opt)
q->flow_refill_delay = usecs_to_jiffies(usecs_delay); q->flow_refill_delay = usecs_to_jiffies(usecs_delay);
} }
if (tb[TCA_FQ_ORPHAN_MASK])
q->orphan_mask = nla_get_u32(tb[TCA_FQ_ORPHAN_MASK]);
if (!err) { if (!err) {
sch_tree_unlock(sch); sch_tree_unlock(sch);
err = fq_resize(sch, fq_log); err = fq_resize(sch, fq_log);
...@@ -749,6 +762,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt) ...@@ -749,6 +762,7 @@ static int fq_init(struct Qdisc *sch, struct nlattr *opt)
q->delayed = RB_ROOT; q->delayed = RB_ROOT;
q->fq_root = NULL; q->fq_root = NULL;
q->fq_trees_log = ilog2(1024); q->fq_trees_log = ilog2(1024);
q->orphan_mask = 1024 - 1;
qdisc_watchdog_init(&q->watchdog, sch); qdisc_watchdog_init(&q->watchdog, sch);
if (opt) if (opt)
...@@ -778,6 +792,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb) ...@@ -778,6 +792,7 @@ static int fq_dump(struct Qdisc *sch, struct sk_buff *skb)
nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) || nla_put_u32(skb, TCA_FQ_FLOW_MAX_RATE, q->flow_max_rate) ||
nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY, nla_put_u32(skb, TCA_FQ_FLOW_REFILL_DELAY,
jiffies_to_usecs(q->flow_refill_delay)) || jiffies_to_usecs(q->flow_refill_delay)) ||
nla_put_u32(skb, TCA_FQ_ORPHAN_MASK, q->orphan_mask) ||
nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log)) nla_put_u32(skb, TCA_FQ_BUCKETS_LOG, q->fq_trees_log))
goto nla_put_failure; goto nla_put_failure;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment