Commit c30f1fc0 authored by David S. Miller's avatar David S. Miller

Merge branch 'ip-Use-rb-trees-for-IP-frag-queue'

Peter Oskolkov says:

====================
ip: Use rb trees for IP frag queue.

This patchset
 * changes IPv4 defrag behavior to match that of IPv6: overlapping
   fragments now cause the whole IP datagram to be discarded (suggested
   by David Miller): there are no legitimate use cases for overlapping
   fragments;
 * changes IPv4 defrag queue from a list to a rb tree (suggested
   by Eric Dumazet): this change removes a potential attach vector.

Upcoming patches will contain similar changes for IPv6 frag queue,
as well as a comprehensive IP defrag self-test (temporarily delayed).
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents cfb4099f fa0f5273
...@@ -676,13 +676,16 @@ struct sk_buff { ...@@ -676,13 +676,16 @@ struct sk_buff {
* UDP receive path is one user. * UDP receive path is one user.
*/ */
unsigned long dev_scratch; unsigned long dev_scratch;
int ip_defrag_offset;
}; };
}; };
struct rb_node rbnode; /* used in netem & tcp stack */ struct rb_node rbnode; /* used in netem, ip4 defrag, and tcp stack */
struct list_head list; struct list_head list;
}; };
struct sock *sk;
union {
struct sock *sk;
int ip_defrag_offset;
};
union { union {
ktime_t tstamp; ktime_t tstamp;
...@@ -2585,7 +2588,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list) ...@@ -2585,7 +2588,7 @@ static inline void __skb_queue_purge(struct sk_buff_head *list)
kfree_skb(skb); kfree_skb(skb);
} }
void skb_rbtree_purge(struct rb_root *root); unsigned int skb_rbtree_purge(struct rb_root *root);
void *netdev_alloc_frag(unsigned int fragsz); void *netdev_alloc_frag(unsigned int fragsz);
......
...@@ -75,7 +75,8 @@ struct inet_frag_queue { ...@@ -75,7 +75,8 @@ struct inet_frag_queue {
struct timer_list timer; struct timer_list timer;
spinlock_t lock; spinlock_t lock;
refcount_t refcnt; refcount_t refcnt;
struct sk_buff *fragments; struct sk_buff *fragments; /* Used in IPv6. */
struct rb_root rb_fragments; /* Used in IPv4. */
struct sk_buff *fragments_tail; struct sk_buff *fragments_tail;
ktime_t stamp; ktime_t stamp;
int len; int len;
......
...@@ -56,6 +56,7 @@ enum ...@@ -56,6 +56,7 @@ enum
IPSTATS_MIB_ECT1PKTS, /* InECT1Pkts */ IPSTATS_MIB_ECT1PKTS, /* InECT1Pkts */
IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */ IPSTATS_MIB_ECT0PKTS, /* InECT0Pkts */
IPSTATS_MIB_CEPKTS, /* InCEPkts */ IPSTATS_MIB_CEPKTS, /* InCEPkts */
IPSTATS_MIB_REASM_OVERLAPS, /* ReasmOverlaps */
__IPSTATS_MIB_MAX __IPSTATS_MIB_MAX
}; };
......
...@@ -2858,23 +2858,27 @@ EXPORT_SYMBOL(skb_queue_purge); ...@@ -2858,23 +2858,27 @@ EXPORT_SYMBOL(skb_queue_purge);
/** /**
* skb_rbtree_purge - empty a skb rbtree * skb_rbtree_purge - empty a skb rbtree
* @root: root of the rbtree to empty * @root: root of the rbtree to empty
* Return value: the sum of truesizes of all purged skbs.
* *
* Delete all buffers on an &sk_buff rbtree. Each buffer is removed from * Delete all buffers on an &sk_buff rbtree. Each buffer is removed from
* the list and one reference dropped. This function does not take * the list and one reference dropped. This function does not take
* any lock. Synchronization should be handled by the caller (e.g., TCP * any lock. Synchronization should be handled by the caller (e.g., TCP
* out-of-order queue is protected by the socket lock). * out-of-order queue is protected by the socket lock).
*/ */
void skb_rbtree_purge(struct rb_root *root) unsigned int skb_rbtree_purge(struct rb_root *root)
{ {
struct rb_node *p = rb_first(root); struct rb_node *p = rb_first(root);
unsigned int sum = 0;
while (p) { while (p) {
struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode); struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
p = rb_next(p); p = rb_next(p);
rb_erase(&skb->rbnode, root); rb_erase(&skb->rbnode, root);
sum += skb->truesize;
kfree_skb(skb); kfree_skb(skb);
} }
return sum;
} }
/** /**
......
...@@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q) ...@@ -137,12 +137,16 @@ void inet_frag_destroy(struct inet_frag_queue *q)
fp = q->fragments; fp = q->fragments;
nf = q->net; nf = q->net;
f = nf->f; f = nf->f;
while (fp) { if (fp) {
struct sk_buff *xp = fp->next; do {
struct sk_buff *xp = fp->next;
sum_truesize += fp->truesize;
kfree_skb(fp); sum_truesize += fp->truesize;
fp = xp; kfree_skb(fp);
fp = xp;
} while (fp);
} else {
sum_truesize = skb_rbtree_purge(&q->rb_fragments);
} }
sum = sum_truesize + f->qsize; sum = sum_truesize + f->qsize;
......
This diff is collapsed.
...@@ -119,6 +119,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = { ...@@ -119,6 +119,7 @@ static const struct snmp_mib snmp4_ipextstats_list[] = {
SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS), SNMP_MIB_ITEM("InECT1Pkts", IPSTATS_MIB_ECT1PKTS),
SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS), SNMP_MIB_ITEM("InECT0Pkts", IPSTATS_MIB_ECT0PKTS),
SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS), SNMP_MIB_ITEM("InCEPkts", IPSTATS_MIB_CEPKTS),
SNMP_MIB_ITEM("ReasmOverlaps", IPSTATS_MIB_REASM_OVERLAPS),
SNMP_MIB_SENTINEL SNMP_MIB_SENTINEL
}; };
......
...@@ -463,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic ...@@ -463,6 +463,7 @@ nf_ct_frag6_reasm(struct frag_queue *fq, struct sk_buff *prev, struct net_devic
head->csum); head->csum);
fq->q.fragments = NULL; fq->q.fragments = NULL;
fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL; fq->q.fragments_tail = NULL;
return true; return true;
......
...@@ -405,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev, ...@@ -405,6 +405,7 @@ static int ip6_frag_reasm(struct frag_queue *fq, struct sk_buff *prev,
__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS); __IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMOKS);
rcu_read_unlock(); rcu_read_unlock();
fq->q.fragments = NULL; fq->q.fragments = NULL;
fq->q.rb_fragments = RB_ROOT;
fq->q.fragments_tail = NULL; fq->q.fragments_tail = NULL;
return 1; return 1;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment