Commit dda6a7a3 authored by David S. Miller's avatar David S. Miller

Merge branch 'ipv6-defrag-rbtree'

Peter Oskolkov says:

====================
net: IP defrag: use rbtrees in IPv6 defragmentation

Currently, IPv6 defragmentation code drops non-last fragments that
are smaller than 1280 bytes: see
commit 0ed4229b ("ipv6: defrag: drop non-last frags smaller than min mtu")

This behavior is not specified in IPv6 RFCs and appears to break compatibility
with some IPv6 implementations, as reported here:
https://www.spinics.net/lists/netdev/msg543846.html

This patchset contains four patches:
- patch 1 moves rbtree-related code from IPv4 to files shared b/w
IPv4/IPv6
- patch 2 changes IPv6 defragmenation code to use rbtrees for defrag
queue
- patch 3 changes nf_conntrack IPv6 defragmentation code to use rbtrees
- patch 4 changes ip_defrag selftest to test changes made in the
previous three patches.

Along the way, the 1280-byte restrictions are removed.

I plan to introduce similar changes to 6lowpan defragmentation code
once I figure out how to test it.
====================
Reviewed-by: default avatarEric Dumazet <edumazet@google.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents ccaceadc 4c351048
......@@ -77,8 +77,8 @@ struct inet_frag_queue {
struct timer_list timer;
spinlock_t lock;
refcount_t refcnt;
struct sk_buff *fragments; /* Used in IPv6. */
struct rb_root rb_fragments; /* Used in IPv4. */
struct sk_buff *fragments; /* used in 6lopwpan IPv6. */
struct rb_root rb_fragments; /* Used in IPv4/IPv6. */
struct sk_buff *fragments_tail;
struct sk_buff *last_run_head;
ktime_t stamp;
......@@ -153,4 +153,16 @@ static inline void add_frag_mem_limit(struct netns_frags *nf, long val)
extern const u8 ip_frag_ecn_table[16];
/* Return values of inet_frag_queue_insert() */
#define IPFRAG_OK 0
#define IPFRAG_DUP 1
#define IPFRAG_OVERLAP 2
int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
int offset, int end);
void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
struct sk_buff *parent);
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
void *reasm_data);
struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q);
#endif
......@@ -82,8 +82,15 @@ ip6frag_expire_frag_queue(struct net *net, struct frag_queue *fq)
__IP6_INC_STATS(net, __in6_dev_get(dev), IPSTATS_MIB_REASMTIMEOUT);
/* Don't send error if the first segment did not arrive. */
head = fq->q.fragments;
if (!(fq->q.flags & INET_FRAG_FIRST_IN) || !head)
if (!(fq->q.flags & INET_FRAG_FIRST_IN))
goto out;
/* sk_buff::dev and sk_buff::rbnode are unionized. So we
* pull the head out of the tree in order to be able to
* deal with head->dev.
*/
head = inet_frag_pull_head(&fq->q);
if (!head)
goto out;
head->dev = dev;
......
......@@ -25,6 +25,62 @@
#include <net/sock.h>
#include <net/inet_frag.h>
#include <net/inet_ecn.h>
#include <net/ip.h>
#include <net/ipv6.h>
/* Use skb->cb to track consecutive/adjacent fragments coming at
* the end of the queue. Nodes in the rb-tree queue will
* contain "runs" of one or more adjacent fragments.
*
* Invariants:
* - next_frag is NULL at the tail of a "run";
* - the head of a "run" has the sum of all fragment lengths in frag_run_len.
*/
struct ipfrag_skb_cb {
union {
struct inet_skb_parm h4;
struct inet6_skb_parm h6;
};
struct sk_buff *next_frag;
int frag_run_len;
};
#define FRAG_CB(skb) ((struct ipfrag_skb_cb *)((skb)->cb))
static void fragcb_clear(struct sk_buff *skb)
{
RB_CLEAR_NODE(&skb->rbnode);
FRAG_CB(skb)->next_frag = NULL;
FRAG_CB(skb)->frag_run_len = skb->len;
}
/* Append skb to the last "run". */
static void fragrun_append_to_last(struct inet_frag_queue *q,
struct sk_buff *skb)
{
fragcb_clear(skb);
FRAG_CB(q->last_run_head)->frag_run_len += skb->len;
FRAG_CB(q->fragments_tail)->next_frag = skb;
q->fragments_tail = skb;
}
/* Create a new "run" with the skb. */
static void fragrun_create(struct inet_frag_queue *q, struct sk_buff *skb)
{
BUILD_BUG_ON(sizeof(struct ipfrag_skb_cb) > sizeof(skb->cb));
fragcb_clear(skb);
if (q->last_run_head)
rb_link_node(&skb->rbnode, &q->last_run_head->rbnode,
&q->last_run_head->rbnode.rb_right);
else
rb_link_node(&skb->rbnode, NULL, &q->rb_fragments.rb_node);
rb_insert_color(&skb->rbnode, &q->rb_fragments);
q->fragments_tail = skb;
q->last_run_head = skb;
}
/* Given the OR values of all fragments, apply RFC 3168 5.3 requirements
* Value : 0xff if frame should be dropped.
......@@ -123,6 +179,28 @@ static void inet_frag_destroy_rcu(struct rcu_head *head)
kmem_cache_free(f->frags_cachep, q);
}
unsigned int inet_frag_rbtree_purge(struct rb_root *root)
{
struct rb_node *p = rb_first(root);
unsigned int sum = 0;
while (p) {
struct sk_buff *skb = rb_entry(p, struct sk_buff, rbnode);
p = rb_next(p);
rb_erase(&skb->rbnode, root);
while (skb) {
struct sk_buff *next = FRAG_CB(skb)->next_frag;
sum += skb->truesize;
kfree_skb(skb);
skb = next;
}
}
return sum;
}
EXPORT_SYMBOL(inet_frag_rbtree_purge);
void inet_frag_destroy(struct inet_frag_queue *q)
{
struct sk_buff *fp;
......@@ -224,3 +302,218 @@ struct inet_frag_queue *inet_frag_find(struct netns_frags *nf, void *key)
return fq;
}
EXPORT_SYMBOL(inet_frag_find);
int inet_frag_queue_insert(struct inet_frag_queue *q, struct sk_buff *skb,
int offset, int end)
{
struct sk_buff *last = q->fragments_tail;
/* RFC5722, Section 4, amended by Errata ID : 3089
* When reassembling an IPv6 datagram, if
* one or more its constituent fragments is determined to be an
* overlapping fragment, the entire datagram (and any constituent
* fragments) MUST be silently discarded.
*
* Duplicates, however, should be ignored (i.e. skb dropped, but the
* queue/fragments kept for later reassembly).
*/
if (!last)
fragrun_create(q, skb); /* First fragment. */
else if (last->ip_defrag_offset + last->len < end) {
/* This is the common case: skb goes to the end. */
/* Detect and discard overlaps. */
if (offset < last->ip_defrag_offset + last->len)
return IPFRAG_OVERLAP;
if (offset == last->ip_defrag_offset + last->len)
fragrun_append_to_last(q, skb);
else
fragrun_create(q, skb);
} else {
/* Binary search. Note that skb can become the first fragment,
* but not the last (covered above).
*/
struct rb_node **rbn, *parent;
rbn = &q->rb_fragments.rb_node;
do {
struct sk_buff *curr;
int curr_run_end;
parent = *rbn;
curr = rb_to_skb(parent);
curr_run_end = curr->ip_defrag_offset +
FRAG_CB(curr)->frag_run_len;
if (end <= curr->ip_defrag_offset)
rbn = &parent->rb_left;
else if (offset >= curr_run_end)
rbn = &parent->rb_right;
else if (offset >= curr->ip_defrag_offset &&
end <= curr_run_end)
return IPFRAG_DUP;
else
return IPFRAG_OVERLAP;
} while (*rbn);
/* Here we have parent properly set, and rbn pointing to
* one of its NULL left/right children. Insert skb.
*/
fragcb_clear(skb);
rb_link_node(&skb->rbnode, parent, rbn);
rb_insert_color(&skb->rbnode, &q->rb_fragments);
}
skb->ip_defrag_offset = offset;
return IPFRAG_OK;
}
EXPORT_SYMBOL(inet_frag_queue_insert);
void *inet_frag_reasm_prepare(struct inet_frag_queue *q, struct sk_buff *skb,
struct sk_buff *parent)
{
struct sk_buff *fp, *head = skb_rb_first(&q->rb_fragments);
struct sk_buff **nextp;
int delta;
if (head != skb) {
fp = skb_clone(skb, GFP_ATOMIC);
if (!fp)
return NULL;
FRAG_CB(fp)->next_frag = FRAG_CB(skb)->next_frag;
if (RB_EMPTY_NODE(&skb->rbnode))
FRAG_CB(parent)->next_frag = fp;
else
rb_replace_node(&skb->rbnode, &fp->rbnode,
&q->rb_fragments);
if (q->fragments_tail == skb)
q->fragments_tail = fp;
skb_morph(skb, head);
FRAG_CB(skb)->next_frag = FRAG_CB(head)->next_frag;
rb_replace_node(&head->rbnode, &skb->rbnode,
&q->rb_fragments);
consume_skb(head);
head = skb;
}
WARN_ON(head->ip_defrag_offset != 0);
delta = -head->truesize;
/* Head of list must not be cloned. */
if (skb_unclone(head, GFP_ATOMIC))
return NULL;
delta += head->truesize;
if (delta)
add_frag_mem_limit(q->net, delta);
/* If the first fragment is fragmented itself, we split
* it to two chunks: the first with data and paged part
* and the second, holding only fragments.
*/
if (skb_has_frag_list(head)) {
struct sk_buff *clone;
int i, plen = 0;
clone = alloc_skb(0, GFP_ATOMIC);
if (!clone)
return NULL;
skb_shinfo(clone)->frag_list = skb_shinfo(head)->frag_list;
skb_frag_list_init(head);
for (i = 0; i < skb_shinfo(head)->nr_frags; i++)
plen += skb_frag_size(&skb_shinfo(head)->frags[i]);
clone->data_len = head->data_len - plen;
clone->len = clone->data_len;
head->truesize += clone->truesize;
clone->csum = 0;
clone->ip_summed = head->ip_summed;
add_frag_mem_limit(q->net, clone->truesize);
skb_shinfo(head)->frag_list = clone;
nextp = &clone->next;
} else {
nextp = &skb_shinfo(head)->frag_list;
}
return nextp;
}
EXPORT_SYMBOL(inet_frag_reasm_prepare);
void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head,
void *reasm_data)
{
struct sk_buff **nextp = (struct sk_buff **)reasm_data;
struct rb_node *rbn;
struct sk_buff *fp;
skb_push(head, head->data - skb_network_header(head));
/* Traverse the tree in order, to build frag_list. */
fp = FRAG_CB(head)->next_frag;
rbn = rb_next(&head->rbnode);
rb_erase(&head->rbnode, &q->rb_fragments);
while (rbn || fp) {
/* fp points to the next sk_buff in the current run;
* rbn points to the next run.
*/
/* Go through the current run. */
while (fp) {
*nextp = fp;
nextp = &fp->next;
fp->prev = NULL;
memset(&fp->rbnode, 0, sizeof(fp->rbnode));
fp->sk = NULL;
head->data_len += fp->len;
head->len += fp->len;
if (head->ip_summed != fp->ip_summed)
head->ip_summed = CHECKSUM_NONE;
else if (head->ip_summed == CHECKSUM_COMPLETE)
head->csum = csum_add(head->csum, fp->csum);
head->truesize += fp->truesize;
fp = FRAG_CB(fp)->next_frag;
}
/* Move to the next run. */
if (rbn) {
struct rb_node *rbnext = rb_next(rbn);
fp = rb_to_skb(rbn);
rb_erase(rbn, &q->rb_fragments);
rbn = rbnext;
}
}
sub_frag_mem_limit(q->net, head->truesize);
*nextp = NULL;
skb_mark_not_on_list(head);
head->prev = NULL;
head->tstamp = q->stamp;
}
EXPORT_SYMBOL(inet_frag_reasm_finish);
struct sk_buff *inet_frag_pull_head(struct inet_frag_queue *q)
{
struct sk_buff *head;
if (q->fragments) {
head = q->fragments;
q->fragments = head->next;
} else {
struct sk_buff *skb;
head = skb_rb_first(&q->rb_fragments);
if (!head)
return NULL;
skb = FRAG_CB(head)->next_frag;
if (skb)
rb_replace_node(&head->rbnode, &skb->rbnode,
&q->rb_fragments);
else
rb_erase(&head->rbnode, &q->rb_fragments);
memset(&head->rbnode, 0, sizeof(head->rbnode));
barrier();
}
if (head == q->fragments_tail)
q->fragments_tail = NULL;
sub_frag_mem_limit(q->net, head->truesize);
return head;
}
EXPORT_SYMBOL(inet_frag_pull_head);
This diff is collapsed.
This diff is collapsed.
This diff is collapsed.
......@@ -20,6 +20,7 @@ static bool cfg_do_ipv4;
static bool cfg_do_ipv6;
static bool cfg_verbose;
static bool cfg_overlap;
static bool cfg_permissive;
static unsigned short cfg_port = 9000;
const struct in_addr addr4 = { .s_addr = __constant_htonl(INADDR_LOOPBACK + 2) };
......@@ -35,7 +36,7 @@ const struct in6_addr addr6 = IN6ADDR_LOOPBACK_INIT;
static int payload_len;
static int max_frag_len;
#define MSG_LEN_MAX 60000 /* Max UDP payload length. */
#define MSG_LEN_MAX 10000 /* Max UDP payload length. */
#define IP4_MF (1u << 13) /* IPv4 MF flag. */
#define IP6_MF (1) /* IPv6 MF flag. */
......@@ -59,13 +60,14 @@ static void recv_validate_udp(int fd_udp)
msg_counter++;
if (cfg_overlap) {
if (ret != -1)
error(1, 0, "recv: expected timeout; got %d",
(int)ret);
if (errno != ETIMEDOUT && errno != EAGAIN)
error(1, errno, "recv: expected timeout: %d",
errno);
return; /* OK */
if (ret == -1 && (errno == ETIMEDOUT || errno == EAGAIN))
return; /* OK */
if (!cfg_permissive) {
if (ret != -1)
error(1, 0, "recv: expected timeout; got %d",
(int)ret);
error(1, errno, "recv: expected timeout: %d", errno);
}
}
if (ret == -1)
......@@ -203,7 +205,6 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr,
{
struct ip *iphdr = (struct ip *)ip_frame;
struct ip6_hdr *ip6hdr = (struct ip6_hdr *)ip_frame;
const bool ipv4 = !ipv6;
int res;
int offset;
int frag_len;
......@@ -251,7 +252,7 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr,
}
/* Occasionally test IPv4 "runs" (see net/ipv4/ip_fragment.c) */
if (ipv4 && !cfg_overlap && (rand() % 100 < 20) &&
if (!cfg_overlap && (rand() % 100 < 20) &&
(payload_len > 9 * max_frag_len)) {
offset = 6 * max_frag_len;
while (offset < (UDP_HLEN + payload_len)) {
......@@ -276,41 +277,38 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr,
while (offset < (UDP_HLEN + payload_len)) {
send_fragment(fd_raw, addr, alen, offset, ipv6);
/* IPv4 ignores duplicates, so randomly send a duplicate. */
if (ipv4 && (1 == rand() % 100))
if (rand() % 100 == 1)
send_fragment(fd_raw, addr, alen, offset, ipv6);
offset += 2 * max_frag_len;
}
if (cfg_overlap) {
/* Send an extra random fragment. */
/* Send an extra random fragment.
*
* Duplicates and some fragments completely inside
* previously sent fragments are dropped/ignored. So
* random offset and frag_len can result in a dropped
* fragment instead of a dropped queue/packet. Thus we
* hard-code offset and frag_len.
*/
if (max_frag_len * 4 < payload_len || max_frag_len < 16) {
/* not enough payload for random offset and frag_len. */
offset = 8;
frag_len = UDP_HLEN + max_frag_len;
} else {
offset = rand() % (payload_len / 2);
frag_len = 2 * max_frag_len + 1 + rand() % 256;
}
if (ipv6) {
struct ip6_frag *fraghdr = (struct ip6_frag *)(ip_frame + IP6_HLEN);
/* sendto() returns EINVAL if offset + frag_len is too small. */
offset = rand() % (UDP_HLEN + payload_len - 1);
frag_len = max_frag_len + rand() % 256;
/* In IPv6 if !!(frag_len % 8), the fragment is dropped. */
frag_len &= ~0x7;
fraghdr->ip6f_offlg = htons(offset / 8 | IP6_MF);
ip6hdr->ip6_plen = htons(frag_len);
frag_len += IP6_HLEN;
} else {
/* In IPv4, duplicates and some fragments completely inside
* previously sent fragments are dropped/ignored. So
* random offset and frag_len can result in a dropped
* fragment instead of a dropped queue/packet. So we
* hard-code offset and frag_len.
*
* See ade446403bfb ("net: ipv4: do not handle duplicate
* fragments as overlapping").
*/
if (max_frag_len * 4 < payload_len || max_frag_len < 16) {
/* not enough payload to play with random offset and frag_len. */
offset = 8;
frag_len = IP4_HLEN + UDP_HLEN + max_frag_len;
} else {
offset = rand() % (payload_len / 2);
frag_len = 2 * max_frag_len + 1 + rand() % 256;
}
frag_len += IP4_HLEN;
iphdr->ip_off = htons(offset / 8 | IP4_MF);
iphdr->ip_len = htons(frag_len);
}
......@@ -327,7 +325,7 @@ static void send_udp_frags(int fd_raw, struct sockaddr *addr,
while (offset < (UDP_HLEN + payload_len)) {
send_fragment(fd_raw, addr, alen, offset, ipv6);
/* IPv4 ignores duplicates, so randomly send a duplicate. */
if (ipv4 && (1 == rand() % 100))
if (rand() % 100 == 1)
send_fragment(fd_raw, addr, alen, offset, ipv6);
offset += 2 * max_frag_len;
}
......@@ -342,7 +340,7 @@ static void run_test(struct sockaddr *addr, socklen_t alen, bool ipv6)
*/
struct timeval tv = { .tv_sec = 1, .tv_usec = 10 };
int idx;
int min_frag_len = ipv6 ? 1280 : 8;
int min_frag_len = 8;
/* Initialize the payload. */
for (idx = 0; idx < MSG_LEN_MAX; ++idx)
......@@ -434,7 +432,7 @@ static void parse_opts(int argc, char **argv)
{
int c;
while ((c = getopt(argc, argv, "46ov")) != -1) {
while ((c = getopt(argc, argv, "46opv")) != -1) {
switch (c) {
case '4':
cfg_do_ipv4 = true;
......@@ -445,6 +443,9 @@ static void parse_opts(int argc, char **argv)
case 'o':
cfg_overlap = true;
break;
case 'p':
cfg_permissive = true;
break;
case 'v':
cfg_verbose = true;
break;
......
......@@ -20,6 +20,10 @@ setup() {
ip netns exec "${NETNS}" sysctl -w net.ipv6.ip6frag_low_thresh=7000000 >/dev/null 2>&1
ip netns exec "${NETNS}" sysctl -w net.ipv6.ip6frag_time=1 >/dev/null 2>&1
ip netns exec "${NETNS}" sysctl -w net.netfilter.nf_conntrack_frag6_high_thresh=9000000 >/dev/null 2>&1
ip netns exec "${NETNS}" sysctl -w net.netfilter.nf_conntrack_frag6_low_thresh=7000000 >/dev/null 2>&1
ip netns exec "${NETNS}" sysctl -w net.netfilter.nf_conntrack_frag6_timeout=1 >/dev/null 2>&1
# DST cache can get full with a lot of frags, with GC not keeping up with the test.
ip netns exec "${NETNS}" sysctl -w net.ipv6.route.max_size=65536 >/dev/null 2>&1
}
......@@ -43,4 +47,16 @@ ip netns exec "${NETNS}" ./ip_defrag -6
echo "ipv6 defrag with overlaps"
ip netns exec "${NETNS}" ./ip_defrag -6o
# insert an nf_conntrack rule so that the codepath in nf_conntrack_reasm.c taken
ip netns exec "${NETNS}" ip6tables -A INPUT -m conntrack --ctstate INVALID -j ACCEPT
echo "ipv6 nf_conntrack defrag"
ip netns exec "${NETNS}" ./ip_defrag -6
echo "ipv6 nf_conntrack defrag with overlaps"
# netfilter will drop some invalid packets, so we run the test in
# permissive mode: i.e. pass the test if the packet is correctly assembled
# even if we sent an overlap
ip netns exec "${NETNS}" ./ip_defrag -6op
echo "all tests done"
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment