Commit 041d6e04 authored by Patrick McHardy's avatar Patrick McHardy Committed by David S. Miller

[IPV4/IPV6]: Fix suboptimal fragment sizing for last fragment

Yoshifuji's recent fragment patch prevents unnecessary fragmentation
when the data can be kept in a single packet, but only for the first
packet. When fragmenting, all fragments are still truncated to
multiples of 8 and we might end up creating an unnecessary fragment.

This dump shows the problem (MTU 1499):

172.16.1.123.32771 > 172.16.195.3.4135: udp 2937 (frag 7066:1472@0+)
172.16.1.123 > 172.16.195.3: udp (frag 7066:1472@1472+)
172.16.1.123 > 172.16.195.3: udp (frag 7066:1@2944)

This patch always builds mtu sized fragments and truncates the previous
fragment to a multiple of 8 bytes when allocating a new one. With the
patch the dump looks like this:


172.16.1.123.32772 > 172.16.195.3.4135: udp 2937 (frag 49641:1472@0+)
172.16.1.123 > 172.16.195.3: udp (frag 49641:1473@1472)
Signed-off-by: default avatarPatrick McHardy <kaber@trash.net>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent 0dbf2dbb
...@@ -735,10 +735,10 @@ int ip_append_data(struct sock *sk, ...@@ -735,10 +735,10 @@ int ip_append_data(struct sock *sk,
int hh_len; int hh_len;
int exthdrlen; int exthdrlen;
int mtu; int mtu;
int copy = 0; int copy;
int err; int err;
int offset = 0; int offset = 0;
unsigned int maxfraglen, fragheaderlen, fraggap = 0; unsigned int maxfraglen, fragheaderlen;
int csummode = CHECKSUM_NONE; int csummode = CHECKSUM_NONE;
if (flags&MSG_PROBE) if (flags&MSG_PROBE)
...@@ -781,33 +781,19 @@ int ip_append_data(struct sock *sk, ...@@ -781,33 +781,19 @@ int ip_append_data(struct sock *sk,
hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
if (inet->cork.length + length > 0xFFFF - fragheaderlen) { if (inet->cork.length + length > 0xFFFF - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen); ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu-exthdrlen);
return -EMSGSIZE; return -EMSGSIZE;
} }
/*
* Let's try using as much space as possible to avoid generating
* additional unnecessary small fragment of length
* (mtu-fragheaderlen)%8 if mtu-fragheaderlen is not 0 modulo 8.
* -- yoshfuji
*/
if (fragheaderlen + inet->cork.length + length <= mtu)
maxfraglen = mtu;
else
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
if (fragheaderlen + inet->cork.length <= mtu &&
fragheaderlen + inet->cork.length + length > mtu)
fraggap = 1;
/* /*
* transhdrlen > 0 means that this is the first fragment and we wish * transhdrlen > 0 means that this is the first fragment and we wish
* it won't be fragmented in the future. * it won't be fragmented in the future.
*/ */
if (transhdrlen && if (transhdrlen &&
length + fragheaderlen <= maxfraglen && length + fragheaderlen <= mtu &&
rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) && rt->u.dst.dev->features&(NETIF_F_IP_CSUM|NETIF_F_NO_CSUM|NETIF_F_HW_CSUM) &&
!exthdrlen) !exthdrlen)
csummode = CHECKSUM_HW; csummode = CHECKSUM_HW;
...@@ -821,34 +807,33 @@ int ip_append_data(struct sock *sk, ...@@ -821,34 +807,33 @@ int ip_append_data(struct sock *sk,
* adding appropriate IP header. * adding appropriate IP header.
*/ */
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
fraggap = 0;
goto alloc_new_skb; goto alloc_new_skb;
}
while (length > 0) { while (length > 0) {
if ((copy = maxfraglen - skb->len) <= 0) { if ((copy = mtu - skb->len) <= 0) {
char *data; char *data;
unsigned int datalen; unsigned int datalen;
unsigned int fraglen; unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen; unsigned int alloclen;
struct sk_buff *skb_prev; struct sk_buff *skb_prev;
BUG_TRAP(fraggap || copy == 0); BUG_TRAP(copy == 0);
alloc_new_skb: alloc_new_skb:
skb_prev = skb; skb_prev = skb;
fraggap = 0;
if (skb_prev)
fraggap = mtu - maxfraglen;
if (fraggap) datalen = mtu - fragheaderlen;
fraggap = -copy;
datalen = maxfraglen - fragheaderlen;
if (datalen > length + fraggap) if (datalen > length + fraggap)
datalen = length + fraggap; datalen = length + fraggap;
fraglen = datalen + fragheaderlen; fraglen = datalen + fragheaderlen;
if ((flags & MSG_MORE) && if ((flags & MSG_MORE) &&
!(rt->u.dst.dev->features&NETIF_F_SG)) !(rt->u.dst.dev->features&NETIF_F_SG))
alloclen = maxfraglen; alloclen = mtu;
else else
alloclen = datalen + fragheaderlen; alloclen = datalen + fragheaderlen;
...@@ -913,7 +898,6 @@ int ip_append_data(struct sock *sk, ...@@ -913,7 +898,6 @@ int ip_append_data(struct sock *sk,
length -= datalen - fraggap; length -= datalen - fraggap;
transhdrlen = 0; transhdrlen = 0;
exthdrlen = 0; exthdrlen = 0;
fraggap = 0;
csummode = CHECKSUM_NONE; csummode = CHECKSUM_NONE;
/* /*
...@@ -1006,7 +990,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, ...@@ -1006,7 +990,7 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
int mtu; int mtu;
int len; int len;
int err; int err;
unsigned int maxfraglen, fragheaderlen, fraggap = 0; unsigned int maxfraglen, fragheaderlen, fraggap;
if (inet->hdrincl) if (inet->hdrincl)
return -EPERM; return -EPERM;
...@@ -1028,27 +1012,13 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, ...@@ -1028,27 +1012,13 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
mtu = inet->cork.fragsize; mtu = inet->cork.fragsize;
fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0); fragheaderlen = sizeof(struct iphdr) + (opt ? opt->optlen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
if (inet->cork.length + size > 0xFFFF - fragheaderlen) { if (inet->cork.length + size > 0xFFFF - fragheaderlen) {
ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu); ip_local_error(sk, EMSGSIZE, rt->rt_dst, inet->dport, mtu);
return -EMSGSIZE; return -EMSGSIZE;
} }
/*
* Let's try using as much space as possible to avoid generating
* additional unnecessary small fragment of length
* (mtu-fragheaderlen)%8 if mtu-fragheaderlen is not 0 modulo 8.
* -- yoshfuji
*/
if (fragheaderlen + inet->cork.length + size <= mtu)
maxfraglen = mtu;
else
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen;
if (fragheaderlen + inet->cork.length <= mtu &&
fragheaderlen + inet->cork.length + size > mtu)
fraggap = 1;
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
return -EINVAL; return -EINVAL;
...@@ -1056,17 +1026,18 @@ ssize_t ip_append_page(struct sock *sk, struct page *page, ...@@ -1056,17 +1026,18 @@ ssize_t ip_append_page(struct sock *sk, struct page *page,
while (size > 0) { while (size > 0) {
int i; int i;
if ((len = maxfraglen - skb->len) <= 0) { if ((len = mtu - skb->len) <= 0) {
struct sk_buff *skb_prev; struct sk_buff *skb_prev;
char *data; char *data;
struct iphdr *iph; struct iphdr *iph;
int alloclen; int alloclen;
BUG_TRAP(fraggap || len == 0); BUG_TRAP(len == 0);
skb_prev = skb; skb_prev = skb;
if (fraggap) fraggap = 0;
fraggap = -len; if (skb_prev)
fraggap = mtu - maxfraglen;
alloclen = fragheaderlen + hh_len + fraggap + 15; alloclen = fragheaderlen + hh_len + fraggap + 15;
skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation); skb = sock_wmalloc(sk, alloclen, 1, sk->sk_allocation);
......
...@@ -814,11 +814,11 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse ...@@ -814,11 +814,11 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
struct inet_opt *inet = inet_sk(sk); struct inet_opt *inet = inet_sk(sk);
struct ipv6_pinfo *np = inet6_sk(sk); struct ipv6_pinfo *np = inet6_sk(sk);
struct sk_buff *skb; struct sk_buff *skb;
unsigned int maxfraglen, fragheaderlen, fraggap = 0; unsigned int maxfraglen, fragheaderlen;
int exthdrlen; int exthdrlen;
int hh_len; int hh_len;
int mtu; int mtu;
int copy = 0; int copy;
int err; int err;
int offset = 0; int offset = 0;
int csummode = CHECKSUM_NONE; int csummode = CHECKSUM_NONE;
...@@ -867,6 +867,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse ...@@ -867,6 +867,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
hh_len = LL_RESERVED_SPACE(rt->u.dst.dev); hh_len = LL_RESERVED_SPACE(rt->u.dst.dev);
fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0); fragheaderlen = sizeof(struct ipv6hdr) + (opt ? opt->opt_nflen : 0);
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen - sizeof(struct frag_hdr);
if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) { if (mtu <= sizeof(struct ipv6hdr) + IPV6_MAXPLEN) {
if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) { if (inet->cork.length + length > sizeof(struct ipv6hdr) + IPV6_MAXPLEN - fragheaderlen) {
...@@ -883,46 +884,37 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse ...@@ -883,46 +884,37 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
* *
* Note that we may need to "move" the data from the tail of * Note that we may need to "move" the data from the tail of
* of the buffer to the new fragment when we split * of the buffer to the new fragment when we split
* the message at the first time. * the message.
* *
* FIXME: It may be fragmented into multiple chunks * FIXME: It may be fragmented into multiple chunks
* at once if non-fragmentable extension headers * at once if non-fragmentable extension headers
* are too large. * are too large.
* --yoshfuji * --yoshfuji
*/ */
if (fragheaderlen + inet->cork.length + length <= mtu)
maxfraglen = mtu;
else
maxfraglen = ((mtu - fragheaderlen) & ~7) + fragheaderlen
- sizeof(struct frag_hdr);
if (fragheaderlen + inet->cork.length <= mtu &&
fragheaderlen + inet->cork.length + length > mtu)
fraggap = 1;
inet->cork.length += length; inet->cork.length += length;
if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL) { if ((skb = skb_peek_tail(&sk->sk_write_queue)) == NULL)
fraggap = 0;
goto alloc_new_skb; goto alloc_new_skb;
}
while (length > 0) { while (length > 0) {
if ((copy = maxfraglen - skb->len) <= 0) { if ((copy = mtu - skb->len) <= 0) {
char *data; char *data;
unsigned int datalen; unsigned int datalen;
unsigned int fraglen; unsigned int fraglen;
unsigned int fraggap;
unsigned int alloclen; unsigned int alloclen;
struct sk_buff *skb_prev; struct sk_buff *skb_prev;
BUG_TRAP(fraggap || copy == 0); BUG_TRAP(copy == 0);
alloc_new_skb: alloc_new_skb:
skb_prev = skb; skb_prev = skb;
/* There's no room in the current skb */ /* There's no room in the current skb */
if (fraggap) fraggap = 0;
fraggap = -copy; if (skb_prev)
fraggap = mtu - maxfraglen;
datalen = maxfraglen - fragheaderlen; datalen = mtu - fragheaderlen;
if (datalen > length + fraggap) if (datalen > length + fraggap)
datalen = length + fraggap; datalen = length + fraggap;
...@@ -930,7 +922,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse ...@@ -930,7 +922,7 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
fraglen = datalen + fragheaderlen; fraglen = datalen + fragheaderlen;
if ((flags & MSG_MORE) && if ((flags & MSG_MORE) &&
!(rt->u.dst.dev->features&NETIF_F_SG)) !(rt->u.dst.dev->features&NETIF_F_SG))
alloclen = maxfraglen; alloclen = mtu;
else else
alloclen = datalen + fragheaderlen; alloclen = datalen + fragheaderlen;
...@@ -1005,7 +997,6 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse ...@@ -1005,7 +997,6 @@ int ip6_append_data(struct sock *sk, int getfrag(void *from, char *to, int offse
length -= datalen - fraggap; length -= datalen - fraggap;
transhdrlen = 0; transhdrlen = 0;
exthdrlen = 0; exthdrlen = 0;
fraggap = 0;
csummode = CHECKSUM_NONE; csummode = CHECKSUM_NONE;
/* /*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment