Commit 89cee8b1 authored by Herbert Xu's avatar Herbert Xu Committed by David S. Miller

[IPV4]: Safer reassembly

Another spin of Herbert Xu's "safer ip reassembly" patch
for 2.6.16.

(The original patch is here:
http://marc.theaimsgroup.com/?l=linux-netdev&m=112281936522415&w=2
and my only contribution is to have tested it.)

This patch (optionally) does additional checks before accepting IP
fragments, which can greatly reduce the possibility of reassembling
fragments which originated from different IP datagrams.
Signed-off-by: default avatarHerbert Xu <herbert@gondor.apana.org.au>
Signed-off-by: default avatarArthur Kepner <akepner@sgi.com>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent d5228a4f
...@@ -46,6 +46,29 @@ ipfrag_secret_interval - INTEGER ...@@ -46,6 +46,29 @@ ipfrag_secret_interval - INTEGER
for the hash secret) for IP fragments. for the hash secret) for IP fragments.
Default: 600 Default: 600
ipfrag_max_dist - INTEGER
ipfrag_max_dist is a non-negative integer value which defines the
maximum "disorder" which is allowed among fragments which share a
common IP source address. Note that reordering of packets is
not unusual, but if a large number of fragments arrive from a source
IP address while a particular fragment queue remains incomplete, it
probably indicates that one or more fragments belonging to that queue
have been lost. When ipfrag_max_dist is positive, an additional check
is done on fragments before they are added to a reassembly queue - if
ipfrag_max_dist (or more) fragments have arrived from a particular IP
address between additions to any IP fragment queue using that source
address, it's presumed that one or more fragments in the queue are
lost. The existing fragment queue will be dropped, and a new one
started. An ipfrag_max_dist value of zero disables this check.
Using a very small value, e.g. 1 or 2, for ipfrag_max_dist can
result in unnecessarily dropping fragment queues when normal
reordering of packets occurs, which could lead to poor application
performance. Using a very large value, e.g. 50000, increases the
likelihood of incorrectly reassembling IP fragments that originate
from different IP datagrams, which could result in data corruption.
Default: 64
INET peer storage: INET peer storage:
inet_peer_threshold - INTEGER inet_peer_threshold - INTEGER
......
...@@ -390,6 +390,7 @@ enum ...@@ -390,6 +390,7 @@ enum
NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109, NET_IPV4_ICMP_ERRORS_USE_INBOUND_IFADDR=109,
NET_TCP_CONG_CONTROL=110, NET_TCP_CONG_CONTROL=110,
NET_TCP_ABC=111, NET_TCP_ABC=111,
NET_IPV4_IPFRAG_MAX_DIST=112,
}; };
enum { enum {
......
...@@ -25,6 +25,7 @@ struct inet_peer ...@@ -25,6 +25,7 @@ struct inet_peer
__u32 v4daddr; /* peer's address */ __u32 v4daddr; /* peer's address */
__u16 avl_height; __u16 avl_height;
__u16 ip_id_count; /* IP ID for the next packet */ __u16 ip_id_count; /* IP ID for the next packet */
atomic_t rid; /* Frag reception counter */
__u32 tcp_ts; __u32 tcp_ts;
unsigned long tcp_ts_stamp; unsigned long tcp_ts_stamp;
}; };
......
...@@ -45,6 +45,7 @@ struct inet_skb_parm ...@@ -45,6 +45,7 @@ struct inet_skb_parm
#define IPSKB_TRANSLATED 2 #define IPSKB_TRANSLATED 2
#define IPSKB_FORWARDED 4 #define IPSKB_FORWARDED 4
#define IPSKB_XFRM_TUNNEL_SIZE 8 #define IPSKB_XFRM_TUNNEL_SIZE 8
#define IPSKB_FRAG_COMPLETE 16
}; };
struct ipcm_cookie struct ipcm_cookie
...@@ -168,6 +169,7 @@ extern int sysctl_ipfrag_high_thresh; ...@@ -168,6 +169,7 @@ extern int sysctl_ipfrag_high_thresh;
extern int sysctl_ipfrag_low_thresh; extern int sysctl_ipfrag_low_thresh;
extern int sysctl_ipfrag_time; extern int sysctl_ipfrag_time;
extern int sysctl_ipfrag_secret_interval; extern int sysctl_ipfrag_secret_interval;
extern int sysctl_ipfrag_max_dist;
/* From inetpeer.c */ /* From inetpeer.c */
extern int inet_peer_threshold; extern int inet_peer_threshold;
......
...@@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create) ...@@ -401,6 +401,7 @@ struct inet_peer *inet_getpeer(__u32 daddr, int create)
return NULL; return NULL;
n->v4daddr = daddr; n->v4daddr = daddr;
atomic_set(&n->refcnt, 1); atomic_set(&n->refcnt, 1);
atomic_set(&n->rid, 0);
n->ip_id_count = secure_ip_id(daddr); n->ip_id_count = secure_ip_id(daddr);
n->tcp_ts_stamp = 0; n->tcp_ts_stamp = 0;
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
* Patrick McHardy : LRU queue of frag heads for evictor. * Patrick McHardy : LRU queue of frag heads for evictor.
*/ */
#include <linux/compiler.h>
#include <linux/config.h> #include <linux/config.h>
#include <linux/module.h> #include <linux/module.h>
#include <linux/types.h> #include <linux/types.h>
...@@ -38,6 +39,7 @@ ...@@ -38,6 +39,7 @@
#include <net/ip.h> #include <net/ip.h>
#include <net/icmp.h> #include <net/icmp.h>
#include <net/checksum.h> #include <net/checksum.h>
#include <net/inetpeer.h>
#include <linux/tcp.h> #include <linux/tcp.h>
#include <linux/udp.h> #include <linux/udp.h>
#include <linux/inet.h> #include <linux/inet.h>
...@@ -56,6 +58,8 @@ ...@@ -56,6 +58,8 @@
int sysctl_ipfrag_high_thresh = 256*1024; int sysctl_ipfrag_high_thresh = 256*1024;
int sysctl_ipfrag_low_thresh = 192*1024; int sysctl_ipfrag_low_thresh = 192*1024;
int sysctl_ipfrag_max_dist = 64;
/* Important NOTE! Fragment queue must be destroyed before MSL expires. /* Important NOTE! Fragment queue must be destroyed before MSL expires.
* RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL. * RFC791 is wrong proposing to prolongate timer each fragment arrival by TTL.
*/ */
...@@ -89,8 +93,10 @@ struct ipq { ...@@ -89,8 +93,10 @@ struct ipq {
spinlock_t lock; spinlock_t lock;
atomic_t refcnt; atomic_t refcnt;
struct timer_list timer; /* when will this queue expire? */ struct timer_list timer; /* when will this queue expire? */
int iif;
struct timeval stamp; struct timeval stamp;
int iif;
unsigned int rid;
struct inet_peer *peer;
}; };
/* Hash table. */ /* Hash table. */
...@@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work) ...@@ -195,6 +201,9 @@ static void ip_frag_destroy(struct ipq *qp, int *work)
BUG_TRAP(qp->last_in&COMPLETE); BUG_TRAP(qp->last_in&COMPLETE);
BUG_TRAP(del_timer(&qp->timer) == 0); BUG_TRAP(del_timer(&qp->timer) == 0);
if (qp->peer)
inet_putpeer(qp->peer);
/* Release all fragment data. */ /* Release all fragment data. */
fp = qp->fragments; fp = qp->fragments;
while (fp) { while (fp) {
...@@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user) ...@@ -353,6 +362,7 @@ static struct ipq *ip_frag_create(unsigned hash, struct iphdr *iph, u32 user)
qp->meat = 0; qp->meat = 0;
qp->fragments = NULL; qp->fragments = NULL;
qp->iif = 0; qp->iif = 0;
qp->peer = sysctl_ipfrag_max_dist ? inet_getpeer(iph->saddr, 1) : NULL;
/* Initialize a timer for this entry. */ /* Initialize a timer for this entry. */
init_timer(&qp->timer); init_timer(&qp->timer);
...@@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user) ...@@ -398,6 +408,56 @@ static inline struct ipq *ip_find(struct iphdr *iph, u32 user)
return ip_frag_create(hash, iph, user); return ip_frag_create(hash, iph, user);
} }
/* Is the fragment too far ahead to be part of ipq? */
static inline int ip_frag_too_far(struct ipq *qp)
{
struct inet_peer *peer = qp->peer;
unsigned int max = sysctl_ipfrag_max_dist;
unsigned int start, end;
int rc;
if (!peer || !max)
return 0;
start = qp->rid;
end = atomic_inc_return(&peer->rid);
qp->rid = end;
rc = qp->fragments && (end - start) > max;
if (rc) {
IP_INC_STATS_BH(IPSTATS_MIB_REASMFAILS);
}
return rc;
}
static int ip_frag_reinit(struct ipq *qp)
{
struct sk_buff *fp;
if (!mod_timer(&qp->timer, jiffies + sysctl_ipfrag_time)) {
atomic_inc(&qp->refcnt);
return -ETIMEDOUT;
}
fp = qp->fragments;
do {
struct sk_buff *xp = fp->next;
frag_kfree_skb(fp, NULL);
fp = xp;
} while (fp);
qp->last_in = 0;
qp->len = 0;
qp->meat = 0;
qp->fragments = NULL;
qp->iif = 0;
return 0;
}
/* Add new segment to existing queue. */ /* Add new segment to existing queue. */
static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
{ {
...@@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb) ...@@ -408,6 +468,12 @@ static void ip_frag_queue(struct ipq *qp, struct sk_buff *skb)
if (qp->last_in & COMPLETE) if (qp->last_in & COMPLETE)
goto err; goto err;
if (!(IPCB(skb)->flags & IPSKB_FRAG_COMPLETE) &&
unlikely(ip_frag_too_far(qp)) && unlikely(ip_frag_reinit(qp))) {
ipq_kill(qp);
goto err;
}
offset = ntohs(skb->nh.iph->frag_off); offset = ntohs(skb->nh.iph->frag_off);
flags = offset & ~IP_OFFSET; flags = offset & ~IP_OFFSET;
offset &= IP_OFFSET; offset &= IP_OFFSET;
......
...@@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*)) ...@@ -445,6 +445,7 @@ int ip_fragment(struct sk_buff *skb, int (*output)(struct sk_buff*))
hlen = iph->ihl * 4; hlen = iph->ihl * 4;
mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */ mtu = dst_mtu(&rt->u.dst) - hlen; /* Size of data space */
IPCB(skb)->flags |= IPSKB_FRAG_COMPLETE;
/* When frag_list is given, use it. First, check its validity: /* When frag_list is given, use it. First, check its validity:
* some transformers could create wrong frag_list or break existing * some transformers could create wrong frag_list or break existing
......
...@@ -22,6 +22,7 @@ ...@@ -22,6 +22,7 @@
extern int sysctl_ip_nonlocal_bind; extern int sysctl_ip_nonlocal_bind;
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
static int zero;
static int tcp_retr1_max = 255; static int tcp_retr1_max = 255;
static int ip_local_port_range_min[] = { 1, 1 }; static int ip_local_port_range_min[] = { 1, 1 };
static int ip_local_port_range_max[] = { 65535, 65535 }; static int ip_local_port_range_max[] = { 65535, 65535 };
...@@ -613,6 +614,15 @@ ctl_table ipv4_table[] = { ...@@ -613,6 +614,15 @@ ctl_table ipv4_table[] = {
.proc_handler = &proc_dointvec_jiffies, .proc_handler = &proc_dointvec_jiffies,
.strategy = &sysctl_jiffies .strategy = &sysctl_jiffies
}, },
{
.ctl_name = NET_IPV4_IPFRAG_MAX_DIST,
.procname = "ipfrag_max_dist",
.data = &sysctl_ipfrag_max_dist,
.maxlen = sizeof(int),
.mode = 0644,
.proc_handler = &proc_dointvec_minmax,
.extra1 = &zero
},
{ {
.ctl_name = NET_TCP_NO_METRICS_SAVE, .ctl_name = NET_TCP_NO_METRICS_SAVE,
.procname = "tcp_no_metrics_save", .procname = "tcp_no_metrics_save",
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment