Merge nuts.davemloft.net:/disk1/BK/network-2.6

into nuts.davemloft.net:/disk1/BK/net-2.6

Merge nuts.davemloft.net:/disk1/BK/network-2.6
into nuts.davemloft.net:/disk1/BK/net-2.6
2407e34e · David S. Miller · f0fdf5f8 · faf1633b · 2407e34e · 2407e34e
Commit 2407e34e authored Mar 29, 2004 by David S. Miller
10 changed files
--- a/Documentation/networking/packet_mmap.txt
+++ b/Documentation/networking/packet_mmap.txt
--- a/net/Makefile
+++ b/net/Makefile
@@ -16,7 +16,9 @@ obj-$(CONFIG_LLC)		+= llc/
 obj-$(CONFIG_NET)		+= ethernet/ 802/ sched/ netlink/
 obj-$(CONFIG_INET)		+= ipv4/ xfrm/
 obj-$(CONFIG_UNIX)		+= unix/
-obj-$(CONFIG_IPV6)		+= ipv6/
+ifneq ($(CONFIG_IPV6),)
+obj-y				+= ipv6/
+endif
 obj-$(CONFIG_PACKET)		+= packet/
 obj-$(CONFIG_NET_KEY)		+= key/
 obj-$(CONFIG_NET_SCHED)		+= sched/

--- a/net/ipv4/netfilter/ip_nat_standalone.c
+++ b/net/ipv4/netfilter/ip_nat_standalone.c
@@ -124,7 +124,16 @@ ip_nat_fn(unsigned int hooknum,
 		WRITE_LOCK(&ip_nat_lock);
 		/* Seen it before?  This can happen for loopback, retrans,
 		   or local packets.. */
-		if (!(info->initialized & (1 << maniptype))) {
+		if (!(info->initialized & (1 << maniptype))
+#ifndef CONFIG_IP_NF_NAT_LOCAL
+		    /* If this session has already been confirmed we must not
+		     * touch it again even if there is no mapping set up.
+		     * Can only happen on local->local traffic with
+		     * CONFIG_IP_NF_NAT_LOCAL disabled.
+		     */
+		    && !(ct->status & IPS_CONFIRMED)
+#endif
+		    ) {
 			unsigned int ret;
 			if (ct->master

--- a/net/ipv4/netfilter/ipt_MASQUERADE.c
+++ b/net/ipv4/netfilter/ipt_MASQUERADE.c
@@ -45,7 +45,7 @@ masquerade_check(const char *tablename,
 	const struct ip_nat_multi_range *mr = targinfo;
 	if (strcmp(tablename, "nat") != 0) {
-		DEBUGP("masquerade_check: bad table `%s'.\n", table);
+		DEBUGP("masquerade_check: bad table `%s'.\n", tablename);
 		return 0;
 	}
 	if (targinfosize != IPT_ALIGN(sizeof(*mr))) {

--- a/net/ipv4/tcp_ipv4.c
+++ b/net/ipv4/tcp_ipv4.c
@@ -1825,12 +1825,15 @@ int tcp_v4_rcv(struct sk_buff *skb)
 	goto discard_it;
 do_time_wait:
-	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb))
+	if (!xfrm4_policy_check(NULL, XFRM_POLICY_IN, skb)) {
-		goto discard_and_relse;
+		tcp_tw_put((struct tcp_tw_bucket *) sk);
+		goto discard_it;
+	}
 	if (skb->len < (th->doff << 2) || tcp_checksum_complete(skb)) {
 		TCP_INC_STATS_BH(TcpInErrs);
-		goto discard_and_relse;
+		tcp_tw_put((struct tcp_tw_bucket *) sk);
+		goto discard_it;
 	}
 	switch (tcp_timewait_state_process((struct tcp_tw_bucket *)sk,
 					   skb, th, skb->len)) {

--- a/net/ipv6/Makefile
+++ b/net/ipv6/Makefile
@@ -19,3 +19,5 @@ obj-$(CONFIG_INET6_IPCOMP) += ipcomp6.o
 obj-$(CONFIG_NETFILTER)	+= netfilter/
 obj-$(CONFIG_IPV6_TUNNEL) += ip6_tunnel.o
+obj-y += exthdrs_core.o
--- a/net/ipv6/exthdrs.c
+++ b/net/ipv6/exthdrs.c
@@ -633,105 +633,3 @@ ipv6_dup_options(struct sock *sk, struct ipv6_txoptions *opt)
 	}
 	return opt2;
 }
-/* 
- * find out if nexthdr is a well-known extension header or a protocol
- */
-int ipv6_ext_hdr(u8 nexthdr)
-{
-	/* 
-	 * find out if nexthdr is an extension header or a protocol
-	 */
-	return ( (nexthdr == NEXTHDR_HOP)	||
-		 (nexthdr == NEXTHDR_ROUTING)	||
-		 (nexthdr == NEXTHDR_FRAGMENT)	||
-		 (nexthdr == NEXTHDR_AUTH)	||
-		 (nexthdr == NEXTHDR_NONE)	||
-		 (nexthdr == NEXTHDR_DEST) );
-}
-/*
- * Skip any extension headers. This is used by the ICMP module.
- *
- * Note that strictly speaking this conflicts with RFC 2460 4.0:
- * ...The contents and semantics of each extension header determine whether 
- * or not to proceed to the next header.  Therefore, extension headers must
- * be processed strictly in the order they appear in the packet; a
- * receiver must not, for example, scan through a packet looking for a
- * particular kind of extension header and process that header prior to
- * processing all preceding ones.
- * 
- * We do exactly this. This is a protocol bug. We can't decide after a
- * seeing an unknown discard-with-error flavour TLV option if it's a 
- * ICMP error message or not (errors should never be send in reply to
- * ICMP error messages).
- * 
- * But I see no other way to do this. This might need to be reexamined
- * when Linux implements ESP (and maybe AUTH) headers.
- * --AK
- *
- * This function parses (probably truncated) exthdr set "hdr"
- * of length "len". "nexthdrp" initially points to some place,
- * where type of the first header can be found.
- *
- * It skips all well-known exthdrs, and returns pointer to the start
- * of unparsable area i.e. the first header with unknown type.
- * If it is not NULL *nexthdr is updated by type/protocol of this header.
- *
- * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL.
- *        - it may return pointer pointing beyond end of packet,
- *	    if the last recognized header is truncated in the middle.
- *        - if packet is truncated, so that all parsed headers are skipped,
- *	    it returns NULL.
- *	  - First fragment header is skipped, not-first ones
- *	    are considered as unparsable.
- *	  - ESP is unparsable for now and considered like
- *	    normal payload protocol.
- *	  - Note also special handling of AUTH header. Thanks to IPsec wizards.
- *
- * --ANK (980726)
- */
-int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, int len)
-{
-	u8 nexthdr = *nexthdrp;
-	while (ipv6_ext_hdr(nexthdr)) {
-		struct ipv6_opt_hdr hdr;
-		int hdrlen;
-		if (len < (int)sizeof(struct ipv6_opt_hdr))
-			return -1;
-		if (nexthdr == NEXTHDR_NONE)
-			return -1;
-		if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
-			BUG();
-		if (nexthdr == NEXTHDR_FRAGMENT) {
-			unsigned short frag_off;
-			if (skb_copy_bits(skb,
-					  start+offsetof(struct frag_hdr,
-							 frag_off),
-					  &frag_off,
-					  sizeof(frag_off))) {
-				return -1;
-			}
-			if (ntohs(frag_off) & ~0x7)
-				break;
-			hdrlen = 8;
-		} else if (nexthdr == NEXTHDR_AUTH)
-			hdrlen = (hdr.hdrlen+2)<<2; 
-		else
-			hdrlen = ipv6_optlen(&hdr); 
-		nexthdr = hdr.nexthdr;
-		len -= hdrlen;
-		start += hdrlen;
-	}
-	*nexthdrp = nexthdr;
-	return start;
-}
--- a/net/ipv6/exthdrs_core.c
+++ b/net/ipv6/exthdrs_core.c
+/*
+ * IPv6 library code, needed by static components when full IPv6 support is
+ * not configured or static.
+ */
+#include <net/ipv6.h>
+/* 
+ * find out if nexthdr is a well-known extension header or a protocol
+ */
+int ipv6_ext_hdr(u8 nexthdr)
+{
+	/* 
+	 * find out if nexthdr is an extension header or a protocol
+	 */
+	return ( (nexthdr == NEXTHDR_HOP)	||
+		 (nexthdr == NEXTHDR_ROUTING)	||
+		 (nexthdr == NEXTHDR_FRAGMENT)	||
+		 (nexthdr == NEXTHDR_AUTH)	||
+		 (nexthdr == NEXTHDR_NONE)	||
+		 (nexthdr == NEXTHDR_DEST) );
+}
+/*
+ * Skip any extension headers. This is used by the ICMP module.
+ *
+ * Note that strictly speaking this conflicts with RFC 2460 4.0:
+ * ...The contents and semantics of each extension header determine whether 
+ * or not to proceed to the next header.  Therefore, extension headers must
+ * be processed strictly in the order they appear in the packet; a
+ * receiver must not, for example, scan through a packet looking for a
+ * particular kind of extension header and process that header prior to
+ * processing all preceding ones.
+ * 
+ * We do exactly this. This is a protocol bug. We can't decide after a
+ * seeing an unknown discard-with-error flavour TLV option if it's a 
+ * ICMP error message or not (errors should never be send in reply to
+ * ICMP error messages).
+ * 
+ * But I see no other way to do this. This might need to be reexamined
+ * when Linux implements ESP (and maybe AUTH) headers.
+ * --AK
+ *
+ * This function parses (probably truncated) exthdr set "hdr"
+ * of length "len". "nexthdrp" initially points to some place,
+ * where type of the first header can be found.
+ *
+ * It skips all well-known exthdrs, and returns pointer to the start
+ * of unparsable area i.e. the first header with unknown type.
+ * If it is not NULL *nexthdr is updated by type/protocol of this header.
+ *
+ * NOTES: - if packet terminated with NEXTHDR_NONE it returns NULL.
+ *        - it may return pointer pointing beyond end of packet,
+ *	    if the last recognized header is truncated in the middle.
+ *        - if packet is truncated, so that all parsed headers are skipped,
+ *	    it returns NULL.
+ *	  - First fragment header is skipped, not-first ones
+ *	    are considered as unparsable.
+ *	  - ESP is unparsable for now and considered like
+ *	    normal payload protocol.
+ *	  - Note also special handling of AUTH header. Thanks to IPsec wizards.
+ *
+ * --ANK (980726)
+ */
+int ipv6_skip_exthdr(const struct sk_buff *skb, int start, u8 *nexthdrp, int len)
+{
+	u8 nexthdr = *nexthdrp;
+	while (ipv6_ext_hdr(nexthdr)) {
+		struct ipv6_opt_hdr hdr;
+		int hdrlen;
+		if (len < (int)sizeof(struct ipv6_opt_hdr))
+			return -1;
+		if (nexthdr == NEXTHDR_NONE)
+			return -1;
+		if (skb_copy_bits(skb, start, &hdr, sizeof(hdr)))
+			BUG();
+		if (nexthdr == NEXTHDR_FRAGMENT) {
+			unsigned short frag_off;
+			if (skb_copy_bits(skb,
+					  start+offsetof(struct frag_hdr,
+							 frag_off),
+					  &frag_off,
+					  sizeof(frag_off))) {
+				return -1;
+			}
+			if (ntohs(frag_off) & ~0x7)
+				break;
+			hdrlen = 8;
+		} else if (nexthdr == NEXTHDR_AUTH)
+			hdrlen = (hdr.hdrlen+2)<<2; 
+		else
+			hdrlen = ipv6_optlen(&hdr); 
+		nexthdr = hdr.nexthdr;
+		len -= hdrlen;
+		start += hdrlen;
+	}
+	*nexthdrp = nexthdr;
+	return start;
+}
+EXPORT_SYMBOL(ipv6_ext_hdr);
+EXPORT_SYMBOL(ipv6_skip_exthdr);
--- a/net/ipv6/ipv6_syms.c
+++ b/net/ipv6/ipv6_syms.c
@@ -41,9 +41,7 @@ EXPORT_SYMBOL(xfrm6_rcv);
 #endif
 EXPORT_SYMBOL(rt6_lookup);
 EXPORT_SYMBOL(fl6_sock_lookup);
-EXPORT_SYMBOL(ipv6_ext_hdr);
 EXPORT_SYMBOL(ip6_append_data);
 EXPORT_SYMBOL(ip6_flush_pending_frames);
 EXPORT_SYMBOL(ip6_push_pending_frames);
 EXPORT_SYMBOL(ipv6_push_nfrag_opts);
-EXPORT_SYMBOL(ipv6_skip_exthdr);
--- a/net/packet/af_packet.c
+++ b/net/packet/af_packet.c
@@ -34,6 +34,8 @@
 *	Alexey Kuznetsov	:	Untied from IPv4 stack.
 *	Cyrus Durgin		:	Fixed kerneld for kmod.
 *	Michal Ostrowski        :       Module initialization cleanup.
+ *         Ulises Alonso        :       Frame number limit removal and 
+ *                                      packet_set_ring memory leak.
 *
 *		This program is free software; you can redistribute it and/or
 *		modify it under the terms of the GNU General Public License
@@ -168,30 +170,47 @@ static void packet_flush_mclist(struct sock *sk);
 struct packet_opt
 {
+	struct tpacket_stats	stats;
+#ifdef CONFIG_PACKET_MMAP
+	unsigned long		*pg_vec;
+	unsigned int		head;
+	unsigned int            frames_per_block;
+	unsigned int		frame_size;
+	unsigned int		frame_max;
+	int			copy_thresh;
+#endif
 	struct packet_type	prot_hook;
 	spinlock_t		bind_lock;
 	char			running;	/* prot_hook is attached*/
 	int			ifindex;	/* bound device		*/
 	unsigned short		num;
-	struct tpacket_stats	stats;
 #ifdef CONFIG_PACKET_MULTICAST
 	struct packet_mclist	*mclist;
 #endif
 #ifdef CONFIG_PACKET_MMAP
 	atomic_t		mapped;
-	unsigned long		*pg_vec;
+	unsigned int            pg_vec_order;
-	unsigned int		pg_vec_order;
 	unsigned int		pg_vec_pages;
 	unsigned int		pg_vec_len;
-	struct tpacket_hdr	**iovec;
-	unsigned int		frame_size;
-	unsigned int		iovmax;
-	unsigned int		head;
-	int			copy_thresh;
 #endif
 };
+#ifdef CONFIG_PACKET_MMAP
+static inline unsigned long packet_lookup_frame(struct packet_opt *po, unsigned int position)
+{
+	unsigned int pg_vec_pos, frame_offset;
+	unsigned long frame;
+	pg_vec_pos = position / po->frames_per_block;
+	frame_offset = position % po->frames_per_block;
+	frame = (unsigned long) (po->pg_vec[pg_vec_pos] + (frame_offset * po->frame_size));
+	return frame;
+}
+#endif
 #define pkt_sk(__sk) ((struct packet_opt *)(__sk)->sk_protinfo)
 void packet_sock_destruct(struct sock *sk)
@@ -586,11 +605,11 @@ static int tpacket_rcv(struct sk_buff *skb, struct net_device *dev,  struct pack
 		snaplen = skb->len-skb->data_len;
 	spin_lock(&sk->sk_receive_queue.lock);
-	h = po->iovec[po->head];
+	h = (struct tpacket_hdr *)packet_lookup_frame(po, po->head);
 	if (h->tp_status)
 		goto ring_is_full;
-	po->head = po->head != po->iovmax ? po->head+1 : 0;
+	po->head = po->head != po->frame_max ? po->head+1 : 0;
 	po->stats.tp_packets++;
 	if (copy_skb) {
 		status |= TP_STATUS_COPY;
@@ -1485,10 +1504,13 @@ unsigned int packet_poll(struct file * file, struct socket *sock, poll_table *wa
 	unsigned int mask = datagram_poll(file, sock, wait);
 	spin_lock_bh(&sk->sk_receive_queue.lock);
-	if (po->iovec) {
+	if (po->pg_vec) {
-		unsigned last = po->head ? po->head-1 : po->iovmax;
+		unsigned last = po->head ? po->head-1 : po->frame_max;
+		struct tpacket_hdr *h;
-		if (po->iovec[last]->tp_status)
+		h = (struct tpacket_hdr *)packet_lookup_frame(po, last);
+		if (h->tp_status)
 			mask |= POLLIN | POLLRDNORM;
 	}
 	spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -1548,16 +1570,18 @@ static void free_pg_vec(unsigned long *pg_vec, unsigned order, unsigned len)
 static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing)
 {
 	unsigned long *pg_vec = NULL;
-	struct tpacket_hdr **io_vec = NULL;
 	struct packet_opt *po = pkt_sk(sk);
 	int was_running, num, order = 0;
 	int err = 0;
 	if (req->tp_block_nr) {
 		int i, l;
-		int frames_per_block;
 		/* Sanity tests and some calculations */
+		if (po->pg_vec)
+			return -EBUSY;
 		if ((int)req->tp_block_size <= 0)
 			return -EINVAL;
 		if (req->tp_block_size&(PAGE_SIZE-1))
@@ -1566,10 +1590,11 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
 			return -EINVAL;
 		if (req->tp_frame_size&(TPACKET_ALIGNMENT-1))
 			return -EINVAL;
-		frames_per_block = req->tp_block_size/req->tp_frame_size;
-		if (frames_per_block <= 0)
+		po->frames_per_block = req->tp_block_size/req->tp_frame_size;
+		if (po->frames_per_block <= 0)
 			return -EINVAL;
-		if (frames_per_block*req->tp_block_nr != req->tp_frame_nr)
+		if (po->frames_per_block*req->tp_block_nr != req->tp_frame_nr)
 			return -EINVAL;
 		/* OK! */
@@ -1596,20 +1621,16 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
 		}
 		/* Page vector is allocated */
-		/* Draw frames */
-		io_vec = kmalloc(req->tp_frame_nr*sizeof(struct tpacket_hdr*), GFP_KERNEL);
-		if (io_vec == NULL)
-			goto out_free_pgvec;
-		memset(io_vec, 0, req->tp_frame_nr*sizeof(struct tpacket_hdr*));
 		l = 0;
 		for (i=0; i<req->tp_block_nr; i++) {
 			unsigned long ptr = pg_vec[i];
+			struct tpacket_hdr *header;
 			int k;
-			for (k=0; k<frames_per_block; k++, l++) {
+			for (k=0; k<po->frames_per_block; k++) {
-				io_vec[l] = (struct tpacket_hdr*)ptr;
-				io_vec[l]->tp_status = TP_STATUS_KERNEL;
+				header = (struct tpacket_hdr*)ptr;
+				header->tp_status = TP_STATUS_KERNEL;
 				ptr += req->tp_frame_size;
 			}
 		}
@@ -1642,8 +1663,7 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
 		spin_lock_bh(&sk->sk_receive_queue.lock);
 		pg_vec = XC(po->pg_vec, pg_vec);
-		io_vec = XC(po->iovec, io_vec);
+		po->frame_max = req->tp_frame_nr-1;
-		po->iovmax = req->tp_frame_nr-1;
 		po->head = 0;
 		po->frame_size = req->tp_frame_size;
 		spin_unlock_bh(&sk->sk_receive_queue.lock);
@@ -1652,7 +1672,7 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
 		req->tp_block_nr = XC(po->pg_vec_len, req->tp_block_nr);
 		po->pg_vec_pages = req->tp_block_size/PAGE_SIZE;
-		po->prot_hook.func = po->iovec ? tpacket_rcv : packet_rcv;
+		po->prot_hook.func = po->pg_vec ? tpacket_rcv : packet_rcv;
 		skb_queue_purge(&sk->sk_receive_queue);
 #undef XC
 		if (atomic_read(&po->mapped))
@@ -1670,9 +1690,6 @@ static int packet_set_ring(struct sock *sk, struct tpacket_req *req, int closing
 	release_sock(sk);
-	if (io_vec)
-		kfree(io_vec);
 out_free_pgvec:
 	if (pg_vec)
 		free_pg_vec(pg_vec, order, req->tp_block_nr);