Commit 8d5f3377 authored by Rusty Russell's avatar Rusty Russell Committed by David S. Miller

[NETFILTER]: Remove manip array from conntrack entry

Original patch and multo bugfixes by Krisztian Kovacs.

Now NAT has been simplified, there is only one place to NAT each
packet.  That means we can intuit what to do by looking at the
difference between this packet and the reply we expect, getting rid of
the manips[] array in the connection tracking structure, which is 72
bytes.  Rework NAT to be based on 'change this packet to make src/dst
look like this tuple'.

1) Each protocol's manip_pkt takes a 'struct ip_conntrack_manip',
   which is half (the source half) of a tuple.  Hand the whole desired
   tuple to the NAT code and have it use the 'maniptype' arg to decide
   what part to copy.

2) Krisztian points out that we don't need the NAT lock to read the
   NAT information (or the tuples) as they never change once set, and
   while being set we have exclusive access.  A lock is only needed to
   deal with only remaining NAT list: the bysource hash.

3) We don't need to rehash for the bysource hash: it depends on the
   incoming packet, which we can't change.

4) Many NAT functions only need the maniptype they are to perform, not
   the actual hook, which makes the code clearer.

5) New status bits to indicate what NAT needs to be done.  We can
   always figure it out by inverting the tuple we expect in the other
   direction and comparing it, but this is faster.

6) Rename 'do_bindings' to 'nat_packet'.

7) ICMP handing is vastly simplified: we unconditionally change to
   look the way we want.
Signed-off-by: default avatarRusty Russell <rusty@rustcorp.com.au>
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parent cd795640
...@@ -40,6 +40,17 @@ enum ip_conntrack_status { ...@@ -40,6 +40,17 @@ enum ip_conntrack_status {
/* Connection is confirmed: originating packet has left box */ /* Connection is confirmed: originating packet has left box */
IPS_CONFIRMED_BIT = 3, IPS_CONFIRMED_BIT = 3,
IPS_CONFIRMED = (1 << IPS_CONFIRMED_BIT), IPS_CONFIRMED = (1 << IPS_CONFIRMED_BIT),
/* Connection needs src nat in orig dir. This bit never changed. */
IPS_SRC_NAT_BIT = 4,
IPS_SRC_NAT = (1 << IPS_SRC_NAT_BIT),
/* Connection needs dst nat in orig dir. This bit never changed. */
IPS_DST_NAT_BIT = 5,
IPS_DST_NAT = (1 << IPS_DST_NAT_BIT),
/* Both together. */
IPS_NAT_MASK = (IPS_DST_NAT | IPS_SRC_NAT),
}; };
#ifdef __KERNEL__ #ifdef __KERNEL__
......
...@@ -48,42 +48,16 @@ struct ip_nat_multi_range_compat ...@@ -48,42 +48,16 @@ struct ip_nat_multi_range_compat
struct ip_nat_range range[1]; struct ip_nat_range range[1];
}; };
/* Worst case: local-out manip + 1 post-routing, and reverse dirn. */
#define IP_NAT_MAX_MANIPS (2*2)
struct ip_nat_info_manip
{
/* The direction. */
u_int8_t direction;
/* Which hook the manipulation happens on. */
u_int8_t hooknum;
/* The manipulation type. */
u_int8_t maniptype;
/* Manipulations to occur at each conntrack in this dirn. */
struct ip_conntrack_manip manip;
};
#ifdef __KERNEL__ #ifdef __KERNEL__
#include <linux/list.h> #include <linux/list.h>
#include <linux/netfilter_ipv4/lockhelp.h> #include <linux/netfilter_ipv4/lockhelp.h>
/* Protects NAT hash tables, and NAT-private part of conntracks. */
DECLARE_RWLOCK_EXTERN(ip_nat_lock);
/* The structure embedded in the conntrack structure. */ /* The structure embedded in the conntrack structure. */
struct ip_nat_info struct ip_nat_info
{ {
/* Set to zero when conntrack created: bitmask of maniptypes */ /* Set to zero when conntrack created: bitmask of maniptypes */
u_int16_t initialized; u_int16_t initialized;
u_int16_t num_manips;
/* Manipulations to be done on this conntrack. */
struct ip_nat_info_manip manips[IP_NAT_MAX_MANIPS];
struct list_head bysource; struct list_head bysource;
/* Helper (NULL if none). */ /* Helper (NULL if none). */
......
...@@ -8,16 +8,13 @@ ...@@ -8,16 +8,13 @@
extern int ip_nat_init(void); extern int ip_nat_init(void);
extern void ip_nat_cleanup(void); extern void ip_nat_cleanup(void);
extern unsigned int do_bindings(struct ip_conntrack *ct, extern unsigned int nat_packet(struct ip_conntrack *ct,
enum ip_conntrack_info conntrackinfo, enum ip_conntrack_info conntrackinfo,
struct ip_nat_info *info, unsigned int hooknum,
unsigned int hooknum, struct sk_buff **pskb);
struct sk_buff **pskb);
extern int icmp_reply_translation(struct sk_buff **pskb, extern int icmp_reply_translation(struct sk_buff **pskb,
struct ip_conntrack *conntrack, struct ip_conntrack *ct,
unsigned int hooknum, enum ip_nat_manip_type manip,
int dir); enum ip_conntrack_dir dir);
#endif /* _IP_NAT_CORE_H */ #endif /* _IP_NAT_CORE_H */
...@@ -15,11 +15,11 @@ struct ip_nat_protocol ...@@ -15,11 +15,11 @@ struct ip_nat_protocol
/* Protocol number. */ /* Protocol number. */
unsigned int protonum; unsigned int protonum;
/* Do a packet translation according to the ip_nat_proto_manip /* Translate a packet to the target according to manip type.
* and manip type. Return true if succeeded. */ Return true if succeeded. */
int (*manip_pkt)(struct sk_buff **pskb, int (*manip_pkt)(struct sk_buff **pskb,
unsigned int iphdroff, unsigned int iphdroff,
const struct ip_conntrack_manip *manip, const struct ip_conntrack_tuple *tuple,
enum ip_nat_manip_type maniptype); enum ip_nat_manip_type maniptype);
/* Is the manipable part of the tuple between min and max incl? */ /* Is the manipable part of the tuple between min and max incl? */
......
This diff is collapsed.
...@@ -405,46 +405,28 @@ ip_nat_seq_adjust(struct sk_buff **pskb, ...@@ -405,46 +405,28 @@ ip_nat_seq_adjust(struct sk_buff **pskb,
return 1; return 1;
} }
/* We look at the master's nat fields without ip_nat_lock. This works
because the master's NAT must be fully initialized, because we
don't match expectations set up by unconfirmed connections. We
can't grab the lock because we hold the ip_conntrack_lock, and that
would be backwards from other locking orders. */
static void ip_nat_copy_manip(struct ip_nat_info *master,
struct ip_conntrack_expect *exp,
struct ip_conntrack *ct)
{
struct ip_nat_range range;
unsigned int i;
range.flags = IP_NAT_RANGE_MAP_IPS;
/* Find what master is mapped to (if any), so we can do the same. */
for (i = 0; i < master->num_manips; i++) {
if (master->manips[i].direction != exp->dir)
continue;
range.min_ip = range.max_ip = master->manips[i].manip.ip;
/* If this is a DST manip, map port here to where it's
* expected. */
if (master->manips[i].maniptype == IP_NAT_MANIP_DST) {
range.flags |= IP_NAT_RANGE_PROTO_SPECIFIED;
range.min = range.max = exp->saved_proto;
}
ip_nat_setup_info(ct, &range, master->manips[i].hooknum);
}
}
/* Setup NAT on this expected conntrack so it follows master. */ /* Setup NAT on this expected conntrack so it follows master. */
/* If we fail to get a free NAT slot, we'll get dropped on confirm */ /* If we fail to get a free NAT slot, we'll get dropped on confirm */
void ip_nat_follow_master(struct ip_conntrack *ct, void ip_nat_follow_master(struct ip_conntrack *ct,
struct ip_conntrack_expect *this) struct ip_conntrack_expect *exp)
{ {
struct ip_nat_info *master = &ct->master->nat.info; struct ip_nat_range range;
/* This must be a fresh one. */ /* This must be a fresh one. */
BUG_ON(ct->nat.info.initialized); BUG_ON(ct->nat.info.initialized);
ip_nat_copy_manip(master, this, ct); /* Change src to where master sends to */
range.flags = IP_NAT_RANGE_MAP_IPS;
range.min_ip = range.max_ip
= ct->master->tuplehash[!exp->dir].tuple.dst.ip;
/* hook doesn't matter, but it has to do source manip */
ip_nat_setup_info(ct, &range, NF_IP_POST_ROUTING);
/* For DST manip, map port here to where it's expected. */
range.flags = (IP_NAT_RANGE_MAP_IPS | IP_NAT_RANGE_PROTO_SPECIFIED);
range.min = range.max = exp->saved_proto;
range.min_ip = range.max_ip
= ct->master->tuplehash[!exp->dir].tuple.src.ip;
/* hook doesn't matter, but it has to do destination manip */
ip_nat_setup_info(ct, &range, NF_IP_PRE_ROUTING);
} }
...@@ -54,7 +54,7 @@ icmp_unique_tuple(struct ip_conntrack_tuple *tuple, ...@@ -54,7 +54,7 @@ icmp_unique_tuple(struct ip_conntrack_tuple *tuple,
static int static int
icmp_manip_pkt(struct sk_buff **pskb, icmp_manip_pkt(struct sk_buff **pskb,
unsigned int iphdroff, unsigned int iphdroff,
const struct ip_conntrack_manip *manip, const struct ip_conntrack_tuple *tuple,
enum ip_nat_manip_type maniptype) enum ip_nat_manip_type maniptype)
{ {
struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
...@@ -64,12 +64,12 @@ icmp_manip_pkt(struct sk_buff **pskb, ...@@ -64,12 +64,12 @@ icmp_manip_pkt(struct sk_buff **pskb,
if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
return 0; return 0;
hdr = (void *)(*pskb)->data + hdroff; hdr = (struct icmphdr *)((*pskb)->data + hdroff);
hdr->checksum = ip_nat_cheat_check(hdr->un.echo.id ^ 0xFFFF, hdr->checksum = ip_nat_cheat_check(hdr->un.echo.id ^ 0xFFFF,
manip->u.icmp.id, tuple->src.u.icmp.id,
hdr->checksum); hdr->checksum);
hdr->un.echo.id = manip->u.icmp.id; hdr->un.echo.id = tuple->src.u.icmp.id;
return 1; return 1;
} }
......
...@@ -85,14 +85,14 @@ tcp_unique_tuple(struct ip_conntrack_tuple *tuple, ...@@ -85,14 +85,14 @@ tcp_unique_tuple(struct ip_conntrack_tuple *tuple,
static int static int
tcp_manip_pkt(struct sk_buff **pskb, tcp_manip_pkt(struct sk_buff **pskb,
unsigned int iphdroff, unsigned int iphdroff,
const struct ip_conntrack_manip *manip, const struct ip_conntrack_tuple *tuple,
enum ip_nat_manip_type maniptype) enum ip_nat_manip_type maniptype)
{ {
struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
struct tcphdr *hdr; struct tcphdr *hdr;
unsigned int hdroff = iphdroff + iph->ihl*4; unsigned int hdroff = iphdroff + iph->ihl*4;
u32 oldip, oldsrc = iph->saddr, olddst = iph->daddr; u32 oldip, newip;
u16 *portptr, oldport; u16 *portptr, newport, oldport;
int hdrsize = 8; /* TCP connection tracking guarantees this much */ int hdrsize = 8; /* TCP connection tracking guarantees this much */
/* this could be a inner header returned in icmp packet; in such /* this could be a inner header returned in icmp packet; in such
...@@ -104,27 +104,32 @@ tcp_manip_pkt(struct sk_buff **pskb, ...@@ -104,27 +104,32 @@ tcp_manip_pkt(struct sk_buff **pskb,
if (!skb_ip_make_writable(pskb, hdroff + hdrsize)) if (!skb_ip_make_writable(pskb, hdroff + hdrsize))
return 0; return 0;
hdr = (void *)(*pskb)->data + hdroff; iph = (struct iphdr *)((*pskb)->data + iphdroff);
hdr = (struct tcphdr *)((*pskb)->data + iph->ihl*4);
if (maniptype == IP_NAT_MANIP_SRC) { if (maniptype == IP_NAT_MANIP_SRC) {
/* Get rid of src ip and src pt */ /* Get rid of src ip and src pt */
oldip = oldsrc; oldip = iph->saddr;
newip = tuple->src.ip;
newport = tuple->src.u.tcp.port;
portptr = &hdr->source; portptr = &hdr->source;
} else { } else {
/* Get rid of dst ip and dst pt */ /* Get rid of dst ip and dst pt */
oldip = olddst; oldip = iph->daddr;
newip = tuple->dst.ip;
newport = tuple->dst.u.tcp.port;
portptr = &hdr->dest; portptr = &hdr->dest;
} }
oldport = *portptr; oldport = *portptr;
*portptr = manip->u.tcp.port; *portptr = newport;
if (hdrsize < sizeof(*hdr)) if (hdrsize < sizeof(*hdr))
return 1; return 1;
hdr->check = ip_nat_cheat_check(~oldip, manip->ip, hdr->check = ip_nat_cheat_check(~oldip, newip,
ip_nat_cheat_check(oldport ^ 0xFFFF, ip_nat_cheat_check(oldport ^ 0xFFFF,
manip->u.tcp.port, newport,
hdr->check)); hdr->check));
return 1; return 1;
} }
......
...@@ -84,34 +84,40 @@ udp_unique_tuple(struct ip_conntrack_tuple *tuple, ...@@ -84,34 +84,40 @@ udp_unique_tuple(struct ip_conntrack_tuple *tuple,
static int static int
udp_manip_pkt(struct sk_buff **pskb, udp_manip_pkt(struct sk_buff **pskb,
unsigned int iphdroff, unsigned int iphdroff,
const struct ip_conntrack_manip *manip, const struct ip_conntrack_tuple *tuple,
enum ip_nat_manip_type maniptype) enum ip_nat_manip_type maniptype)
{ {
struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff); struct iphdr *iph = (struct iphdr *)((*pskb)->data + iphdroff);
struct udphdr *hdr; struct udphdr *hdr;
unsigned int hdroff = iphdroff + iph->ihl*4; unsigned int hdroff = iphdroff + iph->ihl*4;
u32 oldip, oldsrc = iph->saddr, olddst = iph->daddr; u32 oldip, newip;
u16 *portptr; u16 *portptr, newport;
if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr))) if (!skb_ip_make_writable(pskb, hdroff + sizeof(*hdr)))
return 0; return 0;
hdr = (void *)(*pskb)->data + hdroff; iph = (struct iphdr *)((*pskb)->data + iphdroff);
hdr = (struct udphdr *)((*pskb)->data + hdroff);
if (maniptype == IP_NAT_MANIP_SRC) { if (maniptype == IP_NAT_MANIP_SRC) {
/* Get rid of src ip and src pt */ /* Get rid of src ip and src pt */
oldip = oldsrc; oldip = iph->saddr;
newip = tuple->src.ip;
newport = tuple->src.u.udp.port;
portptr = &hdr->source; portptr = &hdr->source;
} else { } else {
/* Get rid of dst ip and dst pt */ /* Get rid of dst ip and dst pt */
oldip = olddst; oldip = iph->daddr;
newip = tuple->dst.ip;
newport = tuple->dst.u.udp.port;
portptr = &hdr->dest; portptr = &hdr->dest;
} }
if (hdr->check) /* 0 is a special case meaning no checksum */ if (hdr->check) /* 0 is a special case meaning no checksum */
hdr->check = ip_nat_cheat_check(~oldip, manip->ip, hdr->check = ip_nat_cheat_check(~oldip, newip,
ip_nat_cheat_check(*portptr ^ 0xFFFF, ip_nat_cheat_check(*portptr ^ 0xFFFF,
manip->u.udp.port, newport,
hdr->check)); hdr->check));
*portptr = manip->u.udp.port; *portptr = newport;
return 1; return 1;
} }
......
...@@ -40,7 +40,7 @@ static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple, ...@@ -40,7 +40,7 @@ static int unknown_unique_tuple(struct ip_conntrack_tuple *tuple,
static int static int
unknown_manip_pkt(struct sk_buff **pskb, unknown_manip_pkt(struct sk_buff **pskb,
unsigned int iphdroff, unsigned int iphdroff,
const struct ip_conntrack_manip *manip, const struct ip_conntrack_tuple *tuple,
enum ip_nat_manip_type maniptype) enum ip_nat_manip_type maniptype)
{ {
return 1; return 1;
......
...@@ -16,6 +16,7 @@ ...@@ -16,6 +16,7 @@
#include <linux/skbuff.h> #include <linux/skbuff.h>
#include <linux/proc_fs.h> #include <linux/proc_fs.h>
#include <net/checksum.h> #include <net/checksum.h>
#include <net/route.h>
#include <linux/bitops.h> #include <linux/bitops.h>
#define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock) #define ASSERT_READ_LOCK(x) MUST_BE_READ_LOCKED(&ip_nat_lock)
...@@ -120,6 +121,25 @@ static unsigned int ipt_snat_target(struct sk_buff **pskb, ...@@ -120,6 +121,25 @@ static unsigned int ipt_snat_target(struct sk_buff **pskb,
return ip_nat_setup_info(ct, &mr->range[0], hooknum); return ip_nat_setup_info(ct, &mr->range[0], hooknum);
} }
/* Before 2.6.11 we did implicit source NAT if required. Warn about change. */
static void warn_if_extra_mangle(u32 dstip, u32 srcip)
{
static int warned = 0;
struct flowi fl = { .nl_u = { .ip4_u = { .daddr = dstip } } };
struct rtable *rt;
if (ip_route_output_key(&rt, &fl) != 0)
return;
if (rt->rt_src != srcip && !warned) {
printk("NAT: no longer support implicit source local NAT\n");
printk("NAT: packet src %u.%u.%u.%u -> dst %u.%u.%u.%u\n",
NIPQUAD(srcip), NIPQUAD(dstip));
warned = 1;
}
ip_rt_put(rt);
}
static unsigned int ipt_dnat_target(struct sk_buff **pskb, static unsigned int ipt_dnat_target(struct sk_buff **pskb,
const struct net_device *in, const struct net_device *in,
const struct net_device *out, const struct net_device *out,
...@@ -139,6 +159,11 @@ static unsigned int ipt_dnat_target(struct sk_buff **pskb, ...@@ -139,6 +159,11 @@ static unsigned int ipt_dnat_target(struct sk_buff **pskb,
/* Connection must be valid and new. */ /* Connection must be valid and new. */
IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED)); IP_NF_ASSERT(ct && (ctinfo == IP_CT_NEW || ctinfo == IP_CT_RELATED));
if (hooknum == NF_IP_LOCAL_OUT
&& mr->range[0].flags & IP_NAT_RANGE_MAP_IPS)
warn_if_extra_mangle((*pskb)->nh.iph->daddr,
mr->range[0].min_ip);
return ip_nat_setup_info(ct, &mr->range[0], hooknum); return ip_nat_setup_info(ct, &mr->range[0], hooknum);
} }
......
...@@ -106,7 +106,7 @@ ip_nat_fn(unsigned int hooknum, ...@@ -106,7 +106,7 @@ ip_nat_fn(unsigned int hooknum,
case IP_CT_RELATED: case IP_CT_RELATED:
case IP_CT_RELATED+IP_CT_IS_REPLY: case IP_CT_RELATED+IP_CT_IS_REPLY:
if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) { if ((*pskb)->nh.iph->protocol == IPPROTO_ICMP) {
if (!icmp_reply_translation(pskb, ct, hooknum, if (!icmp_reply_translation(pskb, ct, maniptype,
CTINFO2DIR(ctinfo))) CTINFO2DIR(ctinfo)))
return NF_DROP; return NF_DROP;
else else
...@@ -116,7 +116,6 @@ ip_nat_fn(unsigned int hooknum, ...@@ -116,7 +116,6 @@ ip_nat_fn(unsigned int hooknum,
case IP_CT_NEW: case IP_CT_NEW:
info = &ct->nat.info; info = &ct->nat.info;
WRITE_LOCK(&ip_nat_lock);
/* Seen it before? This can happen for loopback, retrans, /* Seen it before? This can happen for loopback, retrans,
or local packets.. */ or local packets.. */
if (!(info->initialized & (1 << maniptype))) { if (!(info->initialized & (1 << maniptype))) {
...@@ -131,14 +130,12 @@ ip_nat_fn(unsigned int hooknum, ...@@ -131,14 +130,12 @@ ip_nat_fn(unsigned int hooknum,
info); info);
if (ret != NF_ACCEPT) { if (ret != NF_ACCEPT) {
WRITE_UNLOCK(&ip_nat_lock);
return ret; return ret;
} }
} else } else
DEBUGP("Already setup manip %s for ct %p\n", DEBUGP("Already setup manip %s for ct %p\n",
maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST", maniptype == IP_NAT_MANIP_SRC ? "SRC" : "DST",
ct); ct);
WRITE_UNLOCK(&ip_nat_lock);
break; break;
default: default:
...@@ -149,7 +146,7 @@ ip_nat_fn(unsigned int hooknum, ...@@ -149,7 +146,7 @@ ip_nat_fn(unsigned int hooknum,
} }
IP_NF_ASSERT(info); IP_NF_ASSERT(info);
return do_bindings(ct, ctinfo, info, hooknum, pskb); return nat_packet(ct, ctinfo, hooknum, pskb);
} }
static unsigned int static unsigned int
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment