Commit acced9d2 authored by David S. Miller's avatar David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf

Pablo Neira Ayuso says:

====================
Netfilter/IPVS fixes for net

The following patchset contains Netfilter/IPVS fixes for your net tree:

1) Add a selftest for icmp packet too big errors with conntrack, from
   Florian Westphal.

2) Validate inner header in ICMP error message does not lie to us
   in conntrack, also from Florian.

3) Initialize ct->timeout to calm down KASAN, from Alexander Potapenko.

4) Skip ICMP error messages from tunnels in IPVS, from Julian Anastasov.

5) Use a hash to expose conntrack and expectation ID, from Florian Westphal.

6) Prevent shift wrap in nft_chain_parse_hook(), from Dan Carpenter.

7) Fix broken ICMP ID randomization with NAT, also from Florian.

8) Remove WARN_ON in ebtables compat that is reached via syzkaller,
   from Florian Westphal.

9) Fix broken timestamps since fb420d5d ("tcp/fq: move back to
   CLOCK_MONOTONIC"), from Florian.

10) Fix logging of invalid packets in conntrack, from Andrei Vagin.
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents 26d1b858 d4866805
...@@ -316,6 +316,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net, ...@@ -316,6 +316,8 @@ struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
gfp_t flags); gfp_t flags);
void nf_ct_tmpl_free(struct nf_conn *tmpl); void nf_ct_tmpl_free(struct nf_conn *tmpl);
u32 nf_ct_get_id(const struct nf_conn *ct);
static inline void static inline void
nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info) nf_ct_set(struct sk_buff *skb, struct nf_conn *ct, enum ip_conntrack_info info)
{ {
......
...@@ -75,6 +75,12 @@ bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple, ...@@ -75,6 +75,12 @@ bool nf_conntrack_invert_icmp_tuple(struct nf_conntrack_tuple *tuple,
bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple, bool nf_conntrack_invert_icmpv6_tuple(struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_tuple *orig); const struct nf_conntrack_tuple *orig);
int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb,
unsigned int dataoff,
const struct nf_hook_state *state,
u8 l4proto,
union nf_inet_addr *outer_daddr);
int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
struct sk_buff *skb, struct sk_buff *skb,
unsigned int dataoff, unsigned int dataoff,
......
...@@ -2032,7 +2032,8 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32, ...@@ -2032,7 +2032,8 @@ static int ebt_size_mwt(struct compat_ebt_entry_mwt *match32,
if (match_kern) if (match_kern)
match_kern->match_size = ret; match_kern->match_size = ret;
if (WARN_ON(type == EBT_COMPAT_TARGET && size_left)) /* rule should have no remaining data after target */
if (type == EBT_COMPAT_TARGET && size_left)
return -EINVAL; return -EINVAL;
match32 = (struct compat_ebt_entry_mwt *) buf; match32 = (struct compat_ebt_entry_mwt *) buf;
......
...@@ -1678,7 +1678,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related, ...@@ -1678,7 +1678,7 @@ ip_vs_in_icmp(struct netns_ipvs *ipvs, struct sk_buff *skb, int *related,
if (!cp) { if (!cp) {
int v; int v;
if (!sysctl_schedule_icmp(ipvs)) if (ipip || !sysctl_schedule_icmp(ipvs))
return NF_ACCEPT; return NF_ACCEPT;
if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph)) if (!ip_vs_try_to_schedule(ipvs, AF_INET, skb, pd, &v, &cp, &ciph))
......
...@@ -25,6 +25,7 @@ ...@@ -25,6 +25,7 @@
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/random.h> #include <linux/random.h>
#include <linux/jhash.h> #include <linux/jhash.h>
#include <linux/siphash.h>
#include <linux/err.h> #include <linux/err.h>
#include <linux/percpu.h> #include <linux/percpu.h>
#include <linux/moduleparam.h> #include <linux/moduleparam.h>
...@@ -449,6 +450,40 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse, ...@@ -449,6 +450,40 @@ nf_ct_invert_tuple(struct nf_conntrack_tuple *inverse,
} }
EXPORT_SYMBOL_GPL(nf_ct_invert_tuple); EXPORT_SYMBOL_GPL(nf_ct_invert_tuple);
/* Generate a almost-unique pseudo-id for a given conntrack.
*
* intentionally doesn't re-use any of the seeds used for hash
* table location, we assume id gets exposed to userspace.
*
* Following nf_conn items do not change throughout lifetime
* of the nf_conn after it has been committed to main hash table:
*
* 1. nf_conn address
* 2. nf_conn->ext address
* 3. nf_conn->master address (normally NULL)
* 4. tuple
* 5. the associated net namespace
*/
u32 nf_ct_get_id(const struct nf_conn *ct)
{
static __read_mostly siphash_key_t ct_id_seed;
unsigned long a, b, c, d;
net_get_random_once(&ct_id_seed, sizeof(ct_id_seed));
a = (unsigned long)ct;
b = (unsigned long)ct->master ^ net_hash_mix(nf_ct_net(ct));
c = (unsigned long)ct->ext;
d = (unsigned long)siphash(&ct->tuplehash, sizeof(ct->tuplehash),
&ct_id_seed);
#ifdef CONFIG_64BIT
return siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &ct_id_seed);
#else
return siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &ct_id_seed);
#endif
}
EXPORT_SYMBOL_GPL(nf_ct_get_id);
static void static void
clean_from_lists(struct nf_conn *ct) clean_from_lists(struct nf_conn *ct)
{ {
...@@ -982,12 +1017,9 @@ __nf_conntrack_confirm(struct sk_buff *skb) ...@@ -982,12 +1017,9 @@ __nf_conntrack_confirm(struct sk_buff *skb)
/* set conntrack timestamp, if enabled. */ /* set conntrack timestamp, if enabled. */
tstamp = nf_conn_tstamp_find(ct); tstamp = nf_conn_tstamp_find(ct);
if (tstamp) { if (tstamp)
if (skb->tstamp == 0) tstamp->start = ktime_get_real_ns();
__net_timestamp(skb);
tstamp->start = ktime_to_ns(skb->tstamp);
}
/* Since the lookup is lockless, hash insertion must be done after /* Since the lookup is lockless, hash insertion must be done after
* starting the timer and setting the CONFIRMED bit. The RCU barriers * starting the timer and setting the CONFIRMED bit. The RCU barriers
* guarantee that no other CPU can find the conntrack before the above * guarantee that no other CPU can find the conntrack before the above
...@@ -1350,6 +1382,7 @@ __nf_conntrack_alloc(struct net *net, ...@@ -1350,6 +1382,7 @@ __nf_conntrack_alloc(struct net *net,
/* save hash for reusing when confirming */ /* save hash for reusing when confirming */
*(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash; *(unsigned long *)(&ct->tuplehash[IP_CT_DIR_REPLY].hnnode.pprev) = hash;
ct->status = 0; ct->status = 0;
ct->timeout = 0;
write_pnet(&ct->ct_net, net); write_pnet(&ct->ct_net, net);
memset(&ct->__nfct_init_offset[0], 0, memset(&ct->__nfct_init_offset[0], 0,
offsetof(struct nf_conn, proto) - offsetof(struct nf_conn, proto) -
......
...@@ -29,6 +29,7 @@ ...@@ -29,6 +29,7 @@
#include <linux/spinlock.h> #include <linux/spinlock.h>
#include <linux/interrupt.h> #include <linux/interrupt.h>
#include <linux/slab.h> #include <linux/slab.h>
#include <linux/siphash.h>
#include <linux/netfilter.h> #include <linux/netfilter.h>
#include <net/netlink.h> #include <net/netlink.h>
...@@ -485,7 +486,9 @@ static int ctnetlink_dump_ct_synproxy(struct sk_buff *skb, struct nf_conn *ct) ...@@ -485,7 +486,9 @@ static int ctnetlink_dump_ct_synproxy(struct sk_buff *skb, struct nf_conn *ct)
static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct) static int ctnetlink_dump_id(struct sk_buff *skb, const struct nf_conn *ct)
{ {
if (nla_put_be32(skb, CTA_ID, htonl((unsigned long)ct))) __be32 id = (__force __be32)nf_ct_get_id(ct);
if (nla_put_be32(skb, CTA_ID, id))
goto nla_put_failure; goto nla_put_failure;
return 0; return 0;
...@@ -1286,8 +1289,9 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl, ...@@ -1286,8 +1289,9 @@ static int ctnetlink_del_conntrack(struct net *net, struct sock *ctnl,
} }
if (cda[CTA_ID]) { if (cda[CTA_ID]) {
u_int32_t id = ntohl(nla_get_be32(cda[CTA_ID])); __be32 id = nla_get_be32(cda[CTA_ID]);
if (id != (u32)(unsigned long)ct) {
if (id != (__force __be32)nf_ct_get_id(ct)) {
nf_ct_put(ct); nf_ct_put(ct);
return -ENOENT; return -ENOENT;
} }
...@@ -2692,6 +2696,25 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb, ...@@ -2692,6 +2696,25 @@ static int ctnetlink_exp_dump_mask(struct sk_buff *skb,
static const union nf_inet_addr any_addr; static const union nf_inet_addr any_addr;
static __be32 nf_expect_get_id(const struct nf_conntrack_expect *exp)
{
static __read_mostly siphash_key_t exp_id_seed;
unsigned long a, b, c, d;
net_get_random_once(&exp_id_seed, sizeof(exp_id_seed));
a = (unsigned long)exp;
b = (unsigned long)exp->helper;
c = (unsigned long)exp->master;
d = (unsigned long)siphash(&exp->tuple, sizeof(exp->tuple), &exp_id_seed);
#ifdef CONFIG_64BIT
return (__force __be32)siphash_4u64((u64)a, (u64)b, (u64)c, (u64)d, &exp_id_seed);
#else
return (__force __be32)siphash_4u32((u32)a, (u32)b, (u32)c, (u32)d, &exp_id_seed);
#endif
}
static int static int
ctnetlink_exp_dump_expect(struct sk_buff *skb, ctnetlink_exp_dump_expect(struct sk_buff *skb,
const struct nf_conntrack_expect *exp) const struct nf_conntrack_expect *exp)
...@@ -2739,7 +2762,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb, ...@@ -2739,7 +2762,7 @@ ctnetlink_exp_dump_expect(struct sk_buff *skb,
} }
#endif #endif
if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) || if (nla_put_be32(skb, CTA_EXPECT_TIMEOUT, htonl(timeout)) ||
nla_put_be32(skb, CTA_EXPECT_ID, htonl((unsigned long)exp)) || nla_put_be32(skb, CTA_EXPECT_ID, nf_expect_get_id(exp)) ||
nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) || nla_put_be32(skb, CTA_EXPECT_FLAGS, htonl(exp->flags)) ||
nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class))) nla_put_be32(skb, CTA_EXPECT_CLASS, htonl(exp->class)))
goto nla_put_failure; goto nla_put_failure;
...@@ -3044,7 +3067,8 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl, ...@@ -3044,7 +3067,8 @@ static int ctnetlink_get_expect(struct net *net, struct sock *ctnl,
if (cda[CTA_EXPECT_ID]) { if (cda[CTA_EXPECT_ID]) {
__be32 id = nla_get_be32(cda[CTA_EXPECT_ID]); __be32 id = nla_get_be32(cda[CTA_EXPECT_ID]);
if (ntohl(id) != (u32)(unsigned long)exp) {
if (id != nf_expect_get_id(exp)) {
nf_ct_expect_put(exp); nf_ct_expect_put(exp);
return -ENOENT; return -ENOENT;
} }
......
...@@ -55,7 +55,7 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb, ...@@ -55,7 +55,7 @@ void nf_l4proto_log_invalid(const struct sk_buff *skb,
struct va_format vaf; struct va_format vaf;
va_list args; va_list args;
if (net->ct.sysctl_log_invalid != protonum || if (net->ct.sysctl_log_invalid != protonum &&
net->ct.sysctl_log_invalid != IPPROTO_RAW) net->ct.sysctl_log_invalid != IPPROTO_RAW)
return; return;
......
...@@ -103,49 +103,94 @@ int nf_conntrack_icmp_packet(struct nf_conn *ct, ...@@ -103,49 +103,94 @@ int nf_conntrack_icmp_packet(struct nf_conn *ct,
return NF_ACCEPT; return NF_ACCEPT;
} }
/* Returns conntrack if it dealt with ICMP, and filled in skb fields */ /* Check inner header is related to any of the existing connections */
static int int nf_conntrack_inet_error(struct nf_conn *tmpl, struct sk_buff *skb,
icmp_error_message(struct nf_conn *tmpl, struct sk_buff *skb, unsigned int dataoff,
const struct nf_hook_state *state) const struct nf_hook_state *state,
u8 l4proto, union nf_inet_addr *outer_daddr)
{ {
struct nf_conntrack_tuple innertuple, origtuple; struct nf_conntrack_tuple innertuple, origtuple;
const struct nf_conntrack_tuple_hash *h; const struct nf_conntrack_tuple_hash *h;
const struct nf_conntrack_zone *zone; const struct nf_conntrack_zone *zone;
enum ip_conntrack_info ctinfo; enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp; struct nf_conntrack_zone tmp;
union nf_inet_addr *ct_daddr;
enum ip_conntrack_dir dir;
struct nf_conn *ct;
WARN_ON(skb_nfct(skb)); WARN_ON(skb_nfct(skb));
zone = nf_ct_zone_tmpl(tmpl, skb, &tmp); zone = nf_ct_zone_tmpl(tmpl, skb, &tmp);
/* Are they talking about one of our connections? */ /* Are they talking about one of our connections? */
if (!nf_ct_get_tuplepr(skb, if (!nf_ct_get_tuplepr(skb, dataoff,
skb_network_offset(skb) + ip_hdrlen(skb) state->pf, state->net, &origtuple))
+ sizeof(struct icmphdr),
PF_INET, state->net, &origtuple)) {
pr_debug("icmp_error_message: failed to get tuple\n");
return -NF_ACCEPT; return -NF_ACCEPT;
}
/* Ordinarily, we'd expect the inverted tupleproto, but it's /* Ordinarily, we'd expect the inverted tupleproto, but it's
been preserved inside the ICMP. */ been preserved inside the ICMP. */
if (!nf_ct_invert_tuple(&innertuple, &origtuple)) { if (!nf_ct_invert_tuple(&innertuple, &origtuple))
pr_debug("icmp_error_message: no match\n");
return -NF_ACCEPT; return -NF_ACCEPT;
}
ctinfo = IP_CT_RELATED;
h = nf_conntrack_find_get(state->net, zone, &innertuple); h = nf_conntrack_find_get(state->net, zone, &innertuple);
if (!h) { if (!h)
pr_debug("icmp_error_message: no match\n"); return -NF_ACCEPT;
/* Consider: A -> T (=This machine) -> B
* Conntrack entry will look like this:
* Original: A->B
* Reply: B->T (SNAT case) OR A
*
* When this function runs, we got packet that looks like this:
* iphdr|icmphdr|inner_iphdr|l4header (tcp, udp, ..).
*
* Above nf_conntrack_find_get() makes lookup based on inner_hdr,
* so we should expect that destination of the found connection
* matches outer header destination address.
*
* In above example, we can consider these two cases:
* 1. Error coming in reply direction from B or M (middle box) to
* T (SNAT case) or A.
* Inner saddr will be B, dst will be T or A.
* The found conntrack will be reply tuple (B->T/A).
* 2. Error coming in original direction from A or M to B.
* Inner saddr will be A, inner daddr will be B.
* The found conntrack will be original tuple (A->B).
*
* In both cases, conntrack[dir].dst == inner.dst.
*
* A bogus packet could look like this:
* Inner: B->T
* Outer: B->X (other machine reachable by T).
*
* In this case, lookup yields connection A->B and will
* set packet from B->X as *RELATED*, even though no connection
* from X was ever seen.
*/
ct = nf_ct_tuplehash_to_ctrack(h);
dir = NF_CT_DIRECTION(h);
ct_daddr = &ct->tuplehash[dir].tuple.dst.u3;
if (!nf_inet_addr_cmp(outer_daddr, ct_daddr)) {
if (state->pf == AF_INET) {
nf_l4proto_log_invalid(skb, state->net, state->pf,
l4proto,
"outer daddr %pI4 != inner %pI4",
&outer_daddr->ip, &ct_daddr->ip);
} else if (state->pf == AF_INET6) {
nf_l4proto_log_invalid(skb, state->net, state->pf,
l4proto,
"outer daddr %pI6 != inner %pI6",
&outer_daddr->ip6, &ct_daddr->ip6);
}
nf_ct_put(ct);
return -NF_ACCEPT; return -NF_ACCEPT;
} }
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY) ctinfo = IP_CT_RELATED;
if (dir == IP_CT_DIR_REPLY)
ctinfo += IP_CT_IS_REPLY; ctinfo += IP_CT_IS_REPLY;
/* Update skb to refer to this connection */ /* Update skb to refer to this connection */
nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo); nf_ct_set(skb, ct, ctinfo);
return NF_ACCEPT; return NF_ACCEPT;
} }
...@@ -162,11 +207,12 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, ...@@ -162,11 +207,12 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
struct sk_buff *skb, unsigned int dataoff, struct sk_buff *skb, unsigned int dataoff,
const struct nf_hook_state *state) const struct nf_hook_state *state)
{ {
union nf_inet_addr outer_daddr;
const struct icmphdr *icmph; const struct icmphdr *icmph;
struct icmphdr _ih; struct icmphdr _ih;
/* Not enough header? */ /* Not enough header? */
icmph = skb_header_pointer(skb, ip_hdrlen(skb), sizeof(_ih), &_ih); icmph = skb_header_pointer(skb, dataoff, sizeof(_ih), &_ih);
if (icmph == NULL) { if (icmph == NULL) {
icmp_error_log(skb, state, "short packet"); icmp_error_log(skb, state, "short packet");
return -NF_ACCEPT; return -NF_ACCEPT;
...@@ -199,7 +245,12 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl, ...@@ -199,7 +245,12 @@ int nf_conntrack_icmpv4_error(struct nf_conn *tmpl,
icmph->type != ICMP_REDIRECT) icmph->type != ICMP_REDIRECT)
return NF_ACCEPT; return NF_ACCEPT;
return icmp_error_message(tmpl, skb, state); memset(&outer_daddr, 0, sizeof(outer_daddr));
outer_daddr.ip = ip_hdr(skb)->daddr;
dataoff += sizeof(*icmph);
return nf_conntrack_inet_error(tmpl, skb, dataoff, state,
IPPROTO_ICMP, &outer_daddr);
} }
#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
......
...@@ -123,51 +123,6 @@ int nf_conntrack_icmpv6_packet(struct nf_conn *ct, ...@@ -123,51 +123,6 @@ int nf_conntrack_icmpv6_packet(struct nf_conn *ct,
return NF_ACCEPT; return NF_ACCEPT;
} }
static int
icmpv6_error_message(struct net *net, struct nf_conn *tmpl,
struct sk_buff *skb,
unsigned int icmp6off)
{
struct nf_conntrack_tuple intuple, origtuple;
const struct nf_conntrack_tuple_hash *h;
enum ip_conntrack_info ctinfo;
struct nf_conntrack_zone tmp;
WARN_ON(skb_nfct(skb));
/* Are they talking about one of our connections? */
if (!nf_ct_get_tuplepr(skb,
skb_network_offset(skb)
+ sizeof(struct ipv6hdr)
+ sizeof(struct icmp6hdr),
PF_INET6, net, &origtuple)) {
pr_debug("icmpv6_error: Can't get tuple\n");
return -NF_ACCEPT;
}
/* Ordinarily, we'd expect the inverted tupleproto, but it's
been preserved inside the ICMP. */
if (!nf_ct_invert_tuple(&intuple, &origtuple)) {
pr_debug("icmpv6_error: Can't invert tuple\n");
return -NF_ACCEPT;
}
ctinfo = IP_CT_RELATED;
h = nf_conntrack_find_get(net, nf_ct_zone_tmpl(tmpl, skb, &tmp),
&intuple);
if (!h) {
pr_debug("icmpv6_error: no match\n");
return -NF_ACCEPT;
} else {
if (NF_CT_DIRECTION(h) == IP_CT_DIR_REPLY)
ctinfo += IP_CT_IS_REPLY;
}
/* Update skb to refer to this connection */
nf_ct_set(skb, nf_ct_tuplehash_to_ctrack(h), ctinfo);
return NF_ACCEPT;
}
static void icmpv6_error_log(const struct sk_buff *skb, static void icmpv6_error_log(const struct sk_buff *skb,
const struct nf_hook_state *state, const struct nf_hook_state *state,
...@@ -182,6 +137,7 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, ...@@ -182,6 +137,7 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
unsigned int dataoff, unsigned int dataoff,
const struct nf_hook_state *state) const struct nf_hook_state *state)
{ {
union nf_inet_addr outer_daddr;
const struct icmp6hdr *icmp6h; const struct icmp6hdr *icmp6h;
struct icmp6hdr _ih; struct icmp6hdr _ih;
int type; int type;
...@@ -210,7 +166,11 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl, ...@@ -210,7 +166,11 @@ int nf_conntrack_icmpv6_error(struct nf_conn *tmpl,
if (icmp6h->icmp6_type >= 128) if (icmp6h->icmp6_type >= 128)
return NF_ACCEPT; return NF_ACCEPT;
return icmpv6_error_message(state->net, tmpl, skb, dataoff); memcpy(&outer_daddr.ip6, &ipv6_hdr(skb)->daddr,
sizeof(outer_daddr.ip6));
dataoff += sizeof(*icmp6h);
return nf_conntrack_inet_error(tmpl, skb, dataoff, state,
IPPROTO_ICMPV6, &outer_daddr);
} }
#if IS_ENABLED(CONFIG_NF_CT_NETLINK) #if IS_ENABLED(CONFIG_NF_CT_NETLINK)
......
...@@ -415,9 +415,14 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple, ...@@ -415,9 +415,14 @@ static void nf_nat_l4proto_unique_tuple(struct nf_conntrack_tuple *tuple,
case IPPROTO_ICMPV6: case IPPROTO_ICMPV6:
/* id is same for either direction... */ /* id is same for either direction... */
keyptr = &tuple->src.u.icmp.id; keyptr = &tuple->src.u.icmp.id;
min = range->min_proto.icmp.id; if (!(range->flags & NF_NAT_RANGE_PROTO_SPECIFIED)) {
range_size = ntohs(range->max_proto.icmp.id) - min = 0;
ntohs(range->min_proto.icmp.id) + 1; range_size = 65536;
} else {
min = ntohs(range->min_proto.icmp.id);
range_size = ntohs(range->max_proto.icmp.id) -
ntohs(range->min_proto.icmp.id) + 1;
}
goto find_free_id; goto find_free_id;
#if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE) #if IS_ENABLED(CONFIG_NF_CT_PROTO_GRE)
case IPPROTO_GRE: case IPPROTO_GRE:
......
...@@ -1545,7 +1545,7 @@ static int nft_chain_parse_hook(struct net *net, ...@@ -1545,7 +1545,7 @@ static int nft_chain_parse_hook(struct net *net,
if (IS_ERR(type)) if (IS_ERR(type))
return PTR_ERR(type); return PTR_ERR(type);
} }
if (!(type->hook_mask & (1 << hook->num))) if (hook->num > NF_MAX_HOOKS || !(type->hook_mask & (1 << hook->num)))
return -EOPNOTSUPP; return -EOPNOTSUPP;
if (type->type == NFT_CHAIN_T_NAT && if (type->type == NFT_CHAIN_T_NAT &&
......
...@@ -540,7 +540,7 @@ __build_packet_message(struct nfnl_log_net *log, ...@@ -540,7 +540,7 @@ __build_packet_message(struct nfnl_log_net *log,
goto nla_put_failure; goto nla_put_failure;
} }
if (skb->tstamp) { if (hooknum <= NF_INET_FORWARD && skb->tstamp) {
struct nfulnl_msg_packet_timestamp ts; struct nfulnl_msg_packet_timestamp ts;
struct timespec64 kts = ktime_to_timespec64(skb->tstamp); struct timespec64 kts = ktime_to_timespec64(skb->tstamp);
ts.sec = cpu_to_be64(kts.tv_sec); ts.sec = cpu_to_be64(kts.tv_sec);
......
...@@ -582,7 +582,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue, ...@@ -582,7 +582,7 @@ nfqnl_build_packet_message(struct net *net, struct nfqnl_instance *queue,
if (nfqnl_put_bridge(entry, skb) < 0) if (nfqnl_put_bridge(entry, skb) < 0)
goto nla_put_failure; goto nla_put_failure;
if (entskb->tstamp) { if (entry->state.hook <= NF_INET_FORWARD && entskb->tstamp) {
struct nfqnl_msg_packet_timestamp ts; struct nfqnl_msg_packet_timestamp ts;
struct timespec64 kts = ktime_to_timespec64(entskb->tstamp); struct timespec64 kts = ktime_to_timespec64(entskb->tstamp);
......
...@@ -163,19 +163,24 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par) ...@@ -163,19 +163,24 @@ time_mt(const struct sk_buff *skb, struct xt_action_param *par)
s64 stamp; s64 stamp;
/* /*
* We cannot use get_seconds() instead of __net_timestamp() here. * We need real time here, but we can neither use skb->tstamp
* nor __net_timestamp().
*
* skb->tstamp and skb->skb_mstamp_ns overlap, however, they
* use different clock types (real vs monotonic).
*
* Suppose you have two rules: * Suppose you have two rules:
* 1. match before 13:00 * 1. match before 13:00
* 2. match after 13:00 * 2. match after 13:00
*
* If you match against processing time (get_seconds) it * If you match against processing time (get_seconds) it
* may happen that the same packet matches both rules if * may happen that the same packet matches both rules if
* it arrived at the right moment before 13:00. * it arrived at the right moment before 13:00, so it would be
* better to check skb->tstamp and set it via __net_timestamp()
* if needed. This however breaks outgoing packets tx timestamp,
* and causes them to get delayed forever by fq packet scheduler.
*/ */
if (skb->tstamp == 0) stamp = get_seconds();
__net_timestamp((struct sk_buff *)skb);
stamp = ktime_to_ns(skb->tstamp);
stamp = div_s64(stamp, NSEC_PER_SEC);
if (info->flags & XT_TIME_LOCAL_TZ) if (info->flags & XT_TIME_LOCAL_TZ)
/* Adjust for local timezone */ /* Adjust for local timezone */
......
# SPDX-License-Identifier: GPL-2.0 # SPDX-License-Identifier: GPL-2.0
# Makefile for netfilter selftests # Makefile for netfilter selftests
TEST_PROGS := nft_trans_stress.sh nft_nat.sh TEST_PROGS := nft_trans_stress.sh nft_nat.sh conntrack_icmp_related.sh
include ../lib.mk include ../lib.mk
#!/bin/bash
#
# check that ICMP df-needed/pkttoobig icmp are set are set as related
# state
#
# Setup is:
#
# nsclient1 -> nsrouter1 -> nsrouter2 -> nsclient2
# MTU 1500, except for nsrouter2 <-> nsclient2 link (1280).
# ping nsclient2 from nsclient1, checking that conntrack did set RELATED
# 'fragmentation needed' icmp packet.
#
# In addition, nsrouter1 will perform IP masquerading, i.e. also
# check the icmp errors are propagated to the correct host as per
# nat of "established" icmp-echo "connection".
# Kselftest framework requirement - SKIP code is 4.
ksft_skip=4
ret=0
nft --version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without nft tool"
exit $ksft_skip
fi
ip -Version > /dev/null 2>&1
if [ $? -ne 0 ];then
echo "SKIP: Could not run test without ip tool"
exit $ksft_skip
fi
cleanup() {
for i in 1 2;do ip netns del nsclient$i;done
for i in 1 2;do ip netns del nsrouter$i;done
}
ipv4() {
echo -n 192.168.$1.2
}
ipv6 () {
echo -n dead:$1::2
}
check_counter()
{
ns=$1
name=$2
expect=$3
local lret=0
cnt=$(ip netns exec $ns nft list counter inet filter "$name" | grep -q "$expect")
if [ $? -ne 0 ]; then
echo "ERROR: counter $name in $ns has unexpected value (expected $expect)" 1>&2
ip netns exec $ns nft list counter inet filter "$name" 1>&2
lret=1
fi
return $lret
}
check_unknown()
{
expect="packets 0 bytes 0"
for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do
check_counter $n "unknown" "$expect"
if [ $? -ne 0 ] ;then
return 1
fi
done
return 0
}
for n in nsclient1 nsclient2 nsrouter1 nsrouter2; do
ip netns add $n
ip -net $n link set lo up
done
DEV=veth0
ip link add $DEV netns nsclient1 type veth peer name eth1 netns nsrouter1
DEV=veth0
ip link add $DEV netns nsclient2 type veth peer name eth1 netns nsrouter2
DEV=veth0
ip link add $DEV netns nsrouter1 type veth peer name eth2 netns nsrouter2
DEV=veth0
for i in 1 2; do
ip -net nsclient$i link set $DEV up
ip -net nsclient$i addr add $(ipv4 $i)/24 dev $DEV
ip -net nsclient$i addr add $(ipv6 $i)/64 dev $DEV
done
ip -net nsrouter1 link set eth1 up
ip -net nsrouter1 link set veth0 up
ip -net nsrouter2 link set eth1 up
ip -net nsrouter2 link set eth2 up
ip -net nsclient1 route add default via 192.168.1.1
ip -net nsclient1 -6 route add default via dead:1::1
ip -net nsclient2 route add default via 192.168.2.1
ip -net nsclient2 route add default via dead:2::1
i=3
ip -net nsrouter1 addr add 192.168.1.1/24 dev eth1
ip -net nsrouter1 addr add 192.168.3.1/24 dev veth0
ip -net nsrouter1 addr add dead:1::1/64 dev eth1
ip -net nsrouter1 addr add dead:3::1/64 dev veth0
ip -net nsrouter1 route add default via 192.168.3.10
ip -net nsrouter1 -6 route add default via dead:3::10
ip -net nsrouter2 addr add 192.168.2.1/24 dev eth1
ip -net nsrouter2 addr add 192.168.3.10/24 dev eth2
ip -net nsrouter2 addr add dead:2::1/64 dev eth1
ip -net nsrouter2 addr add dead:3::10/64 dev eth2
ip -net nsrouter2 route add default via 192.168.3.1
ip -net nsrouter2 route add default via dead:3::1
sleep 2
for i in 4 6; do
ip netns exec nsrouter1 sysctl -q net.ipv$i.conf.all.forwarding=1
ip netns exec nsrouter2 sysctl -q net.ipv$i.conf.all.forwarding=1
done
for netns in nsrouter1 nsrouter2; do
ip netns exec $netns nft -f - <<EOF
table inet filter {
counter unknown { }
counter related { }
chain forward {
type filter hook forward priority 0; policy accept;
meta l4proto icmpv6 icmpv6 type "packet-too-big" ct state "related" counter name "related" accept
meta l4proto icmp icmp type "destination-unreachable" ct state "related" counter name "related" accept
meta l4proto { icmp, icmpv6 } ct state new,established accept
counter name "unknown" drop
}
}
EOF
done
ip netns exec nsclient1 nft -f - <<EOF
table inet filter {
counter unknown { }
counter related { }
chain input {
type filter hook input priority 0; policy accept;
meta l4proto { icmp, icmpv6 } ct state established,untracked accept
meta l4proto { icmp, icmpv6 } ct state "related" counter name "related" accept
counter name "unknown" drop
}
}
EOF
ip netns exec nsclient2 nft -f - <<EOF
table inet filter {
counter unknown { }
counter new { }
counter established { }
chain input {
type filter hook input priority 0; policy accept;
meta l4proto { icmp, icmpv6 } ct state established,untracked accept
meta l4proto { icmp, icmpv6 } ct state "new" counter name "new" accept
meta l4proto { icmp, icmpv6 } ct state "established" counter name "established" accept
counter name "unknown" drop
}
chain output {
type filter hook output priority 0; policy accept;
meta l4proto { icmp, icmpv6 } ct state established,untracked accept
meta l4proto { icmp, icmpv6 } ct state "new" counter name "new"
meta l4proto { icmp, icmpv6 } ct state "established" counter name "established"
counter name "unknown" drop
}
}
EOF
# make sure NAT core rewrites adress of icmp error if nat is used according to
# conntrack nat information (icmp error will be directed at nsrouter1 address,
# but it needs to be routed to nsclient1 address).
ip netns exec nsrouter1 nft -f - <<EOF
table ip nat {
chain postrouting {
type nat hook postrouting priority 0; policy accept;
ip protocol icmp oifname "veth0" counter masquerade
}
}
table ip6 nat {
chain postrouting {
type nat hook postrouting priority 0; policy accept;
ip6 nexthdr icmpv6 oifname "veth0" counter masquerade
}
}
EOF
ip netns exec nsrouter2 ip link set eth1 mtu 1280
ip netns exec nsclient2 ip link set veth0 mtu 1280
sleep 1
ip netns exec nsclient1 ping -c 1 -s 1000 -q -M do 192.168.2.2 >/dev/null
if [ $? -ne 0 ]; then
echo "ERROR: netns ip routing/connectivity broken" 1>&2
cleanup
exit 1
fi
ip netns exec nsclient1 ping6 -q -c 1 -s 1000 dead:2::2 >/dev/null
if [ $? -ne 0 ]; then
echo "ERROR: netns ipv6 routing/connectivity broken" 1>&2
cleanup
exit 1
fi
check_unknown
if [ $? -ne 0 ]; then
ret=1
fi
expect="packets 0 bytes 0"
for netns in nsrouter1 nsrouter2 nsclient1;do
check_counter "$netns" "related" "$expect"
if [ $? -ne 0 ]; then
ret=1
fi
done
expect="packets 2 bytes 2076"
check_counter nsclient2 "new" "$expect"
if [ $? -ne 0 ]; then
ret=1
fi
ip netns exec nsclient1 ping -q -c 1 -s 1300 -M do 192.168.2.2 > /dev/null
if [ $? -eq 0 ]; then
echo "ERROR: ping should have failed with PMTU too big error" 1>&2
ret=1
fi
# nsrouter2 should have generated the icmp error, so
# related counter should be 0 (its in forward).
expect="packets 0 bytes 0"
check_counter "nsrouter2" "related" "$expect"
if [ $? -ne 0 ]; then
ret=1
fi
# but nsrouter1 should have seen it, same for nsclient1.
expect="packets 1 bytes 576"
for netns in nsrouter1 nsclient1;do
check_counter "$netns" "related" "$expect"
if [ $? -ne 0 ]; then
ret=1
fi
done
ip netns exec nsclient1 ping6 -c 1 -s 1300 dead:2::2 > /dev/null
if [ $? -eq 0 ]; then
echo "ERROR: ping6 should have failed with PMTU too big error" 1>&2
ret=1
fi
expect="packets 2 bytes 1856"
for netns in nsrouter1 nsclient1;do
check_counter "$netns" "related" "$expect"
if [ $? -ne 0 ]; then
ret=1
fi
done
if [ $ret -eq 0 ];then
echo "PASS: icmp mtu error had RELATED state"
else
echo "ERROR: icmp error RELATED state test has failed"
fi
cleanup
exit $ret
...@@ -321,6 +321,7 @@ EOF ...@@ -321,6 +321,7 @@ EOF
test_masquerade6() test_masquerade6()
{ {
local natflags=$1
local lret=0 local lret=0
ip netns exec ns0 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null ip netns exec ns0 sysctl net.ipv6.conf.all.forwarding=1 > /dev/null
...@@ -354,13 +355,13 @@ ip netns exec ns0 nft -f - <<EOF ...@@ -354,13 +355,13 @@ ip netns exec ns0 nft -f - <<EOF
table ip6 nat { table ip6 nat {
chain postrouting { chain postrouting {
type nat hook postrouting priority 0; policy accept; type nat hook postrouting priority 0; policy accept;
meta oif veth0 masquerade meta oif veth0 masquerade $natflags
} }
} }
EOF EOF
ip netns exec ns2 ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1 ip netns exec ns2 ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
if [ $? -ne 0 ] ; then if [ $? -ne 0 ] ; then
echo "ERROR: cannot ping ns1 from ns2 with active ipv6 masquerading" echo "ERROR: cannot ping ns1 from ns2 with active ipv6 masquerade $natflags"
lret=1 lret=1
fi fi
...@@ -397,19 +398,26 @@ EOF ...@@ -397,19 +398,26 @@ EOF
fi fi
done done
ip netns exec ns2 ping -q -c 1 dead:1::99 > /dev/null # ping ns2->ns1
if [ $? -ne 0 ] ; then
echo "ERROR: cannot ping ns1 from ns2 with active ipv6 masquerade $natflags (attempt 2)"
lret=1
fi
ip netns exec ns0 nft flush chain ip6 nat postrouting ip netns exec ns0 nft flush chain ip6 nat postrouting
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "ERROR: Could not flush ip6 nat postrouting" 1>&2 echo "ERROR: Could not flush ip6 nat postrouting" 1>&2
lret=1 lret=1
fi fi
test $lret -eq 0 && echo "PASS: IPv6 masquerade for ns2" test $lret -eq 0 && echo "PASS: IPv6 masquerade $natflags for ns2"
return $lret return $lret
} }
test_masquerade() test_masquerade()
{ {
local natflags=$1
local lret=0 local lret=0
ip netns exec ns0 sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null ip netns exec ns0 sysctl net.ipv4.conf.veth0.forwarding=1 > /dev/null
...@@ -417,7 +425,7 @@ test_masquerade() ...@@ -417,7 +425,7 @@ test_masquerade()
ip netns exec ns2 ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 ip netns exec ns2 ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
if [ $? -ne 0 ] ; then if [ $? -ne 0 ] ; then
echo "ERROR: canot ping ns1 from ns2" echo "ERROR: cannot ping ns1 from ns2 $natflags"
lret=1 lret=1
fi fi
...@@ -443,13 +451,13 @@ ip netns exec ns0 nft -f - <<EOF ...@@ -443,13 +451,13 @@ ip netns exec ns0 nft -f - <<EOF
table ip nat { table ip nat {
chain postrouting { chain postrouting {
type nat hook postrouting priority 0; policy accept; type nat hook postrouting priority 0; policy accept;
meta oif veth0 masquerade meta oif veth0 masquerade $natflags
} }
} }
EOF EOF
ip netns exec ns2 ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1 ip netns exec ns2 ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
if [ $? -ne 0 ] ; then if [ $? -ne 0 ] ; then
echo "ERROR: cannot ping ns1 from ns2 with active ip masquerading" echo "ERROR: cannot ping ns1 from ns2 with active ip masquere $natflags"
lret=1 lret=1
fi fi
...@@ -485,13 +493,19 @@ EOF ...@@ -485,13 +493,19 @@ EOF
fi fi
done done
ip netns exec ns2 ping -q -c 1 10.0.1.99 > /dev/null # ping ns2->ns1
if [ $? -ne 0 ] ; then
echo "ERROR: cannot ping ns1 from ns2 with active ip masquerade $natflags (attempt 2)"
lret=1
fi
ip netns exec ns0 nft flush chain ip nat postrouting ip netns exec ns0 nft flush chain ip nat postrouting
if [ $? -ne 0 ]; then if [ $? -ne 0 ]; then
echo "ERROR: Could not flush nat postrouting" 1>&2 echo "ERROR: Could not flush nat postrouting" 1>&2
lret=1 lret=1
fi fi
test $lret -eq 0 && echo "PASS: IP masquerade for ns2" test $lret -eq 0 && echo "PASS: IP masquerade $natflags for ns2"
return $lret return $lret
} }
...@@ -750,8 +764,12 @@ test_local_dnat ...@@ -750,8 +764,12 @@ test_local_dnat
test_local_dnat6 test_local_dnat6
reset_counters reset_counters
test_masquerade test_masquerade ""
test_masquerade6 test_masquerade6 ""
reset_counters
test_masquerade "fully-random"
test_masquerade6 "fully-random"
reset_counters reset_counters
test_redirect test_redirect
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment