Commit e8ed77df authored by David S. Miller's avatar David S. Miller

Merge git://git.kernel.org/pub/scm/linux/kernel/git/pablo/nf-next

Pablo Neira Ayuso says:

====================
Netfilter updates for net-next

The following large patchset contains Netfilter updates for your
net-next tree. My initial intention was to send you this in two goes but
when I looked back twice I already had this burden on top of me.

Several updates for IPVS from Marco Angaroni:

1) Allow SIP connections originating from real-servers to be load
   balanced by the SIP persistence engine as is already implemented
   in the other direction.

2) Release connections immediately for One-packet-scheduling (OPS)
   in IPVS, instead of making it via timer and rcu callback.

3) Skip deleting conntracks for each one packet in OPS, and don't call
   nf_conntrack_alter_reply() since no reply is expected.

4) Enable drop on exhaustion for OPS + SIP persistence.

Miscelaneous conntrack updates from Florian Westphal, including fix for
hash resize:

5) Move conntrack generation counter out of conntrack pernet structure
   since this is only used by the init_ns to allow hash resizing.

6) Use get_random_once() from packet path to collect hash random seed
    instead of our compound.

7) Don't disable BH from ____nf_conntrack_find() for statistics,
   use NF_CT_STAT_INC_ATOMIC() instead.

8) Fix lookup race during conntrack hash resizing.

9) Introduce clash resolution on conntrack insertion for connectionless
   protocol.

Then, Florian's netns rework to get rid of per-netns conntrack table,
thus we use one single table for them all. There was consensus on this
change during the NFWS 2015 and, on top of that, it has recently been
pointed as a source of multiple problems from unpriviledged netns:

11) Use a single conntrack hashtable for all namespaces. Include netns
    in object comparisons and make it part of the hash calculation.
    Adapt early_drop() to consider netns.

12) Use single expectation and NAT hashtable for all namespaces.

13) Use a single slab cache for all namespaces for conntrack objects.

14) Skip full table scanning from nf_ct_iterate_cleanup() if the pernet
    conntrack counter tells us the table is empty (ie. equals zero).

Fixes for nf_tables interval set element handling, support to set
conntrack connlabels and allow set names up to 32 bytes.

15) Parse element flags from element deletion path and pass it up to the
    backend set implementation.

16) Allow adjacent intervals in the rbtree set type for dynamic interval
    updates.

17) Add support to set connlabel from nf_tables, from Florian Westphal.

18) Allow set names up to 32 bytes in nf_tables.

Several x_tables fixes and updates:

19) Fix incorrect use of IS_ERR_VALUE() in x_tables, original patch
    from Andrzej Hajda.

And finally, miscelaneous netfilter updates such as:

20) Disable automatic helper assignment by default. Note this proc knob
    was introduced by a9006892 ("netfilter: nf_ct_helper: allow to
    disable automatic helper assignment") 4 years ago to start moving
    towards explicit conntrack helper configuration via iptables CT
    target.

21) Get rid of obsolete and inconsistent debugging instrumentation
    in x_tables.

22) Remove unnecessary check for null after ip6_route_output().
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents e26522cd 0c5366b3
......@@ -380,16 +380,16 @@ static inline unsigned long ifname_compare_aligned(const char *_a,
* allows us to return 0 for single core systems without forcing
* callers to deal with SMP vs. NONSMP issues.
*/
static inline u64 xt_percpu_counter_alloc(void)
static inline unsigned long xt_percpu_counter_alloc(void)
{
if (nr_cpu_ids > 1) {
void __percpu *res = __alloc_percpu(sizeof(struct xt_counters),
sizeof(struct xt_counters));
if (res == NULL)
return (u64) -ENOMEM;
return -ENOMEM;
return (u64) (__force unsigned long) res;
return (__force unsigned long) res;
}
return 0;
......
......@@ -731,6 +731,12 @@ struct ip_vs_pe {
u32 (*hashkey_raw)(const struct ip_vs_conn_param *p, u32 initval,
bool inverse);
int (*show_pe_data)(const struct ip_vs_conn *cp, char *buf);
/* create connections for real-server outgoing packets */
struct ip_vs_conn* (*conn_out)(struct ip_vs_service *svc,
struct ip_vs_dest *dest,
struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
__be16 dport, __be16 cport);
};
/* The application module object (a.k.a. app incarnation) */
......@@ -874,6 +880,7 @@ struct netns_ipvs {
/* Service counters */
atomic_t ftpsvc_counter;
atomic_t nullsvc_counter;
atomic_t conn_out_counter;
#ifdef CONFIG_SYSCTL
/* 1/rate drop and drop-entry variables */
......@@ -1147,6 +1154,12 @@ static inline int sysctl_cache_bypass(struct netns_ipvs *ipvs)
*/
const char *ip_vs_proto_name(unsigned int proto);
void ip_vs_init_hash_table(struct list_head *table, int rows);
struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
struct ip_vs_dest *dest,
struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
__be16 dport,
__be16 cport);
#define IP_VS_INIT_HASH_TABLE(t) ip_vs_init_hash_table((t), ARRAY_SIZE((t)))
#define IP_VS_APP_TYPE_FTP 1
......@@ -1378,6 +1391,10 @@ ip_vs_service_find(struct netns_ipvs *ipvs, int af, __u32 fwmark, __u16 protocol
bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
const union nf_inet_addr *daddr, __be16 dport);
struct ip_vs_dest *
ip_vs_find_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
const union nf_inet_addr *daddr, __be16 dport);
int ip_vs_use_count_inc(void);
void ip_vs_use_count_dec(void);
int ip_vs_register_nl_ioctl(void);
......
......@@ -289,8 +289,6 @@ struct kernel_param;
int nf_conntrack_set_hashsize(const char *val, struct kernel_param *kp);
extern unsigned int nf_conntrack_htable_size;
extern unsigned int nf_conntrack_max;
extern unsigned int nf_conntrack_hash_rnd;
void init_nf_conntrack_hash_rnd(void);
struct nf_conn *nf_ct_tmpl_alloc(struct net *net,
const struct nf_conntrack_zone *zone,
......
......@@ -81,6 +81,7 @@ print_tuple(struct seq_file *s, const struct nf_conntrack_tuple *tuple,
#define CONNTRACK_LOCKS 1024
extern struct hlist_nulls_head *nf_conntrack_hash;
extern spinlock_t nf_conntrack_locks[CONNTRACK_LOCKS];
void nf_conntrack_lock(spinlock_t *lock);
......
......@@ -10,6 +10,7 @@
extern unsigned int nf_ct_expect_hsize;
extern unsigned int nf_ct_expect_max;
extern struct hlist_head *nf_ct_expect_hash;
struct nf_conntrack_expect {
/* Conntrack expectation list member */
......
......@@ -23,6 +23,9 @@ struct nf_conntrack_l4proto {
/* L4 Protocol number. */
u_int8_t l4proto;
/* Resolve clashes on insertion races. */
bool allow_clash;
/* Try to fill in the third arg: dataoff is offset past network protocol
hdr. Return true if possible. */
bool (*pkt_to_tuple)(const struct sk_buff *skb, unsigned int dataoff,
......
......@@ -303,7 +303,7 @@ void nft_unregister_set(struct nft_set_ops *ops);
struct nft_set {
struct list_head list;
struct list_head bindings;
char name[IFNAMSIZ];
char name[NFT_SET_MAXNAMELEN];
u32 ktype;
u32 dtype;
u32 size;
......
......@@ -84,7 +84,6 @@ struct netns_ct {
struct ctl_table_header *event_sysctl_header;
struct ctl_table_header *helper_sysctl_header;
#endif
char *slabname;
unsigned int sysctl_log_invalid; /* Log invalid packets */
int sysctl_events;
int sysctl_acct;
......@@ -93,11 +92,6 @@ struct netns_ct {
int sysctl_tstamp;
int sysctl_checksum;
unsigned int htable_size;
seqcount_t generation;
struct kmem_cache *nf_conntrack_cachep;
struct hlist_nulls_head *hash;
struct hlist_head *expect_hash;
struct ct_pcpu __percpu *pcpu_lists;
struct ip_conntrack_stat __percpu *stat;
struct nf_ct_event_notifier __rcu *nf_conntrack_event_cb;
......@@ -107,9 +101,5 @@ struct netns_ct {
unsigned int labels_used;
u8 label_words;
#endif
#ifdef CONFIG_NF_NAT_NEEDED
struct hlist_head *nat_bysource;
unsigned int nat_htable_size;
#endif
};
#endif
......@@ -3,6 +3,7 @@
#define NFT_TABLE_MAXNAMELEN 32
#define NFT_CHAIN_MAXNAMELEN 32
#define NFT_SET_MAXNAMELEN 32
#define NFT_USERDATA_MAXLEN 256
/**
......
This diff is collapsed.
This diff is collapsed.
......@@ -360,7 +360,7 @@ static int ipv4_init_net(struct net *net)
in->ctl_table[0].data = &nf_conntrack_max;
in->ctl_table[1].data = &net->ct.count;
in->ctl_table[2].data = &net->ct.htable_size;
in->ctl_table[2].data = &nf_conntrack_htable_size;
in->ctl_table[3].data = &net->ct.sysctl_checksum;
in->ctl_table[4].data = &net->ct.sysctl_log_invalid;
#endif
......
......@@ -31,15 +31,14 @@ struct ct_iter_state {
static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
{
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
struct hlist_nulls_node *n;
for (st->bucket = 0;
st->bucket < net->ct.htable_size;
st->bucket < nf_conntrack_htable_size;
st->bucket++) {
n = rcu_dereference(
hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
......@@ -49,17 +48,16 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct hlist_nulls_node *head)
{
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= net->ct.htable_size)
if (++st->bucket >= nf_conntrack_htable_size)
return NULL;
}
head = rcu_dereference(
hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
}
return head;
}
......@@ -114,6 +112,23 @@ static inline void ct_show_secctx(struct seq_file *s, const struct nf_conn *ct)
}
#endif
static bool ct_seq_should_skip(const struct nf_conn *ct,
const struct net *net,
const struct nf_conntrack_tuple_hash *hash)
{
/* we only want to print DIR_ORIGINAL */
if (NF_CT_DIRECTION(hash))
return true;
if (nf_ct_l3num(ct) != AF_INET)
return true;
if (!net_eq(nf_ct_net(ct), net))
return true;
return false;
}
static int ct_seq_show(struct seq_file *s, void *v)
{
struct nf_conntrack_tuple_hash *hash = v;
......@@ -123,14 +138,15 @@ static int ct_seq_show(struct seq_file *s, void *v)
int ret = 0;
NF_CT_ASSERT(ct);
if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
if (ct_seq_should_skip(ct, seq_file_net(s), hash))
return 0;
if (unlikely(!atomic_inc_not_zero(&ct->ct_general.use)))
return 0;
/* we only want to print DIR_ORIGINAL */
if (NF_CT_DIRECTION(hash))
goto release;
if (nf_ct_l3num(ct) != AF_INET)
/* check if we raced w. object reuse */
if (!nf_ct_is_confirmed(ct) ||
ct_seq_should_skip(ct, seq_file_net(s), hash))
goto release;
l3proto = __nf_ct_l3proto_find(nf_ct_l3num(ct));
......@@ -220,13 +236,12 @@ struct ct_expect_iter_state {
static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
{
struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
n = rcu_dereference(
hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
if (n)
return n;
}
......@@ -236,7 +251,6 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct hlist_node *head)
{
struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
head = rcu_dereference(hlist_next_rcu(head));
......@@ -244,7 +258,7 @@ static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
head = rcu_dereference(
hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
}
return head;
}
......@@ -285,6 +299,9 @@ static int exp_seq_show(struct seq_file *s, void *v)
exp = hlist_entry(n, struct nf_conntrack_expect, hnode);
if (!net_eq(nf_ct_net(exp->master), seq_file_net(s)))
return 0;
if (exp->tuple.src.l3num != AF_INET)
return 0;
......
This diff is collapsed.
......@@ -60,7 +60,7 @@ synproxy_send_tcp(struct net *net,
fl6.fl6_dport = nth->dest;
security_skb_classify_flow((struct sk_buff *)skb, flowi6_to_flowi(&fl6));
dst = ip6_route_output(net, NULL, &fl6);
if (dst == NULL || dst->error) {
if (dst->error) {
dst_release(dst);
goto free_nskb;
}
......
......@@ -104,6 +104,7 @@ static inline void ct_write_unlock_bh(unsigned int key)
spin_unlock_bh(&__ip_vs_conntbl_lock_array[key&CT_LOCKARRAY_MASK].l);
}
static void ip_vs_conn_expire(unsigned long data);
/*
* Returns hash value for IPVS connection entry
......@@ -453,10 +454,16 @@ ip_vs_conn_out_get_proto(struct netns_ipvs *ipvs, int af,
}
EXPORT_SYMBOL_GPL(ip_vs_conn_out_get_proto);
static void __ip_vs_conn_put_notimer(struct ip_vs_conn *cp)
{
__ip_vs_conn_put(cp);
ip_vs_conn_expire((unsigned long)cp);
}
/*
* Put back the conn and restart its timer with its timeout
*/
void ip_vs_conn_put(struct ip_vs_conn *cp)
static void __ip_vs_conn_put_timer(struct ip_vs_conn *cp)
{
unsigned long t = (cp->flags & IP_VS_CONN_F_ONE_PACKET) ?
0 : cp->timeout;
......@@ -465,6 +472,16 @@ void ip_vs_conn_put(struct ip_vs_conn *cp)
__ip_vs_conn_put(cp);
}
void ip_vs_conn_put(struct ip_vs_conn *cp)
{
if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) &&
(atomic_read(&cp->refcnt) == 1) &&
!timer_pending(&cp->timer))
/* expire connection immediately */
__ip_vs_conn_put_notimer(cp);
else
__ip_vs_conn_put_timer(cp);
}
/*
* Fill a no_client_port connection with a client port number
......@@ -819,7 +836,8 @@ static void ip_vs_conn_expire(unsigned long data)
if (cp->control)
ip_vs_control_del(cp);
if (cp->flags & IP_VS_CONN_F_NFCT) {
if ((cp->flags & IP_VS_CONN_F_NFCT) &&
!(cp->flags & IP_VS_CONN_F_ONE_PACKET)) {
/* Do not access conntracks during subsys cleanup
* because nf_conntrack_find_get can not be used after
* conntrack cleanup for the net.
......@@ -834,7 +852,10 @@ static void ip_vs_conn_expire(unsigned long data)
ip_vs_unbind_dest(cp);
if (cp->flags & IP_VS_CONN_F_NO_CPORT)
atomic_dec(&ip_vs_conn_no_cport_cnt);
call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
ip_vs_conn_rcu_free(&cp->rcu_head);
else
call_rcu(&cp->rcu_head, ip_vs_conn_rcu_free);
atomic_dec(&ipvs->conn_count);
return;
}
......@@ -850,7 +871,7 @@ static void ip_vs_conn_expire(unsigned long data)
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, sysctl_sync_threshold(ipvs));
ip_vs_conn_put(cp);
__ip_vs_conn_put_timer(cp);
}
/* Modify timer, so that it expires as soon as possible.
......@@ -1240,6 +1261,16 @@ static inline int todrop_entry(struct ip_vs_conn *cp)
return 1;
}
static inline bool ip_vs_conn_ops_mode(struct ip_vs_conn *cp)
{
struct ip_vs_service *svc;
if (!cp->dest)
return false;
svc = rcu_dereference(cp->dest->svc);
return svc && (svc->flags & IP_VS_SVC_F_ONEPACKET);
}
/* Called from keventd and must protect itself from softirqs */
void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
{
......@@ -1254,11 +1285,16 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
unsigned int hash = prandom_u32() & ip_vs_conn_tab_mask;
hlist_for_each_entry_rcu(cp, &ip_vs_conn_tab[hash], c_list) {
if (cp->flags & IP_VS_CONN_F_TEMPLATE)
/* connection template */
continue;
if (cp->ipvs != ipvs)
continue;
if (cp->flags & IP_VS_CONN_F_TEMPLATE) {
if (atomic_read(&cp->n_control) ||
!ip_vs_conn_ops_mode(cp))
continue;
else
/* connection template of OPS */
goto try_drop;
}
if (cp->protocol == IPPROTO_TCP) {
switch(cp->state) {
case IP_VS_TCP_S_SYN_RECV:
......@@ -1286,6 +1322,7 @@ void ip_vs_random_dropentry(struct netns_ipvs *ipvs)
continue;
}
} else {
try_drop:
if (!todrop_entry(cp))
continue;
}
......
......@@ -68,6 +68,7 @@ EXPORT_SYMBOL(ip_vs_conn_put);
#ifdef CONFIG_IP_VS_DEBUG
EXPORT_SYMBOL(ip_vs_get_debug_level);
#endif
EXPORT_SYMBOL(ip_vs_new_conn_out);
static int ip_vs_net_id __read_mostly;
/* netns cnt used for uniqueness */
......@@ -611,7 +612,10 @@ int ip_vs_leave(struct ip_vs_service *svc, struct sk_buff *skb,
ret = cp->packet_xmit(skb, cp, pd->pp, iph);
/* do not touch skb anymore */
atomic_inc(&cp->in_pkts);
if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
atomic_inc(&cp->control->in_pkts);
else
atomic_inc(&cp->in_pkts);
ip_vs_conn_put(cp);
return ret;
}
......@@ -1100,6 +1104,143 @@ static inline bool is_new_conn_expected(const struct ip_vs_conn *cp,
}
}
/* Generic function to create new connections for outgoing RS packets
*
* Pre-requisites for successful connection creation:
* 1) Virtual Service is NOT fwmark based:
* In fwmark-VS actual vaddr and vport are unknown to IPVS
* 2) Real Server and Virtual Service were NOT configured without port:
* This is to allow match of different VS to the same RS ip-addr
*/
struct ip_vs_conn *ip_vs_new_conn_out(struct ip_vs_service *svc,
struct ip_vs_dest *dest,
struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
__be16 dport,
__be16 cport)
{
struct ip_vs_conn_param param;
struct ip_vs_conn *ct = NULL, *cp = NULL;
const union nf_inet_addr *vaddr, *daddr, *caddr;
union nf_inet_addr snet;
__be16 vport;
unsigned int flags;
EnterFunction(12);
vaddr = &svc->addr;
vport = svc->port;
daddr = &iph->saddr;
caddr = &iph->daddr;
/* check pre-requisites are satisfied */
if (svc->fwmark)
return NULL;
if (!vport || !dport)
return NULL;
/* for persistent service first create connection template */
if (svc->flags & IP_VS_SVC_F_PERSISTENT) {
/* apply netmask the same way ingress-side does */
#ifdef CONFIG_IP_VS_IPV6
if (svc->af == AF_INET6)
ipv6_addr_prefix(&snet.in6, &caddr->in6,
(__force __u32)svc->netmask);
else
#endif
snet.ip = caddr->ip & svc->netmask;
/* fill params and create template if not existent */
if (ip_vs_conn_fill_param_persist(svc, skb, iph->protocol,
&snet, 0, vaddr,
vport, &param) < 0)
return NULL;
ct = ip_vs_ct_in_get(&param);
if (!ct) {
ct = ip_vs_conn_new(&param, dest->af, daddr, dport,
IP_VS_CONN_F_TEMPLATE, dest, 0);
if (!ct) {
kfree(param.pe_data);
return NULL;
}
ct->timeout = svc->timeout;
} else {
kfree(param.pe_data);
}
}
/* connection flags */
flags = ((svc->flags & IP_VS_SVC_F_ONEPACKET) &&
iph->protocol == IPPROTO_UDP) ? IP_VS_CONN_F_ONE_PACKET : 0;
/* create connection */
ip_vs_conn_fill_param(svc->ipvs, svc->af, iph->protocol,
caddr, cport, vaddr, vport, &param);
cp = ip_vs_conn_new(&param, dest->af, daddr, dport, flags, dest, 0);
if (!cp) {
if (ct)
ip_vs_conn_put(ct);
return NULL;
}
if (ct) {
ip_vs_control_add(cp, ct);
ip_vs_conn_put(ct);
}
ip_vs_conn_stats(cp, svc);
/* return connection (will be used to handle outgoing packet) */
IP_VS_DBG_BUF(6, "New connection RS-initiated:%c c:%s:%u v:%s:%u "
"d:%s:%u conn->flags:%X conn->refcnt:%d\n",
ip_vs_fwd_tag(cp),
IP_VS_DBG_ADDR(cp->af, &cp->caddr), ntohs(cp->cport),
IP_VS_DBG_ADDR(cp->af, &cp->vaddr), ntohs(cp->vport),
IP_VS_DBG_ADDR(cp->af, &cp->daddr), ntohs(cp->dport),
cp->flags, atomic_read(&cp->refcnt));
LeaveFunction(12);
return cp;
}
/* Handle outgoing packets which are considered requests initiated by
* real servers, so that subsequent responses from external client can be
* routed to the right real server.
* Used also for outgoing responses in OPS mode.
*
* Connection management is handled by persistent-engine specific callback.
*/
static struct ip_vs_conn *__ip_vs_rs_conn_out(unsigned int hooknum,
struct netns_ipvs *ipvs,
int af, struct sk_buff *skb,
const struct ip_vs_iphdr *iph)
{
struct ip_vs_dest *dest;
struct ip_vs_conn *cp = NULL;
__be16 _ports[2], *pptr;
if (hooknum == NF_INET_LOCAL_IN)
return NULL;
pptr = frag_safe_skb_hp(skb, iph->len,
sizeof(_ports), _ports, iph);
if (!pptr)
return NULL;
rcu_read_lock();
dest = ip_vs_find_real_service(ipvs, af, iph->protocol,
&iph->saddr, pptr[0]);
if (dest) {
struct ip_vs_service *svc;
struct ip_vs_pe *pe;
svc = rcu_dereference(dest->svc);
if (svc) {
pe = rcu_dereference(svc->pe);
if (pe && pe->conn_out)
cp = pe->conn_out(svc, dest, skb, iph,
pptr[0], pptr[1]);
}
}
rcu_read_unlock();
return cp;
}
/* Handle response packets: rewrite addresses and send away...
*/
static unsigned int
......@@ -1245,6 +1386,22 @@ ip_vs_out(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, in
if (likely(cp))
return handle_response(af, skb, pd, cp, &iph, hooknum);
/* Check for real-server-started requests */
if (atomic_read(&ipvs->conn_out_counter)) {
/* Currently only for UDP:
* connection oriented protocols typically use
* ephemeral ports for outgoing connections, so
* related incoming responses would not match any VS
*/
if (pp->protocol == IPPROTO_UDP) {
cp = __ip_vs_rs_conn_out(hooknum, ipvs, af, skb, &iph);
if (likely(cp))
return handle_response(af, skb, pd, cp, &iph,
hooknum);
}
}
if (sysctl_nat_icmp_send(ipvs) &&
(pp->protocol == IPPROTO_TCP ||
pp->protocol == IPPROTO_UDP ||
......@@ -1837,6 +1994,9 @@ ip_vs_in(struct netns_ipvs *ipvs, unsigned int hooknum, struct sk_buff *skb, int
if (ipvs->sync_state & IP_VS_STATE_MASTER)
ip_vs_sync_conn(ipvs, cp, pkts);
else if ((cp->flags & IP_VS_CONN_F_ONE_PACKET) && cp->control)
/* increment is done inside ip_vs_sync_conn too */
atomic_inc(&cp->control->in_pkts);
ip_vs_conn_put(cp);
return ret;
......
......@@ -567,6 +567,36 @@ bool ip_vs_has_real_service(struct netns_ipvs *ipvs, int af, __u16 protocol,
return false;
}
/* Find real service record by <proto,addr,port>.
* In case of multiple records with the same <proto,addr,port>, only
* the first found record is returned.
*
* To be called under RCU lock.
*/
struct ip_vs_dest *ip_vs_find_real_service(struct netns_ipvs *ipvs, int af,
__u16 protocol,
const union nf_inet_addr *daddr,
__be16 dport)
{
unsigned int hash;
struct ip_vs_dest *dest;
/* Check for "full" addressed entries */
hash = ip_vs_rs_hashkey(af, daddr, dport);
hlist_for_each_entry_rcu(dest, &ipvs->rs_table[hash], d_list) {
if (dest->port == dport &&
dest->af == af &&
ip_vs_addr_equal(af, &dest->addr, daddr) &&
(dest->protocol == protocol || dest->vfwmark)) {
/* HIT */
return dest;
}
}
return NULL;
}
/* Lookup destination by {addr,port} in the given service
* Called under RCU lock.
*/
......@@ -1253,6 +1283,8 @@ ip_vs_add_service(struct netns_ipvs *ipvs, struct ip_vs_service_user_kern *u,
atomic_inc(&ipvs->ftpsvc_counter);
else if (svc->port == 0)
atomic_inc(&ipvs->nullsvc_counter);
if (svc->pe && svc->pe->conn_out)
atomic_inc(&ipvs->conn_out_counter);
ip_vs_start_estimator(ipvs, &svc->stats);
......@@ -1293,6 +1325,7 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
struct ip_vs_scheduler *sched = NULL, *old_sched;
struct ip_vs_pe *pe = NULL, *old_pe = NULL;
int ret = 0;
bool new_pe_conn_out, old_pe_conn_out;
/*
* Lookup the scheduler, by 'u->sched_name'
......@@ -1355,8 +1388,16 @@ ip_vs_edit_service(struct ip_vs_service *svc, struct ip_vs_service_user_kern *u)
svc->netmask = u->netmask;
old_pe = rcu_dereference_protected(svc->pe, 1);
if (pe != old_pe)
if (pe != old_pe) {
rcu_assign_pointer(svc->pe, pe);
/* check for optional methods in new pe */
new_pe_conn_out = (pe && pe->conn_out) ? true : false;
old_pe_conn_out = (old_pe && old_pe->conn_out) ? true : false;
if (new_pe_conn_out && !old_pe_conn_out)
atomic_inc(&svc->ipvs->conn_out_counter);
if (old_pe_conn_out && !new_pe_conn_out)
atomic_dec(&svc->ipvs->conn_out_counter);
}
out:
ip_vs_scheduler_put(old_sched);
......@@ -1389,6 +1430,8 @@ static void __ip_vs_del_service(struct ip_vs_service *svc, bool cleanup)
/* Unbind persistence engine, keep svc->pe */
old_pe = rcu_dereference_protected(svc->pe, 1);
if (old_pe && old_pe->conn_out)
atomic_dec(&ipvs->conn_out_counter);
ip_vs_pe_put(old_pe);
/*
......@@ -3969,6 +4012,7 @@ int __net_init ip_vs_control_net_init(struct netns_ipvs *ipvs)
(unsigned long) ipvs);
atomic_set(&ipvs->ftpsvc_counter, 0);
atomic_set(&ipvs->nullsvc_counter, 0);
atomic_set(&ipvs->conn_out_counter, 0);
/* procfs stats */
ipvs->tot_stats.cpustats = alloc_percpu(struct ip_vs_cpu_stats);
......
......@@ -93,6 +93,10 @@ ip_vs_update_conntrack(struct sk_buff *skb, struct ip_vs_conn *cp, int outin)
if (IP_VS_FWD_METHOD(cp) != IP_VS_CONN_F_MASQ)
return;
/* Never alter conntrack for OPS conns (no reply is expected) */
if (cp->flags & IP_VS_CONN_F_ONE_PACKET)
return;
/* Alter reply only in original direction */
if (CTINFO2DIR(ctinfo) != IP_CT_DIR_ORIGINAL)
return;
......
......@@ -143,6 +143,20 @@ static int ip_vs_sip_show_pe_data(const struct ip_vs_conn *cp, char *buf)
return cp->pe_data_len;
}
static struct ip_vs_conn *
ip_vs_sip_conn_out(struct ip_vs_service *svc,
struct ip_vs_dest *dest,
struct sk_buff *skb,
const struct ip_vs_iphdr *iph,
__be16 dport,
__be16 cport)
{
if (likely(iph->protocol == IPPROTO_UDP))
return ip_vs_new_conn_out(svc, dest, skb, iph, dport, cport);
/* currently no need to handle other than UDP */
return NULL;
}
static struct ip_vs_pe ip_vs_sip_pe =
{
.name = "sip",
......@@ -153,6 +167,7 @@ static struct ip_vs_pe ip_vs_sip_pe =
.ct_match = ip_vs_sip_ct_match,
.hashkey_raw = ip_vs_sip_hashkey_raw,
.show_pe_data = ip_vs_sip_show_pe_data,
.conn_out = ip_vs_sip_conn_out,
};
static int __init ip_vs_sip_init(void)
......
This diff is collapsed.
......@@ -24,6 +24,7 @@
#include <linux/moduleparam.h>
#include <linux/export.h>
#include <net/net_namespace.h>
#include <net/netns/hash.h>
#include <net/netfilter/nf_conntrack.h>
#include <net/netfilter/nf_conntrack_core.h>
......@@ -35,9 +36,13 @@
unsigned int nf_ct_expect_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_expect_hsize);
struct hlist_head *nf_ct_expect_hash __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_expect_hash);
unsigned int nf_ct_expect_max __read_mostly;
static struct kmem_cache *nf_ct_expect_cachep __read_mostly;
static unsigned int nf_ct_expect_hashrnd __read_mostly;
/* nf_conntrack_expect helper functions */
void nf_ct_unlink_expect_report(struct nf_conntrack_expect *exp,
......@@ -72,21 +77,32 @@ static void nf_ct_expectation_timed_out(unsigned long ul_expect)
nf_ct_expect_put(exp);
}
static unsigned int nf_ct_expect_dst_hash(const struct nf_conntrack_tuple *tuple)
static unsigned int nf_ct_expect_dst_hash(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
unsigned int hash, seed;
if (unlikely(!nf_conntrack_hash_rnd)) {
init_nf_conntrack_hash_rnd();
}
get_random_once(&nf_ct_expect_hashrnd, sizeof(nf_ct_expect_hashrnd));
seed = nf_ct_expect_hashrnd ^ net_hash_mix(n);
hash = jhash2(tuple->dst.u3.all, ARRAY_SIZE(tuple->dst.u3.all),
(((tuple->dst.protonum ^ tuple->src.l3num) << 16) |
(__force __u16)tuple->dst.u.all) ^ nf_conntrack_hash_rnd);
(__force __u16)tuple->dst.u.all) ^ seed);
return reciprocal_scale(hash, nf_ct_expect_hsize);
}
static bool
nf_ct_exp_equal(const struct nf_conntrack_tuple *tuple,
const struct nf_conntrack_expect *i,
const struct nf_conntrack_zone *zone,
const struct net *net)
{
return nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
net_eq(net, nf_ct_net(i->master)) &&
nf_ct_zone_equal_any(i->master, zone);
}
struct nf_conntrack_expect *
__nf_ct_expect_find(struct net *net,
const struct nf_conntrack_zone *zone,
......@@ -98,10 +114,9 @@ __nf_ct_expect_find(struct net *net,
if (!net->ct.expect_count)
return NULL;
h = nf_ct_expect_dst_hash(tuple);
hlist_for_each_entry_rcu(i, &net->ct.expect_hash[h], hnode) {
if (nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
nf_ct_zone_equal_any(i->master, zone))
h = nf_ct_expect_dst_hash(net, tuple);
hlist_for_each_entry_rcu(i, &nf_ct_expect_hash[h], hnode) {
if (nf_ct_exp_equal(tuple, i, zone, net))
return i;
}
return NULL;
......@@ -139,11 +154,10 @@ nf_ct_find_expectation(struct net *net,
if (!net->ct.expect_count)
return NULL;
h = nf_ct_expect_dst_hash(tuple);
hlist_for_each_entry(i, &net->ct.expect_hash[h], hnode) {
h = nf_ct_expect_dst_hash(net, tuple);
hlist_for_each_entry(i, &nf_ct_expect_hash[h], hnode) {
if (!(i->flags & NF_CT_EXPECT_INACTIVE) &&
nf_ct_tuple_mask_cmp(tuple, &i->tuple, &i->mask) &&
nf_ct_zone_equal_any(i->master, zone)) {
nf_ct_exp_equal(tuple, i, zone, net)) {
exp = i;
break;
}
......@@ -223,6 +237,7 @@ static inline int expect_clash(const struct nf_conntrack_expect *a,
}
return nf_ct_tuple_mask_cmp(&a->tuple, &b->tuple, &intersect_mask) &&
net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) &&
nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
......@@ -232,6 +247,7 @@ static inline int expect_matches(const struct nf_conntrack_expect *a,
return a->master == b->master && a->class == b->class &&
nf_ct_tuple_equal(&a->tuple, &b->tuple) &&
nf_ct_tuple_mask_equal(&a->mask, &b->mask) &&
net_eq(nf_ct_net(a->master), nf_ct_net(b->master)) &&
nf_ct_zone_equal_any(a->master, nf_ct_zone(b->master));
}
......@@ -342,7 +358,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
struct nf_conn_help *master_help = nfct_help(exp->master);
struct nf_conntrack_helper *helper;
struct net *net = nf_ct_exp_net(exp);
unsigned int h = nf_ct_expect_dst_hash(&exp->tuple);
unsigned int h = nf_ct_expect_dst_hash(net, &exp->tuple);
/* two references : one for hash insert, one for the timer */
atomic_add(2, &exp->use);
......@@ -350,7 +366,7 @@ static int nf_ct_expect_insert(struct nf_conntrack_expect *exp)
hlist_add_head(&exp->lnode, &master_help->expectations);
master_help->expecting[exp->class]++;
hlist_add_head_rcu(&exp->hnode, &net->ct.expect_hash[h]);
hlist_add_head_rcu(&exp->hnode, &nf_ct_expect_hash[h]);
net->ct.expect_count++;
setup_timer(&exp->timeout, nf_ct_expectation_timed_out,
......@@ -401,8 +417,8 @@ static inline int __nf_ct_expect_check(struct nf_conntrack_expect *expect)
ret = -ESHUTDOWN;
goto out;
}
h = nf_ct_expect_dst_hash(&expect->tuple);
hlist_for_each_entry_safe(i, next, &net->ct.expect_hash[h], hnode) {
h = nf_ct_expect_dst_hash(net, &expect->tuple);
hlist_for_each_entry_safe(i, next, &nf_ct_expect_hash[h], hnode) {
if (expect_matches(i, expect)) {
if (del_timer(&i->timeout)) {
nf_ct_unlink_expect(i);
......@@ -468,12 +484,11 @@ struct ct_expect_iter_state {
static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
{
struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
struct hlist_node *n;
for (st->bucket = 0; st->bucket < nf_ct_expect_hsize; st->bucket++) {
n = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
n = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
if (n)
return n;
}
......@@ -483,14 +498,13 @@ static struct hlist_node *ct_expect_get_first(struct seq_file *seq)
static struct hlist_node *ct_expect_get_next(struct seq_file *seq,
struct hlist_node *head)
{
struct net *net = seq_file_net(seq);
struct ct_expect_iter_state *st = seq->private;
head = rcu_dereference(hlist_next_rcu(head));
while (head == NULL) {
if (++st->bucket >= nf_ct_expect_hsize)
return NULL;
head = rcu_dereference(hlist_first_rcu(&net->ct.expect_hash[st->bucket]));
head = rcu_dereference(hlist_first_rcu(&nf_ct_expect_hash[st->bucket]));
}
return head;
}
......@@ -623,28 +637,13 @@ module_param_named(expect_hashsize, nf_ct_expect_hsize, uint, 0400);
int nf_conntrack_expect_pernet_init(struct net *net)
{
int err = -ENOMEM;
net->ct.expect_count = 0;
net->ct.expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
if (net->ct.expect_hash == NULL)
goto err1;
err = exp_proc_init(net);
if (err < 0)
goto err2;
return 0;
err2:
nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
err1:
return err;
return exp_proc_init(net);
}
void nf_conntrack_expect_pernet_fini(struct net *net)
{
exp_proc_remove(net);
nf_ct_free_hashtable(net->ct.expect_hash, nf_ct_expect_hsize);
}
int nf_conntrack_expect_init(void)
......@@ -660,6 +659,13 @@ int nf_conntrack_expect_init(void)
0, 0, NULL);
if (!nf_ct_expect_cachep)
return -ENOMEM;
nf_ct_expect_hash = nf_ct_alloc_hashtable(&nf_ct_expect_hsize, 0);
if (!nf_ct_expect_hash) {
kmem_cache_destroy(nf_ct_expect_cachep);
return -ENOMEM;
}
return 0;
}
......@@ -667,4 +673,5 @@ void nf_conntrack_expect_fini(void)
{
rcu_barrier(); /* Wait for call_rcu() before destroy */
kmem_cache_destroy(nf_ct_expect_cachep);
nf_ct_free_hashtable(nf_ct_expect_hash, nf_ct_expect_hsize);
}
......@@ -38,10 +38,10 @@ unsigned int nf_ct_helper_hsize __read_mostly;
EXPORT_SYMBOL_GPL(nf_ct_helper_hsize);
static unsigned int nf_ct_helper_count __read_mostly;
static bool nf_ct_auto_assign_helper __read_mostly = true;
static bool nf_ct_auto_assign_helper __read_mostly = false;
module_param_named(nf_conntrack_helper, nf_ct_auto_assign_helper, bool, 0644);
MODULE_PARM_DESC(nf_conntrack_helper,
"Enable automatic conntrack helper assignment (default 1)");
"Enable automatic conntrack helper assignment (default 0)");
#ifdef CONFIG_SYSCTL
static struct ctl_table helper_sysctl_table[] = {
......@@ -400,7 +400,7 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist_for_each_entry_safe(exp, next,
&net->ct.expect_hash[i], hnode) {
&nf_ct_expect_hash[i], hnode) {
struct nf_conn_help *help = nfct_help(exp->master);
if ((rcu_dereference_protected(
help->helper,
......@@ -424,10 +424,10 @@ static void __nf_conntrack_helper_unregister(struct nf_conntrack_helper *me,
spin_unlock_bh(&pcpu->lock);
}
local_bh_disable();
for (i = 0; i < net->ct.htable_size; i++) {
for (i = 0; i < nf_conntrack_htable_size; i++) {
nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
if (i < net->ct.htable_size) {
hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
if (i < nf_conntrack_htable_size) {
hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
unhelp(h, me);
}
spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
......
......@@ -824,19 +824,22 @@ ctnetlink_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
last = (struct nf_conn *)cb->args[1];
local_bh_disable();
for (; cb->args[0] < net->ct.htable_size; cb->args[0]++) {
for (; cb->args[0] < nf_conntrack_htable_size; cb->args[0]++) {
restart:
lockp = &nf_conntrack_locks[cb->args[0] % CONNTRACK_LOCKS];
nf_conntrack_lock(lockp);
if (cb->args[0] >= net->ct.htable_size) {
if (cb->args[0] >= nf_conntrack_htable_size) {
spin_unlock(lockp);
goto out;
}
hlist_nulls_for_each_entry(h, n, &net->ct.hash[cb->args[0]],
hnnode) {
hlist_nulls_for_each_entry(h, n, &nf_conntrack_hash[cb->args[0]],
hnnode) {
if (NF_CT_DIRECTION(h) != IP_CT_DIR_ORIGINAL)
continue;
ct = nf_ct_tuplehash_to_ctrack(h);
if (!net_eq(net, nf_ct_net(ct)))
continue;
/* Dump entries of a given L3 protocol number.
* If it is not specified, ie. l3proto == 0,
* then dump everything. */
......@@ -2629,10 +2632,14 @@ ctnetlink_exp_dump_table(struct sk_buff *skb, struct netlink_callback *cb)
last = (struct nf_conntrack_expect *)cb->args[1];
for (; cb->args[0] < nf_ct_expect_hsize; cb->args[0]++) {
restart:
hlist_for_each_entry(exp, &net->ct.expect_hash[cb->args[0]],
hlist_for_each_entry(exp, &nf_ct_expect_hash[cb->args[0]],
hnode) {
if (l3proto && exp->tuple.src.l3num != l3proto)
continue;
if (!net_eq(nf_ct_net(exp->master), net))
continue;
if (cb->args[1]) {
if (exp != last)
continue;
......@@ -2883,8 +2890,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist_for_each_entry_safe(exp, next,
&net->ct.expect_hash[i],
&nf_ct_expect_hash[i],
hnode) {
if (!net_eq(nf_ct_exp_net(exp), net))
continue;
m_help = nfct_help(exp->master);
if (!strcmp(m_help->helper->name, name) &&
del_timer(&exp->timeout)) {
......@@ -2901,8 +2912,12 @@ static int ctnetlink_del_expect(struct net *net, struct sock *ctnl,
spin_lock_bh(&nf_conntrack_expect_lock);
for (i = 0; i < nf_ct_expect_hsize; i++) {
hlist_for_each_entry_safe(exp, next,
&net->ct.expect_hash[i],
&nf_ct_expect_hash[i],
hnode) {
if (!net_eq(nf_ct_exp_net(exp), net))
continue;
if (del_timer(&exp->timeout)) {
nf_ct_unlink_expect_report(exp,
NETLINK_CB(skb).portid,
......
......@@ -309,6 +309,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp4 __read_mostly =
.l3proto = PF_INET,
.l4proto = IPPROTO_UDP,
.name = "udp",
.allow_clash = true,
.pkt_to_tuple = udp_pkt_to_tuple,
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
......@@ -341,6 +342,7 @@ struct nf_conntrack_l4proto nf_conntrack_l4proto_udp6 __read_mostly =
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDP,
.name = "udp",
.allow_clash = true,
.pkt_to_tuple = udp_pkt_to_tuple,
.invert_tuple = udp_invert_tuple,
.print_tuple = udp_print_tuple,
......
......@@ -274,6 +274,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite4 __read_mostly =
.l3proto = PF_INET,
.l4proto = IPPROTO_UDPLITE,
.name = "udplite",
.allow_clash = true,
.pkt_to_tuple = udplite_pkt_to_tuple,
.invert_tuple = udplite_invert_tuple,
.print_tuple = udplite_print_tuple,
......@@ -306,6 +307,7 @@ static struct nf_conntrack_l4proto nf_conntrack_l4proto_udplite6 __read_mostly =
.l3proto = PF_INET6,
.l4proto = IPPROTO_UDPLITE,
.name = "udplite",
.allow_clash = true,
.pkt_to_tuple = udplite_pkt_to_tuple,
.invert_tuple = udplite_invert_tuple,
.print_tuple = udplite_print_tuple,
......
......@@ -54,14 +54,13 @@ struct ct_iter_state {
static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
{
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
struct hlist_nulls_node *n;
for (st->bucket = 0;
st->bucket < net->ct.htable_size;
st->bucket < nf_conntrack_htable_size;
st->bucket++) {
n = rcu_dereference(hlist_nulls_first_rcu(&net->ct.hash[st->bucket]));
n = rcu_dereference(hlist_nulls_first_rcu(&nf_conntrack_hash[st->bucket]));
if (!is_a_nulls(n))
return n;
}
......@@ -71,18 +70,17 @@ static struct hlist_nulls_node *ct_get_first(struct seq_file *seq)
static struct hlist_nulls_node *ct_get_next(struct seq_file *seq,
struct hlist_nulls_node *head)
{
struct net *net = seq_file_net(seq);
struct ct_iter_state *st = seq->private;
head = rcu_dereference(hlist_nulls_next_rcu(head));
while (is_a_nulls(head)) {
if (likely(get_nulls_value(head) == st->bucket)) {
if (++st->bucket >= net->ct.htable_size)
if (++st->bucket >= nf_conntrack_htable_size)
return NULL;
}
head = rcu_dereference(
hlist_nulls_first_rcu(
&net->ct.hash[st->bucket]));
&nf_conntrack_hash[st->bucket]));
}
return head;
}
......@@ -458,7 +456,7 @@ static struct ctl_table nf_ct_sysctl_table[] = {
},
{
.procname = "nf_conntrack_buckets",
.data = &init_net.ct.htable_size,
.data = &nf_conntrack_htable_size,
.maxlen = sizeof(unsigned int),
.mode = 0444,
.proc_handler = proc_dointvec,
......@@ -512,7 +510,6 @@ static int nf_conntrack_standalone_init_sysctl(struct net *net)
goto out_kmemdup;
table[1].data = &net->ct.count;
table[2].data = &net->ct.htable_size;
table[3].data = &net->ct.sysctl_checksum;
table[4].data = &net->ct.sysctl_log_invalid;
......
......@@ -38,6 +38,9 @@ static const struct nf_nat_l3proto __rcu *nf_nat_l3protos[NFPROTO_NUMPROTO]
static const struct nf_nat_l4proto __rcu **nf_nat_l4protos[NFPROTO_NUMPROTO]
__read_mostly;
static struct hlist_head *nf_nat_bysource __read_mostly;
static unsigned int nf_nat_htable_size __read_mostly;
static unsigned int nf_nat_hash_rnd __read_mostly;
inline const struct nf_nat_l3proto *
__nf_nat_l3proto_find(u8 family)
......@@ -118,15 +121,17 @@ EXPORT_SYMBOL(nf_xfrm_me_harder);
/* We keep an extra hash for each conntrack, for fast searching. */
static inline unsigned int
hash_by_src(const struct net *net, const struct nf_conntrack_tuple *tuple)
hash_by_src(const struct net *n, const struct nf_conntrack_tuple *tuple)
{
unsigned int hash;
get_random_once(&nf_nat_hash_rnd, sizeof(nf_nat_hash_rnd));
/* Original src, to ensure we map it consistently if poss. */
hash = jhash2((u32 *)&tuple->src, sizeof(tuple->src) / sizeof(u32),
tuple->dst.protonum ^ nf_conntrack_hash_rnd);
tuple->dst.protonum ^ nf_nat_hash_rnd ^ net_hash_mix(n));
return reciprocal_scale(hash, net->ct.nat_htable_size);
return reciprocal_scale(hash, nf_nat_htable_size);
}
/* Is this tuple already taken? (not by us) */
......@@ -196,9 +201,10 @@ find_appropriate_src(struct net *net,
const struct nf_conn_nat *nat;
const struct nf_conn *ct;
hlist_for_each_entry_rcu(nat, &net->ct.nat_bysource[h], bysource) {
hlist_for_each_entry_rcu(nat, &nf_nat_bysource[h], bysource) {
ct = nat->ct;
if (same_src(ct, tuple) &&
net_eq(net, nf_ct_net(ct)) &&
nf_ct_zone_equal(ct, zone, IP_CT_DIR_ORIGINAL)) {
/* Copy source part from reply tuple. */
nf_ct_invert_tuplepr(result,
......@@ -431,7 +437,7 @@ nf_nat_setup_info(struct nf_conn *ct,
nat = nfct_nat(ct);
nat->ct = ct;
hlist_add_head_rcu(&nat->bysource,
&net->ct.nat_bysource[srchash]);
&nf_nat_bysource[srchash]);
spin_unlock_bh(&nf_nat_lock);
}
......@@ -819,27 +825,14 @@ nfnetlink_parse_nat_setup(struct nf_conn *ct,
}
#endif
static int __net_init nf_nat_net_init(struct net *net)
{
/* Leave them the same for the moment. */
net->ct.nat_htable_size = net->ct.htable_size;
net->ct.nat_bysource = nf_ct_alloc_hashtable(&net->ct.nat_htable_size, 0);
if (!net->ct.nat_bysource)
return -ENOMEM;
return 0;
}
static void __net_exit nf_nat_net_exit(struct net *net)
{
struct nf_nat_proto_clean clean = {};
nf_ct_iterate_cleanup(net, nf_nat_proto_clean, &clean, 0, 0);
synchronize_rcu();
nf_ct_free_hashtable(net->ct.nat_bysource, net->ct.nat_htable_size);
}
static struct pernet_operations nf_nat_net_ops = {
.init = nf_nat_net_init,
.exit = nf_nat_net_exit,
};
......@@ -852,8 +845,16 @@ static int __init nf_nat_init(void)
{
int ret;
/* Leave them the same for the moment. */
nf_nat_htable_size = nf_conntrack_htable_size;
nf_nat_bysource = nf_ct_alloc_hashtable(&nf_nat_htable_size, 0);
if (!nf_nat_bysource)
return -ENOMEM;
ret = nf_ct_extend_register(&nat_extend);
if (ret < 0) {
nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
printk(KERN_ERR "nf_nat_core: Unable to register extension\n");
return ret;
}
......@@ -877,6 +878,7 @@ static int __init nf_nat_init(void)
return 0;
cleanup_extend:
nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
nf_ct_extend_unregister(&nat_extend);
return ret;
}
......@@ -895,6 +897,7 @@ static void __exit nf_nat_cleanup(void)
for (i = 0; i < NFPROTO_NUMPROTO; i++)
kfree(nf_nat_l4protos[i]);
synchronize_net();
nf_ct_free_hashtable(nf_nat_bysource, nf_nat_htable_size);
}
MODULE_LICENSE("GPL");
......
......@@ -2317,7 +2317,7 @@ nft_select_set_ops(const struct nlattr * const nla[],
static const struct nla_policy nft_set_policy[NFTA_SET_MAX + 1] = {
[NFTA_SET_TABLE] = { .type = NLA_STRING },
[NFTA_SET_NAME] = { .type = NLA_STRING,
.len = IFNAMSIZ - 1 },
.len = NFT_SET_MAXNAMELEN - 1 },
[NFTA_SET_FLAGS] = { .type = NLA_U32 },
[NFTA_SET_KEY_TYPE] = { .type = NLA_U32 },
[NFTA_SET_KEY_LEN] = { .type = NLA_U32 },
......@@ -2401,7 +2401,7 @@ static int nf_tables_set_alloc_name(struct nft_ctx *ctx, struct nft_set *set,
unsigned long *inuse;
unsigned int n = 0, min = 0;
p = strnchr(name, IFNAMSIZ, '%');
p = strnchr(name, NFT_SET_MAXNAMELEN, '%');
if (p != NULL) {
if (p[1] != 'd' || strchr(p + 2, '%'))
return -EINVAL;
......@@ -2696,7 +2696,7 @@ static int nf_tables_newset(struct net *net, struct sock *nlsk,
struct nft_table *table;
struct nft_set *set;
struct nft_ctx ctx;
char name[IFNAMSIZ];
char name[NFT_SET_MAXNAMELEN];
unsigned int size;
bool create;
u64 timeout;
......@@ -3375,6 +3375,22 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem)
}
EXPORT_SYMBOL_GPL(nft_set_elem_destroy);
static int nft_setelem_parse_flags(const struct nft_set *set,
const struct nlattr *attr, u32 *flags)
{
if (attr == NULL)
return 0;
*flags = ntohl(nla_get_be32(attr));
if (*flags & ~NFT_SET_ELEM_INTERVAL_END)
return -EINVAL;
if (!(set->flags & NFT_SET_INTERVAL) &&
*flags & NFT_SET_ELEM_INTERVAL_END)
return -EINVAL;
return 0;
}
static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
......@@ -3388,8 +3404,8 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
struct nft_data data;
enum nft_registers dreg;
struct nft_trans *trans;
u32 flags = 0;
u64 timeout;
u32 flags;
u8 ulen;
int err;
......@@ -3403,17 +3419,11 @@ static int nft_add_set_elem(struct nft_ctx *ctx, struct nft_set *set,
nft_set_ext_prepare(&tmpl);
flags = 0;
if (nla[NFTA_SET_ELEM_FLAGS] != NULL) {
flags = ntohl(nla_get_be32(nla[NFTA_SET_ELEM_FLAGS]));
if (flags & ~NFT_SET_ELEM_INTERVAL_END)
return -EINVAL;
if (!(set->flags & NFT_SET_INTERVAL) &&
flags & NFT_SET_ELEM_INTERVAL_END)
return -EINVAL;
if (flags != 0)
nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
}
err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
if (err < 0)
return err;
if (flags != 0)
nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
if (set->flags & NFT_SET_MAP) {
if (nla[NFTA_SET_ELEM_DATA] == NULL &&
......@@ -3582,9 +3592,13 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
const struct nlattr *attr)
{
struct nlattr *nla[NFTA_SET_ELEM_MAX + 1];
struct nft_set_ext_tmpl tmpl;
struct nft_data_desc desc;
struct nft_set_elem elem;
struct nft_set_ext *ext;
struct nft_trans *trans;
u32 flags = 0;
void *priv;
int err;
err = nla_parse_nested(nla, NFTA_SET_ELEM_MAX, attr,
......@@ -3596,6 +3610,14 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (nla[NFTA_SET_ELEM_KEY] == NULL)
goto err1;
nft_set_ext_prepare(&tmpl);
err = nft_setelem_parse_flags(set, nla[NFTA_SET_ELEM_FLAGS], &flags);
if (err < 0)
return err;
if (flags != 0)
nft_set_ext_add(&tmpl, NFT_SET_EXT_FLAGS);
err = nft_data_init(ctx, &elem.key.val, sizeof(elem.key), &desc,
nla[NFTA_SET_ELEM_KEY]);
if (err < 0)
......@@ -3605,24 +3627,40 @@ static int nft_del_setelem(struct nft_ctx *ctx, struct nft_set *set,
if (desc.type != NFT_DATA_VALUE || desc.len != set->klen)
goto err2;
nft_set_ext_add_length(&tmpl, NFT_SET_EXT_KEY, desc.len);
err = -ENOMEM;
elem.priv = nft_set_elem_init(set, &tmpl, elem.key.val.data, NULL, 0,
GFP_KERNEL);
if (elem.priv == NULL)
goto err2;
ext = nft_set_elem_ext(set, elem.priv);
if (flags)
*nft_set_ext_flags(ext) = flags;
trans = nft_trans_elem_alloc(ctx, NFT_MSG_DELSETELEM, set);
if (trans == NULL) {
err = -ENOMEM;
goto err2;
goto err3;
}
elem.priv = set->ops->deactivate(set, &elem);
if (elem.priv == NULL) {
priv = set->ops->deactivate(set, &elem);
if (priv == NULL) {
err = -ENOENT;
goto err3;
goto err4;
}
kfree(elem.priv);
elem.priv = priv;
nft_trans_elem(trans) = elem;
list_add_tail(&trans->list, &ctx->net->nft.commit_list);
return 0;
err3:
err4:
kfree(trans);
err3:
kfree(elem.priv);
err2:
nft_data_uninit(&elem.key.val, desc.type);
err1:
......
......@@ -306,10 +306,10 @@ static void ctnl_untimeout(struct net *net, struct ctnl_timeout *timeout)
int i;
local_bh_disable();
for (i = 0; i < net->ct.htable_size; i++) {
for (i = 0; i < nf_conntrack_htable_size; i++) {
nf_conntrack_lock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
if (i < net->ct.htable_size) {
hlist_nulls_for_each_entry(h, nn, &net->ct.hash[i], hnnode)
if (i < nf_conntrack_htable_size) {
hlist_nulls_for_each_entry(h, nn, &nf_conntrack_hash[i], hnnode)
untimeout(h, timeout);
}
spin_unlock(&nf_conntrack_locks[i % CONNTRACK_LOCKS]);
......
......@@ -197,6 +197,14 @@ static void nft_ct_set_eval(const struct nft_expr *expr,
nf_conntrack_event_cache(IPCT_MARK, ct);
}
break;
#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
case NFT_CT_LABELS:
nf_connlabels_replace(ct,
&regs->data[priv->sreg],
&regs->data[priv->sreg],
NF_CT_LABELS_MAX_SIZE / sizeof(u32));
break;
#endif
default:
break;
......@@ -364,6 +372,16 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
case NFT_CT_MARK:
len = FIELD_SIZEOF(struct nf_conn, mark);
break;
#endif
#ifdef CONFIG_NF_CONNTRACK_LABELS
case NFT_CT_LABELS:
if (tb[NFTA_CT_DIRECTION])
return -EINVAL;
len = NF_CT_LABELS_MAX_SIZE;
err = nf_connlabels_get(ctx->net, (len * BITS_PER_BYTE) - 1);
if (err)
return err;
break;
#endif
default:
return -EOPNOTSUPP;
......@@ -384,6 +402,18 @@ static int nft_ct_set_init(const struct nft_ctx *ctx,
static void nft_ct_destroy(const struct nft_ctx *ctx,
const struct nft_expr *expr)
{
struct nft_ct *priv = nft_expr_priv(expr);
switch (priv->key) {
#ifdef CONFIG_NF_CONNTRACK_LABELS
case NFT_CT_LABELS:
nf_connlabels_put(ctx->net);
break;
#endif
default:
break;
}
nft_ct_l3proto_module_put(ctx->afi->family);
}
......
......@@ -29,6 +29,17 @@ struct nft_rbtree_elem {
struct nft_set_ext ext;
};
static bool nft_rbtree_interval_end(const struct nft_rbtree_elem *rbe)
{
return nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
(*nft_set_ext_flags(&rbe->ext) & NFT_SET_ELEM_INTERVAL_END);
}
static bool nft_rbtree_equal(const struct nft_set *set, const void *this,
const struct nft_rbtree_elem *interval)
{
return memcmp(this, nft_set_ext_key(&interval->ext), set->klen) == 0;
}
static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
const struct nft_set_ext **ext)
......@@ -37,6 +48,7 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
const struct nft_rbtree_elem *rbe, *interval = NULL;
const struct rb_node *parent;
u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
const void *this;
int d;
spin_lock_bh(&nft_rbtree_lock);
......@@ -44,9 +56,16 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
while (parent != NULL) {
rbe = rb_entry(parent, struct nft_rbtree_elem, node);
d = memcmp(nft_set_ext_key(&rbe->ext), key, set->klen);
this = nft_set_ext_key(&rbe->ext);
d = memcmp(this, key, set->klen);
if (d < 0) {
parent = parent->rb_left;
/* In case of adjacent ranges, we always see the high
* part of the range in first place, before the low one.
* So don't update interval if the keys are equal.
*/
if (interval && nft_rbtree_equal(set, this, interval))
continue;
interval = rbe;
} else if (d > 0)
parent = parent->rb_right;
......@@ -56,9 +75,7 @@ static bool nft_rbtree_lookup(const struct nft_set *set, const u32 *key,
parent = parent->rb_left;
continue;
}
if (nft_set_ext_exists(&rbe->ext, NFT_SET_EXT_FLAGS) &&
*nft_set_ext_flags(&rbe->ext) &
NFT_SET_ELEM_INTERVAL_END)
if (nft_rbtree_interval_end(rbe))
goto out;
spin_unlock_bh(&nft_rbtree_lock);
......@@ -98,9 +115,16 @@ static int __nft_rbtree_insert(const struct nft_set *set,
else if (d > 0)
p = &parent->rb_right;
else {
if (nft_set_elem_active(&rbe->ext, genmask))
return -EEXIST;
p = &parent->rb_left;
if (nft_set_elem_active(&rbe->ext, genmask)) {
if (nft_rbtree_interval_end(rbe) &&
!nft_rbtree_interval_end(new))
p = &parent->rb_left;
else if (!nft_rbtree_interval_end(rbe) &&
nft_rbtree_interval_end(new))
p = &parent->rb_right;
else
return -EEXIST;
}
}
}
rb_link_node(&new->node, parent, p);
......@@ -145,7 +169,7 @@ static void *nft_rbtree_deactivate(const struct nft_set *set,
{
const struct nft_rbtree *priv = nft_set_priv(set);
const struct rb_node *parent = priv->root.rb_node;
struct nft_rbtree_elem *rbe;
struct nft_rbtree_elem *rbe, *this = elem->priv;
u8 genmask = nft_genmask_cur(read_pnet(&set->pnet));
int d;
......@@ -163,6 +187,15 @@ static void *nft_rbtree_deactivate(const struct nft_set *set,
parent = parent->rb_left;
continue;
}
if (nft_rbtree_interval_end(rbe) &&
!nft_rbtree_interval_end(this)) {
parent = parent->rb_left;
continue;
} else if (!nft_rbtree_interval_end(rbe) &&
nft_rbtree_interval_end(this)) {
parent = parent->rb_right;
continue;
}
nft_set_elem_change_active(set, &rbe->ext);
return rbe;
}
......
......@@ -439,20 +439,12 @@ ovs_ct_find_existing(struct net *net, const struct nf_conntrack_zone *zone,
u8 protonum;
l3proto = __nf_ct_l3proto_find(l3num);
if (!l3proto) {
pr_debug("ovs_ct_find_existing: Can't get l3proto\n");
return NULL;
}
if (l3proto->get_l4proto(skb, skb_network_offset(skb), &dataoff,
&protonum) <= 0) {
pr_debug("ovs_ct_find_existing: Can't get protonum\n");
return NULL;
}
l4proto = __nf_ct_l4proto_find(l3num, protonum);
if (!l4proto) {
pr_debug("ovs_ct_find_existing: Can't get l4proto\n");
return NULL;
}
if (!nf_ct_get_tuple(skb, skb_network_offset(skb), dataoff, l3num,
protonum, net, &tuple, l3proto, l4proto)) {
pr_debug("ovs_ct_find_existing: Can't get tuple\n");
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment