Commit 97f7cf1c authored by Jozsef Kadlecsik's avatar Jozsef Kadlecsik Committed by Pablo Neira Ayuso

netfilter: ipset: fix performance regression in swap operation

The patch "netfilter: ipset: fix race condition between swap/destroy
and kernel side add/del/test", commit 28628fa9 fixes a race condition.
But the synchronize_rcu() added to the swap function unnecessarily slows
it down: it can safely be moved to destroy and use call_rcu() instead.

Eric Dumazet pointed out that simply calling the destroy functions as
rcu callback does not work: sets with timeout use garbage collectors
which need cancelling at destroy which can wait. Therefore the destroy
functions are split into two: cancelling garbage collectors safely at
executing the command received by netlink and moving the remaining
part only into the rcu callback.

Link: https://lore.kernel.org/lkml/C0829B10-EAA6-4809-874E-E1E9C05A8D84@automattic.com/
Fixes: 28628fa9 ("netfilter: ipset: fix race condition between swap/destroy and kernel side add/del/test")
Reported-by: default avatarAle Crismani <ale.crismani@automattic.com>
Reported-by: default avatarDavid Wang <00107082@163.com>
Tested-by: default avatarDavid Wang <00107082@163.com>
Signed-off-by: default avatarJozsef Kadlecsik <kadlec@netfilter.org>
Signed-off-by: default avatarPablo Neira Ayuso <pablo@netfilter.org>
parent 6e348067
...@@ -186,6 +186,8 @@ struct ip_set_type_variant { ...@@ -186,6 +186,8 @@ struct ip_set_type_variant {
/* Return true if "b" set is the same as "a" /* Return true if "b" set is the same as "a"
* according to the create set parameters */ * according to the create set parameters */
bool (*same_set)(const struct ip_set *a, const struct ip_set *b); bool (*same_set)(const struct ip_set *a, const struct ip_set *b);
/* Cancel ongoing garbage collectors before destroying the set*/
void (*cancel_gc)(struct ip_set *set);
/* Region-locking is used */ /* Region-locking is used */
bool region_lock; bool region_lock;
}; };
...@@ -242,6 +244,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type); ...@@ -242,6 +244,8 @@ extern void ip_set_type_unregister(struct ip_set_type *set_type);
/* A generic IP set */ /* A generic IP set */
struct ip_set { struct ip_set {
/* For call_cru in destroy */
struct rcu_head rcu;
/* The name of the set */ /* The name of the set */
char name[IPSET_MAXNAMELEN]; char name[IPSET_MAXNAMELEN];
/* Lock protecting the set data */ /* Lock protecting the set data */
......
...@@ -30,6 +30,7 @@ ...@@ -30,6 +30,7 @@
#define mtype_del IPSET_TOKEN(MTYPE, _del) #define mtype_del IPSET_TOKEN(MTYPE, _del)
#define mtype_list IPSET_TOKEN(MTYPE, _list) #define mtype_list IPSET_TOKEN(MTYPE, _list)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_gc IPSET_TOKEN(MTYPE, _gc)
#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype MTYPE #define mtype MTYPE
#define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id))) #define get_ext(set, map, id) ((map)->extensions + ((set)->dsize * (id)))
...@@ -59,9 +60,6 @@ mtype_destroy(struct ip_set *set) ...@@ -59,9 +60,6 @@ mtype_destroy(struct ip_set *set)
{ {
struct mtype *map = set->data; struct mtype *map = set->data;
if (SET_WITH_TIMEOUT(set))
del_timer_sync(&map->gc);
if (set->dsize && set->extensions & IPSET_EXT_DESTROY) if (set->dsize && set->extensions & IPSET_EXT_DESTROY)
mtype_ext_cleanup(set); mtype_ext_cleanup(set);
ip_set_free(map->members); ip_set_free(map->members);
...@@ -290,6 +288,15 @@ mtype_gc(struct timer_list *t) ...@@ -290,6 +288,15 @@ mtype_gc(struct timer_list *t)
add_timer(&map->gc); add_timer(&map->gc);
} }
static void
mtype_cancel_gc(struct ip_set *set)
{
struct mtype *map = set->data;
if (SET_WITH_TIMEOUT(set))
del_timer_sync(&map->gc);
}
static const struct ip_set_type_variant mtype = { static const struct ip_set_type_variant mtype = {
.kadt = mtype_kadt, .kadt = mtype_kadt,
.uadt = mtype_uadt, .uadt = mtype_uadt,
...@@ -303,6 +310,7 @@ static const struct ip_set_type_variant mtype = { ...@@ -303,6 +310,7 @@ static const struct ip_set_type_variant mtype = {
.head = mtype_head, .head = mtype_head,
.list = mtype_list, .list = mtype_list,
.same_set = mtype_same_set, .same_set = mtype_same_set,
.cancel_gc = mtype_cancel_gc,
}; };
#endif /* __IP_SET_BITMAP_IP_GEN_H */ #endif /* __IP_SET_BITMAP_IP_GEN_H */
...@@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set) ...@@ -1182,6 +1182,14 @@ ip_set_destroy_set(struct ip_set *set)
kfree(set); kfree(set);
} }
static void
ip_set_destroy_set_rcu(struct rcu_head *head)
{
struct ip_set *set = container_of(head, struct ip_set, rcu);
ip_set_destroy_set(set);
}
static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
const struct nlattr * const attr[]) const struct nlattr * const attr[])
{ {
...@@ -1193,8 +1201,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, ...@@ -1193,8 +1201,6 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
if (unlikely(protocol_min_failed(attr))) if (unlikely(protocol_min_failed(attr)))
return -IPSET_ERR_PROTOCOL; return -IPSET_ERR_PROTOCOL;
/* Must wait for flush to be really finished in list:set */
rcu_barrier();
/* Commands are serialized and references are /* Commands are serialized and references are
* protected by the ip_set_ref_lock. * protected by the ip_set_ref_lock.
...@@ -1206,8 +1212,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, ...@@ -1206,8 +1212,10 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
* counter, so if it's already zero, we can proceed * counter, so if it's already zero, we can proceed
* without holding the lock. * without holding the lock.
*/ */
read_lock_bh(&ip_set_ref_lock);
if (!attr[IPSET_ATTR_SETNAME]) { if (!attr[IPSET_ATTR_SETNAME]) {
/* Must wait for flush to be really finished in list:set */
rcu_barrier();
read_lock_bh(&ip_set_ref_lock);
for (i = 0; i < inst->ip_set_max; i++) { for (i = 0; i < inst->ip_set_max; i++) {
s = ip_set(inst, i); s = ip_set(inst, i);
if (s && (s->ref || s->ref_netlink)) { if (s && (s->ref || s->ref_netlink)) {
...@@ -1221,6 +1229,8 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, ...@@ -1221,6 +1229,8 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
s = ip_set(inst, i); s = ip_set(inst, i);
if (s) { if (s) {
ip_set(inst, i) = NULL; ip_set(inst, i) = NULL;
/* Must cancel garbage collectors */
s->variant->cancel_gc(s);
ip_set_destroy_set(s); ip_set_destroy_set(s);
} }
} }
...@@ -1228,6 +1238,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, ...@@ -1228,6 +1238,9 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
inst->is_destroyed = false; inst->is_destroyed = false;
} else { } else {
u32 flags = flag_exist(info->nlh); u32 flags = flag_exist(info->nlh);
u16 features = 0;
read_lock_bh(&ip_set_ref_lock);
s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]), s = find_set_and_id(inst, nla_data(attr[IPSET_ATTR_SETNAME]),
&i); &i);
if (!s) { if (!s) {
...@@ -1238,10 +1251,16 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info, ...@@ -1238,10 +1251,16 @@ static int ip_set_destroy(struct sk_buff *skb, const struct nfnl_info *info,
ret = -IPSET_ERR_BUSY; ret = -IPSET_ERR_BUSY;
goto out; goto out;
} }
features = s->type->features;
ip_set(inst, i) = NULL; ip_set(inst, i) = NULL;
read_unlock_bh(&ip_set_ref_lock); read_unlock_bh(&ip_set_ref_lock);
if (features & IPSET_TYPE_NAME) {
ip_set_destroy_set(s); /* Must wait for flush to be really finished */
rcu_barrier();
}
/* Must cancel garbage collectors */
s->variant->cancel_gc(s);
call_rcu(&s->rcu, ip_set_destroy_set_rcu);
} }
return 0; return 0;
out: out:
...@@ -1394,9 +1413,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info, ...@@ -1394,9 +1413,6 @@ static int ip_set_swap(struct sk_buff *skb, const struct nfnl_info *info,
ip_set(inst, to_id) = from; ip_set(inst, to_id) = from;
write_unlock_bh(&ip_set_ref_lock); write_unlock_bh(&ip_set_ref_lock);
/* Make sure all readers of the old set pointers are completed. */
synchronize_rcu();
return 0; return 0;
} }
...@@ -2409,8 +2425,11 @@ ip_set_fini(void) ...@@ -2409,8 +2425,11 @@ ip_set_fini(void)
{ {
nf_unregister_sockopt(&so_set); nf_unregister_sockopt(&so_set);
nfnetlink_subsys_unregister(&ip_set_netlink_subsys); nfnetlink_subsys_unregister(&ip_set_netlink_subsys);
unregister_pernet_subsys(&ip_set_net_ops); unregister_pernet_subsys(&ip_set_net_ops);
/* Wait for call_rcu() in destroy */
rcu_barrier();
pr_debug("these are the famous last words\n"); pr_debug("these are the famous last words\n");
} }
......
...@@ -222,6 +222,7 @@ static const union nf_inet_addr zeromask = {}; ...@@ -222,6 +222,7 @@ static const union nf_inet_addr zeromask = {};
#undef mtype_gc_do #undef mtype_gc_do
#undef mtype_gc #undef mtype_gc
#undef mtype_gc_init #undef mtype_gc_init
#undef mtype_cancel_gc
#undef mtype_variant #undef mtype_variant
#undef mtype_data_match #undef mtype_data_match
...@@ -266,6 +267,7 @@ static const union nf_inet_addr zeromask = {}; ...@@ -266,6 +267,7 @@ static const union nf_inet_addr zeromask = {};
#define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do) #define mtype_gc_do IPSET_TOKEN(MTYPE, _gc_do)
#define mtype_gc IPSET_TOKEN(MTYPE, _gc) #define mtype_gc IPSET_TOKEN(MTYPE, _gc)
#define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init) #define mtype_gc_init IPSET_TOKEN(MTYPE, _gc_init)
#define mtype_cancel_gc IPSET_TOKEN(MTYPE, _cancel_gc)
#define mtype_variant IPSET_TOKEN(MTYPE, _variant) #define mtype_variant IPSET_TOKEN(MTYPE, _variant)
#define mtype_data_match IPSET_TOKEN(MTYPE, _data_match) #define mtype_data_match IPSET_TOKEN(MTYPE, _data_match)
...@@ -450,9 +452,6 @@ mtype_destroy(struct ip_set *set) ...@@ -450,9 +452,6 @@ mtype_destroy(struct ip_set *set)
struct htype *h = set->data; struct htype *h = set->data;
struct list_head *l, *lt; struct list_head *l, *lt;
if (SET_WITH_TIMEOUT(set))
cancel_delayed_work_sync(&h->gc.dwork);
mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true); mtype_ahash_destroy(set, ipset_dereference_nfnl(h->table), true);
list_for_each_safe(l, lt, &h->ad) { list_for_each_safe(l, lt, &h->ad) {
list_del(l); list_del(l);
...@@ -599,6 +598,15 @@ mtype_gc_init(struct htable_gc *gc) ...@@ -599,6 +598,15 @@ mtype_gc_init(struct htable_gc *gc)
queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ); queue_delayed_work(system_power_efficient_wq, &gc->dwork, HZ);
} }
static void
mtype_cancel_gc(struct ip_set *set)
{
struct htype *h = set->data;
if (SET_WITH_TIMEOUT(set))
cancel_delayed_work_sync(&h->gc.dwork);
}
static int static int
mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext, mtype_add(struct ip_set *set, void *value, const struct ip_set_ext *ext,
struct ip_set_ext *mext, u32 flags); struct ip_set_ext *mext, u32 flags);
...@@ -1441,6 +1449,7 @@ static const struct ip_set_type_variant mtype_variant = { ...@@ -1441,6 +1449,7 @@ static const struct ip_set_type_variant mtype_variant = {
.uref = mtype_uref, .uref = mtype_uref,
.resize = mtype_resize, .resize = mtype_resize,
.same_set = mtype_same_set, .same_set = mtype_same_set,
.cancel_gc = mtype_cancel_gc,
.region_lock = true, .region_lock = true,
}; };
......
...@@ -426,9 +426,6 @@ list_set_destroy(struct ip_set *set) ...@@ -426,9 +426,6 @@ list_set_destroy(struct ip_set *set)
struct list_set *map = set->data; struct list_set *map = set->data;
struct set_elem *e, *n; struct set_elem *e, *n;
if (SET_WITH_TIMEOUT(set))
timer_shutdown_sync(&map->gc);
list_for_each_entry_safe(e, n, &map->members, list) { list_for_each_entry_safe(e, n, &map->members, list) {
list_del(&e->list); list_del(&e->list);
ip_set_put_byindex(map->net, e->id); ip_set_put_byindex(map->net, e->id);
...@@ -545,6 +542,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b) ...@@ -545,6 +542,15 @@ list_set_same_set(const struct ip_set *a, const struct ip_set *b)
a->extensions == b->extensions; a->extensions == b->extensions;
} }
static void
list_set_cancel_gc(struct ip_set *set)
{
struct list_set *map = set->data;
if (SET_WITH_TIMEOUT(set))
timer_shutdown_sync(&map->gc);
}
static const struct ip_set_type_variant set_variant = { static const struct ip_set_type_variant set_variant = {
.kadt = list_set_kadt, .kadt = list_set_kadt,
.uadt = list_set_uadt, .uadt = list_set_uadt,
...@@ -558,6 +564,7 @@ static const struct ip_set_type_variant set_variant = { ...@@ -558,6 +564,7 @@ static const struct ip_set_type_variant set_variant = {
.head = list_set_head, .head = list_set_head,
.list = list_set_list, .list = list_set_list,
.same_set = list_set_same_set, .same_set = list_set_same_set,
.cancel_gc = list_set_cancel_gc,
}; };
static void static void
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment