Commit 3e91b0eb authored by Jakub Kicinski's avatar Jakub Kicinski

Merge tag 'nf-23-08-10' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf

Pablo Neira Ayuso says:

====================
Netfilter fixes for net

The existing attempt to resolve races between control plane and GC work
is error prone, as reported by Bien Pham <phamnnb@sea.com>, some places
forgot to call nft_set_elem_mark_busy(), leading to double-deactivation
of elements.

This series contains the following patches:

1) Do not skip expired elements during walk otherwise elements might
   never decrement the reference counter on data, leading to memleak.

2) Add a GC transaction API to replace the former attempt to deal with
   races between control plane and GC. GC worker sets on NFT_SET_ELEM_DEAD_BIT
   on elements and it creates a GC transaction to remove the expired
   elements, GC transaction could abort in case of interference with
   control plane and retried later (GC async). Set backends such as
   rbtree and pipapo also perform GC from control plane (GC sync), in
   such case, element deactivation and removal is safe because mutex
   is held then collected elements are released via call_rcu().

3) Adapt existing set backends to use the GC transaction API.

4) Update rhash set backend to set on _DEAD bit to report deleted
   elements from datapath for GC.

5) Remove old GC batch API and the NFT_SET_ELEM_BUSY_BIT.

* tag 'nf-23-08-10' of git://git.kernel.org/pub/scm/linux/kernel/git/netfilter/nf:
  netfilter: nf_tables: remove busy mark and gc batch API
  netfilter: nft_set_hash: mark set element as dead when deleting from packet path
  netfilter: nf_tables: adapt set backend to use GC transaction API
  netfilter: nf_tables: GC transaction API to avoid race with control plane
  netfilter: nf_tables: don't skip expired elements during walk
====================

Link: https://lore.kernel.org/r/20230810070830.24064-1-pablo@netfilter.orgSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 62d02fca a2dd0233
...@@ -512,6 +512,7 @@ struct nft_set_elem_expr { ...@@ -512,6 +512,7 @@ struct nft_set_elem_expr {
* *
* @list: table set list node * @list: table set list node
* @bindings: list of set bindings * @bindings: list of set bindings
* @refs: internal refcounting for async set destruction
* @table: table this set belongs to * @table: table this set belongs to
* @net: netnamespace this set belongs to * @net: netnamespace this set belongs to
* @name: name of the set * @name: name of the set
...@@ -541,6 +542,7 @@ struct nft_set_elem_expr { ...@@ -541,6 +542,7 @@ struct nft_set_elem_expr {
struct nft_set { struct nft_set {
struct list_head list; struct list_head list;
struct list_head bindings; struct list_head bindings;
refcount_t refs;
struct nft_table *table; struct nft_table *table;
possible_net_t net; possible_net_t net;
char *name; char *name;
...@@ -562,7 +564,8 @@ struct nft_set { ...@@ -562,7 +564,8 @@ struct nft_set {
struct list_head pending_update; struct list_head pending_update;
/* runtime data below here */ /* runtime data below here */
const struct nft_set_ops *ops ____cacheline_aligned; const struct nft_set_ops *ops ____cacheline_aligned;
u16 flags:14, u16 flags:13,
dead:1,
genmask:2; genmask:2;
u8 klen; u8 klen;
u8 dlen; u8 dlen;
...@@ -596,7 +599,6 @@ struct nft_set *nft_set_lookup_global(const struct net *net, ...@@ -596,7 +599,6 @@ struct nft_set *nft_set_lookup_global(const struct net *net,
struct nft_set_ext *nft_set_catchall_lookup(const struct net *net, struct nft_set_ext *nft_set_catchall_lookup(const struct net *net,
const struct nft_set *set); const struct nft_set *set);
void *nft_set_catchall_gc(const struct nft_set *set);
static inline unsigned long nft_set_gc_interval(const struct nft_set *set) static inline unsigned long nft_set_gc_interval(const struct nft_set *set)
{ {
...@@ -813,62 +815,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem, ...@@ -813,62 +815,6 @@ void nft_set_elem_destroy(const struct nft_set *set, void *elem,
void nf_tables_set_elem_destroy(const struct nft_ctx *ctx, void nf_tables_set_elem_destroy(const struct nft_ctx *ctx,
const struct nft_set *set, void *elem); const struct nft_set *set, void *elem);
/**
* struct nft_set_gc_batch_head - nf_tables set garbage collection batch
*
* @rcu: rcu head
* @set: set the elements belong to
* @cnt: count of elements
*/
struct nft_set_gc_batch_head {
struct rcu_head rcu;
const struct nft_set *set;
unsigned int cnt;
};
#define NFT_SET_GC_BATCH_SIZE ((PAGE_SIZE - \
sizeof(struct nft_set_gc_batch_head)) / \
sizeof(void *))
/**
* struct nft_set_gc_batch - nf_tables set garbage collection batch
*
* @head: GC batch head
* @elems: garbage collection elements
*/
struct nft_set_gc_batch {
struct nft_set_gc_batch_head head;
void *elems[NFT_SET_GC_BATCH_SIZE];
};
struct nft_set_gc_batch *nft_set_gc_batch_alloc(const struct nft_set *set,
gfp_t gfp);
void nft_set_gc_batch_release(struct rcu_head *rcu);
static inline void nft_set_gc_batch_complete(struct nft_set_gc_batch *gcb)
{
if (gcb != NULL)
call_rcu(&gcb->head.rcu, nft_set_gc_batch_release);
}
static inline struct nft_set_gc_batch *
nft_set_gc_batch_check(const struct nft_set *set, struct nft_set_gc_batch *gcb,
gfp_t gfp)
{
if (gcb != NULL) {
if (gcb->head.cnt + 1 < ARRAY_SIZE(gcb->elems))
return gcb;
nft_set_gc_batch_complete(gcb);
}
return nft_set_gc_batch_alloc(set, gfp);
}
static inline void nft_set_gc_batch_add(struct nft_set_gc_batch *gcb,
void *elem)
{
gcb->elems[gcb->head.cnt++] = elem;
}
struct nft_expr_ops; struct nft_expr_ops;
/** /**
* struct nft_expr_type - nf_tables expression type * struct nft_expr_type - nf_tables expression type
...@@ -1557,39 +1503,30 @@ static inline void nft_set_elem_change_active(const struct net *net, ...@@ -1557,39 +1503,30 @@ static inline void nft_set_elem_change_active(const struct net *net,
#endif /* IS_ENABLED(CONFIG_NF_TABLES) */ #endif /* IS_ENABLED(CONFIG_NF_TABLES) */
/* #define NFT_SET_ELEM_DEAD_MASK (1 << 2)
* We use a free bit in the genmask field to indicate the element
* is busy, meaning it is currently being processed either by
* the netlink API or GC.
*
* Even though the genmask is only a single byte wide, this works
* because the extension structure if fully constant once initialized,
* so there are no non-atomic write accesses unless it is already
* marked busy.
*/
#define NFT_SET_ELEM_BUSY_MASK (1 << 2)
#if defined(__LITTLE_ENDIAN_BITFIELD) #if defined(__LITTLE_ENDIAN_BITFIELD)
#define NFT_SET_ELEM_BUSY_BIT 2 #define NFT_SET_ELEM_DEAD_BIT 2
#elif defined(__BIG_ENDIAN_BITFIELD) #elif defined(__BIG_ENDIAN_BITFIELD)
#define NFT_SET_ELEM_BUSY_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2) #define NFT_SET_ELEM_DEAD_BIT (BITS_PER_LONG - BITS_PER_BYTE + 2)
#else #else
#error #error
#endif #endif
static inline int nft_set_elem_mark_busy(struct nft_set_ext *ext) static inline void nft_set_elem_dead(struct nft_set_ext *ext)
{ {
unsigned long *word = (unsigned long *)ext; unsigned long *word = (unsigned long *)ext;
BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0); BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
return test_and_set_bit(NFT_SET_ELEM_BUSY_BIT, word); set_bit(NFT_SET_ELEM_DEAD_BIT, word);
} }
static inline void nft_set_elem_clear_busy(struct nft_set_ext *ext) static inline int nft_set_elem_is_dead(const struct nft_set_ext *ext)
{ {
unsigned long *word = (unsigned long *)ext; unsigned long *word = (unsigned long *)ext;
clear_bit(NFT_SET_ELEM_BUSY_BIT, word); BUILD_BUG_ON(offsetof(struct nft_set_ext, genmask) != 0);
return test_bit(NFT_SET_ELEM_DEAD_BIT, word);
} }
/** /**
...@@ -1732,6 +1669,38 @@ struct nft_trans_flowtable { ...@@ -1732,6 +1669,38 @@ struct nft_trans_flowtable {
#define nft_trans_flowtable_flags(trans) \ #define nft_trans_flowtable_flags(trans) \
(((struct nft_trans_flowtable *)trans->data)->flags) (((struct nft_trans_flowtable *)trans->data)->flags)
#define NFT_TRANS_GC_BATCHCOUNT 256
struct nft_trans_gc {
struct list_head list;
struct net *net;
struct nft_set *set;
u32 seq;
u8 count;
void *priv[NFT_TRANS_GC_BATCHCOUNT];
struct rcu_head rcu;
};
struct nft_trans_gc *nft_trans_gc_alloc(struct nft_set *set,
unsigned int gc_seq, gfp_t gfp);
void nft_trans_gc_destroy(struct nft_trans_gc *trans);
struct nft_trans_gc *nft_trans_gc_queue_async(struct nft_trans_gc *gc,
unsigned int gc_seq, gfp_t gfp);
void nft_trans_gc_queue_async_done(struct nft_trans_gc *gc);
struct nft_trans_gc *nft_trans_gc_queue_sync(struct nft_trans_gc *gc, gfp_t gfp);
void nft_trans_gc_queue_sync_done(struct nft_trans_gc *trans);
void nft_trans_gc_elem_add(struct nft_trans_gc *gc, void *priv);
struct nft_trans_gc *nft_trans_gc_catchall(struct nft_trans_gc *gc,
unsigned int gc_seq);
void nft_setelem_data_deactivate(const struct net *net,
const struct nft_set *set,
struct nft_set_elem *elem);
int __init nft_chain_filter_init(void); int __init nft_chain_filter_init(void);
void nft_chain_filter_fini(void); void nft_chain_filter_fini(void);
...@@ -1758,6 +1727,7 @@ struct nftables_pernet { ...@@ -1758,6 +1727,7 @@ struct nftables_pernet {
struct mutex commit_mutex; struct mutex commit_mutex;
u64 table_handle; u64 table_handle;
unsigned int base_seq; unsigned int base_seq;
unsigned int gc_seq;
}; };
extern unsigned int nf_tables_net_id; extern unsigned int nf_tables_net_id;
......
This diff is collapsed.
...@@ -59,6 +59,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg, ...@@ -59,6 +59,8 @@ static inline int nft_rhash_cmp(struct rhashtable_compare_arg *arg,
if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen)) if (memcmp(nft_set_ext_key(&he->ext), x->key, x->set->klen))
return 1; return 1;
if (nft_set_elem_is_dead(&he->ext))
return 1;
if (nft_set_elem_expired(&he->ext)) if (nft_set_elem_expired(&he->ext))
return 1; return 1;
if (!nft_set_elem_active(&he->ext, x->genmask)) if (!nft_set_elem_active(&he->ext, x->genmask))
...@@ -188,7 +190,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set, ...@@ -188,7 +190,6 @@ static void nft_rhash_activate(const struct net *net, const struct nft_set *set,
struct nft_rhash_elem *he = elem->priv; struct nft_rhash_elem *he = elem->priv;
nft_set_elem_change_active(net, set, &he->ext); nft_set_elem_change_active(net, set, &he->ext);
nft_set_elem_clear_busy(&he->ext);
} }
static bool nft_rhash_flush(const struct net *net, static bool nft_rhash_flush(const struct net *net,
...@@ -196,12 +197,9 @@ static bool nft_rhash_flush(const struct net *net, ...@@ -196,12 +197,9 @@ static bool nft_rhash_flush(const struct net *net,
{ {
struct nft_rhash_elem *he = priv; struct nft_rhash_elem *he = priv;
if (!nft_set_elem_mark_busy(&he->ext) || nft_set_elem_change_active(net, set, &he->ext);
!nft_is_active(net, &he->ext)) {
nft_set_elem_change_active(net, set, &he->ext); return true;
return true;
}
return false;
} }
static void *nft_rhash_deactivate(const struct net *net, static void *nft_rhash_deactivate(const struct net *net,
...@@ -218,9 +216,8 @@ static void *nft_rhash_deactivate(const struct net *net, ...@@ -218,9 +216,8 @@ static void *nft_rhash_deactivate(const struct net *net,
rcu_read_lock(); rcu_read_lock();
he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params); he = rhashtable_lookup(&priv->ht, &arg, nft_rhash_params);
if (he != NULL && if (he)
!nft_rhash_flush(net, set, he)) nft_set_elem_change_active(net, set, &he->ext);
he = NULL;
rcu_read_unlock(); rcu_read_unlock();
...@@ -252,7 +249,9 @@ static bool nft_rhash_delete(const struct nft_set *set, ...@@ -252,7 +249,9 @@ static bool nft_rhash_delete(const struct nft_set *set,
if (he == NULL) if (he == NULL)
return false; return false;
return rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params) == 0; nft_set_elem_dead(&he->ext);
return true;
} }
static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
...@@ -278,8 +277,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set, ...@@ -278,8 +277,6 @@ static void nft_rhash_walk(const struct nft_ctx *ctx, struct nft_set *set,
if (iter->count < iter->skip) if (iter->count < iter->skip)
goto cont; goto cont;
if (nft_set_elem_expired(&he->ext))
goto cont;
if (!nft_set_elem_active(&he->ext, iter->genmask)) if (!nft_set_elem_active(&he->ext, iter->genmask))
goto cont; goto cont;
...@@ -314,25 +311,48 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set, ...@@ -314,25 +311,48 @@ static bool nft_rhash_expr_needs_gc_run(const struct nft_set *set,
static void nft_rhash_gc(struct work_struct *work) static void nft_rhash_gc(struct work_struct *work)
{ {
struct nftables_pernet *nft_net;
struct nft_set *set; struct nft_set *set;
struct nft_rhash_elem *he; struct nft_rhash_elem *he;
struct nft_rhash *priv; struct nft_rhash *priv;
struct nft_set_gc_batch *gcb = NULL;
struct rhashtable_iter hti; struct rhashtable_iter hti;
struct nft_trans_gc *gc;
struct net *net;
u32 gc_seq;
priv = container_of(work, struct nft_rhash, gc_work.work); priv = container_of(work, struct nft_rhash, gc_work.work);
set = nft_set_container_of(priv); set = nft_set_container_of(priv);
net = read_pnet(&set->net);
nft_net = nft_pernet(net);
gc_seq = READ_ONCE(nft_net->gc_seq);
gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
if (!gc)
goto done;
rhashtable_walk_enter(&priv->ht, &hti); rhashtable_walk_enter(&priv->ht, &hti);
rhashtable_walk_start(&hti); rhashtable_walk_start(&hti);
while ((he = rhashtable_walk_next(&hti))) { while ((he = rhashtable_walk_next(&hti))) {
if (IS_ERR(he)) { if (IS_ERR(he)) {
if (PTR_ERR(he) != -EAGAIN) if (PTR_ERR(he) != -EAGAIN) {
break; nft_trans_gc_destroy(gc);
gc = NULL;
goto try_later;
}
continue; continue;
} }
/* Ruleset has been updated, try later. */
if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
nft_trans_gc_destroy(gc);
gc = NULL;
goto try_later;
}
if (nft_set_elem_is_dead(&he->ext))
goto dead_elem;
if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) && if (nft_set_ext_exists(&he->ext, NFT_SET_EXT_EXPRESSIONS) &&
nft_rhash_expr_needs_gc_run(set, &he->ext)) nft_rhash_expr_needs_gc_run(set, &he->ext))
goto needs_gc_run; goto needs_gc_run;
...@@ -340,26 +360,26 @@ static void nft_rhash_gc(struct work_struct *work) ...@@ -340,26 +360,26 @@ static void nft_rhash_gc(struct work_struct *work)
if (!nft_set_elem_expired(&he->ext)) if (!nft_set_elem_expired(&he->ext))
continue; continue;
needs_gc_run: needs_gc_run:
if (nft_set_elem_mark_busy(&he->ext)) nft_set_elem_dead(&he->ext);
continue; dead_elem:
gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
if (!gc)
goto try_later;
gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); nft_trans_gc_elem_add(gc, he);
if (gcb == NULL)
break;
rhashtable_remove_fast(&priv->ht, &he->node, nft_rhash_params);
atomic_dec(&set->nelems);
nft_set_gc_batch_add(gcb, he);
} }
gc = nft_trans_gc_catchall(gc, gc_seq);
try_later:
/* catchall list iteration requires rcu read side lock. */
rhashtable_walk_stop(&hti); rhashtable_walk_stop(&hti);
rhashtable_walk_exit(&hti); rhashtable_walk_exit(&hti);
he = nft_set_catchall_gc(set); if (gc)
if (he) { nft_trans_gc_queue_async_done(gc);
gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
if (gcb) done:
nft_set_gc_batch_add(gcb, he);
}
nft_set_gc_batch_complete(gcb);
queue_delayed_work(system_power_efficient_wq, &priv->gc_work, queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set)); nft_set_gc_interval(set));
} }
...@@ -394,7 +414,7 @@ static int nft_rhash_init(const struct nft_set *set, ...@@ -394,7 +414,7 @@ static int nft_rhash_init(const struct nft_set *set,
return err; return err;
INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc); INIT_DEFERRABLE_WORK(&priv->gc_work, nft_rhash_gc);
if (set->flags & NFT_SET_TIMEOUT) if (set->flags & (NFT_SET_TIMEOUT | NFT_SET_EVAL))
nft_rhash_gc_init(set); nft_rhash_gc_init(set);
return 0; return 0;
...@@ -422,7 +442,6 @@ static void nft_rhash_destroy(const struct nft_ctx *ctx, ...@@ -422,7 +442,6 @@ static void nft_rhash_destroy(const struct nft_ctx *ctx,
}; };
cancel_delayed_work_sync(&priv->gc_work); cancel_delayed_work_sync(&priv->gc_work);
rcu_barrier();
rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy, rhashtable_free_and_destroy(&priv->ht, nft_rhash_elem_destroy,
(void *)&rhash_ctx); (void *)&rhash_ctx);
} }
......
...@@ -566,8 +566,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, ...@@ -566,8 +566,7 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net,
goto out; goto out;
if (last) { if (last) {
if (nft_set_elem_expired(&f->mt[b].e->ext) || if ((genmask &&
(genmask &&
!nft_set_elem_active(&f->mt[b].e->ext, genmask))) !nft_set_elem_active(&f->mt[b].e->ext, genmask)))
goto next_match; goto next_match;
...@@ -601,8 +600,17 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net, ...@@ -601,8 +600,17 @@ static struct nft_pipapo_elem *pipapo_get(const struct net *net,
static void *nft_pipapo_get(const struct net *net, const struct nft_set *set, static void *nft_pipapo_get(const struct net *net, const struct nft_set *set,
const struct nft_set_elem *elem, unsigned int flags) const struct nft_set_elem *elem, unsigned int flags)
{ {
return pipapo_get(net, set, (const u8 *)elem->key.val.data, struct nft_pipapo_elem *ret;
nft_genmask_cur(net));
ret = pipapo_get(net, set, (const u8 *)elem->key.val.data,
nft_genmask_cur(net));
if (IS_ERR(ret))
return ret;
if (nft_set_elem_expired(&ret->ext))
return ERR_PTR(-ENOENT);
return ret;
} }
/** /**
...@@ -1528,16 +1536,34 @@ static void pipapo_drop(struct nft_pipapo_match *m, ...@@ -1528,16 +1536,34 @@ static void pipapo_drop(struct nft_pipapo_match *m,
} }
} }
static void nft_pipapo_gc_deactivate(struct net *net, struct nft_set *set,
struct nft_pipapo_elem *e)
{
struct nft_set_elem elem = {
.priv = e,
};
nft_setelem_data_deactivate(net, set, &elem);
}
/** /**
* pipapo_gc() - Drop expired entries from set, destroy start and end elements * pipapo_gc() - Drop expired entries from set, destroy start and end elements
* @set: nftables API set representation * @set: nftables API set representation
* @m: Matching data * @m: Matching data
*/ */
static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) static void pipapo_gc(const struct nft_set *_set, struct nft_pipapo_match *m)
{ {
struct nft_set *set = (struct nft_set *) _set;
struct nft_pipapo *priv = nft_set_priv(set); struct nft_pipapo *priv = nft_set_priv(set);
struct net *net = read_pnet(&set->net);
int rules_f0, first_rule = 0; int rules_f0, first_rule = 0;
struct nft_pipapo_elem *e; struct nft_pipapo_elem *e;
struct nft_trans_gc *gc;
gc = nft_trans_gc_alloc(set, 0, GFP_KERNEL);
if (!gc)
return;
while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) { while ((rules_f0 = pipapo_rules_same_key(m->f, first_rule))) {
union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS]; union nft_pipapo_map_bucket rulemap[NFT_PIPAPO_MAX_FIELDS];
...@@ -1561,13 +1587,20 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) ...@@ -1561,13 +1587,20 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
f--; f--;
i--; i--;
e = f->mt[rulemap[i].to].e; e = f->mt[rulemap[i].to].e;
if (nft_set_elem_expired(&e->ext) &&
!nft_set_elem_mark_busy(&e->ext)) { /* synchronous gc never fails, there is no need to set on
* NFT_SET_ELEM_DEAD_BIT.
*/
if (nft_set_elem_expired(&e->ext)) {
priv->dirty = true; priv->dirty = true;
pipapo_drop(m, rulemap);
rcu_barrier(); gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
nft_set_elem_destroy(set, e, true); if (!gc)
break;
nft_pipapo_gc_deactivate(net, set, e);
pipapo_drop(m, rulemap);
nft_trans_gc_elem_add(gc, e);
/* And check again current first rule, which is now the /* And check again current first rule, which is now the
* first we haven't checked. * first we haven't checked.
...@@ -1577,11 +1610,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m) ...@@ -1577,11 +1610,11 @@ static void pipapo_gc(const struct nft_set *set, struct nft_pipapo_match *m)
} }
} }
e = nft_set_catchall_gc(set); gc = nft_trans_gc_catchall(gc, 0);
if (e) if (gc) {
nft_set_elem_destroy(set, e, true); nft_trans_gc_queue_sync_done(gc);
priv->last_gc = jiffies;
priv->last_gc = jiffies; }
} }
/** /**
...@@ -1706,7 +1739,6 @@ static void nft_pipapo_activate(const struct net *net, ...@@ -1706,7 +1739,6 @@ static void nft_pipapo_activate(const struct net *net,
return; return;
nft_set_elem_change_active(net, set, &e->ext); nft_set_elem_change_active(net, set, &e->ext);
nft_set_elem_clear_busy(&e->ext);
} }
/** /**
...@@ -2005,8 +2037,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set, ...@@ -2005,8 +2037,6 @@ static void nft_pipapo_walk(const struct nft_ctx *ctx, struct nft_set *set,
goto cont; goto cont;
e = f->mt[r].e; e = f->mt[r].e;
if (nft_set_elem_expired(&e->ext))
goto cont;
elem.priv = e; elem.priv = e;
......
...@@ -46,6 +46,12 @@ static int nft_rbtree_cmp(const struct nft_set *set, ...@@ -46,6 +46,12 @@ static int nft_rbtree_cmp(const struct nft_set *set,
set->klen); set->klen);
} }
static bool nft_rbtree_elem_expired(const struct nft_rbtree_elem *rbe)
{
return nft_set_elem_expired(&rbe->ext) ||
nft_set_elem_is_dead(&rbe->ext);
}
static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set, static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set,
const u32 *key, const struct nft_set_ext **ext, const u32 *key, const struct nft_set_ext **ext,
unsigned int seq) unsigned int seq)
...@@ -80,7 +86,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set ...@@ -80,7 +86,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
continue; continue;
} }
if (nft_set_elem_expired(&rbe->ext)) if (nft_rbtree_elem_expired(rbe))
return false; return false;
if (nft_rbtree_interval_end(rbe)) { if (nft_rbtree_interval_end(rbe)) {
...@@ -98,7 +104,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set ...@@ -98,7 +104,7 @@ static bool __nft_rbtree_lookup(const struct net *net, const struct nft_set *set
if (set->flags & NFT_SET_INTERVAL && interval != NULL && if (set->flags & NFT_SET_INTERVAL && interval != NULL &&
nft_set_elem_active(&interval->ext, genmask) && nft_set_elem_active(&interval->ext, genmask) &&
!nft_set_elem_expired(&interval->ext) && !nft_rbtree_elem_expired(interval) &&
nft_rbtree_interval_start(interval)) { nft_rbtree_interval_start(interval)) {
*ext = &interval->ext; *ext = &interval->ext;
return true; return true;
...@@ -215,6 +221,18 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set, ...@@ -215,6 +221,18 @@ static void *nft_rbtree_get(const struct net *net, const struct nft_set *set,
return rbe; return rbe;
} }
static void nft_rbtree_gc_remove(struct net *net, struct nft_set *set,
struct nft_rbtree *priv,
struct nft_rbtree_elem *rbe)
{
struct nft_set_elem elem = {
.priv = rbe,
};
nft_setelem_data_deactivate(net, set, &elem);
rb_erase(&rbe->node, &priv->root);
}
static int nft_rbtree_gc_elem(const struct nft_set *__set, static int nft_rbtree_gc_elem(const struct nft_set *__set,
struct nft_rbtree *priv, struct nft_rbtree *priv,
struct nft_rbtree_elem *rbe, struct nft_rbtree_elem *rbe,
...@@ -222,11 +240,12 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, ...@@ -222,11 +240,12 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
{ {
struct nft_set *set = (struct nft_set *)__set; struct nft_set *set = (struct nft_set *)__set;
struct rb_node *prev = rb_prev(&rbe->node); struct rb_node *prev = rb_prev(&rbe->node);
struct net *net = read_pnet(&set->net);
struct nft_rbtree_elem *rbe_prev; struct nft_rbtree_elem *rbe_prev;
struct nft_set_gc_batch *gcb; struct nft_trans_gc *gc;
gcb = nft_set_gc_batch_check(set, NULL, GFP_ATOMIC); gc = nft_trans_gc_alloc(set, 0, GFP_ATOMIC);
if (!gcb) if (!gc)
return -ENOMEM; return -ENOMEM;
/* search for end interval coming before this element. /* search for end interval coming before this element.
...@@ -244,17 +263,28 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set, ...@@ -244,17 +263,28 @@ static int nft_rbtree_gc_elem(const struct nft_set *__set,
if (prev) { if (prev) {
rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node); rbe_prev = rb_entry(prev, struct nft_rbtree_elem, node);
nft_rbtree_gc_remove(net, set, priv, rbe_prev);
rb_erase(&rbe_prev->node, &priv->root); /* There is always room in this trans gc for this element,
atomic_dec(&set->nelems); * memory allocation never actually happens, hence, the warning
nft_set_gc_batch_add(gcb, rbe_prev); * splat in such case. No need to set NFT_SET_ELEM_DEAD_BIT,
* this is synchronous gc which never fails.
*/
gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
if (WARN_ON_ONCE(!gc))
return -ENOMEM;
nft_trans_gc_elem_add(gc, rbe_prev);
} }
rb_erase(&rbe->node, &priv->root); nft_rbtree_gc_remove(net, set, priv, rbe);
atomic_dec(&set->nelems); gc = nft_trans_gc_queue_sync(gc, GFP_ATOMIC);
if (WARN_ON_ONCE(!gc))
return -ENOMEM;
nft_trans_gc_elem_add(gc, rbe);
nft_set_gc_batch_add(gcb, rbe); nft_trans_gc_queue_sync_done(gc);
nft_set_gc_batch_complete(gcb);
return 0; return 0;
} }
...@@ -482,7 +512,6 @@ static void nft_rbtree_activate(const struct net *net, ...@@ -482,7 +512,6 @@ static void nft_rbtree_activate(const struct net *net,
struct nft_rbtree_elem *rbe = elem->priv; struct nft_rbtree_elem *rbe = elem->priv;
nft_set_elem_change_active(net, set, &rbe->ext); nft_set_elem_change_active(net, set, &rbe->ext);
nft_set_elem_clear_busy(&rbe->ext);
} }
static bool nft_rbtree_flush(const struct net *net, static bool nft_rbtree_flush(const struct net *net,
...@@ -490,12 +519,9 @@ static bool nft_rbtree_flush(const struct net *net, ...@@ -490,12 +519,9 @@ static bool nft_rbtree_flush(const struct net *net,
{ {
struct nft_rbtree_elem *rbe = priv; struct nft_rbtree_elem *rbe = priv;
if (!nft_set_elem_mark_busy(&rbe->ext) || nft_set_elem_change_active(net, set, &rbe->ext);
!nft_is_active(net, &rbe->ext)) {
nft_set_elem_change_active(net, set, &rbe->ext); return true;
return true;
}
return false;
} }
static void *nft_rbtree_deactivate(const struct net *net, static void *nft_rbtree_deactivate(const struct net *net,
...@@ -552,8 +578,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, ...@@ -552,8 +578,6 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
if (iter->count < iter->skip) if (iter->count < iter->skip)
goto cont; goto cont;
if (nft_set_elem_expired(&rbe->ext))
goto cont;
if (!nft_set_elem_active(&rbe->ext, iter->genmask)) if (!nft_set_elem_active(&rbe->ext, iter->genmask))
goto cont; goto cont;
...@@ -572,26 +596,40 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx, ...@@ -572,26 +596,40 @@ static void nft_rbtree_walk(const struct nft_ctx *ctx,
static void nft_rbtree_gc(struct work_struct *work) static void nft_rbtree_gc(struct work_struct *work)
{ {
struct nft_rbtree_elem *rbe, *rbe_end = NULL, *rbe_prev = NULL; struct nft_rbtree_elem *rbe, *rbe_end = NULL;
struct nft_set_gc_batch *gcb = NULL; struct nftables_pernet *nft_net;
struct nft_rbtree *priv; struct nft_rbtree *priv;
struct nft_trans_gc *gc;
struct rb_node *node; struct rb_node *node;
struct nft_set *set; struct nft_set *set;
unsigned int gc_seq;
struct net *net; struct net *net;
u8 genmask;
priv = container_of(work, struct nft_rbtree, gc_work.work); priv = container_of(work, struct nft_rbtree, gc_work.work);
set = nft_set_container_of(priv); set = nft_set_container_of(priv);
net = read_pnet(&set->net); net = read_pnet(&set->net);
genmask = nft_genmask_cur(net); nft_net = nft_pernet(net);
gc_seq = READ_ONCE(nft_net->gc_seq);
gc = nft_trans_gc_alloc(set, gc_seq, GFP_KERNEL);
if (!gc)
goto done;
write_lock_bh(&priv->lock); write_lock_bh(&priv->lock);
write_seqcount_begin(&priv->count); write_seqcount_begin(&priv->count);
for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) { for (node = rb_first(&priv->root); node != NULL; node = rb_next(node)) {
/* Ruleset has been updated, try later. */
if (READ_ONCE(nft_net->gc_seq) != gc_seq) {
nft_trans_gc_destroy(gc);
gc = NULL;
goto try_later;
}
rbe = rb_entry(node, struct nft_rbtree_elem, node); rbe = rb_entry(node, struct nft_rbtree_elem, node);
if (!nft_set_elem_active(&rbe->ext, genmask)) if (nft_set_elem_is_dead(&rbe->ext))
continue; goto dead_elem;
/* elements are reversed in the rbtree for historical reasons, /* elements are reversed in the rbtree for historical reasons,
* from highest to lowest value, that is why end element is * from highest to lowest value, that is why end element is
...@@ -604,46 +642,36 @@ static void nft_rbtree_gc(struct work_struct *work) ...@@ -604,46 +642,36 @@ static void nft_rbtree_gc(struct work_struct *work)
if (!nft_set_elem_expired(&rbe->ext)) if (!nft_set_elem_expired(&rbe->ext))
continue; continue;
if (nft_set_elem_mark_busy(&rbe->ext)) { nft_set_elem_dead(&rbe->ext);
rbe_end = NULL;
if (!rbe_end)
continue; continue;
}
if (rbe_prev) { nft_set_elem_dead(&rbe_end->ext);
rb_erase(&rbe_prev->node, &priv->root);
rbe_prev = NULL;
}
gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC);
if (!gcb)
break;
atomic_dec(&set->nelems); gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
nft_set_gc_batch_add(gcb, rbe); if (!gc)
rbe_prev = rbe; goto try_later;
if (rbe_end) { nft_trans_gc_elem_add(gc, rbe_end);
atomic_dec(&set->nelems); rbe_end = NULL;
nft_set_gc_batch_add(gcb, rbe_end); dead_elem:
rb_erase(&rbe_end->node, &priv->root); gc = nft_trans_gc_queue_async(gc, gc_seq, GFP_ATOMIC);
rbe_end = NULL; if (!gc)
} goto try_later;
node = rb_next(node);
if (!node) nft_trans_gc_elem_add(gc, rbe);
break;
} }
if (rbe_prev)
rb_erase(&rbe_prev->node, &priv->root); gc = nft_trans_gc_catchall(gc, gc_seq);
try_later:
write_seqcount_end(&priv->count); write_seqcount_end(&priv->count);
write_unlock_bh(&priv->lock); write_unlock_bh(&priv->lock);
rbe = nft_set_catchall_gc(set); if (gc)
if (rbe) { nft_trans_gc_queue_async_done(gc);
gcb = nft_set_gc_batch_check(set, gcb, GFP_ATOMIC); done:
if (gcb)
nft_set_gc_batch_add(gcb, rbe);
}
nft_set_gc_batch_complete(gcb);
queue_delayed_work(system_power_efficient_wq, &priv->gc_work, queue_delayed_work(system_power_efficient_wq, &priv->gc_work,
nft_set_gc_interval(set)); nft_set_gc_interval(set));
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment