Commit fead2b86 authored by Michal Hocko's avatar Michal Hocko Committed by Linus Torvalds

mm/memcg: revert ("mm/memcg: optimize user context object stock access")

Patch series "mm/memcg: Address PREEMPT_RT problems instead of disabling it", v5.

This series aims to address the memcg related problem on PREEMPT_RT.

I tested them on CONFIG_PREEMPT and CONFIG_PREEMPT_RT with the
tools/testing/selftests/cgroup/* tests and I haven't observed any
regressions (other than the lockdep report that is already there).

This patch (of 6):

The optimisation is based on a micro benchmark where local_irq_save() is
more expensive than a preempt_disable().  There is no evidence that it
is visible in a real-world workload and there are CPUs where the
opposite is true (local_irq_save() is cheaper than preempt_disable()).

Based on micro benchmarks, the optimisation makes sense on PREEMPT_NONE
where preempt_disable() is optimized away.  There is no improvement with
PREEMPT_DYNAMIC since the preemption counter is always available.

The optimization makes also the PREEMPT_RT integration more complicated
since most of the assumption are not true on PREEMPT_RT.

Revert the optimisation since it complicates the PREEMPT_RT integration
and the improvement is hardly visible.

[bigeasy@linutronix.de: patch body around Michal's diff]

Link: https://lkml.kernel.org/r/20220226204144.1008339-1-bigeasy@linutronix.de
Link: https://lore.kernel.org/all/YgOGkXXCrD%2F1k+p4@dhcp22.suse.cz
Link: https://lkml.kernel.org/r/YdX+INO9gQje6d0S@linutronix.de
Link: https://lkml.kernel.org/r/20220226204144.1008339-2-bigeasy@linutronix.deSigned-off-by: default avatarMichal Hocko <mhocko@suse.com>
Signed-off-by: default avatarSebastian Andrzej Siewior <bigeasy@linutronix.de>
Acked-by: default avatarRoman Gushchin <guro@fb.com>
Acked-by: default avatarJohannes Weiner <hannes@cmpxchg.org>
Reviewed-by: default avatarShakeel Butt <shakeelb@google.com>
Acked-by: default avatarMichal Hocko <mhocko@suse.com>
Cc: Johannes Weiner <hannes@cmpxchg.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Vladimir Davydov <vdavydov.dev@gmail.com>
Cc: Waiman Long <longman@redhat.com>
Cc: kernel test robot <oliver.sang@intel.com>
Cc: Michal Hocko <mhocko@kernel.org>
Cc: Michal Koutný <mkoutny@suse.com>
Signed-off-by: default avatarAndrew Morton <akpm@linux-foundation.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 460a79e1
...@@ -2078,23 +2078,17 @@ void unlock_page_memcg(struct page *page) ...@@ -2078,23 +2078,17 @@ void unlock_page_memcg(struct page *page)
folio_memcg_unlock(page_folio(page)); folio_memcg_unlock(page_folio(page));
} }
struct obj_stock { struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
#ifdef CONFIG_MEMCG_KMEM #ifdef CONFIG_MEMCG_KMEM
struct obj_cgroup *cached_objcg; struct obj_cgroup *cached_objcg;
struct pglist_data *cached_pgdat; struct pglist_data *cached_pgdat;
unsigned int nr_bytes; unsigned int nr_bytes;
int nr_slab_reclaimable_b; int nr_slab_reclaimable_b;
int nr_slab_unreclaimable_b; int nr_slab_unreclaimable_b;
#else
int dummy[0];
#endif #endif
};
struct memcg_stock_pcp {
struct mem_cgroup *cached; /* this never be root cgroup */
unsigned int nr_pages;
struct obj_stock task_obj;
struct obj_stock irq_obj;
struct work_struct work; struct work_struct work;
unsigned long flags; unsigned long flags;
...@@ -2104,13 +2098,13 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock); ...@@ -2104,13 +2098,13 @@ static DEFINE_PER_CPU(struct memcg_stock_pcp, memcg_stock);
static DEFINE_MUTEX(percpu_charge_mutex); static DEFINE_MUTEX(percpu_charge_mutex);
#ifdef CONFIG_MEMCG_KMEM #ifdef CONFIG_MEMCG_KMEM
static void drain_obj_stock(struct obj_stock *stock); static void drain_obj_stock(struct memcg_stock_pcp *stock);
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
struct mem_cgroup *root_memcg); struct mem_cgroup *root_memcg);
static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages); static void memcg_account_kmem(struct mem_cgroup *memcg, int nr_pages);
#else #else
static inline void drain_obj_stock(struct obj_stock *stock) static inline void drain_obj_stock(struct memcg_stock_pcp *stock)
{ {
} }
static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
...@@ -2190,9 +2184,7 @@ static void drain_local_stock(struct work_struct *dummy) ...@@ -2190,9 +2184,7 @@ static void drain_local_stock(struct work_struct *dummy)
local_irq_save(flags); local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock); stock = this_cpu_ptr(&memcg_stock);
drain_obj_stock(&stock->irq_obj); drain_obj_stock(stock);
if (in_task())
drain_obj_stock(&stock->task_obj);
drain_stock(stock); drain_stock(stock);
clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags); clear_bit(FLUSHING_CACHED_CHARGE, &stock->flags);
...@@ -2767,41 +2759,6 @@ static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg) ...@@ -2767,41 +2759,6 @@ static struct mem_cgroup *get_mem_cgroup_from_objcg(struct obj_cgroup *objcg)
*/ */
#define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT) #define OBJCGS_CLEAR_MASK (__GFP_DMA | __GFP_RECLAIMABLE | __GFP_ACCOUNT)
/*
* Most kmem_cache_alloc() calls are from user context. The irq disable/enable
* sequence used in this case to access content from object stock is slow.
* To optimize for user context access, there are now two object stocks for
* task context and interrupt context access respectively.
*
* The task context object stock can be accessed by disabling preemption only
* which is cheap in non-preempt kernel. The interrupt context object stock
* can only be accessed after disabling interrupt. User context code can
* access interrupt object stock, but not vice versa.
*/
static inline struct obj_stock *get_obj_stock(unsigned long *pflags)
{
struct memcg_stock_pcp *stock;
if (likely(in_task())) {
*pflags = 0UL;
preempt_disable();
stock = this_cpu_ptr(&memcg_stock);
return &stock->task_obj;
}
local_irq_save(*pflags);
stock = this_cpu_ptr(&memcg_stock);
return &stock->irq_obj;
}
static inline void put_obj_stock(unsigned long flags)
{
if (likely(in_task()))
preempt_enable();
else
local_irq_restore(flags);
}
/* /*
* mod_objcg_mlstate() may be called with irq enabled, so * mod_objcg_mlstate() may be called with irq enabled, so
* mod_memcg_lruvec_state() should be used. * mod_memcg_lruvec_state() should be used.
...@@ -3082,10 +3039,13 @@ void __memcg_kmem_uncharge_page(struct page *page, int order) ...@@ -3082,10 +3039,13 @@ void __memcg_kmem_uncharge_page(struct page *page, int order)
void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
enum node_stat_item idx, int nr) enum node_stat_item idx, int nr)
{ {
struct memcg_stock_pcp *stock;
unsigned long flags; unsigned long flags;
struct obj_stock *stock = get_obj_stock(&flags);
int *bytes; int *bytes;
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
/* /*
* Save vmstat data in stock and skip vmstat array update unless * Save vmstat data in stock and skip vmstat array update unless
* accumulating over a page of vmstat data or when pgdat or idx * accumulating over a page of vmstat data or when pgdat or idx
...@@ -3136,26 +3096,29 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat, ...@@ -3136,26 +3096,29 @@ void mod_objcg_state(struct obj_cgroup *objcg, struct pglist_data *pgdat,
if (nr) if (nr)
mod_objcg_mlstate(objcg, pgdat, idx, nr); mod_objcg_mlstate(objcg, pgdat, idx, nr);
put_obj_stock(flags); local_irq_restore(flags);
} }
static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes) static bool consume_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes)
{ {
struct memcg_stock_pcp *stock;
unsigned long flags; unsigned long flags;
struct obj_stock *stock = get_obj_stock(&flags);
bool ret = false; bool ret = false;
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) { if (objcg == stock->cached_objcg && stock->nr_bytes >= nr_bytes) {
stock->nr_bytes -= nr_bytes; stock->nr_bytes -= nr_bytes;
ret = true; ret = true;
} }
put_obj_stock(flags); local_irq_restore(flags);
return ret; return ret;
} }
static void drain_obj_stock(struct obj_stock *stock) static void drain_obj_stock(struct memcg_stock_pcp *stock)
{ {
struct obj_cgroup *old = stock->cached_objcg; struct obj_cgroup *old = stock->cached_objcg;
...@@ -3211,13 +3174,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, ...@@ -3211,13 +3174,8 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
{ {
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
if (in_task() && stock->task_obj.cached_objcg) { if (stock->cached_objcg) {
memcg = obj_cgroup_memcg(stock->task_obj.cached_objcg); memcg = obj_cgroup_memcg(stock->cached_objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true;
}
if (stock->irq_obj.cached_objcg) {
memcg = obj_cgroup_memcg(stock->irq_obj.cached_objcg);
if (memcg && mem_cgroup_is_descendant(memcg, root_memcg)) if (memcg && mem_cgroup_is_descendant(memcg, root_memcg))
return true; return true;
} }
...@@ -3228,10 +3186,13 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock, ...@@ -3228,10 +3186,13 @@ static bool obj_stock_flush_required(struct memcg_stock_pcp *stock,
static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
bool allow_uncharge) bool allow_uncharge)
{ {
struct memcg_stock_pcp *stock;
unsigned long flags; unsigned long flags;
struct obj_stock *stock = get_obj_stock(&flags);
unsigned int nr_pages = 0; unsigned int nr_pages = 0;
local_irq_save(flags);
stock = this_cpu_ptr(&memcg_stock);
if (stock->cached_objcg != objcg) { /* reset if necessary */ if (stock->cached_objcg != objcg) { /* reset if necessary */
drain_obj_stock(stock); drain_obj_stock(stock);
obj_cgroup_get(objcg); obj_cgroup_get(objcg);
...@@ -3247,7 +3208,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes, ...@@ -3247,7 +3208,7 @@ static void refill_obj_stock(struct obj_cgroup *objcg, unsigned int nr_bytes,
stock->nr_bytes &= (PAGE_SIZE - 1); stock->nr_bytes &= (PAGE_SIZE - 1);
} }
put_obj_stock(flags); local_irq_restore(flags);
if (nr_pages) if (nr_pages)
obj_cgroup_uncharge_pages(objcg, nr_pages); obj_cgroup_uncharge_pages(objcg, nr_pages);
...@@ -6826,7 +6787,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) ...@@ -6826,7 +6787,6 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
long nr_pages; long nr_pages;
struct mem_cgroup *memcg; struct mem_cgroup *memcg;
struct obj_cgroup *objcg; struct obj_cgroup *objcg;
bool use_objcg = folio_memcg_kmem(folio);
VM_BUG_ON_FOLIO(folio_test_lru(folio), folio); VM_BUG_ON_FOLIO(folio_test_lru(folio), folio);
...@@ -6835,7 +6795,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) ...@@ -6835,7 +6795,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
* folio memcg or objcg at this point, we have fully * folio memcg or objcg at this point, we have fully
* exclusive access to the folio. * exclusive access to the folio.
*/ */
if (use_objcg) { if (folio_memcg_kmem(folio)) {
objcg = __folio_objcg(folio); objcg = __folio_objcg(folio);
/* /*
* This get matches the put at the end of the function and * This get matches the put at the end of the function and
...@@ -6863,7 +6823,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug) ...@@ -6863,7 +6823,7 @@ static void uncharge_folio(struct folio *folio, struct uncharge_gather *ug)
nr_pages = folio_nr_pages(folio); nr_pages = folio_nr_pages(folio);
if (use_objcg) { if (folio_memcg_kmem(folio)) {
ug->nr_memory += nr_pages; ug->nr_memory += nr_pages;
ug->nr_kmem += nr_pages; ug->nr_kmem += nr_pages;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment