Commit 5ab0fc15 authored by Andrew Morton's avatar Andrew Morton

Sync mm-stable with mm-hotfixes-stable to pick up dependent patches

Merge branch 'mm-hotfixes-stable' into mm-stable
parents 9a3f21fe ac86f547
...@@ -130,6 +130,7 @@ Domen Puncer <domen@coderock.org> ...@@ -130,6 +130,7 @@ Domen Puncer <domen@coderock.org>
Douglas Gilbert <dougg@torque.net> Douglas Gilbert <dougg@torque.net>
Ed L. Cashin <ecashin@coraid.com> Ed L. Cashin <ecashin@coraid.com>
Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com> Erik Kaneda <erik.kaneda@intel.com> <erik.schmauss@intel.com>
Eugen Hristev <eugen.hristev@collabora.com> <eugen.hristev@microchip.com>
Evgeniy Polyakov <johnpol@2ka.mipt.ru> Evgeniy Polyakov <johnpol@2ka.mipt.ru>
Ezequiel Garcia <ezequiel@vanguardiasur.com.ar> <ezequiel@collabora.com> Ezequiel Garcia <ezequiel@vanguardiasur.com.ar> <ezequiel@collabora.com>
Felipe W Damasio <felipewd@terra.com.br> Felipe W Damasio <felipewd@terra.com.br>
......
...@@ -1245,13 +1245,17 @@ PAGE_SIZE multiple when read back. ...@@ -1245,13 +1245,17 @@ PAGE_SIZE multiple when read back.
This is a simple interface to trigger memory reclaim in the This is a simple interface to trigger memory reclaim in the
target cgroup. target cgroup.
This file accepts a string which contains the number of bytes to This file accepts a single key, the number of bytes to reclaim.
reclaim. No nested keys are currently supported.
Example:: Example::
echo "1G" > memory.reclaim echo "1G" > memory.reclaim
The interface can be later extended with nested keys to
configure the reclaim behavior. For example, specify the
type of memory to reclaim from (anon, file, ..).
Please note that the kernel can over or under reclaim from Please note that the kernel can over or under reclaim from
the target cgroup. If less bytes are reclaimed than the the target cgroup. If less bytes are reclaimed than the
specified amount, -EAGAIN is returned. specified amount, -EAGAIN is returned.
...@@ -1263,13 +1267,6 @@ PAGE_SIZE multiple when read back. ...@@ -1263,13 +1267,6 @@ PAGE_SIZE multiple when read back.
This means that the networking layer will not adapt based on This means that the networking layer will not adapt based on
reclaim induced by memory.reclaim. reclaim induced by memory.reclaim.
This file also allows the user to specify the nodes to reclaim from,
via the 'nodes=' key, for example::
echo "1G nodes=0,1" > memory.reclaim
The above instructs the kernel to reclaim memory from nodes 0,1.
memory.peak memory.peak
A read-only single value file which exists on non-root A read-only single value file which exists on non-root
cgroups. cgroups.
......
...@@ -170,6 +170,9 @@ ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, u ...@@ -170,6 +170,9 @@ ia64_mremap (unsigned long addr, unsigned long old_len, unsigned long new_len, u
asmlinkage long asmlinkage long
ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp) ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *tp)
{ {
struct timespec64 rtn_tp;
s64 tick_ns;
/* /*
* ia64's clock_gettime() syscall is implemented as a vdso call * ia64's clock_gettime() syscall is implemented as a vdso call
* fsys_clock_gettime(). Currently it handles only * fsys_clock_gettime(). Currently it handles only
...@@ -185,8 +188,8 @@ ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user * ...@@ -185,8 +188,8 @@ ia64_clock_getres(const clockid_t which_clock, struct __kernel_timespec __user *
switch (which_clock) { switch (which_clock) {
case CLOCK_REALTIME: case CLOCK_REALTIME:
case CLOCK_MONOTONIC: case CLOCK_MONOTONIC:
s64 tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq); tick_ns = DIV_ROUND_UP(NSEC_PER_SEC, local_cpu_data->itc_freq);
struct timespec64 rtn_tp = ns_to_timespec64(tick_ns); rtn_tp = ns_to_timespec64(tick_ns);
return put_timespec64(&rtn_tp, tp); return put_timespec64(&rtn_tp, tp);
} }
......
...@@ -4,6 +4,7 @@ ...@@ -4,6 +4,7 @@
* Written by Niibe Yutaka and Paul Mundt * Written by Niibe Yutaka and Paul Mundt
*/ */
OUTPUT_ARCH(sh) OUTPUT_ARCH(sh)
#define RUNTIME_DISCARD_EXIT
#include <asm/thread_info.h> #include <asm/thread_info.h>
#include <asm/cache.h> #include <asm/cache.h>
#include <asm/vmlinux.lds.h> #include <asm/vmlinux.lds.h>
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
#include <linux/serial_core.h> #include <linux/serial_core.h>
#include <linux/sysfs.h> #include <linux/sysfs.h>
#include <linux/random.h> #include <linux/random.h>
#include <linux/kmemleak.h>
#include <asm/setup.h> /* for COMMAND_LINE_SIZE */ #include <asm/setup.h> /* for COMMAND_LINE_SIZE */
#include <asm/page.h> #include <asm/page.h>
...@@ -525,12 +524,9 @@ static int __init __reserved_mem_reserve_reg(unsigned long node, ...@@ -525,12 +524,9 @@ static int __init __reserved_mem_reserve_reg(unsigned long node,
size = dt_mem_next_cell(dt_root_size_cells, &prop); size = dt_mem_next_cell(dt_root_size_cells, &prop);
if (size && if (size &&
early_init_dt_reserve_memory(base, size, nomap) == 0) { early_init_dt_reserve_memory(base, size, nomap) == 0)
pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n", pr_debug("Reserved memory: reserved region for node '%s': base %pa, size %lu MiB\n",
uname, &base, (unsigned long)(size / SZ_1M)); uname, &base, (unsigned long)(size / SZ_1M));
if (!nomap)
kmemleak_alloc_phys(base, size, 0);
}
else else
pr_err("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n", pr_err("Reserved memory: failed to reserve memory for node '%s': base %pa, size %lu MiB\n",
uname, &base, (unsigned long)(size / SZ_1M)); uname, &base, (unsigned long)(size / SZ_1M));
......
...@@ -8,7 +8,7 @@ config VXFS_FS ...@@ -8,7 +8,7 @@ config VXFS_FS
of SCO UnixWare (and possibly others) and optionally available of SCO UnixWare (and possibly others) and optionally available
for Sunsoft Solaris, HP-UX and many other operating systems. However for Sunsoft Solaris, HP-UX and many other operating systems. However
these particular OS implementations of vxfs may differ in on-disk these particular OS implementations of vxfs may differ in on-disk
data endianess and/or superblock offset. The vxfs module has been data endianness and/or superblock offset. The vxfs module has been
tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.) tested with SCO UnixWare and HP-UX B.10.20 (pa-risc 1.1 arch.)
Currently only readonly access is supported and VxFX versions Currently only readonly access is supported and VxFX versions
2, 3 and 4. Tests were performed with HP-UX VxFS version 3. 2, 3 and 4. Tests were performed with HP-UX VxFS version 3.
......
...@@ -745,9 +745,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask, ...@@ -745,9 +745,7 @@ static int smaps_hugetlb_range(pte_t *pte, unsigned long hmask,
page = pfn_swap_entry_to_page(swpent); page = pfn_swap_entry_to_page(swpent);
} }
if (page) { if (page) {
int mapcount = page_mapcount(page); if (page_mapcount(page) >= 2 || hugetlb_pmd_shared(pte))
if (mapcount >= 2)
mss->shared_hugetlb += huge_page_size(hstate_vma(vma)); mss->shared_hugetlb += huge_page_size(hstate_vma(vma));
else else
mss->private_hugetlb += huge_page_size(hstate_vma(vma)); mss->private_hugetlb += huge_page_size(hstate_vma(vma));
......
...@@ -183,7 +183,7 @@ static inline int squashfs_block_size(__le32 raw) ...@@ -183,7 +183,7 @@ static inline int squashfs_block_size(__le32 raw)
#define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\ #define SQUASHFS_ID_BLOCK_BYTES(A) (SQUASHFS_ID_BLOCKS(A) *\
sizeof(u64)) sizeof(u64))
/* xattr id lookup table defines */ /* xattr id lookup table defines */
#define SQUASHFS_XATTR_BYTES(A) ((A) * sizeof(struct squashfs_xattr_id)) #define SQUASHFS_XATTR_BYTES(A) (((u64) (A)) * sizeof(struct squashfs_xattr_id))
#define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \ #define SQUASHFS_XATTR_BLOCK(A) (SQUASHFS_XATTR_BYTES(A) / \
SQUASHFS_METADATA_SIZE) SQUASHFS_METADATA_SIZE)
......
...@@ -63,7 +63,7 @@ struct squashfs_sb_info { ...@@ -63,7 +63,7 @@ struct squashfs_sb_info {
long long bytes_used; long long bytes_used;
unsigned int inodes; unsigned int inodes;
unsigned int fragments; unsigned int fragments;
int xattr_ids; unsigned int xattr_ids;
unsigned int ids; unsigned int ids;
bool panic_on_errors; bool panic_on_errors;
const struct squashfs_decompressor_thread_ops *thread_ops; const struct squashfs_decompressor_thread_ops *thread_ops;
......
...@@ -10,12 +10,12 @@ ...@@ -10,12 +10,12 @@
#ifdef CONFIG_SQUASHFS_XATTR #ifdef CONFIG_SQUASHFS_XATTR
extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64, extern __le64 *squashfs_read_xattr_id_table(struct super_block *, u64,
u64 *, int *); u64 *, unsigned int *);
extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *, extern int squashfs_xattr_lookup(struct super_block *, unsigned int, int *,
unsigned int *, unsigned long long *); unsigned int *, unsigned long long *);
#else #else
static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb, static inline __le64 *squashfs_read_xattr_id_table(struct super_block *sb,
u64 start, u64 *xattr_table_start, int *xattr_ids) u64 start, u64 *xattr_table_start, unsigned int *xattr_ids)
{ {
struct squashfs_xattr_id_table *id_table; struct squashfs_xattr_id_table *id_table;
......
...@@ -56,7 +56,7 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index, ...@@ -56,7 +56,7 @@ int squashfs_xattr_lookup(struct super_block *sb, unsigned int index,
* Read uncompressed xattr id lookup table indexes from disk into memory * Read uncompressed xattr id lookup table indexes from disk into memory
*/ */
__le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start,
u64 *xattr_table_start, int *xattr_ids) u64 *xattr_table_start, unsigned int *xattr_ids)
{ {
struct squashfs_sb_info *msblk = sb->s_fs_info; struct squashfs_sb_info *msblk = sb->s_fs_info;
unsigned int len, indexes; unsigned int len, indexes;
...@@ -76,7 +76,7 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start, ...@@ -76,7 +76,7 @@ __le64 *squashfs_read_xattr_id_table(struct super_block *sb, u64 table_start,
/* Sanity check values */ /* Sanity check values */
/* there is always at least one xattr id */ /* there is always at least one xattr id */
if (*xattr_ids == 0) if (*xattr_ids <= 0)
return ERR_PTR(-EINVAL); return ERR_PTR(-EINVAL);
len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids); len = SQUASHFS_XATTR_BLOCK_BYTES(*xattr_ids);
......
...@@ -200,7 +200,7 @@ static inline void *kmap_local_pfn(unsigned long pfn) ...@@ -200,7 +200,7 @@ static inline void *kmap_local_pfn(unsigned long pfn)
static inline void __kunmap_local(const void *addr) static inline void __kunmap_local(const void *addr)
{ {
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP #ifdef ARCH_HAS_FLUSH_ON_KUNMAP
kunmap_flush_on_unmap(addr); kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif #endif
} }
...@@ -227,7 +227,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn) ...@@ -227,7 +227,7 @@ static inline void *kmap_atomic_pfn(unsigned long pfn)
static inline void __kunmap_atomic(const void *addr) static inline void __kunmap_atomic(const void *addr)
{ {
#ifdef ARCH_HAS_FLUSH_ON_KUNMAP #ifdef ARCH_HAS_FLUSH_ON_KUNMAP
kunmap_flush_on_unmap(addr); kunmap_flush_on_unmap(PTR_ALIGN_DOWN(addr, PAGE_SIZE));
#endif #endif
pagefault_enable(); pagefault_enable();
if (IS_ENABLED(CONFIG_PREEMPT_RT)) if (IS_ENABLED(CONFIG_PREEMPT_RT))
......
...@@ -8,6 +8,7 @@ ...@@ -8,6 +8,7 @@
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/hugetlb_inline.h> #include <linux/hugetlb_inline.h>
#include <linux/cgroup.h> #include <linux/cgroup.h>
#include <linux/page_ref.h>
#include <linux/list.h> #include <linux/list.h>
#include <linux/kref.h> #include <linux/kref.h>
#include <linux/pgtable.h> #include <linux/pgtable.h>
...@@ -1225,6 +1226,18 @@ static inline __init void hugetlb_cma_reserve(int order) ...@@ -1225,6 +1226,18 @@ static inline __init void hugetlb_cma_reserve(int order)
} }
#endif #endif
#ifdef CONFIG_ARCH_WANT_HUGE_PMD_SHARE
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
return page_count(virt_to_page(pte)) > 1;
}
#else
static inline bool hugetlb_pmd_shared(pte_t *pte)
{
return false;
}
#endif
bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr); bool want_pmd_share(struct vm_area_struct *vma, unsigned long addr);
#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE #ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
......
...@@ -1688,10 +1688,13 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio, ...@@ -1688,10 +1688,13 @@ void mem_cgroup_track_foreign_dirty_slowpath(struct folio *folio,
static inline void mem_cgroup_track_foreign_dirty(struct folio *folio, static inline void mem_cgroup_track_foreign_dirty(struct folio *folio,
struct bdi_writeback *wb) struct bdi_writeback *wb)
{ {
struct mem_cgroup *memcg;
if (mem_cgroup_disabled()) if (mem_cgroup_disabled())
return; return;
if (unlikely(&folio_memcg(folio)->css != wb->memcg_css)) memcg = folio_memcg(folio);
if (unlikely(memcg && &memcg->css != wb->memcg_css))
mem_cgroup_track_foreign_dirty_slowpath(folio, wb); mem_cgroup_track_foreign_dirty_slowpath(folio, wb);
} }
......
...@@ -418,8 +418,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order, ...@@ -418,8 +418,7 @@ extern unsigned long try_to_free_pages(struct zonelist *zonelist, int order,
extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, extern unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages, unsigned long nr_pages,
gfp_t gfp_mask, gfp_t gfp_mask,
unsigned int reclaim_options, unsigned int reclaim_options);
nodemask_t *nodemask);
extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem, extern unsigned long mem_cgroup_shrink_node(struct mem_cgroup *mem,
gfp_t gfp_mask, bool noswap, gfp_t gfp_mask, bool noswap,
pg_data_t *pgdat, pg_data_t *pgdat,
......
...@@ -754,6 +754,7 @@ config DEBUG_KMEMLEAK ...@@ -754,6 +754,7 @@ config DEBUG_KMEMLEAK
select KALLSYMS select KALLSYMS
select CRC32 select CRC32
select STACKDEPOT select STACKDEPOT
select STACKDEPOT_ALWAYS_INIT if !DEBUG_KMEMLEAK_DEFAULT_OFF
help help
Say Y here if you want to enable the memory leak Say Y here if you want to enable the memory leak
detector. The memory allocation/freeing is traced in a way detector. The memory allocation/freeing is traced in a way
...@@ -1207,7 +1208,7 @@ config SCHED_DEBUG ...@@ -1207,7 +1208,7 @@ config SCHED_DEBUG
depends on DEBUG_KERNEL && PROC_FS depends on DEBUG_KERNEL && PROC_FS
default y default y
help help
If you say Y here, the /proc/sched_debug file will be provided If you say Y here, the /sys/kernel/debug/sched file will be provided
that can help debug the scheduler. The runtime overhead of this that can help debug the scheduler. The runtime overhead of this
option is minimal. option is minimal.
......
...@@ -667,12 +667,13 @@ static inline unsigned long mte_pivot(const struct maple_enode *mn, ...@@ -667,12 +667,13 @@ static inline unsigned long mte_pivot(const struct maple_enode *mn,
unsigned char piv) unsigned char piv)
{ {
struct maple_node *node = mte_to_node(mn); struct maple_node *node = mte_to_node(mn);
enum maple_type type = mte_node_type(mn);
if (piv >= mt_pivots[piv]) { if (piv >= mt_pivots[type]) {
WARN_ON(1); WARN_ON(1);
return 0; return 0;
} }
switch (mte_node_type(mn)) { switch (type) {
case maple_arange_64: case maple_arange_64:
return node->ma64.pivot[piv]; return node->ma64.pivot[piv];
case maple_range_64: case maple_range_64:
...@@ -4876,7 +4877,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) ...@@ -4876,7 +4877,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
unsigned long *pivots, *gaps; unsigned long *pivots, *gaps;
void __rcu **slots; void __rcu **slots;
unsigned long gap = 0; unsigned long gap = 0;
unsigned long max, min, index; unsigned long max, min;
unsigned char offset; unsigned char offset;
if (unlikely(mas_is_err(mas))) if (unlikely(mas_is_err(mas)))
...@@ -4898,8 +4899,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) ...@@ -4898,8 +4899,7 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
min = mas_safe_min(mas, pivots, --offset); min = mas_safe_min(mas, pivots, --offset);
max = mas_safe_pivot(mas, pivots, offset, type); max = mas_safe_pivot(mas, pivots, offset, type);
index = mas->index; while (mas->index <= max) {
while (index <= max) {
gap = 0; gap = 0;
if (gaps) if (gaps)
gap = gaps[offset]; gap = gaps[offset];
...@@ -4930,10 +4930,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) ...@@ -4930,10 +4930,8 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
min = mas_safe_min(mas, pivots, offset); min = mas_safe_min(mas, pivots, offset);
} }
if (unlikely(index > max)) { if (unlikely((mas->index > max) || (size - 1 > max - mas->index)))
mas_set_err(mas, -EBUSY); goto no_space;
return false;
}
if (unlikely(ma_is_leaf(type))) { if (unlikely(ma_is_leaf(type))) {
mas->offset = offset; mas->offset = offset;
...@@ -4950,9 +4948,11 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size) ...@@ -4950,9 +4948,11 @@ static bool mas_rev_awalk(struct ma_state *mas, unsigned long size)
return false; return false;
ascend: ascend:
if (mte_is_root(mas->node)) if (!mte_is_root(mas->node))
mas_set_err(mas, -EBUSY); return false;
no_space:
mas_set_err(mas, -EBUSY);
return false; return false;
} }
......
...@@ -2517,6 +2517,91 @@ static noinline void check_bnode_min_spanning(struct maple_tree *mt) ...@@ -2517,6 +2517,91 @@ static noinline void check_bnode_min_spanning(struct maple_tree *mt)
mt_set_non_kernel(0); mt_set_non_kernel(0);
} }
static noinline void check_empty_area_window(struct maple_tree *mt)
{
unsigned long i, nr_entries = 20;
MA_STATE(mas, mt, 0, 0);
for (i = 1; i <= nr_entries; i++)
mtree_store_range(mt, i*10, i*10 + 9,
xa_mk_value(i), GFP_KERNEL);
/* Create another hole besides the one at 0 */
mtree_store_range(mt, 160, 169, NULL, GFP_KERNEL);
/* Check lower bounds that don't fit */
rcu_read_lock();
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 10) != -EBUSY);
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 6, 90, 5) != -EBUSY);
/* Check lower bound that does fit */
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 90, 5) != 0);
MT_BUG_ON(mt, mas.index != 5);
MT_BUG_ON(mt, mas.last != 9);
rcu_read_unlock();
/* Check one gap that doesn't fit and one that does */
rcu_read_lock();
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 5, 217, 9) != 0);
MT_BUG_ON(mt, mas.index != 161);
MT_BUG_ON(mt, mas.last != 169);
/* Check one gap that does fit above the min */
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 3) != 0);
MT_BUG_ON(mt, mas.index != 216);
MT_BUG_ON(mt, mas.last != 218);
/* Check size that doesn't fit any gap */
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 218, 16) != -EBUSY);
/*
* Check size that doesn't fit the lower end of the window but
* does fit the gap
*/
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 167, 200, 4) != -EBUSY);
/*
* Check size that doesn't fit the upper end of the window but
* does fit the gap
*/
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area_rev(&mas, 100, 162, 4) != -EBUSY);
/* Check mas_empty_area forward */
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 9) != 0);
MT_BUG_ON(mt, mas.index != 0);
MT_BUG_ON(mt, mas.last != 8);
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 4) != 0);
MT_BUG_ON(mt, mas.index != 0);
MT_BUG_ON(mt, mas.last != 3);
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area(&mas, 0, 100, 11) != -EBUSY);
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area(&mas, 5, 100, 6) != -EBUSY);
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area(&mas, 0, 8, 10) != -EBUSY);
mas_reset(&mas);
mas_empty_area(&mas, 100, 165, 3);
mas_reset(&mas);
MT_BUG_ON(mt, mas_empty_area(&mas, 100, 163, 6) != -EBUSY);
rcu_read_unlock();
}
static DEFINE_MTREE(tree); static DEFINE_MTREE(tree);
static int maple_tree_seed(void) static int maple_tree_seed(void)
{ {
...@@ -2765,6 +2850,10 @@ static int maple_tree_seed(void) ...@@ -2765,6 +2850,10 @@ static int maple_tree_seed(void)
check_bnode_min_spanning(&tree); check_bnode_min_spanning(&tree);
mtree_destroy(&tree); mtree_destroy(&tree);
mt_init_flags(&tree, MT_FLAGS_ALLOC_RANGE);
check_empty_area_window(&tree);
mtree_destroy(&tree);
#if defined(BENCH) #if defined(BENCH)
skip: skip:
#endif #endif
......
...@@ -847,6 +847,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address, ...@@ -847,6 +847,10 @@ static int hugepage_vma_revalidate(struct mm_struct *mm, unsigned long address,
return SCAN_SUCCEED; return SCAN_SUCCEED;
} }
/*
* See pmd_trans_unstable() for how the result may change out from
* underneath us, even if we hold mmap_lock in read.
*/
static int find_pmd_or_thp_or_none(struct mm_struct *mm, static int find_pmd_or_thp_or_none(struct mm_struct *mm,
unsigned long address, unsigned long address,
pmd_t **pmd) pmd_t **pmd)
...@@ -865,8 +869,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm, ...@@ -865,8 +869,12 @@ static int find_pmd_or_thp_or_none(struct mm_struct *mm,
#endif #endif
if (pmd_none(pmde)) if (pmd_none(pmde))
return SCAN_PMD_NONE; return SCAN_PMD_NONE;
if (!pmd_present(pmde))
return SCAN_PMD_NULL;
if (pmd_trans_huge(pmde)) if (pmd_trans_huge(pmde))
return SCAN_PMD_MAPPED; return SCAN_PMD_MAPPED;
if (pmd_devmap(pmde))
return SCAN_PMD_NULL;
if (pmd_bad(pmde)) if (pmd_bad(pmde))
return SCAN_PMD_NULL; return SCAN_PMD_NULL;
return SCAN_SUCCEED; return SCAN_SUCCEED;
...@@ -1642,7 +1650,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, ...@@ -1642,7 +1650,7 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
* has higher cost too. It would also probably require locking * has higher cost too. It would also probably require locking
* the anon_vma. * the anon_vma.
*/ */
if (vma->anon_vma) { if (READ_ONCE(vma->anon_vma)) {
result = SCAN_PAGE_ANON; result = SCAN_PAGE_ANON;
goto next; goto next;
} }
...@@ -1670,6 +1678,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff, ...@@ -1670,6 +1678,18 @@ static int retract_page_tables(struct address_space *mapping, pgoff_t pgoff,
result = SCAN_PTE_MAPPED_HUGEPAGE; result = SCAN_PTE_MAPPED_HUGEPAGE;
if ((cc->is_khugepaged || is_target) && if ((cc->is_khugepaged || is_target) &&
mmap_write_trylock(mm)) { mmap_write_trylock(mm)) {
/*
* Re-check whether we have an ->anon_vma, because
* collapse_and_free_pmd() requires that either no
* ->anon_vma exists or the anon_vma is locked.
* We already checked ->anon_vma above, but that check
* is racy because ->anon_vma can be populated under the
* mmap lock in read mode.
*/
if (vma->anon_vma) {
result = SCAN_PAGE_ANON;
goto unlock_next;
}
/* /*
* When a vma is registered with uffd-wp, we can't * When a vma is registered with uffd-wp, we can't
* recycle the pmd pgtable because there can be pte * recycle the pmd pgtable because there can be pte
......
...@@ -2070,8 +2070,10 @@ static int __init kmemleak_boot_config(char *str) ...@@ -2070,8 +2070,10 @@ static int __init kmemleak_boot_config(char *str)
return -EINVAL; return -EINVAL;
if (strcmp(str, "off") == 0) if (strcmp(str, "off") == 0)
kmemleak_disable(); kmemleak_disable();
else if (strcmp(str, "on") == 0) else if (strcmp(str, "on") == 0) {
kmemleak_skip_disable = 1; kmemleak_skip_disable = 1;
stack_depot_want_early_init();
}
else else
return -EINVAL; return -EINVAL;
return 0; return 0;
...@@ -2093,7 +2095,6 @@ void __init kmemleak_init(void) ...@@ -2093,7 +2095,6 @@ void __init kmemleak_init(void)
if (kmemleak_error) if (kmemleak_error)
return; return;
stack_depot_init();
jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE); jiffies_min_age = msecs_to_jiffies(MSECS_MIN_AGE);
jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000); jiffies_scan_wait = msecs_to_jiffies(SECS_SCAN_WAIT * 1000);
......
...@@ -63,7 +63,6 @@ ...@@ -63,7 +63,6 @@
#include <linux/resume_user_mode.h> #include <linux/resume_user_mode.h>
#include <linux/psi.h> #include <linux/psi.h>
#include <linux/seq_buf.h> #include <linux/seq_buf.h>
#include <linux/parser.h>
#include "internal.h" #include "internal.h"
#include <net/sock.h> #include <net/sock.h>
#include <net/ip.h> #include <net/ip.h>
...@@ -2403,8 +2402,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg, ...@@ -2403,8 +2402,7 @@ static unsigned long reclaim_high(struct mem_cgroup *memcg,
psi_memstall_enter(&pflags); psi_memstall_enter(&pflags);
nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages, nr_reclaimed += try_to_free_mem_cgroup_pages(memcg, nr_pages,
gfp_mask, gfp_mask,
MEMCG_RECLAIM_MAY_SWAP, MEMCG_RECLAIM_MAY_SWAP);
NULL);
psi_memstall_leave(&pflags); psi_memstall_leave(&pflags);
} while ((memcg = parent_mem_cgroup(memcg)) && } while ((memcg = parent_mem_cgroup(memcg)) &&
!mem_cgroup_is_root(memcg)); !mem_cgroup_is_root(memcg));
...@@ -2695,8 +2693,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask, ...@@ -2695,8 +2693,7 @@ static int try_charge_memcg(struct mem_cgroup *memcg, gfp_t gfp_mask,
psi_memstall_enter(&pflags); psi_memstall_enter(&pflags);
nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages, nr_reclaimed = try_to_free_mem_cgroup_pages(mem_over_limit, nr_pages,
gfp_mask, reclaim_options, gfp_mask, reclaim_options);
NULL);
psi_memstall_leave(&pflags); psi_memstall_leave(&pflags);
if (mem_cgroup_margin(mem_over_limit) >= nr_pages) if (mem_cgroup_margin(mem_over_limit) >= nr_pages)
...@@ -3516,8 +3513,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, ...@@ -3516,8 +3513,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg,
} }
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP, memsw ? 0 : MEMCG_RECLAIM_MAY_SWAP)) {
NULL)) {
ret = -EBUSY; ret = -EBUSY;
break; break;
} }
...@@ -3631,8 +3627,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) ...@@ -3631,8 +3627,7 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg)
return -EINTR; return -EINTR;
if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL,
MEMCG_RECLAIM_MAY_SWAP, MEMCG_RECLAIM_MAY_SWAP))
NULL))
nr_retries--; nr_retries--;
} }
...@@ -6473,8 +6468,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, ...@@ -6473,8 +6468,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of,
} }
reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high, reclaimed = try_to_free_mem_cgroup_pages(memcg, nr_pages - high,
GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP);
NULL);
if (!reclaimed && !nr_retries--) if (!reclaimed && !nr_retries--)
break; break;
...@@ -6523,8 +6517,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, ...@@ -6523,8 +6517,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of,
if (nr_reclaims) { if (nr_reclaims) {
if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max, if (!try_to_free_mem_cgroup_pages(memcg, nr_pages - max,
GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP, GFP_KERNEL, MEMCG_RECLAIM_MAY_SWAP))
NULL))
nr_reclaims--; nr_reclaims--;
continue; continue;
} }
...@@ -6647,54 +6640,21 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of, ...@@ -6647,54 +6640,21 @@ static ssize_t memory_oom_group_write(struct kernfs_open_file *of,
return nbytes; return nbytes;
} }
enum {
MEMORY_RECLAIM_NODES = 0,
MEMORY_RECLAIM_NULL,
};
static const match_table_t if_tokens = {
{ MEMORY_RECLAIM_NODES, "nodes=%s" },
{ MEMORY_RECLAIM_NULL, NULL },
};
static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
size_t nbytes, loff_t off) size_t nbytes, loff_t off)
{ {
struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of));
unsigned int nr_retries = MAX_RECLAIM_RETRIES; unsigned int nr_retries = MAX_RECLAIM_RETRIES;
unsigned long nr_to_reclaim, nr_reclaimed = 0; unsigned long nr_to_reclaim, nr_reclaimed = 0;
unsigned int reclaim_options = MEMCG_RECLAIM_MAY_SWAP | unsigned int reclaim_options;
MEMCG_RECLAIM_PROACTIVE; int err;
char *old_buf, *start;
substring_t args[MAX_OPT_ARGS];
int token;
char value[256];
nodemask_t nodemask = NODE_MASK_ALL;
buf = strstrip(buf);
old_buf = buf;
nr_to_reclaim = memparse(buf, &buf) / PAGE_SIZE;
if (buf == old_buf)
return -EINVAL;
buf = strstrip(buf); buf = strstrip(buf);
err = page_counter_memparse(buf, "", &nr_to_reclaim);
if (err)
return err;
while ((start = strsep(&buf, " ")) != NULL) { reclaim_options = MEMCG_RECLAIM_MAY_SWAP | MEMCG_RECLAIM_PROACTIVE;
if (!strlen(start))
continue;
token = match_token(start, if_tokens, args);
match_strlcpy(value, args, sizeof(value));
switch (token) {
case MEMORY_RECLAIM_NODES:
if (nodelist_parse(value, nodemask) < 0)
return -EINVAL;
break;
default:
return -EINVAL;
}
}
while (nr_reclaimed < nr_to_reclaim) { while (nr_reclaimed < nr_to_reclaim) {
unsigned long reclaimed; unsigned long reclaimed;
...@@ -6711,8 +6671,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf, ...@@ -6711,8 +6671,7 @@ static ssize_t memory_reclaim(struct kernfs_open_file *of, char *buf,
reclaimed = try_to_free_mem_cgroup_pages(memcg, reclaimed = try_to_free_mem_cgroup_pages(memcg,
nr_to_reclaim - nr_reclaimed, nr_to_reclaim - nr_reclaimed,
GFP_KERNEL, reclaim_options, GFP_KERNEL, reclaim_options);
&nodemask);
if (!reclaimed && !nr_retries--) if (!reclaimed && !nr_retries--)
return -EAGAIN; return -EAGAIN;
......
...@@ -600,7 +600,8 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask, ...@@ -600,7 +600,8 @@ static int queue_pages_hugetlb(pte_t *pte, unsigned long hmask,
/* With MPOL_MF_MOVE, we migrate only unshared hugepage. */ /* With MPOL_MF_MOVE, we migrate only unshared hugepage. */
if (flags & (MPOL_MF_MOVE_ALL) || if (flags & (MPOL_MF_MOVE_ALL) ||
(flags & MPOL_MF_MOVE && page_mapcount(page) == 1)) { (flags & MPOL_MF_MOVE && page_mapcount(page) == 1 &&
!hugetlb_pmd_shared(pte))) {
if (isolate_hugetlb(page, qp->pagelist) && if (isolate_hugetlb(page, qp->pagelist) &&
(flags & MPOL_MF_STRICT)) (flags & MPOL_MF_STRICT))
/* /*
......
...@@ -1027,16 +1027,29 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len, ...@@ -1027,16 +1027,29 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
} }
/* /*
* Function vma_merge() is called on the extension we are adding to * Function vma_merge() is called on the extension we
* the already existing vma, vma_merge() will merge this extension with * are adding to the already existing vma, vma_merge()
* the already existing vma (expand operation itself) and possibly also * will merge this extension with the already existing
* with the next vma if it becomes adjacent to the expanded vma and * vma (expand operation itself) and possibly also with
* otherwise compatible. * the next vma if it becomes adjacent to the expanded
* vma and otherwise compatible.
*
* However, vma_merge() can currently fail due to
* is_mergeable_vma() check for vm_ops->close (see the
* comment there). Yet this should not prevent vma
* expanding, so perform a simple expand for such vma.
* Ideally the check for close op should be only done
* when a vma would be actually removed due to a merge.
*/ */
vma = vma_merge(mm, vma, extension_start, extension_end, if (!vma->vm_ops || !vma->vm_ops->close) {
vma = vma_merge(mm, vma, extension_start, extension_end,
vma->vm_flags, vma->anon_vma, vma->vm_file, vma->vm_flags, vma->anon_vma, vma->vm_file,
extension_pgoff, vma_policy(vma), extension_pgoff, vma_policy(vma),
vma->vm_userfaultfd_ctx, anon_vma_name(vma)); vma->vm_userfaultfd_ctx, anon_vma_name(vma));
} else if (vma_adjust(vma, vma->vm_start, addr + new_len,
vma->vm_pgoff, NULL)) {
vma = NULL;
}
if (!vma) { if (!vma) {
vm_unacct_memory(pages); vm_unacct_memory(pages);
ret = -ENOMEM; ret = -ENOMEM;
......
...@@ -1100,6 +1100,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size) ...@@ -1100,6 +1100,7 @@ int get_swap_pages(int n_goal, swp_entry_t swp_entries[], int entry_size)
goto check_out; goto check_out;
pr_debug("scan_swap_map of si %d failed to find offset\n", pr_debug("scan_swap_map of si %d failed to find offset\n",
si->type); si->type);
cond_resched();
spin_lock(&swap_avail_lock); spin_lock(&swap_avail_lock);
nextsi: nextsi:
......
...@@ -3335,13 +3335,16 @@ void lru_gen_migrate_mm(struct mm_struct *mm) ...@@ -3335,13 +3335,16 @@ void lru_gen_migrate_mm(struct mm_struct *mm)
if (mem_cgroup_disabled()) if (mem_cgroup_disabled())
return; return;
/* migration can happen before addition */
if (!mm->lru_gen.memcg)
return;
rcu_read_lock(); rcu_read_lock();
memcg = mem_cgroup_from_task(task); memcg = mem_cgroup_from_task(task);
rcu_read_unlock(); rcu_read_unlock();
if (memcg == mm->lru_gen.memcg) if (memcg == mm->lru_gen.memcg)
return; return;
VM_WARN_ON_ONCE(!mm->lru_gen.memcg);
VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list)); VM_WARN_ON_ONCE(list_empty(&mm->lru_gen.list));
lru_gen_del_mm(mm); lru_gen_del_mm(mm);
...@@ -7022,8 +7025,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg, ...@@ -7022,8 +7025,7 @@ unsigned long mem_cgroup_shrink_node(struct mem_cgroup *memcg,
unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
unsigned long nr_pages, unsigned long nr_pages,
gfp_t gfp_mask, gfp_t gfp_mask,
unsigned int reclaim_options, unsigned int reclaim_options)
nodemask_t *nodemask)
{ {
unsigned long nr_reclaimed; unsigned long nr_reclaimed;
unsigned int noreclaim_flag; unsigned int noreclaim_flag;
...@@ -7038,7 +7040,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg, ...@@ -7038,7 +7040,6 @@ unsigned long try_to_free_mem_cgroup_pages(struct mem_cgroup *memcg,
.may_unmap = 1, .may_unmap = 1,
.may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP), .may_swap = !!(reclaim_options & MEMCG_RECLAIM_MAY_SWAP),
.proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE), .proactive = !!(reclaim_options & MEMCG_RECLAIM_PROACTIVE),
.nodemask = nodemask,
}; };
/* /*
* Traverse the ZONELIST_FALLBACK zonelist of the current node to put * Traverse the ZONELIST_FALLBACK zonelist of the current node to put
......
...@@ -113,7 +113,23 @@ ...@@ -113,7 +113,23 @@
* have room for two bit at least. * have room for two bit at least.
*/ */
#define OBJ_ALLOCATED_TAG 1 #define OBJ_ALLOCATED_TAG 1
#define OBJ_TAG_BITS 1
#ifdef CONFIG_ZPOOL
/*
* The second least-significant bit in the object's header identifies if the
* value stored at the header is a deferred handle from the last reclaim
* attempt.
*
* As noted above, this is valid because we have room for two bits.
*/
#define OBJ_DEFERRED_HANDLE_TAG 2
#define OBJ_TAG_BITS 2
#define OBJ_TAG_MASK (OBJ_ALLOCATED_TAG | OBJ_DEFERRED_HANDLE_TAG)
#else
#define OBJ_TAG_BITS 1
#define OBJ_TAG_MASK OBJ_ALLOCATED_TAG
#endif /* CONFIG_ZPOOL */
#define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS) #define OBJ_INDEX_BITS (BITS_PER_LONG - _PFN_BITS - OBJ_TAG_BITS)
#define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1) #define OBJ_INDEX_MASK ((_AC(1, UL) << OBJ_INDEX_BITS) - 1)
...@@ -222,6 +238,12 @@ struct link_free { ...@@ -222,6 +238,12 @@ struct link_free {
* Handle of allocated object. * Handle of allocated object.
*/ */
unsigned long handle; unsigned long handle;
#ifdef CONFIG_ZPOOL
/*
* Deferred handle of a reclaimed object.
*/
unsigned long deferred_handle;
#endif
}; };
}; };
...@@ -272,8 +294,6 @@ struct zspage { ...@@ -272,8 +294,6 @@ struct zspage {
/* links the zspage to the lru list in the pool */ /* links the zspage to the lru list in the pool */
struct list_head lru; struct list_head lru;
bool under_reclaim; bool under_reclaim;
/* list of unfreed handles whose objects have been reclaimed */
unsigned long *deferred_handles;
#endif #endif
struct zs_pool *pool; struct zs_pool *pool;
...@@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle) ...@@ -897,7 +917,8 @@ static unsigned long handle_to_obj(unsigned long handle)
return *(unsigned long *)handle; return *(unsigned long *)handle;
} }
static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) static bool obj_tagged(struct page *page, void *obj, unsigned long *phandle,
int tag)
{ {
unsigned long handle; unsigned long handle;
struct zspage *zspage = get_zspage(page); struct zspage *zspage = get_zspage(page);
...@@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle) ...@@ -908,13 +929,27 @@ static bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
} else } else
handle = *(unsigned long *)obj; handle = *(unsigned long *)obj;
if (!(handle & OBJ_ALLOCATED_TAG)) if (!(handle & tag))
return false; return false;
*phandle = handle & ~OBJ_ALLOCATED_TAG; /* Clear all tags before returning the handle */
*phandle = handle & ~OBJ_TAG_MASK;
return true; return true;
} }
static inline bool obj_allocated(struct page *page, void *obj, unsigned long *phandle)
{
return obj_tagged(page, obj, phandle, OBJ_ALLOCATED_TAG);
}
#ifdef CONFIG_ZPOOL
static bool obj_stores_deferred_handle(struct page *page, void *obj,
unsigned long *phandle)
{
return obj_tagged(page, obj, phandle, OBJ_DEFERRED_HANDLE_TAG);
}
#endif
static void reset_page(struct page *page) static void reset_page(struct page *page)
{ {
__ClearPageMovable(page); __ClearPageMovable(page);
...@@ -946,22 +981,36 @@ static int trylock_zspage(struct zspage *zspage) ...@@ -946,22 +981,36 @@ static int trylock_zspage(struct zspage *zspage)
} }
#ifdef CONFIG_ZPOOL #ifdef CONFIG_ZPOOL
static unsigned long find_deferred_handle_obj(struct size_class *class,
struct page *page, int *obj_idx);
/* /*
* Free all the deferred handles whose objects are freed in zs_free. * Free all the deferred handles whose objects are freed in zs_free.
*/ */
static void free_handles(struct zs_pool *pool, struct zspage *zspage) static void free_handles(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{ {
unsigned long handle = (unsigned long)zspage->deferred_handles; int obj_idx = 0;
struct page *page = get_first_page(zspage);
unsigned long handle;
while (handle) { while (1) {
unsigned long nxt_handle = handle_to_obj(handle); handle = find_deferred_handle_obj(class, page, &obj_idx);
if (!handle) {
page = get_next_page(page);
if (!page)
break;
obj_idx = 0;
continue;
}
cache_free_handle(pool, handle); cache_free_handle(pool, handle);
handle = nxt_handle; obj_idx++;
} }
} }
#else #else
static inline void free_handles(struct zs_pool *pool, struct zspage *zspage) {} static inline void free_handles(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage) {}
#endif #endif
static void __free_zspage(struct zs_pool *pool, struct size_class *class, static void __free_zspage(struct zs_pool *pool, struct size_class *class,
...@@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class, ...@@ -979,7 +1028,7 @@ static void __free_zspage(struct zs_pool *pool, struct size_class *class,
VM_BUG_ON(fg != ZS_EMPTY); VM_BUG_ON(fg != ZS_EMPTY);
/* Free all deferred handles from zs_free */ /* Free all deferred handles from zs_free */
free_handles(pool, zspage); free_handles(pool, class, zspage);
next = page = get_first_page(zspage); next = page = get_first_page(zspage);
do { do {
...@@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage) ...@@ -1067,7 +1116,6 @@ static void init_zspage(struct size_class *class, struct zspage *zspage)
#ifdef CONFIG_ZPOOL #ifdef CONFIG_ZPOOL
INIT_LIST_HEAD(&zspage->lru); INIT_LIST_HEAD(&zspage->lru);
zspage->under_reclaim = false; zspage->under_reclaim = false;
zspage->deferred_handles = NULL;
#endif #endif
set_freeobj(zspage, 0); set_freeobj(zspage, 0);
...@@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp) ...@@ -1568,7 +1616,7 @@ unsigned long zs_malloc(struct zs_pool *pool, size_t size, gfp_t gfp)
} }
EXPORT_SYMBOL_GPL(zs_malloc); EXPORT_SYMBOL_GPL(zs_malloc);
static void obj_free(int class_size, unsigned long obj) static void obj_free(int class_size, unsigned long obj, unsigned long *handle)
{ {
struct link_free *link; struct link_free *link;
struct zspage *zspage; struct zspage *zspage;
...@@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj) ...@@ -1582,15 +1630,29 @@ static void obj_free(int class_size, unsigned long obj)
zspage = get_zspage(f_page); zspage = get_zspage(f_page);
vaddr = kmap_atomic(f_page); vaddr = kmap_atomic(f_page);
/* Insert this object in containing zspage's freelist */
link = (struct link_free *)(vaddr + f_offset); link = (struct link_free *)(vaddr + f_offset);
if (likely(!ZsHugePage(zspage)))
link->next = get_freeobj(zspage) << OBJ_TAG_BITS; if (handle) {
else #ifdef CONFIG_ZPOOL
f_page->index = 0; /* Stores the (deferred) handle in the object's header */
*handle |= OBJ_DEFERRED_HANDLE_TAG;
*handle &= ~OBJ_ALLOCATED_TAG;
if (likely(!ZsHugePage(zspage)))
link->deferred_handle = *handle;
else
f_page->index = *handle;
#endif
} else {
/* Insert this object in containing zspage's freelist */
if (likely(!ZsHugePage(zspage)))
link->next = get_freeobj(zspage) << OBJ_TAG_BITS;
else
f_page->index = 0;
set_freeobj(zspage, f_objidx);
}
kunmap_atomic(vaddr); kunmap_atomic(vaddr);
set_freeobj(zspage, f_objidx);
mod_zspage_inuse(zspage, -1); mod_zspage_inuse(zspage, -1);
} }
...@@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle) ...@@ -1615,7 +1677,6 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
zspage = get_zspage(f_page); zspage = get_zspage(f_page);
class = zspage_class(pool, zspage); class = zspage_class(pool, zspage);
obj_free(class->size, obj);
class_stat_dec(class, OBJ_USED, 1); class_stat_dec(class, OBJ_USED, 1);
#ifdef CONFIG_ZPOOL #ifdef CONFIG_ZPOOL
...@@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle) ...@@ -1624,15 +1685,15 @@ void zs_free(struct zs_pool *pool, unsigned long handle)
* Reclaim needs the handles during writeback. It'll free * Reclaim needs the handles during writeback. It'll free
* them along with the zspage when it's done with them. * them along with the zspage when it's done with them.
* *
* Record current deferred handle at the memory location * Record current deferred handle in the object's header.
* whose address is given by handle.
*/ */
record_obj(handle, (unsigned long)zspage->deferred_handles); obj_free(class->size, obj, &handle);
zspage->deferred_handles = (unsigned long *)handle;
spin_unlock(&pool->lock); spin_unlock(&pool->lock);
return; return;
} }
#endif #endif
obj_free(class->size, obj, NULL);
fullness = fix_fullness_group(class, zspage); fullness = fix_fullness_group(class, zspage);
if (fullness == ZS_EMPTY) if (fullness == ZS_EMPTY)
free_zspage(pool, class, zspage); free_zspage(pool, class, zspage);
...@@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst, ...@@ -1713,11 +1774,11 @@ static void zs_object_copy(struct size_class *class, unsigned long dst,
} }
/* /*
* Find alloced object in zspage from index object and * Find object with a certain tag in zspage from index object and
* return handle. * return handle.
*/ */
static unsigned long find_alloced_obj(struct size_class *class, static unsigned long find_tagged_obj(struct size_class *class,
struct page *page, int *obj_idx) struct page *page, int *obj_idx, int tag)
{ {
unsigned int offset; unsigned int offset;
int index = *obj_idx; int index = *obj_idx;
...@@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class, ...@@ -1728,7 +1789,7 @@ static unsigned long find_alloced_obj(struct size_class *class,
offset += class->size * index; offset += class->size * index;
while (offset < PAGE_SIZE) { while (offset < PAGE_SIZE) {
if (obj_allocated(page, addr + offset, &handle)) if (obj_tagged(page, addr + offset, &handle, tag))
break; break;
offset += class->size; offset += class->size;
...@@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class, ...@@ -1742,6 +1803,28 @@ static unsigned long find_alloced_obj(struct size_class *class,
return handle; return handle;
} }
/*
* Find alloced object in zspage from index object and
* return handle.
*/
static unsigned long find_alloced_obj(struct size_class *class,
struct page *page, int *obj_idx)
{
return find_tagged_obj(class, page, obj_idx, OBJ_ALLOCATED_TAG);
}
#ifdef CONFIG_ZPOOL
/*
* Find object storing a deferred handle in header in zspage from index object
* and return handle.
*/
static unsigned long find_deferred_handle_obj(struct size_class *class,
struct page *page, int *obj_idx)
{
return find_tagged_obj(class, page, obj_idx, OBJ_DEFERRED_HANDLE_TAG);
}
#endif
struct zs_compact_control { struct zs_compact_control {
/* Source spage for migration which could be a subpage of zspage */ /* Source spage for migration which could be a subpage of zspage */
struct page *s_page; struct page *s_page;
...@@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class, ...@@ -1784,7 +1867,7 @@ static int migrate_zspage(struct zs_pool *pool, struct size_class *class,
zs_object_copy(class, free_obj, used_obj); zs_object_copy(class, free_obj, used_obj);
obj_idx++; obj_idx++;
record_obj(handle, free_obj); record_obj(handle, free_obj);
obj_free(class->size, used_obj); obj_free(class->size, used_obj, NULL);
} }
/* Remember last position in this iteration */ /* Remember last position in this iteration */
...@@ -2475,6 +2558,90 @@ void zs_destroy_pool(struct zs_pool *pool) ...@@ -2475,6 +2558,90 @@ void zs_destroy_pool(struct zs_pool *pool)
EXPORT_SYMBOL_GPL(zs_destroy_pool); EXPORT_SYMBOL_GPL(zs_destroy_pool);
#ifdef CONFIG_ZPOOL #ifdef CONFIG_ZPOOL
static void restore_freelist(struct zs_pool *pool, struct size_class *class,
struct zspage *zspage)
{
unsigned int obj_idx = 0;
unsigned long handle, off = 0; /* off is within-page offset */
struct page *page = get_first_page(zspage);
struct link_free *prev_free = NULL;
void *prev_page_vaddr = NULL;
/* in case no free object found */
set_freeobj(zspage, (unsigned int)(-1UL));
while (page) {
void *vaddr = kmap_atomic(page);
struct page *next_page;
while (off < PAGE_SIZE) {
void *obj_addr = vaddr + off;
/* skip allocated object */
if (obj_allocated(page, obj_addr, &handle)) {
obj_idx++;
off += class->size;
continue;
}
/* free deferred handle from reclaim attempt */
if (obj_stores_deferred_handle(page, obj_addr, &handle))
cache_free_handle(pool, handle);
if (prev_free)
prev_free->next = obj_idx << OBJ_TAG_BITS;
else /* first free object found */
set_freeobj(zspage, obj_idx);
prev_free = (struct link_free *)vaddr + off / sizeof(*prev_free);
/* if last free object in a previous page, need to unmap */
if (prev_page_vaddr) {
kunmap_atomic(prev_page_vaddr);
prev_page_vaddr = NULL;
}
obj_idx++;
off += class->size;
}
/*
* Handle the last (full or partial) object on this page.
*/
next_page = get_next_page(page);
if (next_page) {
if (!prev_free || prev_page_vaddr) {
/*
* There is no free object in this page, so we can safely
* unmap it.
*/
kunmap_atomic(vaddr);
} else {
/* update prev_page_vaddr since prev_free is on this page */
prev_page_vaddr = vaddr;
}
} else { /* this is the last page */
if (prev_free) {
/*
* Reset OBJ_TAG_BITS bit to last link to tell
* whether it's allocated object or not.
*/
prev_free->next = -1UL << OBJ_TAG_BITS;
}
/* unmap previous page (if not done yet) */
if (prev_page_vaddr) {
kunmap_atomic(prev_page_vaddr);
prev_page_vaddr = NULL;
}
kunmap_atomic(vaddr);
}
page = next_page;
off %= PAGE_SIZE;
}
}
static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
{ {
int i, obj_idx, ret = 0; int i, obj_idx, ret = 0;
...@@ -2558,6 +2725,12 @@ static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries) ...@@ -2558,6 +2725,12 @@ static int zs_reclaim_page(struct zs_pool *pool, unsigned int retries)
return 0; return 0;
} }
/*
* Eviction fails on one of the handles, so we need to restore zspage.
* We need to rebuild its freelist (and free stored deferred handles),
* put it back to the correct size class, and add it to the LRU list.
*/
restore_freelist(pool, class, zspage);
putback_zspage(class, zspage); putback_zspage(class, zspage);
list_add(&zspage->lru, &pool->lru); list_add(&zspage->lru, &pool->lru);
unlock_zspage(zspage); unlock_zspage(zspage);
......
File mode changed from 100644 to 100755
...@@ -17,7 +17,6 @@ ...@@ -17,7 +17,6 @@
#include <stdio.h> #include <stdio.h>
#include <unistd.h> #include <unistd.h>
#include <sys/mman.h> #include <sys/mman.h>
#define __USE_GNU
#include <fcntl.h> #include <fcntl.h>
#define MIN_FREE_PAGES 20 #define MIN_FREE_PAGES 20
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment