Merge branch 'akpm' (patches from Andrew)

Merge fixes from Andrew Morton: "10 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm: slab: free kmem_cache_node after destroy sysfs file ipc/shm: handle removed segments gracefully in shm_mmap() MAINTAINERS: update Kselftest Framework mailing list devm_memremap_release(): fix memremap'd addr handling mm/hugetlb.c: fix incorrect proc nr_hugepages value mm, x86: fix pte_page() crash in gup_pte_range() fsnotify: turn fsnotify reaper thread into a workqueue job Revert "fsnotify: destroy marks with call_srcu instead of dedicated thread" mm: fix regression in remap_file_pages() emulation thp, dax: do not try to withdraw pgtable from non-anon VMA

Merge branch 'akpm' (patches from Andrew)
Merge fixes from Andrew Morton: "10 fixes" * emailed patches from Andrew Morton <akpm@linux-foundation.org>: mm: slab: free kmem_cache_node after destroy sysfs file ipc/shm: handle removed segments gracefully in shm_mmap() MAINTAINERS: update Kselftest Framework mailing list devm_memremap_release(): fix memremap'd addr handling mm/hugetlb.c: fix incorrect proc nr_hugepages value mm, x86: fix pte_page() crash in gup_pte_range() fsnotify: turn fsnotify reaper thread into a workqueue job Revert "fsnotify: destroy marks with call_srcu instead of dedicated thread" mm: fix regression in remap_file_pages() emulation thp, dax: do not try to withdraw pgtable from non-anon VMA
87d9ac71 · Linus Torvalds · 23300f65 · 52b4b950 · 87d9ac71 · 87d9ac71
Commit 87d9ac71 authored Feb 19, 2016 by Linus Torvalds
14 changed files
--- a/MAINTAINERS
+++ b/MAINTAINERS
@@ -6128,7 +6128,7 @@ F:	include/uapi/linux/sunrpc/

 KERNEL SELFTEST FRAMEWORK
 M:	Shuah Khan <shuahkh@osg.samsung.com>
-L:	linux-api@vger.kernel.org
+L:	linux-kselftest@vger.kernel.org
 T:	git git://git.kernel.org/pub/scm/shuah/linux-kselftest
 S:	Maintained
 F:	tools/testing/selftests

--- a/arch/x86/mm/gup.c
+++ b/arch/x86/mm/gup.c
@@ -102,7 +102,6 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 			return 0;
 		}

-		page = pte_page(pte);
 		if (pte_devmap(pte)) {
 			pgmap = get_dev_pagemap(pte_pfn(pte), pgmap);
 			if (unlikely(!pgmap)) {
@@ -115,6 +114,7 @@ static noinline int gup_pte_range(pmd_t pmd, unsigned long addr,
 			return 0;
 		}
 		VM_BUG_ON(!pfn_valid(pte_pfn(pte)));
+		page = pte_page(pte);
 		get_page(page);
 		put_dev_pagemap(pgmap);
 		SetPageReferenced(page);

--- a/fs/notify/mark.c
+++ b/fs/notify/mark.c
@@ -91,7 +91,14 @@
 #include <linux/fsnotify_backend.h>
 #include "fsnotify.h"

+#define FSNOTIFY_REAPER_DELAY	(1)	/* 1 jiffy */
+
 struct srcu_struct fsnotify_mark_srcu;
+static DEFINE_SPINLOCK(destroy_lock);
+static LIST_HEAD(destroy_list);
+
+static void fsnotify_mark_destroy(struct work_struct *work);
+static DECLARE_DELAYED_WORK(reaper_work, fsnotify_mark_destroy);

 void fsnotify_get_mark(struct fsnotify_mark *mark)
 {
@@ -165,19 +172,10 @@ void fsnotify_detach_mark(struct fsnotify_mark *mark)
 	atomic_dec(&group->num_marks);
 }

-static void
-fsnotify_mark_free_rcu(struct rcu_head *rcu)
-{
-	struct fsnotify_mark	*mark;
-
-	mark = container_of(rcu, struct fsnotify_mark, g_rcu);
-	fsnotify_put_mark(mark);
-}
-
 /*
- * Free fsnotify mark. The freeing is actually happening from a call_srcu
- * callback. Caller must have a reference to the mark or be protected by
- * fsnotify_mark_srcu.
+ * Free fsnotify mark. The freeing is actually happening from a kthread which
+ * first waits for srcu period end. Caller must have a reference to the mark
+ * or be protected by fsnotify_mark_srcu.
 */
 void fsnotify_free_mark(struct fsnotify_mark *mark)
 {
@@ -192,7 +190,11 @@ void fsnotify_free_mark(struct fsnotify_mark *mark)
 	mark->flags &= ~FSNOTIFY_MARK_FLAG_ALIVE;
 	spin_unlock(&mark->lock);

-	call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
+	spin_lock(&destroy_lock);
+	list_add(&mark->g_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+	queue_delayed_work(system_unbound_wq, &reaper_work,
+				FSNOTIFY_REAPER_DELAY);

 	/*
 	 * Some groups like to know that marks are being freed.  This is a
@@ -388,7 +390,12 @@ int fsnotify_add_mark_locked(struct fsnotify_mark *mark,

 	spin_unlock(&mark->lock);

-	call_srcu(&fsnotify_mark_srcu, &mark->g_rcu, fsnotify_mark_free_rcu);
+	spin_lock(&destroy_lock);
+	list_add(&mark->g_list, &destroy_list);
+	spin_unlock(&destroy_lock);
+	queue_delayed_work(system_unbound_wq, &reaper_work,
+				FSNOTIFY_REAPER_DELAY);
+
 	return ret;
 }

@@ -491,3 +498,21 @@ void fsnotify_init_mark(struct fsnotify_mark *mark,
 	atomic_set(&mark->refcnt, 1);
 	mark->free_mark = free_mark;
 }
+
+static void fsnotify_mark_destroy(struct work_struct *work)
+{
+	struct fsnotify_mark *mark, *next;
+	struct list_head private_destroy_list;
+
+	spin_lock(&destroy_lock);
+	/* exchange the list head */
+	list_replace_init(&destroy_list, &private_destroy_list);
+	spin_unlock(&destroy_lock);
+
+	synchronize_srcu(&fsnotify_mark_srcu);
+
+	list_for_each_entry_safe(mark, next, &private_destroy_list, g_list) {
+		list_del_init(&mark->g_list);
+		fsnotify_put_mark(mark);
+	}
+}
--- a/include/linux/fsnotify_backend.h
+++ b/include/linux/fsnotify_backend.h
@@ -220,10 +220,7 @@ struct fsnotify_mark {
 	/* List of marks by group->i_fsnotify_marks. Also reused for queueing
 	 * mark into destroy_list when it's waiting for the end of SRCU period
 	 * before it can be freed. [group->mark_mutex] */
-	union {
 	struct list_head g_list;
-		struct rcu_head g_rcu;
-	};
 	/* Protects inode / mnt pointers, flags, masks */
 	spinlock_t lock;
 	/* List of marks for inode / vfsmount [obj_lock] */

--- a/ipc/shm.c
+++ b/ipc/shm.c
@@ -156,11 +156,12 @@ static inline struct shmid_kernel *shm_lock(struct ipc_namespace *ns, int id)
 	struct kern_ipc_perm *ipcp = ipc_lock(&shm_ids(ns), id);

 	/*
-	 * We raced in the idr lookup or with shm_destroy().  Either way, the
-	 * ID is busted.
+	 * Callers of shm_lock() must validate the status of the returned ipc
+	 * object pointer (as returned by ipc_lock()), and error out as
+	 * appropriate.
 	 */
-	WARN_ON(IS_ERR(ipcp));
-
+	if (IS_ERR(ipcp))
+		return (void *)ipcp;
 	return container_of(ipcp, struct shmid_kernel, shm_perm);
 }

@@ -186,18 +187,33 @@ static inline void shm_rmid(struct ipc_namespace *ns, struct shmid_kernel *s)
 }


-/* This is called by fork, once for every shm attach. */
-static void shm_open(struct vm_area_struct *vma)
+static int __shm_open(struct vm_area_struct *vma)
 {
 	struct file *file = vma->vm_file;
 	struct shm_file_data *sfd = shm_file_data(file);
 	struct shmid_kernel *shp;

 	shp = shm_lock(sfd->ns, sfd->id);
+
+	if (IS_ERR(shp))
+		return PTR_ERR(shp);
+
 	shp->shm_atim = get_seconds();
 	shp->shm_lprid = task_tgid_vnr(current);
 	shp->shm_nattch++;
 	shm_unlock(shp);
+	return 0;
+}
+
+/* This is called by fork, once for every shm attach. */
+static void shm_open(struct vm_area_struct *vma)
+{
+	int err = __shm_open(vma);
+	/*
+	 * We raced in the idr lookup or with shm_destroy().
+	 * Either way, the ID is busted.
+	 */
+	WARN_ON_ONCE(err);
 }

 /*
@@ -260,6 +276,14 @@ static void shm_close(struct vm_area_struct *vma)
 	down_write(&shm_ids(ns).rwsem);
 	/* remove from the list of attaches of the shm segment */
 	shp = shm_lock(ns, sfd->id);
+
+	/*
+	 * We raced in the idr lookup or with shm_destroy().
+	 * Either way, the ID is busted.
+	 */
+	if (WARN_ON_ONCE(IS_ERR(shp)))
+		goto done; /* no-op */
+
 	shp->shm_lprid = task_tgid_vnr(current);
 	shp->shm_dtim = get_seconds();
 	shp->shm_nattch--;
@@ -267,6 +291,7 @@ static void shm_close(struct vm_area_struct *vma)
 		shm_destroy(ns, shp);
 	else
 		shm_unlock(shp);
+done:
 	up_write(&shm_ids(ns).rwsem);
 }

@@ -388,17 +413,25 @@ static int shm_mmap(struct file *file, struct vm_area_struct *vma)
 	struct shm_file_data *sfd = shm_file_data(file);
 	int ret;

+	/*
+	 * In case of remap_file_pages() emulation, the file can represent
+	 * removed IPC ID: propogate shm_lock() error to caller.
+	 */
+	ret =__shm_open(vma);
+	if (ret)
+		return ret;
+
 	ret = sfd->file->f_op->mmap(sfd->file, vma);
-	if (ret != 0)
+	if (ret) {
+		shm_close(vma);
 		return ret;
+	}
 	sfd->vm_ops = vma->vm_ops;
 #ifdef CONFIG_MMU
 	WARN_ON(!sfd->vm_ops->fault);
 #endif
 	vma->vm_ops = &shm_vm_ops;
-	shm_open(vma);
-
-	return ret;
+	return 0;
 }

 static int shm_release(struct inode *ino, struct file *file)

--- a/kernel/memremap.c
+++ b/kernel/memremap.c
@@ -114,7 +114,7 @@ EXPORT_SYMBOL(memunmap);

 static void devm_memremap_release(struct device *dev, void *res)
 {
-	memunmap(res);
+	memunmap(*(void **)res);
 }

 static int devm_memremap_match(struct device *dev, void *res, void *match_data)

--- a/mm/huge_memory.c
+++ b/mm/huge_memory.c
@@ -1700,7 +1700,8 @@ bool move_huge_pmd(struct vm_area_struct *vma, struct vm_area_struct *new_vma,
 		pmd = pmdp_huge_get_and_clear(mm, old_addr, old_pmd);
 		VM_BUG_ON(!pmd_none(*new_pmd));

-		if (pmd_move_must_withdraw(new_ptl, old_ptl)) {
+		if (pmd_move_must_withdraw(new_ptl, old_ptl) &&
+				vma_is_anonymous(vma)) {
 			pgtable_t pgtable;
 			pgtable = pgtable_trans_huge_withdraw(mm, old_pmd);
 			pgtable_trans_huge_deposit(mm, new_pmd, pgtable);

--- a/mm/hugetlb.c
+++ b/mm/hugetlb.c
@@ -2630,8 +2630,10 @@ static int __init hugetlb_init(void)
 			hugetlb_add_hstate(HUGETLB_PAGE_ORDER);
 	}
 	default_hstate_idx = hstate_index(size_to_hstate(default_hstate_size));
-	if (default_hstate_max_huge_pages)
+	if (default_hstate_max_huge_pages) {
+		if (!default_hstate.max_huge_pages)
 			default_hstate.max_huge_pages = default_hstate_max_huge_pages;
+	}

 	hugetlb_init_hstates();
 	gather_bootmem_prealloc();

--- a/mm/mmap.c
+++ b/mm/mmap.c
@@ -2664,11 +2664,28 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	if (!vma || !(vma->vm_flags & VM_SHARED))
 		goto out;

-	if (start < vma->vm_start || start + size > vma->vm_end)
+	if (start < vma->vm_start)
 		goto out;

-	if (pgoff == linear_page_index(vma, start)) {
-		ret = 0;
+	if (start + size > vma->vm_end) {
+		struct vm_area_struct *next;
+
+		for (next = vma->vm_next; next; next = next->vm_next) {
+			/* hole between vmas ? */
+			if (next->vm_start != next->vm_prev->vm_end)
+				goto out;
+
+			if (next->vm_file != vma->vm_file)
+				goto out;
+
+			if (next->vm_flags != vma->vm_flags)
+				goto out;
+
+			if (start + size <= next->vm_end)
+				break;
+		}
+
+		if (!next)
 			goto out;
 	}

@@ -2679,9 +2696,16 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
 	flags &= MAP_NONBLOCK;
 	flags |= MAP_SHARED | MAP_FIXED | MAP_POPULATE;
 	if (vma->vm_flags & VM_LOCKED) {
+		struct vm_area_struct *tmp;
 		flags |= MAP_LOCKED;
+
 		/* drop PG_Mlocked flag for over-mapped range */
-		munlock_vma_pages_range(vma, start, start + size);
+		for (tmp = vma; tmp->vm_start >= start + size;
+				tmp = tmp->vm_next) {
+			munlock_vma_pages_range(tmp,
+					max(tmp->vm_start, start),
+					min(tmp->vm_end, start + size));
+		}
 	}

 	file = get_file(vma->vm_file);

--- a/mm/slab.c
+++ b/mm/slab.c
@@ -2275,7 +2275,7 @@ __kmem_cache_create (struct kmem_cache *cachep, unsigned long flags)

 	err = setup_cpu_cache(cachep, gfp);
 	if (err) {
-		__kmem_cache_shutdown(cachep);
+		__kmem_cache_release(cachep);
 		return err;
 	}

@@ -2413,13 +2413,14 @@ int __kmem_cache_shrink(struct kmem_cache *cachep, bool deactivate)
 }

 int __kmem_cache_shutdown(struct kmem_cache *cachep)
+{
+	return __kmem_cache_shrink(cachep, false);
+}
+
+void __kmem_cache_release(struct kmem_cache *cachep)
 {
 	int i;
 	struct kmem_cache_node *n;
-	int rc = __kmem_cache_shrink(cachep, false);
-
-	if (rc)
-		return rc;

 	free_percpu(cachep->cpu_cache);

@@ -2430,7 +2431,6 @@ int __kmem_cache_shutdown(struct kmem_cache *cachep)
 		kfree(n);
 		cachep->node[i] = NULL;
 	}
-	return 0;
 }

 /*

--- a/mm/slab.h
+++ b/mm/slab.h
@@ -140,6 +140,7 @@ static inline unsigned long kmem_cache_flags(unsigned long object_size,
 #define CACHE_CREATE_MASK (SLAB_CORE_FLAGS | SLAB_DEBUG_FLAGS | SLAB_CACHE_FLAGS)

 int __kmem_cache_shutdown(struct kmem_cache *);
+void __kmem_cache_release(struct kmem_cache *);
 int __kmem_cache_shrink(struct kmem_cache *, bool);
 void slab_kmem_cache_release(struct kmem_cache *);


--- a/mm/slab_common.c
+++ b/mm/slab_common.c
@@ -693,6 +693,7 @@ static inline int shutdown_memcg_caches(struct kmem_cache *s,

 void slab_kmem_cache_release(struct kmem_cache *s)
 {
+	__kmem_cache_release(s);
 	destroy_memcg_params(s);
 	kfree_const(s->name);
 	kmem_cache_free(kmem_cache, s);

--- a/mm/slob.c
+++ b/mm/slob.c
@@ -630,6 +630,10 @@ int __kmem_cache_shutdown(struct kmem_cache *c)
 	return 0;
 }

+void __kmem_cache_release(struct kmem_cache *c)
+{
+}
+
 int __kmem_cache_shrink(struct kmem_cache *d, bool deactivate)
 {
 	return 0;

--- a/mm/slub.c
+++ b/mm/slub.c
@@ -1592,18 +1592,12 @@ static inline void add_partial(struct kmem_cache_node *n,
 	__add_partial(n, page, tail);
 }

-static inline void
-__remove_partial(struct kmem_cache_node *n, struct page *page)
-{
-	list_del(&page->lru);
-	n->nr_partial--;
-}
-
 static inline void remove_partial(struct kmem_cache_node *n,
 					struct page *page)
 {
 	lockdep_assert_held(&n->list_lock);
-	__remove_partial(n, page);
+	list_del(&page->lru);
+	n->nr_partial--;
 }

 /*
@@ -3184,6 +3178,12 @@ static void free_kmem_cache_nodes(struct kmem_cache *s)
 	}
 }

+void __kmem_cache_release(struct kmem_cache *s)
+{
+	free_percpu(s->cpu_slab);
+	free_kmem_cache_nodes(s);
+}
+
 static int init_kmem_cache_nodes(struct kmem_cache *s)
 {
 	int node;
@@ -3443,28 +3443,31 @@ static void list_slab_objects(struct kmem_cache *s, struct page *page,

 /*
 * Attempt to free all partial slabs on a node.
- * This is called from kmem_cache_close(). We must be the last thread
- * using the cache and therefore we do not need to lock anymore.
+ * This is called from __kmem_cache_shutdown(). We must take list_lock
+ * because sysfs file might still access partial list after the shutdowning.
 */
 static void free_partial(struct kmem_cache *s, struct kmem_cache_node *n)
 {
 	struct page *page, *h;

+	BUG_ON(irqs_disabled());
+	spin_lock_irq(&n->list_lock);
 	list_for_each_entry_safe(page, h, &n->partial, lru) {
 		if (!page->inuse) {
-			__remove_partial(n, page);
+			remove_partial(n, page);
 			discard_slab(s, page);
 		} else {
 			list_slab_objects(s, page,
-			"Objects remaining in %s on kmem_cache_close()");
+			"Objects remaining in %s on __kmem_cache_shutdown()");
 		}
 	}
+	spin_unlock_irq(&n->list_lock);
 }

 /*
 * Release all resources used by a slab cache.
 */
-static inline int kmem_cache_close(struct kmem_cache *s)
+int __kmem_cache_shutdown(struct kmem_cache *s)
 {
 	int node;
 	struct kmem_cache_node *n;
@@ -3476,16 +3479,9 @@ static inline int kmem_cache_close(struct kmem_cache *s)
 		if (n->nr_partial || slabs_node(s, node))
 			return 1;
 	}
-	free_percpu(s->cpu_slab);
-	free_kmem_cache_nodes(s);
 	return 0;
 }

-int __kmem_cache_shutdown(struct kmem_cache *s)
-{
-	return kmem_cache_close(s);
-}
-
 /********************************************************************
 *		Kmalloc subsystem
 *******************************************************************/
@@ -3980,7 +3976,7 @@ int __kmem_cache_create(struct kmem_cache *s, unsigned long flags)
 	memcg_propagate_slab_attrs(s);
 	err = sysfs_slab_add(s);
 	if (err)
-		kmem_cache_close(s);
+		__kmem_cache_release(s);

 	return err;
 }