s390/mm: use full 4KB page for 2KB PTE

Cease using 4KB pages to host two 2KB PTEs. That greatly simplifies the memory management code at the expense of page tables memory footprint. Instead of two PTEs per 4KB page use only upper half of the parent page for a single PTE. With that the list of half-used pages pgtable_list becomes unneeded. Further, the upper byte of the parent page _refcount counter does not need to be used for fragments tracking and could be left alone. Commit 8211dad6 ("s390: add pte_free_defer() for pgtables sharing page") introduced the use of PageActive flag to coordinate a deferred free with 2KB page table fragments tracking. Since there is no tracking anymore, there is no need for using PageActive flag. Reviewed-by: Heiko Carstens <hca@linux.ibm.com> Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>

s390/mm: use full 4KB page for 2KB PTE
Cease using 4KB pages to host two 2KB PTEs. That greatly simplifies the memory management code at the expense of page tables memory footprint. Instead of two PTEs per 4KB page use only upper half of the parent page for a single PTE. With that the list of half-used pages pgtable_list becomes unneeded. Further, the upper byte of the parent page _refcount counter does not need to be used for fragments tracking and could be left alone. Commit 8211dad6 ("s390: add pte_free_defer() for pgtables sharing page") introduced the use of PageActive flag to coordinate a deferred free with 2KB page table fragments tracking. Since there is no tracking anymore, there is no need for using PageActive flag. Reviewed-by: Heiko Carstens <hca@linux.ibm.com> Reviewed-by: Gerald Schaefer <gerald.schaefer@linux.ibm.com> Signed-off-by: Alexander Gordeev <agordeev@linux.ibm.com> Signed-off-by: Vasily Gorbik <gor@linux.ibm.com>
d08d4e7c · Alexander Gordeev · Vasily Gorbik · a51324c4 · d08d4e7c · d08d4e7c
Commit d08d4e7c authored Oct 12, 2023 by Alexander Gordeev Committed by Vasily Gorbik Nov 05, 2023
4 changed files
--- a/arch/s390/include/asm/mmu.h
+++ b/arch/s390/include/asm/mmu.h
@@ -11,7 +11,6 @@ typedef struct {
 	cpumask_t cpu_attach_mask;
 	atomic_t flush_count;
 	unsigned int flush_mm;
-	struct list_head pgtable_list;
 	struct list_head gmap_list;
 	unsigned long gmap_asce;
 	unsigned long asce;
@@ -39,7 +38,6 @@ typedef struct {

 #define INIT_MM_CONTEXT(name)						   \
 	.context.lock =	__SPIN_LOCK_UNLOCKED(name.context.lock),	   \
-	.context.pgtable_list = LIST_HEAD_INIT(name.context.pgtable_list), \
 	.context.gmap_list = LIST_HEAD_INIT(name.context.gmap_list),

 #endif
--- a/arch/s390/include/asm/mmu_context.h
+++ b/arch/s390/include/asm/mmu_context.h
@@ -22,7 +22,6 @@ static inline int init_new_context(struct task_struct *tsk,
 	unsigned long asce_type, init_entry;

 	spin_lock_init(&mm->context.lock);
-	INIT_LIST_HEAD(&mm->context.pgtable_list);
 	INIT_LIST_HEAD(&mm->context.gmap_list);
 	cpumask_clear(&mm->context.cpu_attach_mask);
 	atomic_set(&mm->context.flush_count, 0);

--- a/arch/s390/include/asm/tlb.h
+++ b/arch/s390/include/asm/tlb.h
@@ -69,11 +69,6 @@ static inline void pte_free_tlb(struct mmu_gather *tlb, pgtable_t pte,
 	tlb->mm->context.flush_mm = 1;
 	tlb->freed_tables = 1;
 	tlb->cleared_pmds = 1;
-	/*
-	 * page_table_free_rcu takes care of the allocation bit masks
-	 * of the 2K table fragments in the 4K page table page,
-	 * then calls tlb_remove_table.
-	 */
 	page_table_free_rcu(tlb, (unsigned long *) pte, address);
 }


--- a/arch/s390/mm/pgalloc.c
+++ b/arch/s390/mm/pgalloc.c
@@ -133,11 +133,6 @@ int crst_table_upgrade(struct mm_struct *mm, unsigned long end)
 	return -ENOMEM;
 }

-static inline unsigned int atomic_xor_bits(atomic_t *v, unsigned int bits)
-{
-	return atomic_fetch_xor(bits, v) ^ bits;
-}
-
 #ifdef CONFIG_PGSTE

 struct page *page_table_alloc_pgste(struct mm_struct *mm)
@@ -162,125 +157,11 @@ void page_table_free_pgste(struct page *page)

 #endif /* CONFIG_PGSTE */

-/*
- * A 2KB-pgtable is either upper or lower half of a normal page.
- * The second half of the page may be unused or used as another
- * 2KB-pgtable.
- *
- * Whenever possible the parent page for a new 2KB-pgtable is picked
- * from the list of partially allocated pages mm_context_t::pgtable_list.
- * In case the list is empty a new parent page is allocated and added to
- * the list.
- *
- * When a parent page gets fully allocated it contains 2KB-pgtables in both
- * upper and lower halves and is removed from mm_context_t::pgtable_list.
- *
- * When 2KB-pgtable is freed from to fully allocated parent page that
- * page turns partially allocated and added to mm_context_t::pgtable_list.
- *
- * If 2KB-pgtable is freed from the partially allocated parent page that
- * page turns unused and gets removed from mm_context_t::pgtable_list.
- * Furthermore, the unused parent page is released.
- *
- * As follows from the above, no unallocated or fully allocated parent
- * pages are contained in mm_context_t::pgtable_list.
- *
- * The upper byte (bits 24-31) of the parent page _refcount is used
- * for tracking contained 2KB-pgtables and has the following format:
- *
- *   PP  AA
- * 01234567    upper byte (bits 24-31) of struct page::_refcount
- *   ||  ||
- *   ||  |+--- upper 2KB-pgtable is allocated
- *   ||  +---- lower 2KB-pgtable is allocated
- *   |+------- upper 2KB-pgtable is pending for removal
- *   +-------- lower 2KB-pgtable is pending for removal
- *
- * (See commit 620b4e903179 ("s390: use _refcount for pgtables") on why
- * using _refcount is possible).
- *
- * When 2KB-pgtable is allocated the corresponding AA bit is set to 1.
- * The parent page is either:
- *   - added to mm_context_t::pgtable_list in case the second half of the
- *     parent page is still unallocated;
- *   - removed from mm_context_t::pgtable_list in case both hales of the
- *     parent page are allocated;
- * These operations are protected with mm_context_t::lock.
- *
- * When 2KB-pgtable is deallocated the corresponding AA bit is set to 0
- * and the corresponding PP bit is set to 1 in a single atomic operation.
- * Thus, PP and AA bits corresponding to the same 2KB-pgtable are mutually
- * exclusive and may never be both set to 1!
- * The parent page is either:
- *   - added to mm_context_t::pgtable_list in case the second half of the
- *     parent page is still allocated;
- *   - removed from mm_context_t::pgtable_list in case the second half of
- *     the parent page is unallocated;
- * These operations are protected with mm_context_t::lock.
- *
- * It is important to understand that mm_context_t::lock only protects
- * mm_context_t::pgtable_list and AA bits, but not the parent page itself
- * and PP bits.
- *
- * Releasing the parent page happens whenever the PP bit turns from 1 to 0,
- * while both AA bits and the second PP bit are already unset. Then the
- * parent page does not contain any 2KB-pgtable fragment anymore, and it has
- * also been removed from mm_context_t::pgtable_list. It is safe to release
- * the page therefore.
- *
- * PGSTE memory spaces use full 4KB-pgtables and do not need most of the
- * logic described above. Both AA bits are set to 1 to denote a 4KB-pgtable
- * while the PP bits are never used, nor such a page is added to or removed
- * from mm_context_t::pgtable_list.
- *
- * pte_free_defer() overrides those rules: it takes the page off pgtable_list,
- * and prevents both 2K fragments from being reused. pte_free_defer() has to
- * guarantee that its pgtable cannot be reused before the RCU grace period
- * has elapsed (which page_table_free_rcu() does not actually guarantee).
- * But for simplicity, because page->rcu_head overlays page->lru, and because
- * the RCU callback might not be called before the mm_context_t has been freed,
- * pte_free_defer() in this implementation prevents both fragments from being
- * reused, and delays making the call to RCU until both fragments are freed.
- */
 unsigned long *page_table_alloc(struct mm_struct *mm)
 {
-	unsigned long *table;
 	struct ptdesc *ptdesc;
-	unsigned int mask, bit;
-
-	/* Try to get a fragment of a 4K page as a 2K page table */
-	if (!mm_alloc_pgste(mm)) {
-		table = NULL;
-		spin_lock_bh(&mm->context.lock);
-		if (!list_empty(&mm->context.pgtable_list)) {
-			ptdesc = list_first_entry(&mm->context.pgtable_list,
-						struct ptdesc, pt_list);
-			mask = atomic_read(&ptdesc->_refcount) >> 24;
-			/*
-			 * The pending removal bits must also be checked.
-			 * Failure to do so might lead to an impossible
-			 * value of (i.e 0x13 or 0x23) written to _refcount.
-			 * Such values violate the assumption that pending and
-			 * allocation bits are mutually exclusive, and the rest
-			 * of the code unrails as result. That could lead to
-			 * a whole bunch of races and corruptions.
-			 */
-			mask = (mask | (mask >> 4)) & 0x03U;
-			if (mask != 0x03U) {
-				table = (unsigned long *) ptdesc_to_virt(ptdesc);
-				bit = mask & 1;		/* =1 -> second 2K */
-				if (bit)
-					table += PTRS_PER_PTE;
-				atomic_xor_bits(&ptdesc->_refcount,
-							0x01U << (bit + 24));
-				list_del_init(&ptdesc->pt_list);
-			}
-		}
-		spin_unlock_bh(&mm->context.lock);
-		if (table)
-			return table;
-	}
-	/* Allocate a fresh page */
+	unsigned long *table;
+
 	ptdesc = pagetable_alloc(GFP_KERNEL, 0);
 	if (!ptdesc)
 		return NULL;
@@ -288,177 +169,73 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
 		pagetable_free(ptdesc);
 		return NULL;
 	}
-	/* Initialize page table */
 	table = ptdesc_to_virt(ptdesc);
 	__arch_set_page_dat(table, 1);
-	if (mm_alloc_pgste(mm)) {
-		/* Return 4K page table with PGSTEs */
-		INIT_LIST_HEAD(&ptdesc->pt_list);
-		atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
-		memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
-		memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
-	} else {
-		/* Return the first 2K fragment of the page */
-		atomic_xor_bits(&ptdesc->_refcount, 0x01U << 24);
-		memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE);
-		spin_lock_bh(&mm->context.lock);
-		list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
-		spin_unlock_bh(&mm->context.lock);
-	}
+	/* pt_list is used by gmap only */
+	INIT_LIST_HEAD(&ptdesc->pt_list);
+	memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE);
+	memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE);
 	return table;
 }

-static void page_table_release_check(struct page *page, void *table,
-				     unsigned int half, unsigned int mask)
-{
-	char msg[128];
-
-	if (!IS_ENABLED(CONFIG_DEBUG_VM))
-		return;
-	if (!mask && list_empty(&page->lru))
-		return;
-	snprintf(msg, sizeof(msg),
-		 "Invalid pgtable %p release half 0x%02x mask 0x%02x",
-		 table, half, mask);
-	dump_page(page, msg);
-}
-
-static void pte_free_now(struct rcu_head *head)
+static void pagetable_pte_dtor_free(struct ptdesc *ptdesc)
 {
-	struct ptdesc *ptdesc;
-
-	ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
 	pagetable_pte_dtor(ptdesc);
 	pagetable_free(ptdesc);
 }

 void page_table_free(struct mm_struct *mm, unsigned long *table)
 {
-	unsigned int mask, bit, half;
 	struct ptdesc *ptdesc = virt_to_ptdesc(table);

-	if (!mm_alloc_pgste(mm)) {
-		/* Free 2K page table fragment of a 4K page */
-		bit = ((unsigned long) table & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t));
-		spin_lock_bh(&mm->context.lock);
-		/*
-		 * Mark the page for delayed release. The actual release
-		 * will happen outside of the critical section from this
-		 * function or from __tlb_remove_table()
-		 */
-		mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
-		mask >>= 24;
-		if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
-			/*
-			 * Other half is allocated, and neither half has had
-			 * its free deferred: add page to head of list, to make
-			 * this freed half available for immediate reuse.
-			 */
-			list_add(&ptdesc->pt_list, &mm->context.pgtable_list);
-		} else {
-			/* If page is on list, now remove it. */
-			list_del_init(&ptdesc->pt_list);
-		}
-		spin_unlock_bh(&mm->context.lock);
-		mask = atomic_xor_bits(&ptdesc->_refcount, 0x10U << (bit + 24));
-		mask >>= 24;
-		if (mask != 0x00U)
-			return;
-		half = 0x01U << bit;
-	} else {
-		half = 0x03U;
-		mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
-		mask >>= 24;
-	}
-
-	page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
-	if (folio_test_clear_active(ptdesc_folio(ptdesc)))
-		call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
-	else
-		pte_free_now(&ptdesc->pt_rcu_head);
+	pagetable_pte_dtor_free(ptdesc);
 }

 void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table,
 			 unsigned long vmaddr)
 {
 	struct mm_struct *mm;
-	unsigned int bit, mask;
-	struct ptdesc *ptdesc = virt_to_ptdesc(table);

 	mm = tlb->mm;
-	if (mm_alloc_pgste(mm)) {
+	if (mm_alloc_pgste(mm))
 		gmap_unlink(mm, table, vmaddr);
-		table = (unsigned long *) ((unsigned long)table | 0x03U);
-		tlb_remove_ptdesc(tlb, table);
-		return;
-	}
-	bit = ((unsigned long) table & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t));
-	spin_lock_bh(&mm->context.lock);
-	/*
-	 * Mark the page for delayed release. The actual release will happen
-	 * outside of the critical section from __tlb_remove_table() or from
-	 * page_table_free()
-	 */
-	mask = atomic_xor_bits(&ptdesc->_refcount, 0x11U << (bit + 24));
-	mask >>= 24;
-	if ((mask & 0x03U) && !folio_test_active(ptdesc_folio(ptdesc))) {
-		/*
-		 * Other half is allocated, and neither half has had
-		 * its free deferred: add page to end of list, to make
-		 * this freed half available for reuse once its pending
-		 * bit has been cleared by __tlb_remove_table().
-		 */
-		list_add_tail(&ptdesc->pt_list, &mm->context.pgtable_list);
-	} else {
-		/* If page is on list, now remove it. */
-		list_del_init(&ptdesc->pt_list);
-	}
-	spin_unlock_bh(&mm->context.lock);
-	table = (unsigned long *) ((unsigned long) table | (0x01U << bit));
+	table = (unsigned long *)((unsigned long)table | 0x01U);
 	tlb_remove_ptdesc(tlb, table);
 }

 void __tlb_remove_table(void *_table)
 {
-	unsigned int mask = (unsigned long) _table & 0x03U, half = mask;
-	void *table = (void *)((unsigned long) _table ^ mask);
-	struct ptdesc *ptdesc = virt_to_ptdesc(table);
-
-	switch (half) {
-	case 0x00U:	/* pmd, pud, or p4d */
+	struct ptdesc *ptdesc;
+	unsigned int mask;
+	void *table;
+
+	mask = (unsigned long)_table & 0x01U;
+	table = (void *)((unsigned long)_table ^ mask);
+	ptdesc = virt_to_ptdesc(table);
+	if (!mask) {
+		/* pmd, pud, or p4d */
 		pagetable_free(ptdesc);
 		return;
-	case 0x01U:	/* lower 2K of a 4K page table */
-	case 0x02U:	/* higher 2K of a 4K page table */
-		mask = atomic_xor_bits(&ptdesc->_refcount, mask << (4 + 24));
-		mask >>= 24;
-		if (mask != 0x00U)
-			return;
-		break;
-	case 0x03U:	/* 4K page table with pgstes */
-		mask = atomic_xor_bits(&ptdesc->_refcount, 0x03U << 24);
-		mask >>= 24;
-		break;
 	}
-
-	page_table_release_check(ptdesc_page(ptdesc), table, half, mask);
-	if (folio_test_clear_active(ptdesc_folio(ptdesc)))
-		call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
-	else
-		pte_free_now(&ptdesc->pt_rcu_head);
+	pagetable_pte_dtor_free(ptdesc);
 }

 #ifdef CONFIG_TRANSPARENT_HUGEPAGE
+static void pte_free_now(struct rcu_head *head)
+{
+	struct ptdesc *ptdesc = container_of(head, struct ptdesc, pt_rcu_head);
+
+	pagetable_pte_dtor_free(ptdesc);
+}
+
 void pte_free_defer(struct mm_struct *mm, pgtable_t pgtable)
 {
-	struct page *page;
+	struct ptdesc *ptdesc = virt_to_ptdesc(pgtable);

-	page = virt_to_page(pgtable);
-	SetPageActive(page);
-	page_table_free(mm, (unsigned long *)pgtable);
+	call_rcu(&ptdesc->pt_rcu_head, pte_free_now);
 	/*
-	 * page_table_free() does not do the pgste gmap_unlink() which
-	 * page_table_free_rcu() does: warn us if pgste ever reaches here.
+	 * THPs are not allowed for KVM guests. Warn if pgste ever reaches here.
+	 * Turn to the generic pte_free_defer() version once gmap is removed.
 	 */
 	WARN_ON_ONCE(mm_has_pgste(mm));
 }