Commit 645d5ce2 authored by Aneesh Kumar K.V's avatar Aneesh Kumar K.V Committed by Michael Ellerman

powerpc/mm/radix: Fix PTE/PMD fragment count for early page table mappings

We can hit the following BUG_ON during memory unplug:

kernel BUG at arch/powerpc/mm/book3s64/pgtable.c:342!
Oops: Exception in kernel mode, sig: 5 [#1]
LE PAGE_SIZE=64K MMU=Radix SMP NR_CPUS=2048 NUMA pSeries
NIP [c000000000093308] pmd_fragment_free+0x48/0xc0
LR [c00000000147bfec] remove_pagetable+0x578/0x60c
Call Trace:
0xc000008050000000 (unreliable)
remove_pagetable+0x384/0x60c
radix__remove_section_mapping+0x18/0x2c
remove_section_mapping+0x1c/0x3c
arch_remove_memory+0x11c/0x180
try_remove_memory+0x120/0x1b0
__remove_memory+0x20/0x40
dlpar_remove_lmb+0xc0/0x114
dlpar_memory+0x8b0/0xb20
handle_dlpar_errorlog+0xc0/0x190
pseries_hp_work_fn+0x2c/0x60
process_one_work+0x30c/0x810
worker_thread+0x98/0x540
kthread+0x1c4/0x1d0
ret_from_kernel_thread+0x5c/0x74

This occurs when unplug is attempted for such memory which has
been mapped using memblock pages as part of early kernel page
table setup. We wouldn't have initialized the PMD or PTE fragment
count for those PMD or PTE pages.

This can be fixed by allocating memory in PAGE_SIZE granularity
during early page table allocation. This makes sure a specific
page is not shared for another memblock allocation and we can
free them correctly on removing page-table pages.

Since we now do PAGE_SIZE allocations for both PUD table and
PMD table (Note that PTE table allocation is already of PAGE_SIZE),
we end up allocating more memory for the same amount of system RAM.
Here is a comparision of how much more we need for a 64T and 2G
system after this patch:

1. 64T system
-------------
64T RAM would need 64G for vmemmap with struct page size being 64B.

128 PUD tables for 64T memory (1G mappings)
1 PUD table and 64 PMD tables for 64G vmemmap (2M mappings)

With default PUD[PMD]_TABLE_SIZE(4K), (128+1+64)*4K=772K
With PAGE_SIZE(64K) table allocations, (128+1+64)*64K=12352K

2. 2G system
------------
2G RAM would need 2M for vmemmap with struct page size being 64B.

1 PUD table for 2G memory (1G mapping)
1 PUD table and 1 PMD table for 2M vmemmap (2M mappings)

With default PUD[PMD]_TABLE_SIZE(4K), (1+1+1)*4K=12K
With new PAGE_SIZE(64K) table allocations, (1+1+1)*64K=192K
Signed-off-by: default avatarBharata B Rao <bharata@linux.ibm.com>
Signed-off-by: default avatarAneesh Kumar K.V <aneesh.kumar@linux.ibm.com>
Signed-off-by: default avatarMichael Ellerman <mpe@ellerman.id.au>
Link: https://lore.kernel.org/r/20200709131925.922266-2-aneesh.kumar@linux.ibm.com
parent 9a77c4a0
...@@ -107,11 +107,25 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr) ...@@ -107,11 +107,25 @@ static inline pud_t *pud_alloc_one(struct mm_struct *mm, unsigned long addr)
return pud; return pud;
} }
static inline void pud_free(struct mm_struct *mm, pud_t *pud) static inline void __pud_free(pud_t *pud)
{ {
struct page *page = virt_to_page(pud);
/*
* Early pud pages allocated via memblock allocator
* can't be directly freed to slab
*/
if (PageReserved(page))
free_reserved_page(page);
else
kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud); kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), pud);
} }
static inline void pud_free(struct mm_struct *mm, pud_t *pud)
{
return __pud_free(pud);
}
static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd) static inline void pud_populate(struct mm_struct *mm, pud_t *pud, pmd_t *pmd)
{ {
*pud = __pud(__pgtable_ptr_val(pmd) | PUD_VAL_BITS); *pud = __pud(__pgtable_ptr_val(pmd) | PUD_VAL_BITS);
......
...@@ -339,6 +339,9 @@ void pmd_fragment_free(unsigned long *pmd) ...@@ -339,6 +339,9 @@ void pmd_fragment_free(unsigned long *pmd)
{ {
struct page *page = virt_to_page(pmd); struct page *page = virt_to_page(pmd);
if (PageReserved(page))
return free_reserved_page(page);
BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
if (atomic_dec_and_test(&page->pt_frag_refcount)) { if (atomic_dec_and_test(&page->pt_frag_refcount)) {
pgtable_pmd_page_dtor(page); pgtable_pmd_page_dtor(page);
...@@ -356,7 +359,7 @@ static inline void pgtable_free(void *table, int index) ...@@ -356,7 +359,7 @@ static inline void pgtable_free(void *table, int index)
pmd_fragment_free(table); pmd_fragment_free(table);
break; break;
case PUD_INDEX: case PUD_INDEX:
kmem_cache_free(PGT_CACHE(PUD_CACHE_INDEX), table); __pud_free(table);
break; break;
#if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE) #if defined(CONFIG_PPC_4K_PAGES) && defined(CONFIG_HUGETLB_PAGE)
/* 16M hugepd directory at pud level */ /* 16M hugepd directory at pud level */
......
...@@ -56,6 +56,13 @@ static __ref void *early_alloc_pgtable(unsigned long size, int nid, ...@@ -56,6 +56,13 @@ static __ref void *early_alloc_pgtable(unsigned long size, int nid,
return ptr; return ptr;
} }
/*
* When allocating pud or pmd pointers, we allocate a complete page
* of PAGE_SIZE rather than PUD_TABLE_SIZE or PMD_TABLE_SIZE. This
* is to ensure that the page obtained from the memblock allocator
* can be completely used as page table page and can be freed
* correctly when the page table entries are removed.
*/
static int early_map_kernel_page(unsigned long ea, unsigned long pa, static int early_map_kernel_page(unsigned long ea, unsigned long pa,
pgprot_t flags, pgprot_t flags,
unsigned int map_page_size, unsigned int map_page_size,
...@@ -72,7 +79,7 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa, ...@@ -72,7 +79,7 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
pgdp = pgd_offset_k(ea); pgdp = pgd_offset_k(ea);
p4dp = p4d_offset(pgdp, ea); p4dp = p4d_offset(pgdp, ea);
if (p4d_none(*p4dp)) { if (p4d_none(*p4dp)) {
pudp = early_alloc_pgtable(PUD_TABLE_SIZE, nid, pudp = early_alloc_pgtable(PAGE_SIZE, nid,
region_start, region_end); region_start, region_end);
p4d_populate(&init_mm, p4dp, pudp); p4d_populate(&init_mm, p4dp, pudp);
} }
...@@ -82,8 +89,8 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa, ...@@ -82,8 +89,8 @@ static int early_map_kernel_page(unsigned long ea, unsigned long pa,
goto set_the_pte; goto set_the_pte;
} }
if (pud_none(*pudp)) { if (pud_none(*pudp)) {
pmdp = early_alloc_pgtable(PMD_TABLE_SIZE, nid, pmdp = early_alloc_pgtable(PAGE_SIZE, nid, region_start,
region_start, region_end); region_end);
pud_populate(&init_mm, pudp, pmdp); pud_populate(&init_mm, pudp, pmdp);
} }
pmdp = pmd_offset(pudp, ea); pmdp = pmd_offset(pudp, ea);
......
...@@ -110,6 +110,9 @@ void pte_fragment_free(unsigned long *table, int kernel) ...@@ -110,6 +110,9 @@ void pte_fragment_free(unsigned long *table, int kernel)
{ {
struct page *page = virt_to_page(table); struct page *page = virt_to_page(table);
if (PageReserved(page))
return free_reserved_page(page);
BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0); BUG_ON(atomic_read(&page->pt_frag_refcount) <= 0);
if (atomic_dec_and_test(&page->pt_frag_refcount)) { if (atomic_dec_and_test(&page->pt_frag_refcount)) {
if (!kernel) if (!kernel)
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment