Commit 0dd5b7b0 authored by David S. Miller's avatar David S. Miller

sparc64: Fix physical memory management regressions with large max_phys_bits.

If max_phys_bits needs to be > 43 (f.e. for T4 chips), things like
DEBUG_PAGEALLOC stop working because the 3-level page tables only
can cover up to 43 bits.

Another problem is that when we increased MAX_PHYS_ADDRESS_BITS up to
47, several statically allocated tables became enormous.

Compounding this is that we will need to support up to 49 bits of
physical addressing for M7 chips.

The two tables in question are sparc64_valid_addr_bitmap and
kpte_linear_bitmap.

The first holds a bitmap, with 1 bit for each 4MB chunk of physical
memory, indicating whether that chunk actually exists in the machine
and is valid.

The second table is a set of 2-bit values which tell how large of a
mapping (4MB, 256MB, 2GB, 16GB, respectively) we can use at each 256MB
chunk of ram in the system.

These tables are huge and take up an enormous amount of the BSS
section of the sparc64 kernel image.  Specifically, the
sparc64_valid_addr_bitmap is 4MB, and the kpte_linear_bitmap is 128K.

So let's solve the space wastage and the DEBUG_PAGEALLOC problem
at the same time, by using the kernel page tables (as designed) to
manage this information.

We have to keep using large mappings when DEBUG_PAGEALLOC is disabled,
and we do this by encoding huge PMDs and PUDs.

On a T4-2 with 256GB of ram the kernel page table takes up 16K with
DEBUG_PAGEALLOC disabled and 256MB with it enabled.  Furthermore, this
memory is dynamically allocated at run time rather than coded
statically into the kernel image.
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
Acked-by: default avatarBob Picco <bob.picco@oracle.com>
parent 8c82dc0e
......@@ -128,9 +128,6 @@ extern unsigned long PAGE_OFFSET;
*/
#define MAX_PHYS_ADDRESS_BITS 47
/* These two shift counts are used when indexing sparc64_valid_addr_bitmap
* and kpte_linear_bitmap.
*/
#define ILOG2_4MB 22
#define ILOG2_256MB 28
......
......@@ -79,22 +79,7 @@
#include <linux/sched.h>
extern unsigned long sparc64_valid_addr_bitmap[];
/* Needs to be defined here and not in linux/mm.h, as it is arch dependent */
static inline bool __kern_addr_valid(unsigned long paddr)
{
if ((paddr >> MAX_PHYS_ADDRESS_BITS) != 0UL)
return false;
return test_bit(paddr >> ILOG2_4MB, sparc64_valid_addr_bitmap);
}
static inline bool kern_addr_valid(unsigned long addr)
{
unsigned long paddr = __pa(addr);
return __kern_addr_valid(paddr);
}
bool kern_addr_valid(unsigned long addr);
/* Entries per page directory level. */
#define PTRS_PER_PTE (1UL << (PAGE_SHIFT-3))
......@@ -122,6 +107,7 @@ static inline bool kern_addr_valid(unsigned long addr)
#define _PAGE_R _AC(0x8000000000000000,UL) /* Keep ref bit uptodate*/
#define _PAGE_SPECIAL _AC(0x0200000000000000,UL) /* Special page */
#define _PAGE_PMD_HUGE _AC(0x0100000000000000,UL) /* Huge page */
#define _PAGE_PUD_HUGE _PAGE_PMD_HUGE
/* Advertise support for _PAGE_SPECIAL */
#define __HAVE_ARCH_PTE_SPECIAL
......@@ -668,26 +654,26 @@ static inline unsigned long pmd_large(pmd_t pmd)
return pte_val(pte) & _PAGE_PMD_HUGE;
}
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline unsigned long pmd_young(pmd_t pmd)
static inline unsigned long pmd_pfn(pmd_t pmd)
{
pte_t pte = __pte(pmd_val(pmd));
return pte_young(pte);
return pte_pfn(pte);
}
static inline unsigned long pmd_write(pmd_t pmd)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
static inline unsigned long pmd_young(pmd_t pmd)
{
pte_t pte = __pte(pmd_val(pmd));
return pte_write(pte);
return pte_young(pte);
}
static inline unsigned long pmd_pfn(pmd_t pmd)
static inline unsigned long pmd_write(pmd_t pmd)
{
pte_t pte = __pte(pmd_val(pmd));
return pte_pfn(pte);
return pte_write(pte);
}
static inline unsigned long pmd_trans_huge(pmd_t pmd)
......@@ -781,18 +767,15 @@ static inline int pmd_present(pmd_t pmd)
* the top bits outside of the range of any physical address size we
* support are clear as well. We also validate the physical itself.
*/
#define pmd_bad(pmd) ((pmd_val(pmd) & ~PAGE_MASK) || \
!__kern_addr_valid(pmd_val(pmd)))
#define pmd_bad(pmd) (pmd_val(pmd) & ~PAGE_MASK)
#define pud_none(pud) (!pud_val(pud))
#define pud_bad(pud) ((pud_val(pud) & ~PAGE_MASK) || \
!__kern_addr_valid(pud_val(pud)))
#define pud_bad(pud) (pud_val(pud) & ~PAGE_MASK)
#define pgd_none(pgd) (!pgd_val(pgd))
#define pgd_bad(pgd) ((pgd_val(pgd) & ~PAGE_MASK) || \
!__kern_addr_valid(pgd_val(pgd)))
#define pgd_bad(pgd) (pgd_val(pgd) & ~PAGE_MASK)
#ifdef CONFIG_TRANSPARENT_HUGEPAGE
void set_pmd_at(struct mm_struct *mm, unsigned long addr,
......@@ -835,6 +818,20 @@ static inline unsigned long __pmd_page(pmd_t pmd)
#define pgd_present(pgd) (pgd_val(pgd) != 0U)
#define pgd_clear(pgdp) (pgd_val(*(pgd)) = 0UL)
static inline unsigned long pud_large(pud_t pud)
{
pte_t pte = __pte(pud_val(pud));
return pte_val(pte) & _PAGE_PMD_HUGE;
}
static inline unsigned long pud_pfn(pud_t pud)
{
pte_t pte = __pte(pud_val(pud));
return pte_pfn(pte);
}
/* Same in both SUN4V and SUN4U. */
#define pte_none(pte) (!pte_val(pte))
......
......@@ -133,9 +133,24 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
sub TSB, 0x8, TSB; \
TSB_STORE(TSB, TAG);
/* Do a kernel page table walk. Leaves physical PTE pointer in
* REG1. Jumps to FAIL_LABEL on early page table walk termination.
* VADDR will not be clobbered, but REG2 will.
/* Do a kernel page table walk. Leaves valid PTE value in
* REG1. Jumps to FAIL_LABEL on early page table walk
* termination. VADDR will not be clobbered, but REG2 will.
*
* There are two masks we must apply to propagate bits from
* the virtual address into the PTE physical address field
* when dealing with huge pages. This is because the page
* table boundaries do not match the huge page size(s) the
* hardware supports.
*
* In these cases we propagate the bits that are below the
* page table level where we saw the huge page mapping, but
* are still within the relevant physical bits for the huge
* page size in question. So for PMD mappings (which fall on
* bit 23, for 8MB per PMD) we must propagate bit 22 for a
* 4MB huge page. For huge PUDs (which fall on bit 33, for
* 8GB per PUD), we have to accomodate 256MB and 2GB huge
* pages. So for those we propagate bits 32 to 28.
*/
#define KERN_PGTABLE_WALK(VADDR, REG1, REG2, FAIL_LABEL) \
sethi %hi(swapper_pg_dir), REG1; \
......@@ -150,15 +165,35 @@ extern struct tsb_phys_patch_entry __tsb_phys_patch, __tsb_phys_patch_end;
andn REG2, 0x7, REG2; \
ldxa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
brz,pn REG1, FAIL_LABEL; \
sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
sethi %uhi(_PAGE_PUD_HUGE), REG2; \
brz,pn REG1, FAIL_LABEL; \
sllx REG2, 32, REG2; \
andcc REG1, REG2, %g0; \
sethi %hi(0xf8000000), REG2; \
bne,pt %xcc, 697f; \
sllx REG2, 1, REG2; \
sllx VADDR, 64 - (PMD_SHIFT + PMD_BITS), REG2; \
srlx REG2, 64 - PAGE_SHIFT, REG2; \
andn REG2, 0x7, REG2; \
ldxa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
sethi %uhi(_PAGE_PMD_HUGE), REG2; \
brz,pn REG1, FAIL_LABEL; \
sllx VADDR, 64 - PMD_SHIFT, REG2; \
sllx REG2, 32, REG2; \
andcc REG1, REG2, %g0; \
be,pn %xcc, 698f; \
sethi %hi(0x400000), REG2; \
697: brgez,pn REG1, FAIL_LABEL; \
andn REG1, REG2, REG1; \
and VADDR, REG2, REG2; \
ba,pt %xcc, 699f; \
or REG1, REG2, REG1; \
698: sllx VADDR, 64 - PMD_SHIFT, REG2; \
srlx REG2, 64 - PAGE_SHIFT, REG2; \
andn REG2, 0x7, REG2; \
add REG1, REG2, REG1;
ldxa [REG1 + REG2] ASI_PHYS_USE_EC, REG1; \
brgez,pn REG1, FAIL_LABEL; \
nop; \
699:
/* PMD has been loaded into REG1, interpret the value, seeing
* if it is a HUGE PMD or a normal one. If it is not valid
......
......@@ -47,14 +47,6 @@ kvmap_itlb_vmalloc_addr:
KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_itlb_longpath)
TSB_LOCK_TAG(%g1, %g2, %g7)
/* Load and check PTE. */
ldxa [%g5] ASI_PHYS_USE_EC, %g5
mov 1, %g7
sllx %g7, TSB_TAG_INVALID_BIT, %g7
brgez,a,pn %g5, kvmap_itlb_longpath
TSB_STORE(%g1, %g7)
TSB_WRITE(%g1, %g5, %g6)
/* fallthrough to TLB load */
......@@ -118,6 +110,12 @@ kvmap_dtlb_obp:
ba,pt %xcc, kvmap_dtlb_load
nop
kvmap_linear_early:
sethi %hi(kern_linear_pte_xor), %g7
ldx [%g7 + %lo(kern_linear_pte_xor)], %g2
ba,pt %xcc, kvmap_dtlb_tsb4m_load
xor %g2, %g4, %g5
.align 32
kvmap_dtlb_tsb4m_load:
TSB_LOCK_TAG(%g1, %g2, %g7)
......@@ -146,105 +144,17 @@ kvmap_dtlb_4v:
/* Correct TAG_TARGET is already in %g6, check 4mb TSB. */
KERN_TSB4M_LOOKUP_TL1(%g6, %g5, %g1, %g2, %g3, kvmap_dtlb_load)
#endif
/* TSB entry address left in %g1, lookup linear PTE.
* Must preserve %g1 and %g6 (TAG).
*/
kvmap_dtlb_tsb4m_miss:
/* Clear the PAGE_OFFSET top virtual bits, shift
* down to get PFN, and make sure PFN is in range.
*/
661: sllx %g4, 0, %g5
.section .page_offset_shift_patch, "ax"
.word 661b
.previous
/* Check to see if we know about valid memory at the 4MB
* chunk this physical address will reside within.
/* Linear mapping TSB lookup failed. Fallthrough to kernel
* page table based lookup.
*/
661: srlx %g5, MAX_PHYS_ADDRESS_BITS, %g2
.section .page_offset_shift_patch, "ax"
.word 661b
.previous
brnz,pn %g2, kvmap_dtlb_longpath
nop
/* This unconditional branch and delay-slot nop gets patched
* by the sethi sequence once the bitmap is properly setup.
*/
.globl valid_addr_bitmap_insn
valid_addr_bitmap_insn:
ba,pt %xcc, 2f
nop
.subsection 2
.globl valid_addr_bitmap_patch
valid_addr_bitmap_patch:
sethi %hi(sparc64_valid_addr_bitmap), %g7
or %g7, %lo(sparc64_valid_addr_bitmap), %g7
.previous
661: srlx %g5, ILOG2_4MB, %g2
.section .page_offset_shift_patch, "ax"
.word 661b
.previous
srlx %g2, 6, %g5
and %g2, 63, %g2
sllx %g5, 3, %g5
ldx [%g7 + %g5], %g5
mov 1, %g7
sllx %g7, %g2, %g7
andcc %g5, %g7, %g0
be,pn %xcc, kvmap_dtlb_longpath
2: sethi %hi(kpte_linear_bitmap), %g2
/* Get the 256MB physical address index. */
661: sllx %g4, 0, %g5
.section .page_offset_shift_patch, "ax"
.word 661b
.previous
or %g2, %lo(kpte_linear_bitmap), %g2
661: srlx %g5, ILOG2_256MB, %g5
.section .page_offset_shift_patch, "ax"
.word 661b
.previous
and %g5, (32 - 1), %g7
/* Divide by 32 to get the offset into the bitmask. */
srlx %g5, 5, %g5
add %g7, %g7, %g7
sllx %g5, 3, %g5
/* kern_linear_pte_xor[(mask >> shift) & 3)] */
ldx [%g2 + %g5], %g2
srlx %g2, %g7, %g7
sethi %hi(kern_linear_pte_xor), %g5
and %g7, 3, %g7
or %g5, %lo(kern_linear_pte_xor), %g5
sllx %g7, 3, %g7
ldx [%g5 + %g7], %g2
.globl kvmap_linear_patch
kvmap_linear_patch:
ba,pt %xcc, kvmap_dtlb_tsb4m_load
xor %g2, %g4, %g5
ba,a,pt %xcc, kvmap_linear_early
kvmap_dtlb_vmalloc_addr:
KERN_PGTABLE_WALK(%g4, %g5, %g2, kvmap_dtlb_longpath)
TSB_LOCK_TAG(%g1, %g2, %g7)
/* Load and check PTE. */
ldxa [%g5] ASI_PHYS_USE_EC, %g5
mov 1, %g7
sllx %g7, TSB_TAG_INVALID_BIT, %g7
brgez,a,pn %g5, kvmap_dtlb_longpath
TSB_STORE(%g1, %g7)
TSB_WRITE(%g1, %g5, %g6)
/* fallthrough to TLB load */
......
......@@ -122,11 +122,6 @@ SECTIONS
*(.swapper_4m_tsb_phys_patch)
__swapper_4m_tsb_phys_patch_end = .;
}
.page_offset_shift_patch : {
__page_offset_shift_patch = .;
*(.page_offset_shift_patch)
__page_offset_shift_patch_end = .;
}
.popc_3insn_patch : {
__popc_3insn_patch = .;
*(.popc_3insn_patch)
......
This diff is collapsed.
......@@ -8,15 +8,8 @@
*/
#define MAX_PHYS_ADDRESS (1UL << MAX_PHYS_ADDRESS_BITS)
#define KPTE_BITMAP_CHUNK_SZ (256UL * 1024UL * 1024UL)
#define KPTE_BITMAP_BYTES \
((MAX_PHYS_ADDRESS / KPTE_BITMAP_CHUNK_SZ) / 4)
#define VALID_ADDR_BITMAP_CHUNK_SZ (4UL * 1024UL * 1024UL)
#define VALID_ADDR_BITMAP_BYTES \
((MAX_PHYS_ADDRESS / VALID_ADDR_BITMAP_CHUNK_SZ) / 8)
extern unsigned long kern_linear_pte_xor[4];
extern unsigned long kpte_linear_bitmap[KPTE_BITMAP_BYTES / sizeof(unsigned long)];
extern unsigned int sparc64_highest_unlocked_tlb_ent;
extern unsigned long sparc64_kern_pri_context;
extern unsigned long sparc64_kern_pri_nuc_bits;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment