Commit 250cf776 authored by Christian Borntraeger's avatar Christian Borntraeger Committed by Martin Schwidefsky

[S390] pgtables: Fix race in enable_sie vs. page table ops

The current enable_sie code sets the mm->context.pgstes bit to tell
dup_mm that the new mm should have extended page tables. This bit is also
used by the s390 specific page table primitives to decide about the page
table layout - which means context.pgstes has two meanings. This can cause
any kind of bugs. For example  - e.g. shrink_zone can call
ptep_clear_flush_young while enable_sie is running. ptep_clear_flush_young
will test for context.pgstes. Since enable_sie changed that value of the old
struct mm without changing the page table layout ptep_clear_flush_young will
do the wrong thing.
The solution is to split pgstes into two bits
- one for the allocation
- one for the current state
Signed-off-by: default avatarChristian Borntraeger <borntraeger@de.ibm.com>
Signed-off-by: default avatarMartin Schwidefsky <schwidefsky@de.ibm.com>
parent 2c780914
...@@ -7,7 +7,8 @@ typedef struct { ...@@ -7,7 +7,8 @@ typedef struct {
unsigned long asce_bits; unsigned long asce_bits;
unsigned long asce_limit; unsigned long asce_limit;
int noexec; int noexec;
int pgstes; int has_pgste; /* The mmu context has extended page tables */
int alloc_pgste; /* cloned contexts will have extended page tables */
} mm_context_t; } mm_context_t;
#endif #endif
...@@ -20,12 +20,25 @@ static inline int init_new_context(struct task_struct *tsk, ...@@ -20,12 +20,25 @@ static inline int init_new_context(struct task_struct *tsk,
#ifdef CONFIG_64BIT #ifdef CONFIG_64BIT
mm->context.asce_bits |= _ASCE_TYPE_REGION3; mm->context.asce_bits |= _ASCE_TYPE_REGION3;
#endif #endif
if (current->mm->context.pgstes) { if (current->mm->context.alloc_pgste) {
/*
* alloc_pgste indicates, that any NEW context will be created
* with extended page tables. The old context is unchanged. The
* page table allocation and the page table operations will
* look at has_pgste to distinguish normal and extended page
* tables. The only way to create extended page tables is to
* set alloc_pgste and then create a new context (e.g. dup_mm).
* The page table allocation is called after init_new_context
* and if has_pgste is set, it will create extended page
* tables.
*/
mm->context.noexec = 0; mm->context.noexec = 0;
mm->context.pgstes = 1; mm->context.has_pgste = 1;
mm->context.alloc_pgste = 1;
} else { } else {
mm->context.noexec = s390_noexec; mm->context.noexec = s390_noexec;
mm->context.pgstes = 0; mm->context.has_pgste = 0;
mm->context.alloc_pgste = 0;
} }
mm->context.asce_limit = STACK_TOP_MAX; mm->context.asce_limit = STACK_TOP_MAX;
crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm)); crst_table_init((unsigned long *) mm->pgd, pgd_entry_type(mm));
......
...@@ -679,7 +679,7 @@ static inline void pmd_clear(pmd_t *pmd) ...@@ -679,7 +679,7 @@ static inline void pmd_clear(pmd_t *pmd)
static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep) static inline void pte_clear(struct mm_struct *mm, unsigned long addr, pte_t *ptep)
{ {
if (mm->context.pgstes) if (mm->context.has_pgste)
ptep_rcp_copy(ptep); ptep_rcp_copy(ptep);
pte_val(*ptep) = _PAGE_TYPE_EMPTY; pte_val(*ptep) = _PAGE_TYPE_EMPTY;
if (mm->context.noexec) if (mm->context.noexec)
...@@ -763,7 +763,7 @@ static inline int kvm_s390_test_and_clear_page_dirty(struct mm_struct *mm, ...@@ -763,7 +763,7 @@ static inline int kvm_s390_test_and_clear_page_dirty(struct mm_struct *mm,
struct page *page; struct page *page;
unsigned int skey; unsigned int skey;
if (!mm->context.pgstes) if (!mm->context.has_pgste)
return -EINVAL; return -EINVAL;
rcp_lock(ptep); rcp_lock(ptep);
pgste = (unsigned long *) (ptep + PTRS_PER_PTE); pgste = (unsigned long *) (ptep + PTRS_PER_PTE);
...@@ -794,7 +794,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma, ...@@ -794,7 +794,7 @@ static inline int ptep_test_and_clear_young(struct vm_area_struct *vma,
int young; int young;
unsigned long *pgste; unsigned long *pgste;
if (!vma->vm_mm->context.pgstes) if (!vma->vm_mm->context.has_pgste)
return 0; return 0;
physpage = pte_val(*ptep) & PAGE_MASK; physpage = pte_val(*ptep) & PAGE_MASK;
pgste = (unsigned long *) (ptep + PTRS_PER_PTE); pgste = (unsigned long *) (ptep + PTRS_PER_PTE);
...@@ -844,7 +844,7 @@ static inline void __ptep_ipte(unsigned long address, pte_t *ptep) ...@@ -844,7 +844,7 @@ static inline void __ptep_ipte(unsigned long address, pte_t *ptep)
static inline void ptep_invalidate(struct mm_struct *mm, static inline void ptep_invalidate(struct mm_struct *mm,
unsigned long address, pte_t *ptep) unsigned long address, pte_t *ptep)
{ {
if (mm->context.pgstes) { if (mm->context.has_pgste) {
rcp_lock(ptep); rcp_lock(ptep);
__ptep_ipte(address, ptep); __ptep_ipte(address, ptep);
ptep_rcp_copy(ptep); ptep_rcp_copy(ptep);
......
...@@ -169,7 +169,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) ...@@ -169,7 +169,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
unsigned long *table; unsigned long *table;
unsigned long bits; unsigned long bits;
bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL; bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
page = NULL; page = NULL;
if (!list_empty(&mm->context.pgtable_list)) { if (!list_empty(&mm->context.pgtable_list)) {
...@@ -186,7 +186,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm) ...@@ -186,7 +186,7 @@ unsigned long *page_table_alloc(struct mm_struct *mm)
pgtable_page_ctor(page); pgtable_page_ctor(page);
page->flags &= ~FRAG_MASK; page->flags &= ~FRAG_MASK;
table = (unsigned long *) page_to_phys(page); table = (unsigned long *) page_to_phys(page);
if (mm->context.pgstes) if (mm->context.has_pgste)
clear_table_pgstes(table); clear_table_pgstes(table);
else else
clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE); clear_table(table, _PAGE_TYPE_EMPTY, PAGE_SIZE);
...@@ -210,7 +210,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) ...@@ -210,7 +210,7 @@ void page_table_free(struct mm_struct *mm, unsigned long *table)
struct page *page; struct page *page;
unsigned long bits; unsigned long bits;
bits = (mm->context.noexec || mm->context.pgstes) ? 3UL : 1UL; bits = (mm->context.noexec || mm->context.has_pgste) ? 3UL : 1UL;
bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long); bits <<= (__pa(table) & (PAGE_SIZE - 1)) / 256 / sizeof(unsigned long);
page = pfn_to_page(__pa(table) >> PAGE_SHIFT); page = pfn_to_page(__pa(table) >> PAGE_SHIFT);
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
...@@ -257,7 +257,7 @@ int s390_enable_sie(void) ...@@ -257,7 +257,7 @@ int s390_enable_sie(void)
struct mm_struct *mm, *old_mm; struct mm_struct *mm, *old_mm;
/* Do we have pgstes? if yes, we are done */ /* Do we have pgstes? if yes, we are done */
if (tsk->mm->context.pgstes) if (tsk->mm->context.has_pgste)
return 0; return 0;
/* lets check if we are allowed to replace the mm */ /* lets check if we are allowed to replace the mm */
...@@ -269,14 +269,14 @@ int s390_enable_sie(void) ...@@ -269,14 +269,14 @@ int s390_enable_sie(void)
} }
task_unlock(tsk); task_unlock(tsk);
/* we copy the mm with pgstes enabled */ /* we copy the mm and let dup_mm create the page tables with_pgstes */
tsk->mm->context.pgstes = 1; tsk->mm->context.alloc_pgste = 1;
mm = dup_mm(tsk); mm = dup_mm(tsk);
tsk->mm->context.pgstes = 0; tsk->mm->context.alloc_pgste = 0;
if (!mm) if (!mm)
return -ENOMEM; return -ENOMEM;
/* Now lets check again if somebody attached ptrace etc */ /* Now lets check again if something happened */
task_lock(tsk); task_lock(tsk);
if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 || if (!tsk->mm || atomic_read(&tsk->mm->mm_users) > 1 ||
tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) { tsk->mm != tsk->active_mm || tsk->mm->ioctx_list) {
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment