Commit 323fb82b authored by Jeff Dike's avatar Jeff Dike Committed by Linus Torvalds

[PATCH] uml: Consolidate tlb flushing code

This patch merges now-identical page table walking and flushing code that had
been duplicated in skas and tt modes.  The differences had been the low-level
address space updating operations, which are now abstracted away in the
respective do_ops functions.
Signed-off-by: default avatarJeff Dike <jdike@addtoit.com>
Signed-off-by: default avatarAndrew Morton <akpm@osdl.org>
Signed-off-by: default avatarLinus Torvalds <torvalds@osdl.org>
parent 2eb2be7d
......@@ -36,6 +36,11 @@ struct host_vm_op {
extern void mprotect_kernel_vm(int w);
extern void force_flush_all(void);
extern void fix_range_common(struct mm_struct *mm, unsigned long start_addr,
unsigned long end_addr, int force, int data,
void (*do_ops)(int, struct host_vm_op *, int));
extern int flush_tlb_kernel_range_common(unsigned long start,
unsigned long end);
extern int add_mmap(unsigned long virt, unsigned long phys, unsigned long len,
int r, int w, int x, struct host_vm_op *ops, int index,
......
......@@ -6,22 +6,22 @@
#ifndef __ARCH_UM_MMU_H
#define __ARCH_UM_MMU_H
#include "linux/config.h"
#include "uml-config.h"
#include "choose-mode.h"
#ifdef CONFIG_MODE_TT
#ifdef UML_CONFIG_MODE_TT
#include "mmu-tt.h"
#endif
#ifdef CONFIG_MODE_SKAS
#ifdef UML_CONFIG_MODE_SKAS
#include "mmu-skas.h"
#endif
typedef union {
#ifdef CONFIG_MODE_TT
typedef union mm_context {
#ifdef UML_CONFIG_MODE_TT
struct mmu_context_tt tt;
#endif
#ifdef CONFIG_MODE_SKAS
#ifdef UML_CONFIG_MODE_SKAS
struct mmu_context_skas skas;
#endif
} mm_context_t;
......
......@@ -49,220 +49,37 @@ static void do_ops(int fd, struct host_vm_op *ops, int last)
static void fix_range(struct mm_struct *mm, unsigned long start_addr,
unsigned long end_addr, int force)
{
pgd_t *npgd;
pud_t *npud;
pmd_t *npmd;
pte_t *npte;
unsigned long addr, end;
int r, w, x, fd;
struct host_vm_op ops[16];
int op_index = -1, last_op = sizeof(ops) / sizeof(ops[0]) - 1;
int fd = mm->context.skas.mm_fd;
if(mm == NULL) return;
fd = mm->context.skas.mm_fd;
for(addr = start_addr; addr < end_addr;){
npgd = pgd_offset(mm, addr);
if(!pgd_present(*npgd)){
if(force || pgd_newpage(*npgd)){
end = addr + PGDIR_SIZE;
if(end > end_addr)
end = end_addr;
op_index = add_munmap(addr, end - addr, ops,
op_index, last_op, fd,
do_ops);
pgd_mkuptodate(*npgd);
}
addr += PGDIR_SIZE;
continue;
}
npud = pud_offset(npgd, addr);
if(!pud_present(*npud)){
if(force || pud_newpage(*npud)){
end = addr + PUD_SIZE;
if(end > end_addr)
end = end_addr;
op_index = add_munmap(addr, end - addr, ops,
op_index, last_op, fd,
do_ops);
pud_mkuptodate(*npud);
}
addr += PUD_SIZE;
continue;
}
npmd = pmd_offset(npud, addr);
if(!pmd_present(*npmd)){
if(force || pmd_newpage(*npmd)){
end = addr + PMD_SIZE;
if(end > end_addr)
end = end_addr;
op_index = add_munmap(addr, end - addr, ops,
op_index, last_op, fd,
do_ops);
pmd_mkuptodate(*npmd);
}
addr += PMD_SIZE;
continue;
}
npte = pte_offset_kernel(npmd, addr);
r = pte_read(*npte);
w = pte_write(*npte);
x = pte_exec(*npte);
if(!pte_dirty(*npte))
w = 0;
if(!pte_young(*npte)){
r = 0;
w = 0;
}
if(force || pte_newpage(*npte)){
if(pte_present(*npte))
op_index = add_mmap(addr,
pte_val(*npte) & PAGE_MASK,
PAGE_SIZE, r, w, x, ops,
op_index, last_op, fd,
do_ops);
else op_index = add_munmap(addr, PAGE_SIZE, ops,
op_index, last_op, fd,
do_ops);
}
else if(pte_newprot(*npte))
op_index = add_mprotect(addr, PAGE_SIZE, r, w, x, ops,
op_index, last_op, fd,
do_ops);
*npte = pte_mkuptodate(*npte);
addr += PAGE_SIZE;
}
do_ops(fd, ops, op_index);
}
void flush_tlb_kernel_range_skas(unsigned long start, unsigned long end)
{
struct mm_struct *mm;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
unsigned long addr, last;
int updated = 0, err;
mm = &init_mm;
for(addr = start; addr < end;){
pgd = pgd_offset(mm, addr);
pud = pud_offset(pgd, addr);
pmd = pmd_offset(pud, addr);
if(!pgd_present(*pgd)){
if(pgd_newpage(*pgd)){
updated = 1;
last = addr + PGDIR_SIZE;
if(last > end)
last = end;
err = os_unmap_memory((void *) addr,
last - addr);
if(err < 0)
panic("munmap failed, errno = %d\n",
-err);
}
addr += PGDIR_SIZE;
continue;
}
pud = pud_offset(pgd, addr);
if(!pud_present(*pud)){
if(pud_newpage(*pud)){
updated = 1;
last = addr + PUD_SIZE;
if(last > end)
last = end;
err = os_unmap_memory((void *) addr,
last - addr);
if(err < 0)
panic("munmap failed, errno = %d\n",
-err);
}
addr += PUD_SIZE;
continue;
}
pmd = pmd_offset(pud, addr);
if(!pmd_present(*pmd)){
if(pmd_newpage(*pmd)){
updated = 1;
last = addr + PMD_SIZE;
if(last > end)
last = end;
err = os_unmap_memory((void *) addr,
last - addr);
if(err < 0)
panic("munmap failed, errno = %d\n",
-err);
}
addr += PMD_SIZE;
continue;
}
pte = pte_offset_kernel(pmd, addr);
if(!pte_present(*pte) || pte_newpage(*pte)){
updated = 1;
err = os_unmap_memory((void *) addr, PAGE_SIZE);
if(err < 0)
panic("munmap failed, errno = %d\n", -err);
if(pte_present(*pte))
map_memory(addr, pte_val(*pte) & PAGE_MASK,
PAGE_SIZE, 1, 1, 1);
}
else if(pte_newprot(*pte)){
updated = 1;
protect_memory(addr, PAGE_SIZE, 1, 1, 1, 1);
}
addr += PAGE_SIZE;
}
}
void flush_tlb_kernel_vm_skas(void)
{
flush_tlb_kernel_range_skas(start_vm, end_vm);
fix_range_common(mm, start_addr, end_addr, force, fd, do_ops);
}
void __flush_tlb_one_skas(unsigned long addr)
{
flush_tlb_kernel_range_skas(addr, addr + PAGE_SIZE);
flush_tlb_kernel_range_common(addr, addr + PAGE_SIZE);
}
void flush_tlb_range_skas(struct vm_area_struct *vma, unsigned long start,
unsigned long end)
{
if(vma->vm_mm == NULL)
flush_tlb_kernel_range_skas(start, end);
else fix_range(vma->vm_mm, start, end, 0);
if(vma->vm_mm == NULL)
flush_tlb_kernel_range_common(start, end);
else fix_range(vma->vm_mm, start, end, 0);
}
void flush_tlb_mm_skas(struct mm_struct *mm)
{
/* Don't bother flushing if this address space is about to be
* destroyed.
*/
if(atomic_read(&mm->mm_users) == 0)
return;
* destroyed.
*/
if(atomic_read(&mm->mm_users) == 0)
return;
flush_tlb_kernel_vm_skas();
fix_range(mm, 0, host_task_size, 0);
fix_range(mm, 0, host_task_size, 0);
flush_tlb_kernel_range_common(start_vm, end_vm);
}
void force_flush_all_skas(void)
{
fix_range(current->mm, 0, host_task_size, 1);
fix_range(current->mm, 0, host_task_size, 1);
}
/*
* Overrides for Emacs so that we follow Linus's tabbing style.
* Emacs will notice this stuff at the end of the file and automatically
* adjust the settings for this buffer only. This must remain at the end
* of the file.
* ---------------------------------------------------------------------------
* Local variables:
* c-file-style: "linux"
* End:
*/
This diff is collapsed.
......@@ -15,7 +15,7 @@
void before_mem_tt(unsigned long brk_start)
{
if(!jail || debug)
if(debug)
remap_data(UML_ROUND_DOWN(&_stext), UML_ROUND_UP(&_etext), 1);
remap_data(UML_ROUND_DOWN(&_sdata), UML_ROUND_UP(&_edata), 1);
remap_data(UML_ROUND_DOWN(&__bss_start), UML_ROUND_UP(&_end), 1);
......
......@@ -85,24 +85,6 @@ void *switch_to_tt(void *prev, void *next, void *last)
(prev_sched->exit_state == EXIT_DEAD))
os_kill_process(prev_sched->thread.mode.tt.extern_pid, 1);
/* This works around a nasty race with 'jail'. If we are switching
* between two threads of a threaded app and the incoming process
* runs before the outgoing process reaches the read, and it makes
* it all the way out to userspace, then it will have write-protected
* the outgoing process stack. Then, when the outgoing process
* returns from the write, it will segfault because it can no longer
* write its own stack. So, in order to avoid that, the incoming
* thread sits in a loop yielding until 'reading' is set. This
* isn't entirely safe, since there may be a reschedule from a timer
* happening between setting 'reading' and sleeping in read. But,
* it should get a whole quantum in which to reach the read and sleep,
* which should be enough.
*/
if(jail){
while(!reading) sched_yield();
}
change_sig(SIGVTALRM, vtalrm);
change_sig(SIGALRM, alrm);
change_sig(SIGPROF, prof);
......@@ -394,84 +376,6 @@ void init_idle_tt(void)
default_idle();
}
/* Changed by jail_setup, which is a setup */
int jail = 0;
int __init jail_setup(char *line, int *add)
{
int ok = 1;
if(jail) return(0);
#ifdef CONFIG_SMP
printf("'jail' may not used used in a kernel with CONFIG_SMP "
"enabled\n");
ok = 0;
#endif
#ifdef CONFIG_HOSTFS
printf("'jail' may not used used in a kernel with CONFIG_HOSTFS "
"enabled\n");
ok = 0;
#endif
#ifdef CONFIG_MODULES
printf("'jail' may not used used in a kernel with CONFIG_MODULES "
"enabled\n");
ok = 0;
#endif
if(!ok) exit(1);
/* CAP_SYS_RAWIO controls the ability to open /dev/mem and /dev/kmem.
* Removing it from the bounding set eliminates the ability of anything
* to acquire it, and thus read or write kernel memory.
*/
cap_lower(cap_bset, CAP_SYS_RAWIO);
jail = 1;
return(0);
}
__uml_setup("jail", jail_setup,
"jail\n"
" Enables the protection of kernel memory from processes.\n\n"
);
static void mprotect_kernel_mem(int w)
{
unsigned long start, end;
int pages;
if(!jail || (current == &init_task)) return;
pages = (1 << CONFIG_KERNEL_STACK_ORDER);
start = (unsigned long) current_thread + PAGE_SIZE;
end = (unsigned long) current_thread + PAGE_SIZE * pages;
protect_memory(uml_reserved, start - uml_reserved, 1, w, 1, 1);
protect_memory(end, high_physmem - end, 1, w, 1, 1);
start = (unsigned long) UML_ROUND_DOWN(&_stext);
end = (unsigned long) UML_ROUND_UP(&_etext);
protect_memory(start, end - start, 1, w, 1, 1);
start = (unsigned long) UML_ROUND_DOWN(&_unprotected_end);
end = (unsigned long) UML_ROUND_UP(&_edata);
protect_memory(start, end - start, 1, w, 1, 1);
start = (unsigned long) UML_ROUND_DOWN(&__bss_start);
end = (unsigned long) UML_ROUND_UP(brk_start);
protect_memory(start, end - start, 1, w, 1, 1);
mprotect_kernel_vm(w);
}
void unprotect_kernel_mem(void)
{
mprotect_kernel_mem(1);
}
void protect_kernel_mem(void)
{
mprotect_kernel_mem(0);
}
extern void start_kernel(void);
static int start_kernel_proc(void *unused)
......
......@@ -50,202 +50,20 @@ static void do_ops(int unused, struct host_vm_op *ops, int last)
static void fix_range(struct mm_struct *mm, unsigned long start_addr,
unsigned long end_addr, int force)
{
pgd_t *npgd;
pud_t *npud;
pmd_t *npmd;
pte_t *npte;
unsigned long addr, end;
int r, w, x;
struct host_vm_op ops[16];
int op_index = -1, last_op = sizeof(ops) / sizeof(ops[0]) - 1;
if((current->thread.mode.tt.extern_pid != -1) &&
(current->thread.mode.tt.extern_pid != os_getpid()))
panic("fix_range fixing wrong address space, current = 0x%p",
current);
if((current->thread.mode.tt.extern_pid != -1) &&
(current->thread.mode.tt.extern_pid != os_getpid()))
panic("fix_range fixing wrong address space, current = 0x%p",
current);
if(mm == NULL) return;
for(addr=start_addr;addr<end_addr;){
if(addr == TASK_SIZE){
/* Skip over kernel text, kernel data, and physical
* memory, which don't have ptes, plus kernel virtual
* memory, which is flushed separately, and remap
* the process stack. The only way to get here is
* if (end_addr == STACK_TOP) > TASK_SIZE, which is
* only true in the honeypot case.
*/
addr = STACK_TOP - ABOVE_KMEM;
continue;
}
npgd = pgd_offset(mm, addr);
if(!pgd_present(*npgd)){
if(force || pgd_newpage(*npgd)){
end = addr + PGDIR_SIZE;
if(end > end_addr)
end = end_addr;
op_index = add_munmap(addr, end - addr, ops,
op_index, last_op, 0,
do_ops);
pgd_mkuptodate(*npgd);
}
addr += PGDIR_SIZE;
continue;
}
npud = pud_offset(npgd, addr);
if(!pud_present(*npud)){
if(force || pud_newpage(*npud)){
end = addr + PUD_SIZE;
if(end > end_addr)
end = end_addr;
op_index = add_munmap(addr, end - addr, ops,
op_index, last_op, 0,
do_ops);
pud_mkuptodate(*npud);
}
addr += PUD_SIZE;
continue;
}
npmd = pmd_offset(npud, addr);
if(!pmd_present(*npmd)){
if(force || pmd_newpage(*npmd)){
end = addr + PMD_SIZE;
if(end > end_addr)
end = end_addr;
op_index = add_munmap(addr, end - addr, ops,
op_index, last_op, 0,
do_ops);
pmd_mkuptodate(*npmd);
}
addr += PMD_SIZE;
continue;
}
npte = pte_offset_kernel(npmd, addr);
r = pte_read(*npte);
w = pte_write(*npte);
x = pte_exec(*npte);
if(!pte_dirty(*npte))
w = 0;
if(!pte_young(*npte)){
r = 0;
w = 0;
}
if(force || pte_newpage(*npte)){
if(pte_present(*npte))
op_index = add_mmap(addr,
pte_val(*npte) & PAGE_MASK,
PAGE_SIZE, r, w, x, ops,
op_index, last_op, 0,
do_ops);
else op_index = add_munmap(addr, PAGE_SIZE, ops,
op_index, last_op, 0,
do_ops);
}
else if(pte_newprot(*npte))
op_index = add_mprotect(addr, PAGE_SIZE, r, w, x, ops,
op_index, last_op, 0,
do_ops);
*npte = pte_mkuptodate(*npte);
addr += PAGE_SIZE;
}
do_ops(0, ops, op_index);
fix_range_common(mm, start_addr, end_addr, force, 0, do_ops);
}
atomic_t vmchange_seq = ATOMIC_INIT(1);
static void flush_kernel_vm_range(unsigned long start, unsigned long end,
int update_seq)
{
struct mm_struct *mm;
pgd_t *pgd;
pud_t *pud;
pmd_t *pmd;
pte_t *pte;
unsigned long addr, last;
int updated = 0, err;
mm = &init_mm;
for(addr = start; addr < end;){
pgd = pgd_offset(mm, addr);
if(!pgd_present(*pgd)){
if(pgd_newpage(*pgd)){
updated = 1;
last = addr + PGDIR_SIZE;
if(last > end)
last = end;
err = os_unmap_memory((void *) addr,
last - addr);
if(err < 0)
panic("munmap failed, errno = %d\n",
-err);
}
addr += PGDIR_SIZE;
continue;
}
pud = pud_offset(pgd, addr);
if(!pud_present(*pud)){
if(pud_newpage(*pud)){
updated = 1;
last = addr + PUD_SIZE;
if(last > end)
last = end;
err = os_unmap_memory((void *) addr,
last - addr);
if(err < 0)
panic("munmap failed, errno = %d\n",
-err);
}
addr += PUD_SIZE;
continue;
}
pmd = pmd_offset(pud, addr);
if(!pmd_present(*pmd)){
if(pmd_newpage(*pmd)){
updated = 1;
last = addr + PMD_SIZE;
if(last > end)
last = end;
err = os_unmap_memory((void *) addr,
last - addr);
if(err < 0)
panic("munmap failed, errno = %d\n",
-err);
}
addr += PMD_SIZE;
continue;
}
pte = pte_offset_kernel(pmd, addr);
if(!pte_present(*pte) || pte_newpage(*pte)){
updated = 1;
err = os_unmap_memory((void *) addr,
PAGE_SIZE);
if(err < 0)
panic("munmap failed, errno = %d\n",
-err);
if(pte_present(*pte))
map_memory(addr,
pte_val(*pte) & PAGE_MASK,
PAGE_SIZE, 1, 1, 1);
}
else if(pte_newprot(*pte)){
updated = 1;
protect_memory(addr, PAGE_SIZE, 1, 1, 1, 1);
}
addr += PAGE_SIZE;
}
if(updated && update_seq) atomic_inc(&vmchange_seq);
}
void flush_tlb_kernel_range_tt(unsigned long start, unsigned long end)
{
flush_kernel_vm_range(start, end, 1);
if(flush_tlb_kernel_range_common(start, end))
atomic_inc(&vmchange_seq);
}
static void protect_vm_page(unsigned long addr, int w, int must_succeed)
......@@ -302,8 +120,10 @@ void flush_tlb_range_tt(struct vm_area_struct *vma, unsigned long start,
/* Assumes that the range start ... end is entirely within
* either process memory or kernel vm
*/
if((start >= start_vm) && (start < end_vm))
flush_kernel_vm_range(start, end, 1);
if((start >= start_vm) && (start < end_vm)){
if(flush_tlb_kernel_range_common(start, end))
atomic_inc(&vmchange_seq);
}
else fix_range(vma->vm_mm, start, end, 0);
}
......@@ -316,24 +136,14 @@ void flush_tlb_mm_tt(struct mm_struct *mm)
fix_range(mm, 0, STACK_TOP, 0);
seq = atomic_read(&vmchange_seq);
if(current->thread.mode.tt.vm_seq == seq) return;
if(current->thread.mode.tt.vm_seq == seq)
return;
current->thread.mode.tt.vm_seq = seq;
flush_kernel_vm_range(start_vm, end_vm, 0);
flush_tlb_kernel_range_common(start_vm, end_vm);
}
void force_flush_all_tt(void)
{
fix_range(current->mm, 0, STACK_TOP, 1);
flush_kernel_vm_range(start_vm, end_vm, 0);
flush_tlb_kernel_range_common(start_vm, end_vm);
}
/*
* Overrides for Emacs so that we follow Linus's tabbing style.
* Emacs will notice this stuff at the end of the file and automatically
* adjust the settings for this buffer only. This must remain at the end
* of the file.
* ---------------------------------------------------------------------------
* Local variables:
* c-file-style: "linux"
* End:
*/
......@@ -468,19 +468,6 @@ __uml_setup("debugtrace", uml_debugtrace_setup,
" the debug switch.\n\n"
);
static int __init uml_honeypot_setup(char *line, int *add)
{
jail_setup("", add);
honeypot = 1;
return 0;
}
__uml_setup("honeypot", uml_honeypot_setup,
"honeypot\n"
" This makes UML put process stacks in the same location as they are\n"
" on the host, allowing expoits such as stack smashes to work against\n"
" UML. This implies 'jail'.\n\n"
);
/*
* Overrides for Emacs so that we follow Linus's tabbing style.
* Emacs will notice this stuff at the end of the file and automatically
......
......@@ -20,8 +20,6 @@ void sig_handler_common_tt(int sig, void *sc_ptr)
struct signal_info *info;
int save_errno = errno, is_user;
unprotect_kernel_mem();
/* This is done because to allow SIGSEGV to be delivered inside a SEGV
* handler. This can happen in copy_user, and if SEGV is disabled,
* the process will die.
......@@ -48,7 +46,6 @@ void sig_handler_common_tt(int sig, void *sc_ptr)
}
*r = save_regs;
errno = save_errno;
if(is_user) protect_kernel_mem();
}
/*
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment