Commit 972d8c37 authored by Arun Sharma's avatar Arun Sharma Committed by David Mosberger

[PATCH] ia64: fix ia32 virtual memory leaks due to partial-page mappings

Certain IA-32 applications which do mmap/munmaps which are not
PAGE_SIZE aligned could see temporary (recovered at process exit time)
memory leaks, because the kernel didn't have enough data to decide if
the complete page could be unmapped. This patch adds a new data
structure called the "partial page list" which helps the kernel keep
track of precisely which 4k pages are in use by the IA-32 application.

Armed with this data, the kernel can make better decisions at munmap
and mprotect time. No significant performance degradation was observed
in the workloads we tested and in some cases, the performance actually
improved!  This is possibly due to the reduced length of the vma list.
Signed-off-by: default avatarArun Sharma <arun.sharma@intel.com>
Signed-off-by: default avatarGordon Jin <gordon.jin@intel.com>
Signed-off-by: default avatarDavid Mosberger <davidm@hpl.hp.com>
parent d69e4b12
......@@ -197,6 +197,10 @@ ia32_setup_arg_pages (struct linux_binprm *bprm, int executable_stack)
}
up_write(&current->mm->mmap_sem);
/* Can't do it in ia64_elf32_init(). Needs to be done before calls to
elf32_map() */
current->thread.ppl = ia32_init_pp_list();
return 0;
}
......
......@@ -371,7 +371,7 @@ ia32_syscall_table:
data8 sys_sched_get_priority_min /* 160 */
data8 sys32_sched_rr_get_interval
data8 compat_sys_nanosleep
data8 sys_mremap
data8 sys32_mremap
data8 sys_setresuid /* 16-bit version */
data8 sys32_getresuid16 /* 16-bit version */ /* 165 */
data8 sys_ni_syscall /* vm86 */
......
......@@ -211,6 +211,8 @@ ia32_cpu_init (void)
static int __init
ia32_init (void)
{
extern kmem_cache_t *partial_page_cachep;
ia32_exec_domain.name = "Linux/x86";
ia32_exec_domain.handler = NULL;
ia32_exec_domain.pers_low = PER_LINUX32;
......@@ -218,6 +220,12 @@ ia32_init (void)
ia32_exec_domain.signal_map = default_exec_domain.signal_map;
ia32_exec_domain.signal_invmap = default_exec_domain.signal_invmap;
register_exec_domain(&ia32_exec_domain);
partial_page_cachep = kmem_cache_create("partial_page_cache",
sizeof(struct partial_page), 0, 0, NULL, NULL);
if (!partial_page_cachep)
panic("Cannot create partial page SLAB cache");
return 0;
}
......
......@@ -9,6 +9,7 @@
#include <linux/binfmts.h>
#include <linux/compat.h>
#include <linux/rbtree.h>
#include <asm/processor.h>
......@@ -22,6 +23,30 @@
#define IA32_PAGE_ALIGN(addr) (((addr) + IA32_PAGE_SIZE - 1) & IA32_PAGE_MASK)
#define IA32_CLOCKS_PER_SEC 100 /* Cast in stone for IA32 Linux */
/*
* partially mapped pages provide precise accounting of which 4k sub pages
* are mapped and which ones are not, thereby improving IA-32 compatibility.
*/
struct partial_page {
struct partial_page *next; /* linked list, sorted by address */
struct rb_node pp_rb;
/* 64K is the largest "normal" page supported by ia64 ABI. So 4K*32
* should suffice.*/
unsigned int bitmap;
unsigned int base;
};
struct partial_page_list {
struct partial_page *pp_head; /* list head, points to the lowest
* addressed partial page */
struct rb_root ppl_rb;
struct partial_page *pp_hint; /* pp_hint->next is the last
* accessed partial page */
atomic_t pp_count; /* reference count */
};
struct partial_page_list* ia32_init_pp_list (void);
/* sigcontext.h */
/*
* As documented in the iBCS2 standard..
......
......@@ -8,6 +8,7 @@
* Copyright (C) 1997 David S. Miller (davem@caip.rutgers.edu)
* Copyright (C) 2000-2003 Hewlett-Packard Co
* David Mosberger-Tang <davidm@hpl.hp.com>
* Copyright (C) 2004 Gordon Jin <gordon.jin@intel.com>
*
* These routines maintain argument size conversion between 32bit and 64bit
* environment.
......@@ -48,6 +49,7 @@
#include <linux/ipc.h>
#include <linux/compat.h>
#include <linux/vfs.h>
#include <linux/mman.h>
#include <asm/intrinsics.h>
#include <asm/semaphore.h>
......@@ -250,6 +252,374 @@ mmap_subpage (struct file *file, unsigned long start, unsigned long end, int pro
return ret;
}
/* SLAB cache for partial_page structures */
kmem_cache_t *partial_page_cachep;
/*
* init partial_page_list.
* return 0 means kmalloc fail.
*/
struct partial_page_list*
ia32_init_pp_list(void)
{
struct partial_page_list *p;
if ((p = kmalloc(sizeof(*p), GFP_KERNEL)) == NULL)
return p;
p->pp_head = 0;
p->ppl_rb = RB_ROOT;
p->pp_hint = 0;
atomic_set(&p->pp_count, 1);
return p;
}
/*
* Search for the partial page with @start in partial page list @ppl.
* If finds the partial page, return the found partial page.
* Else, return 0 and provide @pprev, @rb_link, @rb_parent to
* be used by later ia32_insert_pp().
*/
static struct partial_page *
ia32_find_pp(struct partial_page_list *ppl, unsigned int start,
struct partial_page **pprev, struct rb_node ***rb_link,
struct rb_node **rb_parent)
{
struct partial_page *pp;
struct rb_node **__rb_link, *__rb_parent, *rb_prev;
pp = ppl->pp_hint;
if (pp && pp->base == start)
return pp;
__rb_link = &ppl->ppl_rb.rb_node;
rb_prev = __rb_parent = NULL;
while (*__rb_link) {
__rb_parent = *__rb_link;
pp = rb_entry(__rb_parent, struct partial_page, pp_rb);
if (pp->base == start) {
ppl->pp_hint = pp;
return pp;
} else if (pp->base < start) {
rb_prev = __rb_parent;
__rb_link = &__rb_parent->rb_right;
} else {
__rb_link = &__rb_parent->rb_left;
}
}
*rb_link = __rb_link;
*rb_parent = __rb_parent;
*pprev = NULL;
if (rb_prev)
*pprev = rb_entry(rb_prev, struct partial_page, pp_rb);
return NULL;
}
/*
* insert @pp into @ppl.
*/
static void
ia32_insert_pp(struct partial_page_list *ppl, struct partial_page *pp,
struct partial_page *prev, struct rb_node **rb_link,
struct rb_node *rb_parent)
{
/* link list */
if (prev) {
pp->next = prev->next;
prev->next = pp;
} else {
ppl->pp_head = pp;
if (rb_parent)
pp->next = rb_entry(rb_parent,
struct partial_page, pp_rb);
else
pp->next = NULL;
}
/* link rb */
rb_link_node(&pp->pp_rb, rb_parent, rb_link);
rb_insert_color(&pp->pp_rb, &ppl->ppl_rb);
ppl->pp_hint = pp;
}
/*
* delete @pp from partial page list @ppl.
*/
static void
ia32_delete_pp(struct partial_page_list *ppl, struct partial_page *pp,
struct partial_page *prev)
{
if (prev) {
prev->next = pp->next;
if (ppl->pp_hint == pp)
ppl->pp_hint = prev;
} else {
ppl->pp_head = pp->next;
if (ppl->pp_hint == pp)
ppl->pp_hint = pp->next;
}
rb_erase(&pp->pp_rb, &ppl->ppl_rb);
kmem_cache_free(partial_page_cachep, pp);
}
static struct partial_page *
pp_prev(struct partial_page *pp)
{
struct rb_node *prev = rb_prev(&pp->pp_rb);
if (prev)
return rb_entry(prev, struct partial_page, pp_rb);
else
return NULL;
}
/*
* Set the range between @start and @end in bitmap.
* @start and @end should be IA32 page aligned and in the same IA64 page.
*/
static int
__ia32_set_pp(unsigned int start, unsigned int end)
{
struct partial_page *pp, *prev;
struct rb_node ** rb_link, *rb_parent;
unsigned int pstart, start_bit, end_bit, i;
pstart = PAGE_START(start);
start_bit = (start % PAGE_SIZE) / IA32_PAGE_SIZE;
end_bit = (end % PAGE_SIZE) / IA32_PAGE_SIZE;
if (end_bit == 0)
end_bit = PAGE_SIZE / IA32_PAGE_SIZE;
pp = ia32_find_pp(current->thread.ppl, pstart, &prev,
&rb_link, &rb_parent);
if (pp) {
for (i = start_bit; i < end_bit; i++)
set_bit(i, &pp->bitmap);
/*
* Check: if this partial page has been set to a full page,
* then delete it.
*/
if (find_first_zero_bit(&pp->bitmap, sizeof(pp->bitmap)*8) >=
PAGE_SIZE/IA32_PAGE_SIZE) {
ia32_delete_pp(current->thread.ppl, pp, pp_prev(pp));
}
return 0;
}
/* new a partial_page */
pp = kmem_cache_alloc(partial_page_cachep, GFP_KERNEL);
if (!pp)
return -ENOMEM;
pp->base = pstart;
pp->bitmap = 0;
for (i=start_bit; i<end_bit; i++)
set_bit(i, &(pp->bitmap));
pp->next = NULL;
ia32_insert_pp(current->thread.ppl, pp, prev, rb_link, rb_parent);
return 0;
}
/*
* locking version of ia32_set_pp().
* Use mm->mmap_sem to protect partial_page_list.
*/
static void
ia32_set_pp(unsigned int start, unsigned int end)
{
down_write(&current->mm->mmap_sem);
{
__ia32_set_pp(start, end);
}
up_write(&current->mm->mmap_sem);
}
/*
* Unset the range between @start and @end in bitmap.
* @start and @end should be IA32 page aligned and in the same IA64 page.
* After doing that, if the bitmap is 0, then free the page and return 1,
* else return 0;
* If not find the partial page in the list, then
* If the vma exists, then the full page is set to a partial page;
* Else return -ENOMEM.
*/
static int
__ia32_unset_pp(unsigned int start, unsigned int end)
{
struct partial_page *pp, *prev;
struct rb_node ** rb_link, *rb_parent;
unsigned int pstart, start_bit, end_bit, i;
struct vm_area_struct *vma;
pstart = PAGE_START(start);
start_bit = (start % PAGE_SIZE) / IA32_PAGE_SIZE;
end_bit = (end % PAGE_SIZE) / IA32_PAGE_SIZE;
if (end_bit == 0)
end_bit = PAGE_SIZE / IA32_PAGE_SIZE;
pp = ia32_find_pp(current->thread.ppl, pstart, &prev,
&rb_link, &rb_parent);
if (pp) {
for (i = start_bit; i < end_bit; i++)
clear_bit(i, &pp->bitmap);
if (pp->bitmap == 0) {
ia32_delete_pp(current->thread.ppl, pp, pp_prev(pp));
return 1;
}
return 0;
}
vma = find_vma(current->mm, pstart);
if (!vma || vma->vm_start > pstart) {
return -ENOMEM;
}
/* new a partial_page */
pp = kmem_cache_alloc(partial_page_cachep, GFP_KERNEL);
if (!pp)
return -ENOMEM;
pp->base = pstart;
pp->bitmap = 0;
for (i = 0; i < start_bit; i++)
set_bit(i, &(pp->bitmap));
for (i = end_bit; i < PAGE_SIZE / IA32_PAGE_SIZE; i++)
set_bit(i, &(pp->bitmap));
pp->next = NULL;
ia32_insert_pp(current->thread.ppl, pp, prev, rb_link, rb_parent);
return 0;
}
/*
* locking version of ia32_unset_pp().
* Use mm->mmap_sem to protect partial_page_list.
*/
static int
ia32_unset_pp(unsigned int start, unsigned int end)
{
int ret;
down_write(&current->mm->mmap_sem);
{
ret = __ia32_unset_pp(start, end);
}
up_write(&current->mm->mmap_sem);
return ret;
}
/*
* Compare the range between @start and @end with bitmap in partial page:
* Take this as example: the range is the 1st and 2nd 4K page.
* Return 0 if they fit bitmap exactly, i.e. bitmap = 00000011;
* Return 1 if the range doesn't cover whole bitmap, e.g. bitmap = 00001111;
* Return -ENOMEM if the range exceeds the bitmap, e.g. bitmap = 00000001 or
* bitmap = 00000101.
*/
static int
ia32_compare_pp(unsigned int start, unsigned int end)
{
struct partial_page *pp, *prev;
struct rb_node ** rb_link, *rb_parent;
unsigned int pstart, start_bit, end_bit, size;
unsigned int first_bit, next_zero_bit; /* the first range in bitmap */
pstart = PAGE_START(start);
pp = ia32_find_pp(current->thread.ppl, pstart, &prev,
&rb_link, &rb_parent);
if (!pp)
return 1;
start_bit = (start % PAGE_SIZE) / IA32_PAGE_SIZE;
end_bit = (end % PAGE_SIZE) / IA32_PAGE_SIZE;
size = sizeof(pp->bitmap) * 8;
first_bit = find_first_bit(&pp->bitmap, size);
next_zero_bit = find_next_zero_bit(&pp->bitmap, size, first_bit);
if ((start_bit < first_bit) || (end_bit > next_zero_bit)) {
/* exceeds the first range in bitmap */
return -ENOMEM;
} else if ((start_bit == first_bit) && (end_bit == next_zero_bit)) {
first_bit = find_next_bit(&pp->bitmap, size, next_zero_bit);
if ((next_zero_bit < first_bit) && (first_bit < size))
return 1; /* has next range */
else
return 0; /* no next range */
} else
return 1;
}
static void
ia32_do_drop_pp_list(struct partial_page_list *ppl)
{
struct partial_page *pp = ppl->pp_head;
while (pp) {
struct partial_page *next = pp->next;
kmem_cache_free(partial_page_cachep, pp);
pp = next;
}
kfree(ppl);
}
void
ia32_drop_partial_page_list(struct partial_page_list* ppl)
{
if (ppl && atomic_dec_and_test(&ppl->pp_count))
ia32_do_drop_pp_list(ppl);
}
/*
* Copy current->thread.ppl to ppl (already initialized).
*/
static int
ia32_do_copy_pp_list(struct partial_page_list *ppl)
{
struct partial_page *pp, *tmp, *prev;
struct rb_node **rb_link, *rb_parent;
ppl->pp_head = NULL;
ppl->pp_hint = NULL;
ppl->ppl_rb = RB_ROOT;
rb_link = &ppl->ppl_rb.rb_node;
rb_parent = NULL;
prev = NULL;
for (pp = current->thread.ppl->pp_head; pp; pp = pp->next) {
tmp = kmem_cache_alloc(partial_page_cachep, GFP_KERNEL);
if (!tmp)
return -ENOMEM;
*tmp = *pp;
ia32_insert_pp(ppl, tmp, prev, rb_link, rb_parent);
prev = tmp;
rb_link = &tmp->pp_rb.rb_right;
rb_parent = &tmp->pp_rb;
}
return 0;
}
int
ia32_copy_partial_page_list(struct task_struct *p, unsigned long clone_flags)
{
int retval = 0;
if (clone_flags & CLONE_VM) {
atomic_inc(&current->thread.ppl->pp_count);
p->thread.ppl = current->thread.ppl;
} else {
p->thread.ppl = ia32_init_pp_list();
if (!p->thread.ppl)
return -ENOMEM;
down_write(&current->mm->mmap_sem);
{
retval = ia32_do_copy_pp_list(p->thread.ppl);
}
up_write(&current->mm->mmap_sem);
}
return retval;
}
static unsigned long
emulate_mmap (struct file *file, unsigned long start, unsigned long len, int prot, int flags,
loff_t off)
......@@ -274,7 +644,7 @@ emulate_mmap (struct file *file, unsigned long start, unsigned long len, int pro
return ret;
pstart += PAGE_SIZE;
if (pstart >= pend)
return start; /* done */
goto out; /* done */
}
if (end < pend) {
if (flags & MAP_SHARED)
......@@ -287,7 +657,7 @@ emulate_mmap (struct file *file, unsigned long start, unsigned long len, int pro
return ret;
pend -= PAGE_SIZE;
if (pstart >= pend)
return start; /* done */
goto out; /* done */
}
} else {
/*
......@@ -341,6 +711,19 @@ emulate_mmap (struct file *file, unsigned long start, unsigned long len, int pro
if (!(prot & PROT_WRITE) && sys_mprotect(pstart, pend - pstart, prot) < 0)
return -EINVAL;
}
out:
if (end < PAGE_ALIGN(start)) {
ia32_set_pp((unsigned int)start, (unsigned int)end);
} else {
if (start > PAGE_START(start)) {
ia32_set_pp((unsigned int)start, (unsigned int)PAGE_ALIGN(start));
}
if (end < PAGE_ALIGN(end)) {
ia32_set_pp((unsigned int)PAGE_START(end), (unsigned int)end);
}
}
return start;
}
......@@ -472,17 +855,51 @@ sys32_mmap2 (unsigned int addr, unsigned int len, unsigned int prot, unsigned in
asmlinkage long
sys32_munmap (unsigned int start, unsigned int len)
{
unsigned int end = start + len;
unsigned int end, pstart, pend;
long ret;
end = start + len;
#if PAGE_SHIFT <= IA32_PAGE_SHIFT
ret = sys_munmap(start, end - start);
#else
if (OFFSET4K(start))
return -EINVAL;
end = IA32_PAGE_ALIGN(end);
if (start >= end)
return -EINVAL;
pstart = PAGE_ALIGN(start);
pend = PAGE_START(end);
if (end < PAGE_ALIGN(start)) {
ret = ia32_unset_pp((unsigned int)start, (unsigned int)end);
if (ret == 1) {
start = PAGE_START(start);
end = PAGE_ALIGN(end);
} else
return ret;
} else {
if (offset_in_page(start)) {
ret = ia32_unset_pp((unsigned int)start, (unsigned int)PAGE_ALIGN(start));
if (ret == 1)
start = PAGE_START(start);
else if (ret == 0)
start = PAGE_ALIGN(start);
else
return ret;
}
if (offset_in_page(end)) {
ret = ia32_unset_pp((unsigned int)PAGE_START(end), (unsigned int)end);
if (ret == 1)
end = PAGE_ALIGN(end);
else if (ret == 0)
end = PAGE_START(end);
else
return ret;
}
}
if (start >= end)
return 0;
......@@ -540,6 +957,51 @@ sys32_mprotect (unsigned int start, unsigned int len, int prot)
down(&ia32_mmap_sem);
{
if (end < PAGE_ALIGN(start)) {
/* start and end are in the same IA64 (partial) page */
retval = ia32_compare_pp((unsigned int)start, (unsigned int)end);
if (retval < 0) {
/* start~end beyond the mapped area */
goto out;
}
else if (retval == 0) {
/* start~end fits the mapped area well */
start = PAGE_START(start);
end = PAGE_ALIGN(end);
}
else {
/* start~end doesn't cover all mapped area in this IA64 page */
/* So call mprotect_subpage to deal with new prot issue */
goto subpage;
}
} else {
if (offset_in_page(start)) {
retval = ia32_compare_pp((unsigned int)start, (unsigned int)PAGE_ALIGN(start));
if (retval < 0) {
goto out;
}
else if (retval == 0) {
start = PAGE_START(start);
}
else {
goto subpage;
}
}
if (offset_in_page(end)) {
retval = ia32_compare_pp((unsigned int)PAGE_START(end), (unsigned int)end);
if (retval < 0) {
goto out;
}
else if (retval == 0) {
end = PAGE_ALIGN(end);
}
else {
goto subpage;
}
}
}
subpage:
if (offset_in_page(start)) {
/* start address is 4KB aligned but not page aligned. */
retval = mprotect_subpage(PAGE_START(start), prot);
......@@ -567,6 +1029,63 @@ sys32_mprotect (unsigned int start, unsigned int len, int prot)
#endif
}
asmlinkage long
sys32_mremap (unsigned int addr, unsigned int old_len, unsigned int new_len,
unsigned int flags, unsigned int new_addr)
{
long ret;
unsigned int old_end, new_end;
#if PAGE_SHIFT <= IA32_PAGE_SHIFT
ret = sys_mremap(addr, old_len, new_len, flags, new_addr);
#else
if (OFFSET4K(addr))
return -EINVAL;
old_len = IA32_PAGE_ALIGN(old_len);
new_len = IA32_PAGE_ALIGN(new_len);
old_end = addr + old_len;
new_end = addr + new_len;
if (!new_len)
return -EINVAL;
if ((flags & MREMAP_FIXED) && (OFFSET4K(new_addr)))
return -EINVAL;
if (old_len >= new_len) {
ret = sys32_munmap(addr + new_len, old_len - new_len);
if (ret && old_len != new_len)
return ret;
ret = addr;
if (!(flags & MREMAP_FIXED) || (new_addr == addr))
return ret;
old_len = new_len;
}
addr = PAGE_START(addr);
old_len = PAGE_ALIGN(old_end) - addr;
new_len = PAGE_ALIGN(new_end) - addr;
down(&ia32_mmap_sem);
{
ret = sys_mremap(addr, old_len, new_len, flags, new_addr);
}
up(&ia32_mmap_sem);
if ((ret >= 0) && (old_len < new_len)) { /* mremap expand successfully */
if (new_end <= PAGE_ALIGN(old_end)) {
/* old_end and new_end are in the same IA64 page */
ia32_set_pp(old_end, new_end);
} else {
ia32_set_pp((unsigned int)PAGE_START(new_end), new_end);
ia32_set_pp(old_end, (unsigned int)PAGE_ALIGN(old_end));
}
}
#endif
return ret;
}
asmlinkage long
sys32_pipe (int *fd)
{
......
......@@ -439,6 +439,10 @@ copy_thread (int nr, unsigned long clone_flags,
ia32_save_state(p);
if (clone_flags & CLONE_SETTLS)
retval = ia32_clone_tls(p, child_ptregs);
/* Copy partially mapped page list */
if (!retval)
retval = ia32_copy_partial_page_list(p, clone_flags);
}
#endif
......@@ -672,6 +676,10 @@ flush_thread (void)
/* drop floating-point and debug-register state if it exists: */
current->thread.flags &= ~(IA64_THREAD_FPH_VALID | IA64_THREAD_DBG_VALID);
ia64_drop_fpu(current);
#ifdef CONFIG_IA32_SUPPORT
if (IS_IA32_PROCESS(ia64_task_regs(current)))
ia32_drop_partial_page_list(current->thread.ppl);
#endif
}
/*
......@@ -691,6 +699,10 @@ exit_thread (void)
if (current->thread.flags & IA64_THREAD_DBG_VALID)
pfm_release_debug_registers(current);
#endif
#ifdef CONFIG_IA32_SUPPORT
if (IS_IA32_PROCESS(ia64_task_regs(current)))
ia32_drop_partial_page_list(current->thread.ppl);
#endif
}
unsigned long
......
......@@ -18,6 +18,8 @@ extern void ia32_gdt_init (void);
extern int ia32_exception (struct pt_regs *regs, unsigned long isr);
extern int ia32_intercept (struct pt_regs *regs, unsigned long isr);
extern int ia32_clone_tls (struct task_struct *child, struct pt_regs *childregs);
extern int ia32_copy_partial_page_list (struct task_struct *, unsigned long);
extern void ia32_drop_partial_page_list (struct partial_page_list *);
# endif /* !CONFIG_IA32_SUPPORT */
......
......@@ -230,6 +230,7 @@ struct desc_struct {
#define TLS_SIZE (GDT_ENTRY_TLS_ENTRIES * 8)
struct partial_page_list;
#endif
struct thread_struct {
......@@ -251,6 +252,7 @@ struct thread_struct {
__u64 fdr; /* IA32 fp except. data reg */
__u64 old_k1; /* old value of ar.k1 */
__u64 old_iob; /* old IOBase value */
struct partial_page_list *ppl; /* partial page list for 4K page size issue */
/* cached TLS descriptors. */
struct desc_struct tls_array[GDT_ENTRY_TLS_ENTRIES];
......@@ -260,7 +262,8 @@ struct thread_struct {
.fir = 0, \
.fdr = 0, \
.old_k1 = 0, \
.old_iob = 0,
.old_iob = 0, \
.ppl = 0,
#else
# define INIT_THREAD_IA32
#endif /* CONFIG_IA32_SUPPORT */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment