Commit c9d3808f authored by Andrew Morton's avatar Andrew Morton Committed by Christoph Hellwig

[PATCH] hugetlb pages

Rohit Seth's ia32 huge tlb pages patch.

Anton Blanchard took a look at this today; he seemed happy
with it and said he could borrow bits.
parent fca174cc
...@@ -25,6 +25,15 @@ CONFIG_SMP ...@@ -25,6 +25,15 @@ CONFIG_SMP
If you don't know what to do here, say N. If you don't know what to do here, say N.
CONFIG_HUGETLB_PAGE
This enables support for huge pages. User space applications
can make use of this support with the sys_alloc_hugepages and
sys_free_hugepages system calls. If your applications are
huge page aware and your processor (Pentium or later for x86)
supports this, then say Y here.
Otherwise, say N.
CONFIG_PREEMPT CONFIG_PREEMPT
This option reduces the latency of the kernel when reacting to This option reduces the latency of the kernel when reacting to
real-time or interactive events by allowing a low priority process to real-time or interactive events by allowing a low priority process to
......
...@@ -154,6 +154,8 @@ if [ "$CONFIG_MWINCHIP3D" = "y" ]; then ...@@ -154,6 +154,8 @@ if [ "$CONFIG_MWINCHIP3D" = "y" ]; then
define_bool CONFIG_X86_OOSTORE y define_bool CONFIG_X86_OOSTORE y
fi fi
bool 'IA-32 Huge TLB Page Support (if available on processor)' CONFIG_HUGETLB_PAGE
bool 'Symmetric multi-processing support' CONFIG_SMP bool 'Symmetric multi-processing support' CONFIG_SMP
bool 'Preemptible Kernel' CONFIG_PREEMPT bool 'Preemptible Kernel' CONFIG_PREEMPT
if [ "$CONFIG_SMP" != "y" ]; then if [ "$CONFIG_SMP" != "y" ]; then
......
...@@ -759,8 +759,8 @@ ENTRY(sys_call_table) ...@@ -759,8 +759,8 @@ ENTRY(sys_call_table)
.long sys_io_getevents .long sys_io_getevents
.long sys_io_submit .long sys_io_submit
.long sys_io_cancel .long sys_io_cancel
.long sys_ni_syscall /* 250 */ /* sys_alloc_hugepages */ .long sys_alloc_hugepages /* 250 */
.long sys_ni_syscall /* sys_free_hugepages */ .long sys_free_hugepages
.long sys_exit_group .long sys_exit_group
.rept NR_syscalls-(.-sys_call_table)/4 .rept NR_syscalls-(.-sys_call_table)/4
......
...@@ -246,3 +246,94 @@ asmlinkage int sys_olduname(struct oldold_utsname * name) ...@@ -246,3 +246,94 @@ asmlinkage int sys_olduname(struct oldold_utsname * name)
return error; return error;
} }
#ifdef CONFIG_HUGETLB_PAGE
#define HPAGE_ALIGN(x) (((unsigned long)x + (HPAGE_SIZE -1)) & HPAGE_MASK)
extern long sys_munmap(unsigned long, size_t);
/* get_addr function gets the currently unused virtaul range in
* current process's address space. It returns the LARGE_PAGE_SIZE
* aligned address (in cases of success). Other kernel generic
* routines only could gurantee that allocated address is PAGE_SIZSE aligned.
*/
static unsigned long
get_addr(unsigned long addr, unsigned long len)
{
struct vm_area_struct *vma;
if (addr) {
addr = HPAGE_ALIGN(addr);
vma = find_vma(current->mm, addr);
if (((TASK_SIZE - len) >= addr) &&
(!vma || addr + len <= vma->vm_start))
goto found_addr;
}
addr = HPAGE_ALIGN(TASK_UNMAPPED_BASE);
for (vma = find_vma(current->mm, addr); ; vma = vma->vm_next) {
if (TASK_SIZE - len < addr)
return -ENOMEM;
if (!vma || ((addr + len) < vma->vm_start))
goto found_addr;
addr = vma->vm_end;
}
found_addr:
addr = HPAGE_ALIGN(addr);
return addr;
}
asmlinkage unsigned long
sys_alloc_hugepages(int key, unsigned long addr, unsigned long len, int prot, int flag)
{
struct mm_struct *mm = current->mm;
unsigned long raddr;
int retval = 0;
extern int alloc_hugetlb_pages(int, unsigned long, unsigned long, int, int);
if (!(cpu_has_pse))
return -EINVAL;
if (key < 0)
return -EINVAL;
if (len & (HPAGE_SIZE - 1))
return -EINVAL;
down_write(&mm->mmap_sem);
raddr = get_addr(addr, len);
if (raddr == -ENOMEM)
goto raddr_out;
retval = alloc_hugetlb_pages(key, raddr, len, prot, flag);
raddr_out: up_write(&mm->mmap_sem);
if (retval < 0)
return (unsigned long) retval;
return raddr;
}
asmlinkage int
sys_free_hugepages(unsigned long addr)
{
struct mm_struct *mm = current->mm;
struct vm_area_struct *vma;
int retval;
extern int free_hugepages(struct vm_area_struct *);
vma = find_vma(current->mm, addr);
if ((!vma) || (!is_vm_hugetlb_page(vma)) || (vma->vm_start!=addr))
return -EINVAL;
down_write(&mm->mmap_sem);
spin_lock(&mm->page_table_lock);
retval = free_hugepages(vma);
spin_unlock(&mm->page_table_lock);
up_write(&mm->mmap_sem);
return retval;
}
#else
asmlinkage unsigned long
sys_alloc_hugepages(int key, unsigned long addr, size_t len, int prot, int flag)
{
return -ENOSYS;
}
asmlinkage int
sys_free_hugepages(unsigned long addr)
{
return -ENOSYS;
}
#endif
...@@ -12,5 +12,6 @@ O_TARGET := mm.o ...@@ -12,5 +12,6 @@ O_TARGET := mm.o
obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o obj-y := init.o pgtable.o fault.o ioremap.o extable.o pageattr.o
obj-$(CONFIG_DISCONTIGMEM) += discontig.o obj-$(CONFIG_DISCONTIGMEM) += discontig.o
export-objs := pageattr.o export-objs := pageattr.o
obj-$(CONFIG_HUGETLB_PAGE) += hugetlbpage.o
include $(TOPDIR)/Rules.make include $(TOPDIR)/Rules.make
This diff is collapsed.
...@@ -431,6 +431,13 @@ static void __init set_max_mapnr_init(void) ...@@ -431,6 +431,13 @@ static void __init set_max_mapnr_init(void)
extern void set_max_mapnr_init(void); extern void set_max_mapnr_init(void);
#endif /* !CONFIG_DISCONTIGMEM */ #endif /* !CONFIG_DISCONTIGMEM */
#ifdef CONFIG_HUGETLB_PAGE
long htlbpagemem = 0;
int htlbpage_max;
long htlbzone_pages;
extern struct list_head htlbpage_freelist;
#endif
void __init mem_init(void) void __init mem_init(void)
{ {
extern int ppro_with_ram_bug(void); extern int ppro_with_ram_bug(void);
...@@ -493,6 +500,30 @@ void __init mem_init(void) ...@@ -493,6 +500,30 @@ void __init mem_init(void)
#ifndef CONFIG_SMP #ifndef CONFIG_SMP
zap_low_mappings(); zap_low_mappings();
#endif #endif
#ifdef CONFIG_HUGETLB_PAGE
{
long i, j;
struct page *page, *map;
/*For now reserve quarter for hugetlb_pages.*/
htlbzone_pages = (max_low_pfn >> ((HPAGE_SHIFT - PAGE_SHIFT) + 2)) ;
/*Will make this kernel command line. */
INIT_LIST_HEAD(&htlbpage_freelist);
for (i=0; i<htlbzone_pages; i++) {
page = alloc_pages(GFP_ATOMIC, HUGETLB_PAGE_ORDER);
if (page == NULL)
break;
map = page;
for (j=0; j<(HPAGE_SIZE/PAGE_SIZE); j++) {
SetPageReserved(map);
map++;
}
list_add(&page->list, &htlbpage_freelist);
}
printk("Total Huge_TLB_Page memory pages allocated %ld\n", i);
htlbzone_pages = htlbpagemem = i;
htlbpage_max = i;
}
#endif
} }
#if CONFIG_X86_PAE #if CONFIG_X86_PAE
......
...@@ -487,7 +487,18 @@ int proc_pid_statm(struct task_struct *task, char * buffer) ...@@ -487,7 +487,18 @@ int proc_pid_statm(struct task_struct *task, char * buffer)
while (vma) { while (vma) {
pgd_t *pgd = pgd_offset(mm, vma->vm_start); pgd_t *pgd = pgd_offset(mm, vma->vm_start);
int pages = 0, shared = 0, dirty = 0, total = 0; int pages = 0, shared = 0, dirty = 0, total = 0;
if (is_vm_hugetlb_page(vma)) {
int num_pages = ((vma->vm_end - vma->vm_start)/PAGE_SIZE);
resident += num_pages;
if (!(vma->vm_flags & VM_DONTCOPY))
share += num_pages;
if (vma->vm_flags & VM_WRITE)
dt += num_pages;
drs += num_pages;
vma = vma->vm_next;
continue;
}
statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total); statm_pgd_range(pgd, vma->vm_start, vma->vm_end, &pages, &shared, &dirty, &total);
resident += pages; resident += pages;
share += shared; share += shared;
......
...@@ -206,6 +206,19 @@ static int meminfo_read_proc(char *page, char **start, off_t off, ...@@ -206,6 +206,19 @@ static int meminfo_read_proc(char *page, char **start, off_t off,
non_flushes non_flushes
); );
#ifdef CONFIG_HUGETLB_PAGE
{
extern unsigned long htlbpagemem, htlbzone_pages;
len += sprintf(page + len,
"HugePages: %8lu\n"
"Available: %8lu\n"
"Size: %8lu kB\n",
htlbzone_pages,
htlbpagemem,
HPAGE_SIZE/1024);
}
#endif
return proc_calc_metrics(page, start, off, count, eof, len); return proc_calc_metrics(page, start, off, count, eof, len);
#undef K #undef K
} }
......
...@@ -44,14 +44,22 @@ typedef struct { unsigned long pte_low, pte_high; } pte_t; ...@@ -44,14 +44,22 @@ typedef struct { unsigned long pte_low, pte_high; } pte_t;
typedef struct { unsigned long long pmd; } pmd_t; typedef struct { unsigned long long pmd; } pmd_t;
typedef struct { unsigned long long pgd; } pgd_t; typedef struct { unsigned long long pgd; } pgd_t;
#define pte_val(x) ((x).pte_low | ((unsigned long long)(x).pte_high << 32)) #define pte_val(x) ((x).pte_low | ((unsigned long long)(x).pte_high << 32))
#define HPAGE_SHIFT 21
#else #else
typedef struct { unsigned long pte_low; } pte_t; typedef struct { unsigned long pte_low; } pte_t;
typedef struct { unsigned long pmd; } pmd_t; typedef struct { unsigned long pmd; } pmd_t;
typedef struct { unsigned long pgd; } pgd_t; typedef struct { unsigned long pgd; } pgd_t;
#define pte_val(x) ((x).pte_low) #define pte_val(x) ((x).pte_low)
#define HPAGE_SHIFT 22
#endif #endif
#define PTE_MASK PAGE_MASK #define PTE_MASK PAGE_MASK
#ifdef CONFIG_HUGETLB_PAGE
#define HPAGE_SIZE ((1UL) << HPAGE_SHIFT)
#define HPAGE_MASK (~(HPAGE_SIZE - 1))
#define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT)
#endif
typedef struct { unsigned long pgprot; } pgprot_t; typedef struct { unsigned long pgprot; } pgprot_t;
#define pmd_val(x) ((x).pmd) #define pmd_val(x) ((x).pmd)
......
...@@ -104,6 +104,7 @@ struct vm_area_struct { ...@@ -104,6 +104,7 @@ struct vm_area_struct {
#define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */ #define VM_DONTEXPAND 0x00040000 /* Cannot expand with mremap() */
#define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */ #define VM_RESERVED 0x00080000 /* Don't unmap it from swap_out */
#define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */ #define VM_ACCOUNT 0x00100000 /* Is a VM accounted object */
#define VM_HUGETLB 0x00400000 /* Huge TLB Page VM */
#define VM_STACK_FLAGS (0x00000100 | VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT) #define VM_STACK_FLAGS (0x00000100 | VM_DATA_DEFAULT_FLAGS | VM_ACCOUNT)
...@@ -377,6 +378,20 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long ...@@ -377,6 +378,20 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, unsigned long
int __set_page_dirty_buffers(struct page *page); int __set_page_dirty_buffers(struct page *page);
int __set_page_dirty_nobuffers(struct page *page); int __set_page_dirty_nobuffers(struct page *page);
#ifdef CONFIG_HUGETLB_PAGE
#define is_vm_hugetlb_page(vma) (vma->vm_flags & VM_HUGETLB)
extern int copy_hugetlb_page_range(struct mm_struct *, struct mm_struct *, struct vm_area_struct *);
extern int follow_hugetlb_page(struct mm_struct *, struct vm_area_struct *, struct page **, struct vm_area_struct **, unsigned long *, int *, int);
extern int free_hugepages(struct vm_area_struct *);
#else
#define is_vm_hugetlb_page(vma) (0)
#define follow_hugetlb_page(mm, vma, pages, vmas, start, len, i) (0)
#define copy_hugetlb_page_range(dst, src, vma) (0)
#define free_hugepages(mpnt) do { } while(0)
#endif
/* /*
* If the mapping doesn't provide a set_page_dirty a_op, then * If the mapping doesn't provide a set_page_dirty a_op, then
* just fall through and assume that it wants buffer_heads. * just fall through and assume that it wants buffer_heads.
......
...@@ -16,7 +16,7 @@ ...@@ -16,7 +16,7 @@
*/ */
#ifndef CONFIG_FORCE_MAX_ZONEORDER #ifndef CONFIG_FORCE_MAX_ZONEORDER
#define MAX_ORDER 10 #define MAX_ORDER 11
#else #else
#define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER #define MAX_ORDER CONFIG_FORCE_MAX_ZONEORDER
#endif #endif
......
...@@ -128,6 +128,7 @@ enum ...@@ -128,6 +128,7 @@ enum
KERN_TAINTED=53, /* int: various kernel tainted flags */ KERN_TAINTED=53, /* int: various kernel tainted flags */
KERN_CADPID=54, /* int: PID of the process to notify on CAD */ KERN_CADPID=54, /* int: PID of the process to notify on CAD */
KERN_PIDMAX=55, /* int: PID # limit */ KERN_PIDMAX=55, /* int: PID # limit */
KERN_HUGETLB_PAGE_NUM=56, /* int: Number of available Huge Pages */
}; };
......
...@@ -98,6 +98,11 @@ int proc_dol2crvec(ctl_table *table, int write, struct file *filp, ...@@ -98,6 +98,11 @@ int proc_dol2crvec(ctl_table *table, int write, struct file *filp,
extern int acct_parm[]; extern int acct_parm[];
#endif #endif
#ifdef CONFIG_HUGETLB_PAGE
extern int htlbpage_max;
extern int set_hugetlb_mem_size(int);
#endif
static int parse_table(int *, int, void *, size_t *, void *, size_t, static int parse_table(int *, int, void *, size_t *, void *, size_t,
ctl_table *, void **); ctl_table *, void **);
static int proc_doutsstring(ctl_table *table, int write, struct file *filp, static int proc_doutsstring(ctl_table *table, int write, struct file *filp,
...@@ -258,6 +263,10 @@ static ctl_table kern_table[] = { ...@@ -258,6 +263,10 @@ static ctl_table kern_table[] = {
#endif #endif
{KERN_PIDMAX, "pid_max", &pid_max, sizeof (int), {KERN_PIDMAX, "pid_max", &pid_max, sizeof (int),
0600, NULL, &proc_dointvec}, 0600, NULL, &proc_dointvec},
#ifdef CONFIG_HUGETLB_PAGE
{KERN_HUGETLB_PAGE_NUM, "numhugepages", &htlbpage_max, sizeof(int), 0644, NULL,
&proc_dointvec},
#endif
{0} {0}
}; };
...@@ -897,6 +906,10 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp, ...@@ -897,6 +906,10 @@ static int do_proc_dointvec(ctl_table *table, int write, struct file *filp,
val = -val; val = -val;
buffer += len; buffer += len;
left -= len; left -= len;
#ifdef CONFIG_HUGETLB_PAGE
if (i == &htlbpage_max)
val = set_hugetlb_mem_size(val);
#endif
switch(op) { switch(op) {
case OP_SET: *i = val; break; case OP_SET: *i = val; break;
case OP_AND: *i &= val; break; case OP_AND: *i &= val; break;
......
...@@ -208,6 +208,9 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src, ...@@ -208,6 +208,9 @@ int copy_page_range(struct mm_struct *dst, struct mm_struct *src,
unsigned long end = vma->vm_end; unsigned long end = vma->vm_end;
unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE; unsigned long cow = (vma->vm_flags & (VM_SHARED | VM_MAYWRITE)) == VM_MAYWRITE;
if (is_vm_hugetlb_page(vma))
return copy_hugetlb_page_range(dst, src, vma);
src_pgd = pgd_offset(src, address)-1; src_pgd = pgd_offset(src, address)-1;
dst_pgd = pgd_offset(dst, address)-1; dst_pgd = pgd_offset(dst, address)-1;
...@@ -530,6 +533,11 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm, ...@@ -530,6 +533,11 @@ int get_user_pages(struct task_struct *tsk, struct mm_struct *mm,
|| !(flags & vma->vm_flags)) || !(flags & vma->vm_flags))
return i ? : -EFAULT; return i ? : -EFAULT;
if (is_vm_hugetlb_page(vma)) {
i = follow_hugetlb_page(mm, vma, pages, vmas,
&start, &len, i);
continue;
}
spin_lock(&mm->page_table_lock); spin_lock(&mm->page_table_lock);
do { do {
struct page *map; struct page *map;
......
...@@ -1031,10 +1031,14 @@ static struct vm_area_struct *touched_by_munmap(struct mm_struct *mm, ...@@ -1031,10 +1031,14 @@ static struct vm_area_struct *touched_by_munmap(struct mm_struct *mm,
touched = NULL; touched = NULL;
do { do {
struct vm_area_struct *next = mpnt->vm_next; struct vm_area_struct *next = mpnt->vm_next;
mpnt->vm_next = touched; if (!(is_vm_hugetlb_page(mpnt))) {
touched = mpnt; mpnt->vm_next = touched;
mm->map_count--; touched = mpnt;
rb_erase(&mpnt->vm_rb, &mm->mm_rb); rb_erase(&mpnt->vm_rb, &mm->mm_rb);
mm->map_count--;
}
else
free_hugepages(mpnt);
mpnt = next; mpnt = next;
} while (mpnt && mpnt->vm_start < end); } while (mpnt && mpnt->vm_start < end);
*npp = mpnt; *npp = mpnt;
...@@ -1273,7 +1277,10 @@ void exit_mmap(struct mm_struct * mm) ...@@ -1273,7 +1277,10 @@ void exit_mmap(struct mm_struct * mm)
vm_unacct_memory((end - start) >> PAGE_SHIFT); vm_unacct_memory((end - start) >> PAGE_SHIFT);
mm->map_count--; mm->map_count--;
unmap_page_range(tlb, mpnt, start, end); if (!(is_vm_hugetlb_page(mpnt)))
unmap_page_range(tlb, mpnt, start, end);
else
mpnt->vm_ops->close(mpnt);
mpnt = mpnt->vm_next; mpnt = mpnt->vm_next;
} }
......
...@@ -321,6 +321,11 @@ asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot ...@@ -321,6 +321,11 @@ asmlinkage long sys_mprotect(unsigned long start, size_t len, unsigned long prot
/* Here we know that vma->vm_start <= nstart < vma->vm_end. */ /* Here we know that vma->vm_start <= nstart < vma->vm_end. */
if (is_vm_hugetlb_page(vma)) {
error = -EACCES;
goto out;
}
newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC)); newflags = prot | (vma->vm_flags & ~(PROT_READ | PROT_WRITE | PROT_EXEC));
if ((newflags & ~(newflags >> 4)) & 0xf) { if ((newflags & ~(newflags >> 4)) & 0xf) {
error = -EACCES; error = -EACCES;
......
...@@ -311,6 +311,10 @@ unsigned long do_mremap(unsigned long addr, ...@@ -311,6 +311,10 @@ unsigned long do_mremap(unsigned long addr,
vma = find_vma(current->mm, addr); vma = find_vma(current->mm, addr);
if (!vma || vma->vm_start > addr) if (!vma || vma->vm_start > addr)
goto out; goto out;
if (is_vm_hugetlb_page(vma)) {
ret = -EINVAL;
goto out;
}
/* We can't remap across vm area boundaries */ /* We can't remap across vm area boundaries */
if (old_len > vma->vm_end - addr) if (old_len > vma->vm_end - addr)
goto out; goto out;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment