Commit 5a6fe125 authored by Mel Gorman's avatar Mel Gorman Committed by Linus Torvalds

Do not account for the address space used by hugetlbfs using VM_ACCOUNT

When overcommit is disabled, the core VM accounts for pages used by anonymous
shared, private mappings and special mappings. It keeps track of VMAs that
should be accounted for with VM_ACCOUNT and VMAs that never had a reserve
with VM_NORESERVE.

Overcommit for hugetlbfs is much riskier than overcommit for base pages
due to contiguity requirements. It avoids overcommiting on both shared and
private mappings using reservation counters that are checked and updated
during mmap(). This ensures (within limits) that hugepages exist in the
future when faults occurs or it is too easy to applications to be SIGKILLed.

As hugetlbfs makes its own reservations of a different unit to the base page
size, VM_ACCOUNT should never be set. Even if the units were correct, we would
double account for the usage in the core VM and hugetlbfs. VM_NORESERVE may
be set because an application can request no reserves be made for hugetlbfs
at the risk of getting killed later.

With commit fc8744ad, VM_NORESERVE and
VM_ACCOUNT are getting unconditionally set for hugetlbfs-backed mappings. This
breaks the accounting for both the core VM and hugetlbfs, can trigger an
OOM storm when hugepage pools are too small lockups and corrupted counters
otherwise are used. This patch brings hugetlbfs more in line with how the
core VM treats VM_NORESERVE but prevents VM_ACCOUNT being set.
Signed-off-by: default avatarMel Gorman <mel@csn.ul.ie>
Signed-off-by: default avatarLinus Torvalds <torvalds@linux-foundation.org>
parent 4c098bcd
...@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) ...@@ -108,7 +108,8 @@ static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
if (hugetlb_reserve_pages(inode, if (hugetlb_reserve_pages(inode,
vma->vm_pgoff >> huge_page_order(h), vma->vm_pgoff >> huge_page_order(h),
len >> huge_page_shift(h), vma)) len >> huge_page_shift(h), vma,
vma->vm_flags))
goto out; goto out;
ret = 0; ret = 0;
...@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void) ...@@ -947,7 +948,7 @@ static int can_do_hugetlb_shm(void)
can_do_mlock()); can_do_mlock());
} }
struct file *hugetlb_file_setup(const char *name, size_t size) struct file *hugetlb_file_setup(const char *name, size_t size, int acctflag)
{ {
int error = -ENOMEM; int error = -ENOMEM;
struct file *file; struct file *file;
...@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size) ...@@ -981,7 +982,8 @@ struct file *hugetlb_file_setup(const char *name, size_t size)
error = -ENOMEM; error = -ENOMEM;
if (hugetlb_reserve_pages(inode, 0, if (hugetlb_reserve_pages(inode, 0,
size >> huge_page_shift(hstate_inode(inode)), NULL)) size >> huge_page_shift(hstate_inode(inode)), NULL,
acctflag))
goto out_inode; goto out_inode;
d_instantiate(dentry, inode); d_instantiate(dentry, inode);
......
...@@ -33,7 +33,8 @@ unsigned long hugetlb_total_pages(void); ...@@ -33,7 +33,8 @@ unsigned long hugetlb_total_pages(void);
int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma,
unsigned long address, int write_access); unsigned long address, int write_access);
int hugetlb_reserve_pages(struct inode *inode, long from, long to, int hugetlb_reserve_pages(struct inode *inode, long from, long to,
struct vm_area_struct *vma); struct vm_area_struct *vma,
int acctflags);
void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed); void hugetlb_unreserve_pages(struct inode *inode, long offset, long freed);
extern unsigned long hugepages_treat_as_movable; extern unsigned long hugepages_treat_as_movable;
...@@ -138,7 +139,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb) ...@@ -138,7 +139,7 @@ static inline struct hugetlbfs_sb_info *HUGETLBFS_SB(struct super_block *sb)
extern const struct file_operations hugetlbfs_file_operations; extern const struct file_operations hugetlbfs_file_operations;
extern struct vm_operations_struct hugetlb_vm_ops; extern struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(const char *name, size_t); struct file *hugetlb_file_setup(const char *name, size_t, int);
int hugetlb_get_quota(struct address_space *mapping, long delta); int hugetlb_get_quota(struct address_space *mapping, long delta);
void hugetlb_put_quota(struct address_space *mapping, long delta); void hugetlb_put_quota(struct address_space *mapping, long delta);
......
...@@ -1129,8 +1129,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, ...@@ -1129,8 +1129,7 @@ extern unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
unsigned long flag, unsigned long pgoff); unsigned long flag, unsigned long pgoff);
extern unsigned long mmap_region(struct file *file, unsigned long addr, extern unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, unsigned long flags, unsigned long len, unsigned long flags,
unsigned int vm_flags, unsigned long pgoff, unsigned int vm_flags, unsigned long pgoff);
int accountable);
static inline unsigned long do_mmap(struct file *file, unsigned long addr, static inline unsigned long do_mmap(struct file *file, unsigned long addr,
unsigned long len, unsigned long prot, unsigned long len, unsigned long prot,
......
...@@ -340,6 +340,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ...@@ -340,6 +340,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
struct file * file; struct file * file;
char name[13]; char name[13];
int id; int id;
int acctflag = 0;
if (size < SHMMIN || size > ns->shm_ctlmax) if (size < SHMMIN || size > ns->shm_ctlmax)
return -EINVAL; return -EINVAL;
...@@ -364,11 +365,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ...@@ -364,11 +365,12 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
sprintf (name, "SYSV%08x", key); sprintf (name, "SYSV%08x", key);
if (shmflg & SHM_HUGETLB) { if (shmflg & SHM_HUGETLB) {
/* hugetlb_file_setup takes care of mlock user accounting */ /* hugetlb_file_setup applies strict accounting */
file = hugetlb_file_setup(name, size); if (shmflg & SHM_NORESERVE)
acctflag = VM_NORESERVE;
file = hugetlb_file_setup(name, size, acctflag);
shp->mlock_user = current_user(); shp->mlock_user = current_user();
} else { } else {
int acctflag = 0;
/* /*
* Do not allow no accounting for OVERCOMMIT_NEVER, even * Do not allow no accounting for OVERCOMMIT_NEVER, even
* if it's asked for. * if it's asked for.
......
...@@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size, ...@@ -198,7 +198,7 @@ SYSCALL_DEFINE5(remap_file_pages, unsigned long, start, unsigned long, size,
flags &= MAP_NONBLOCK; flags &= MAP_NONBLOCK;
get_file(file); get_file(file);
addr = mmap_region(file, start, size, addr = mmap_region(file, start, size,
flags, vma->vm_flags, pgoff, 1); flags, vma->vm_flags, pgoff);
fput(file); fput(file);
if (IS_ERR_VALUE(addr)) { if (IS_ERR_VALUE(addr)) {
err = addr; err = addr;
......
...@@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma, ...@@ -2269,14 +2269,12 @@ void hugetlb_change_protection(struct vm_area_struct *vma,
int hugetlb_reserve_pages(struct inode *inode, int hugetlb_reserve_pages(struct inode *inode,
long from, long to, long from, long to,
struct vm_area_struct *vma) struct vm_area_struct *vma,
int acctflag)
{ {
long ret, chg; long ret = 0, chg;
struct hstate *h = hstate_inode(inode); struct hstate *h = hstate_inode(inode);
if (vma && vma->vm_flags & VM_NORESERVE)
return 0;
/* /*
* Shared mappings base their reservation on the number of pages that * Shared mappings base their reservation on the number of pages that
* are already allocated on behalf of the file. Private mappings need * are already allocated on behalf of the file. Private mappings need
...@@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode, ...@@ -2285,22 +2283,25 @@ int hugetlb_reserve_pages(struct inode *inode,
*/ */
if (!vma || vma->vm_flags & VM_SHARED) if (!vma || vma->vm_flags & VM_SHARED)
chg = region_chg(&inode->i_mapping->private_list, from, to); chg = region_chg(&inode->i_mapping->private_list, from, to);
else { else
struct resv_map *resv_map = resv_map_alloc();
if (!resv_map)
return -ENOMEM;
chg = to - from; chg = to - from;
set_vma_resv_map(vma, resv_map);
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
if (chg < 0) if (chg < 0)
return chg; return chg;
if (hugetlb_get_quota(inode->i_mapping, chg)) if (hugetlb_get_quota(inode->i_mapping, chg))
return -ENOSPC; return -ENOSPC;
/*
* Only apply hugepage reservation if asked. We still have to
* take the filesystem quota because it is an upper limit
* defined for the mount and not necessarily memory as a whole
*/
if (acctflag & VM_NORESERVE) {
reset_vma_resv_huge_pages(vma);
return 0;
}
ret = hugetlb_acct_memory(h, chg); ret = hugetlb_acct_memory(h, chg);
if (ret < 0) { if (ret < 0) {
hugetlb_put_quota(inode->i_mapping, chg); hugetlb_put_quota(inode->i_mapping, chg);
...@@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode, ...@@ -2308,6 +2309,16 @@ int hugetlb_reserve_pages(struct inode *inode,
} }
if (!vma || vma->vm_flags & VM_SHARED) if (!vma || vma->vm_flags & VM_SHARED)
region_add(&inode->i_mapping->private_list, from, to); region_add(&inode->i_mapping->private_list, from, to);
else {
struct resv_map *resv_map = resv_map_alloc();
if (!resv_map)
return -ENOMEM;
set_vma_resv_map(vma, resv_map);
set_vma_resv_flags(vma, HPAGE_RESV_OWNER);
}
return 0; return 0;
} }
......
...@@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, ...@@ -918,7 +918,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
struct inode *inode; struct inode *inode;
unsigned int vm_flags; unsigned int vm_flags;
int error; int error;
int accountable = 1;
unsigned long reqprot = prot; unsigned long reqprot = prot;
/* /*
...@@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, ...@@ -1019,8 +1018,6 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
return -EPERM; return -EPERM;
vm_flags &= ~VM_MAYEXEC; vm_flags &= ~VM_MAYEXEC;
} }
if (is_file_hugepages(file))
accountable = 0;
if (!file->f_op || !file->f_op->mmap) if (!file->f_op || !file->f_op->mmap)
return -ENODEV; return -ENODEV;
...@@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr, ...@@ -1053,8 +1050,7 @@ unsigned long do_mmap_pgoff(struct file *file, unsigned long addr,
if (error) if (error)
return error; return error;
return mmap_region(file, addr, len, flags, vm_flags, pgoff, return mmap_region(file, addr, len, flags, vm_flags, pgoff);
accountable);
} }
EXPORT_SYMBOL(do_mmap_pgoff); EXPORT_SYMBOL(do_mmap_pgoff);
...@@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma) ...@@ -1092,17 +1088,23 @@ int vma_wants_writenotify(struct vm_area_struct *vma)
/* /*
* We account for memory if it's a private writeable mapping, * We account for memory if it's a private writeable mapping,
* and VM_NORESERVE wasn't set. * not hugepages and VM_NORESERVE wasn't set.
*/ */
static inline int accountable_mapping(unsigned int vm_flags) static inline int accountable_mapping(struct file *file, unsigned int vm_flags)
{ {
/*
* hugetlb has its own accounting separate from the core VM
* VM_HUGETLB may not be set yet so we cannot check for that flag.
*/
if (file && is_file_hugepages(file))
return 0;
return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE; return (vm_flags & (VM_NORESERVE | VM_SHARED | VM_WRITE)) == VM_WRITE;
} }
unsigned long mmap_region(struct file *file, unsigned long addr, unsigned long mmap_region(struct file *file, unsigned long addr,
unsigned long len, unsigned long flags, unsigned long len, unsigned long flags,
unsigned int vm_flags, unsigned long pgoff, unsigned int vm_flags, unsigned long pgoff)
int accountable)
{ {
struct mm_struct *mm = current->mm; struct mm_struct *mm = current->mm;
struct vm_area_struct *vma, *prev; struct vm_area_struct *vma, *prev;
...@@ -1128,18 +1130,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr, ...@@ -1128,18 +1130,22 @@ unsigned long mmap_region(struct file *file, unsigned long addr,
/* /*
* Set 'VM_NORESERVE' if we should not account for the * Set 'VM_NORESERVE' if we should not account for the
* memory use of this mapping. We only honor MAP_NORESERVE * memory use of this mapping.
* if we're allowed to overcommit memory.
*/ */
if ((flags & MAP_NORESERVE) && sysctl_overcommit_memory != OVERCOMMIT_NEVER) if ((flags & MAP_NORESERVE)) {
/* We honor MAP_NORESERVE if allowed to overcommit */
if (sysctl_overcommit_memory != OVERCOMMIT_NEVER)
vm_flags |= VM_NORESERVE; vm_flags |= VM_NORESERVE;
if (!accountable)
/* hugetlb applies strict overcommit unless MAP_NORESERVE */
if (file && is_file_hugepages(file))
vm_flags |= VM_NORESERVE; vm_flags |= VM_NORESERVE;
}
/* /*
* Private writable mapping: check memory availability * Private writable mapping: check memory availability
*/ */
if (accountable_mapping(vm_flags)) { if (accountable_mapping(file, vm_flags)) {
charged = len >> PAGE_SHIFT; charged = len >> PAGE_SHIFT;
if (security_vm_enough_memory(charged)) if (security_vm_enough_memory(charged))
return -ENOMEM; return -ENOMEM;
......
...@@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev, ...@@ -151,10 +151,11 @@ mprotect_fixup(struct vm_area_struct *vma, struct vm_area_struct **pprev,
/* /*
* If we make a private mapping writable we increase our commit; * If we make a private mapping writable we increase our commit;
* but (without finer accounting) cannot reduce our commit if we * but (without finer accounting) cannot reduce our commit if we
* make it unwritable again. * make it unwritable again. hugetlb mapping were accounted for
* even if read-only so there is no need to account for them here
*/ */
if (newflags & VM_WRITE) { if (newflags & VM_WRITE) {
if (!(oldflags & (VM_ACCOUNT|VM_WRITE| if (!(oldflags & (VM_ACCOUNT|VM_WRITE|VM_HUGETLB|
VM_SHARED|VM_NORESERVE))) { VM_SHARED|VM_NORESERVE))) {
charged = nrpages; charged = nrpages;
if (security_vm_enough_memory(charged)) if (security_vm_enough_memory(charged))
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment