Commit cd053a94 authored by Andrew Morton's avatar Andrew Morton Committed by Linus Torvalds

[PATCH] Add sysctl to define a hugetlb-capable group

From: "Chen, Kenneth W" <kenneth.w.chen@intel.com>,
      "Seth, Rohit" <rohit.seth@intel.com>

This patch addresses the longstanding problem wherein Oracle needs
CAP_IPC_LOCK to allocate SHM_HUGETLB shm memory, but people don't want to run
Oracle as root, and capabilties are busted.

Various ideas with rlimits didn't work out, mainly because these objects live
beyond the lifetime of the user processes which establish them.

What we do is to create root-writeable /proc/sys/vm/hugetlb_shm_group which
specifies a single group ID.  Users who belong to that group may allocate
hugepages for SHM_HUGETLB shm segments.

So the sysadmin will greate a new group, say `hugepageusers', will add the
oracle user to that group and will write that group's ID into
/proc/sys/vm/hugetlb_shm_group.
parent 9008d35b
...@@ -1208,6 +1208,14 @@ On the other hand, enabling this feature can cause you to run out of memory ...@@ -1208,6 +1208,14 @@ On the other hand, enabling this feature can cause you to run out of memory
and thrash the system to death, so large and/or important servers will want to and thrash the system to death, so large and/or important servers will want to
set this value to 0. set this value to 0.
nr_hugepages and hugetlb_shm_group
----------------------------------
nr_hugepages configures number of hugetlb page reserved for the system.
hugetlb_shm_group contains group id that is allowed to create SysV shared
memory segment using hugetlb page.
2.5 /proc/sys/dev - Device specific parameters 2.5 /proc/sys/dev - Device specific parameters
---------------------------------------------- ----------------------------------------------
...@@ -1848,10 +1856,3 @@ need to recompile the kernel, or even to reboot the system. The files in the ...@@ -1848,10 +1856,3 @@ need to recompile the kernel, or even to reboot the system. The files in the
command to write value into these files, thereby changing the default settings command to write value into these files, thereby changing the default settings
of the kernel. of the kernel.
------------------------------------------------------------------------------ ------------------------------------------------------------------------------
...@@ -91,9 +91,12 @@ A regular chown, chgrp and chmod commands (with right permissions) could be ...@@ -91,9 +91,12 @@ A regular chown, chgrp and chmod commands (with right permissions) could be
used to change the file attributes on hugetlbfs. used to change the file attributes on hugetlbfs.
Also, it is important to note that no such mount command is required if the Also, it is important to note that no such mount command is required if the
applications are going to use only shmat/shmget system calls. It is possible applications are going to use only shmat/shmget system calls. Users who
for same or different applications to use any combination of mmaps and shm* wish to use hugetlb page via shared memory segment should be a member of
calls. Though the mount of filesystem will be required for using mmaps. a supplementary group and system admin needs to configure that gid into
/proc/sys/vm/hugetlb_shm_group. It is possible for same or different
applications to use any combination of mmaps and shm* calls. Though the
mount of filesystem will be required for using mmaps.
/* Example of using hugepage in user application using Sys V shared memory /* Example of using hugepage in user application using Sys V shared memory
* system calls. In this example, app is requesting memory of size 256MB that * system calls. In this example, app is requesting memory of size 256MB that
......
...@@ -43,6 +43,8 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = { ...@@ -43,6 +43,8 @@ static struct backing_dev_info hugetlbfs_backing_dev_info = {
.memory_backed = 1, /* Does not contribute to dirty memory */ .memory_backed = 1, /* Does not contribute to dirty memory */
}; };
int sysctl_hugetlb_shm_group;
static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma) static int hugetlbfs_file_mmap(struct file *file, struct vm_area_struct *vma)
{ {
struct inode *inode = file->f_dentry->d_inode; struct inode *inode = file->f_dentry->d_inode;
...@@ -698,6 +700,12 @@ static unsigned long hugetlbfs_counter(void) ...@@ -698,6 +700,12 @@ static unsigned long hugetlbfs_counter(void)
return ret; return ret;
} }
static int can_do_hugetlb_shm(void)
{
return likely(capable(CAP_IPC_LOCK) ||
in_group_p(sysctl_hugetlb_shm_group));
}
struct file *hugetlb_zero_setup(size_t size) struct file *hugetlb_zero_setup(size_t size)
{ {
int error; int error;
...@@ -707,7 +715,7 @@ struct file *hugetlb_zero_setup(size_t size) ...@@ -707,7 +715,7 @@ struct file *hugetlb_zero_setup(size_t size)
struct qstr quick_string; struct qstr quick_string;
char buf[16]; char buf[16];
if (!capable(CAP_IPC_LOCK)) if (!can_do_hugetlb_shm())
return ERR_PTR(-EPERM); return ERR_PTR(-EPERM);
if (!is_hugepage_mem_enough(size)) if (!is_hugepage_mem_enough(size))
......
...@@ -30,6 +30,7 @@ void free_huge_page(struct page *); ...@@ -30,6 +30,7 @@ void free_huge_page(struct page *);
extern unsigned long max_huge_pages; extern unsigned long max_huge_pages;
extern const unsigned long hugetlb_zero, hugetlb_infinity; extern const unsigned long hugetlb_zero, hugetlb_infinity;
extern int sysctl_hugetlb_shm_group;
static inline void static inline void
mark_mm_hugetlb(struct mm_struct *mm, struct vm_area_struct *vma) mark_mm_hugetlb(struct mm_struct *mm, struct vm_area_struct *vma)
......
...@@ -163,6 +163,7 @@ enum ...@@ -163,6 +163,7 @@ enum
VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */ VM_MAX_MAP_COUNT=22, /* int: Maximum number of mmaps/address-space */
VM_LAPTOP_MODE=23, /* vm laptop mode */ VM_LAPTOP_MODE=23, /* vm laptop mode */
VM_BLOCK_DUMP=24, /* block dump mode */ VM_BLOCK_DUMP=24, /* block dump mode */
VM_HUGETLB_GROUP=25, /* permitted hugetlb group */
}; };
......
...@@ -738,6 +738,14 @@ static ctl_table vm_table[] = { ...@@ -738,6 +738,14 @@ static ctl_table vm_table[] = {
.extra1 = (void *)&hugetlb_zero, .extra1 = (void *)&hugetlb_zero,
.extra2 = (void *)&hugetlb_infinity, .extra2 = (void *)&hugetlb_infinity,
}, },
{
.ctl_name = VM_HUGETLB_GROUP,
.procname = "hugetlb_shm_group",
.data = &sysctl_hugetlb_shm_group,
.maxlen = sizeof(gid_t),
.mode = 0644,
.proc_handler = &proc_dointvec,
},
#endif #endif
{ {
.ctl_name = VM_LOWER_ZONE_PROTECTION, .ctl_name = VM_LOWER_ZONE_PROTECTION,
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment