Commit c54b245d authored by Linus Torvalds's avatar Linus Torvalds

Merge branch 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace

Pull user namespace rlimit handling update from Eric Biederman:
 "This is the work mainly by Alexey Gladkov to limit rlimits to the
  rlimits of the user that created a user namespace, and to allow users
  to have stricter limits on the resources created within a user
  namespace."

* 'for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/ebiederm/user-namespace:
  cred: add missing return error code when set_cred_ucounts() failed
  ucounts: Silence warning in dec_rlimit_ucounts
  ucounts: Set ucount_max to the largest positive value the type can hold
  kselftests: Add test to check for rlimit changes in different user namespaces
  Reimplement RLIMIT_MEMLOCK on top of ucounts
  Reimplement RLIMIT_SIGPENDING on top of ucounts
  Reimplement RLIMIT_MSGQUEUE on top of ucounts
  Reimplement RLIMIT_NPROC on top of ucounts
  Use atomic_t for ucounts reference counting
  Add a reference to ucounts for each cred
  Increase size of ucounts to atomic_long_t
parents e17c120f 5e6b8a50
...@@ -1360,6 +1360,10 @@ int begin_new_exec(struct linux_binprm * bprm) ...@@ -1360,6 +1360,10 @@ int begin_new_exec(struct linux_binprm * bprm)
WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1); WRITE_ONCE(me->self_exec_id, me->self_exec_id + 1);
flush_signal_handlers(me, 0); flush_signal_handlers(me, 0);
retval = set_cred_ucounts(bprm->cred);
if (retval < 0)
goto out_unlock;
/* /*
* install the new credentials for this executable * install the new credentials for this executable
*/ */
...@@ -1874,7 +1878,7 @@ static int do_execveat_common(int fd, struct filename *filename, ...@@ -1874,7 +1878,7 @@ static int do_execveat_common(int fd, struct filename *filename,
* whether NPROC limit is still exceeded. * whether NPROC limit is still exceeded.
*/ */
if ((current->flags & PF_NPROC_EXCEEDED) && if ((current->flags & PF_NPROC_EXCEEDED) &&
atomic_read(&current_user()->processes) > rlimit(RLIMIT_NPROC)) { is_ucounts_overlimit(current_ucounts(), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
retval = -EAGAIN; retval = -EAGAIN;
goto out_ret; goto out_ret;
} }
......
...@@ -1446,7 +1446,7 @@ static int get_hstate_idx(int page_size_log) ...@@ -1446,7 +1446,7 @@ static int get_hstate_idx(int page_size_log)
* otherwise hugetlb_reserve_pages reserves one less hugepages than intended. * otherwise hugetlb_reserve_pages reserves one less hugepages than intended.
*/ */
struct file *hugetlb_file_setup(const char *name, size_t size, struct file *hugetlb_file_setup(const char *name, size_t size,
vm_flags_t acctflag, struct user_struct **user, vm_flags_t acctflag, struct ucounts **ucounts,
int creat_flags, int page_size_log) int creat_flags, int page_size_log)
{ {
struct inode *inode; struct inode *inode;
...@@ -1458,20 +1458,20 @@ struct file *hugetlb_file_setup(const char *name, size_t size, ...@@ -1458,20 +1458,20 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
if (hstate_idx < 0) if (hstate_idx < 0)
return ERR_PTR(-ENODEV); return ERR_PTR(-ENODEV);
*user = NULL; *ucounts = NULL;
mnt = hugetlbfs_vfsmount[hstate_idx]; mnt = hugetlbfs_vfsmount[hstate_idx];
if (!mnt) if (!mnt)
return ERR_PTR(-ENOENT); return ERR_PTR(-ENOENT);
if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) { if (creat_flags == HUGETLB_SHMFS_INODE && !can_do_hugetlb_shm()) {
*user = current_user(); *ucounts = current_ucounts();
if (user_shm_lock(size, *user)) { if (user_shm_lock(size, *ucounts)) {
task_lock(current); task_lock(current);
pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n", pr_warn_once("%s (%d): Using mlock ulimits for SHM_HUGETLB is deprecated\n",
current->comm, current->pid); current->comm, current->pid);
task_unlock(current); task_unlock(current);
} else { } else {
*user = NULL; *ucounts = NULL;
return ERR_PTR(-EPERM); return ERR_PTR(-EPERM);
} }
} }
...@@ -1498,9 +1498,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size, ...@@ -1498,9 +1498,9 @@ struct file *hugetlb_file_setup(const char *name, size_t size,
iput(inode); iput(inode);
out: out:
if (*user) { if (*ucounts) {
user_shm_unlock(size, *user); user_shm_unlock(size, *ucounts);
*user = NULL; *ucounts = NULL;
} }
return file; return file;
} }
......
...@@ -284,7 +284,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) ...@@ -284,7 +284,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p)
collect_sigign_sigcatch(p, &ignored, &caught); collect_sigign_sigcatch(p, &ignored, &caught);
num_threads = get_nr_threads(p); num_threads = get_nr_threads(p);
rcu_read_lock(); /* FIXME: is this correct? */ rcu_read_lock(); /* FIXME: is this correct? */
qsize = atomic_read(&__task_cred(p)->user->sigpending); qsize = get_ucounts_value(task_ucounts(p), UCOUNT_RLIMIT_SIGPENDING);
rcu_read_unlock(); rcu_read_unlock();
qlim = task_rlimit(p, RLIMIT_SIGPENDING); qlim = task_rlimit(p, RLIMIT_SIGPENDING);
unlock_task_sighand(p, &flags); unlock_task_sighand(p, &flags);
......
...@@ -143,6 +143,7 @@ struct cred { ...@@ -143,6 +143,7 @@ struct cred {
#endif #endif
struct user_struct *user; /* real user ID subscription */ struct user_struct *user; /* real user ID subscription */
struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */ struct user_namespace *user_ns; /* user_ns the caps and keyrings are relative to. */
struct ucounts *ucounts;
struct group_info *group_info; /* supplementary groups for euid/fsgid */ struct group_info *group_info; /* supplementary groups for euid/fsgid */
/* RCU deletion */ /* RCU deletion */
union { union {
...@@ -169,6 +170,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *); ...@@ -169,6 +170,7 @@ extern int set_security_override_from_ctx(struct cred *, const char *);
extern int set_create_files_as(struct cred *, struct inode *); extern int set_create_files_as(struct cred *, struct inode *);
extern int cred_fscmp(const struct cred *, const struct cred *); extern int cred_fscmp(const struct cred *, const struct cred *);
extern void __init cred_init(void); extern void __init cred_init(void);
extern int set_cred_ucounts(struct cred *);
/* /*
* check for validity of credentials * check for validity of credentials
...@@ -369,6 +371,7 @@ static inline void put_cred(const struct cred *_cred) ...@@ -369,6 +371,7 @@ static inline void put_cred(const struct cred *_cred)
#define task_uid(task) (task_cred_xxx((task), uid)) #define task_uid(task) (task_cred_xxx((task), uid))
#define task_euid(task) (task_cred_xxx((task), euid)) #define task_euid(task) (task_cred_xxx((task), euid))
#define task_ucounts(task) (task_cred_xxx((task), ucounts))
#define current_cred_xxx(xxx) \ #define current_cred_xxx(xxx) \
({ \ ({ \
...@@ -385,6 +388,7 @@ static inline void put_cred(const struct cred *_cred) ...@@ -385,6 +388,7 @@ static inline void put_cred(const struct cred *_cred)
#define current_fsgid() (current_cred_xxx(fsgid)) #define current_fsgid() (current_cred_xxx(fsgid))
#define current_cap() (current_cred_xxx(cap_effective)) #define current_cap() (current_cred_xxx(cap_effective))
#define current_user() (current_cred_xxx(user)) #define current_user() (current_cred_xxx(user))
#define current_ucounts() (current_cred_xxx(ucounts))
extern struct user_namespace init_user_ns; extern struct user_namespace init_user_ns;
#ifdef CONFIG_USER_NS #ifdef CONFIG_USER_NS
......
...@@ -451,7 +451,7 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode) ...@@ -451,7 +451,7 @@ static inline struct hugetlbfs_inode_info *HUGETLBFS_I(struct inode *inode)
extern const struct file_operations hugetlbfs_file_operations; extern const struct file_operations hugetlbfs_file_operations;
extern const struct vm_operations_struct hugetlb_vm_ops; extern const struct vm_operations_struct hugetlb_vm_ops;
struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct, struct file *hugetlb_file_setup(const char *name, size_t size, vm_flags_t acct,
struct user_struct **user, int creat_flags, struct ucounts **ucounts, int creat_flags,
int page_size_log); int page_size_log);
static inline bool is_file_hugepages(struct file *file) static inline bool is_file_hugepages(struct file *file)
...@@ -471,7 +471,7 @@ static inline struct hstate *hstate_inode(struct inode *i) ...@@ -471,7 +471,7 @@ static inline struct hstate *hstate_inode(struct inode *i)
#define is_file_hugepages(file) false #define is_file_hugepages(file) false
static inline struct file * static inline struct file *
hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag, hugetlb_file_setup(const char *name, size_t size, vm_flags_t acctflag,
struct user_struct **user, int creat_flags, struct ucounts **ucounts, int creat_flags,
int page_size_log) int page_size_log)
{ {
return ERR_PTR(-ENOSYS); return ERR_PTR(-ENOSYS);
......
...@@ -1709,8 +1709,8 @@ extern bool can_do_mlock(void); ...@@ -1709,8 +1709,8 @@ extern bool can_do_mlock(void);
#else #else
static inline bool can_do_mlock(void) { return false; } static inline bool can_do_mlock(void) { return false; }
#endif #endif
extern int user_shm_lock(size_t, struct user_struct *); extern int user_shm_lock(size_t, struct ucounts *);
extern void user_shm_unlock(size_t, struct user_struct *); extern void user_shm_unlock(size_t, struct ucounts *);
/* /*
* Parameter block passed down to zap_pte_range in exceptional cases. * Parameter block passed down to zap_pte_range in exceptional cases.
......
...@@ -12,16 +12,9 @@ ...@@ -12,16 +12,9 @@
*/ */
struct user_struct { struct user_struct {
refcount_t __count; /* reference count */ refcount_t __count; /* reference count */
atomic_t processes; /* How many processes does this user have? */
atomic_t sigpending; /* How many pending signals does this user have? */
#ifdef CONFIG_EPOLL #ifdef CONFIG_EPOLL
atomic_long_t epoll_watches; /* The number of file descriptors currently watched */ atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
#endif #endif
#ifdef CONFIG_POSIX_MQUEUE
/* protected by mq_lock */
unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
#endif
unsigned long locked_shm; /* How many pages of mlocked shm ? */
unsigned long unix_inflight; /* How many files in flight in unix sockets */ unsigned long unix_inflight; /* How many files in flight in unix sockets */
atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */ atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */
......
...@@ -65,7 +65,7 @@ extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt, ...@@ -65,7 +65,7 @@ extern struct file *shmem_file_setup_with_mnt(struct vfsmount *mnt,
extern int shmem_zero_setup(struct vm_area_struct *); extern int shmem_zero_setup(struct vm_area_struct *);
extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr, extern unsigned long shmem_get_unmapped_area(struct file *, unsigned long addr,
unsigned long len, unsigned long pgoff, unsigned long flags); unsigned long len, unsigned long pgoff, unsigned long flags);
extern int shmem_lock(struct file *file, int lock, struct user_struct *user); extern int shmem_lock(struct file *file, int lock, struct ucounts *ucounts);
#ifdef CONFIG_SHMEM #ifdef CONFIG_SHMEM
extern const struct address_space_operations shmem_aops; extern const struct address_space_operations shmem_aops;
static inline bool shmem_mapping(struct address_space *mapping) static inline bool shmem_mapping(struct address_space *mapping)
......
...@@ -13,6 +13,8 @@ typedef struct kernel_siginfo { ...@@ -13,6 +13,8 @@ typedef struct kernel_siginfo {
__SIGINFO; __SIGINFO;
} kernel_siginfo_t; } kernel_siginfo_t;
struct ucounts;
/* /*
* Real Time signals may be queued. * Real Time signals may be queued.
*/ */
...@@ -21,7 +23,7 @@ struct sigqueue { ...@@ -21,7 +23,7 @@ struct sigqueue {
struct list_head list; struct list_head list;
int flags; int flags;
kernel_siginfo_t info; kernel_siginfo_t info;
struct user_struct *user; struct ucounts *ucounts;
}; };
/* flags values. */ /* flags values. */
......
...@@ -54,9 +54,15 @@ enum ucount_type { ...@@ -54,9 +54,15 @@ enum ucount_type {
UCOUNT_FANOTIFY_GROUPS, UCOUNT_FANOTIFY_GROUPS,
UCOUNT_FANOTIFY_MARKS, UCOUNT_FANOTIFY_MARKS,
#endif #endif
UCOUNT_RLIMIT_NPROC,
UCOUNT_RLIMIT_MSGQUEUE,
UCOUNT_RLIMIT_SIGPENDING,
UCOUNT_RLIMIT_MEMLOCK,
UCOUNT_COUNTS, UCOUNT_COUNTS,
}; };
#define MAX_PER_NAMESPACE_UCOUNTS UCOUNT_RLIMIT_NPROC
struct user_namespace { struct user_namespace {
struct uid_gid_map uid_map; struct uid_gid_map uid_map;
struct uid_gid_map gid_map; struct uid_gid_map gid_map;
...@@ -92,23 +98,42 @@ struct user_namespace { ...@@ -92,23 +98,42 @@ struct user_namespace {
struct ctl_table_header *sysctls; struct ctl_table_header *sysctls;
#endif #endif
struct ucounts *ucounts; struct ucounts *ucounts;
int ucount_max[UCOUNT_COUNTS]; long ucount_max[UCOUNT_COUNTS];
} __randomize_layout; } __randomize_layout;
struct ucounts { struct ucounts {
struct hlist_node node; struct hlist_node node;
struct user_namespace *ns; struct user_namespace *ns;
kuid_t uid; kuid_t uid;
int count; atomic_t count;
atomic_t ucount[UCOUNT_COUNTS]; atomic_long_t ucount[UCOUNT_COUNTS];
}; };
extern struct user_namespace init_user_ns; extern struct user_namespace init_user_ns;
extern struct ucounts init_ucounts;
bool setup_userns_sysctls(struct user_namespace *ns); bool setup_userns_sysctls(struct user_namespace *ns);
void retire_userns_sysctls(struct user_namespace *ns); void retire_userns_sysctls(struct user_namespace *ns);
struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type); struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, enum ucount_type type);
void dec_ucount(struct ucounts *ucounts, enum ucount_type type); void dec_ucount(struct ucounts *ucounts, enum ucount_type type);
struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid);
struct ucounts * __must_check get_ucounts(struct ucounts *ucounts);
void put_ucounts(struct ucounts *ucounts);
static inline long get_ucounts_value(struct ucounts *ucounts, enum ucount_type type)
{
return atomic_long_read(&ucounts->ucount[type]);
}
long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v);
bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max);
static inline void set_rlimit_ucount_max(struct user_namespace *ns,
enum ucount_type type, unsigned long max)
{
ns->ucount_max[type] = max <= LONG_MAX ? max : LONG_MAX;
}
#ifdef CONFIG_USER_NS #ifdef CONFIG_USER_NS
......
...@@ -144,7 +144,7 @@ struct mqueue_inode_info { ...@@ -144,7 +144,7 @@ struct mqueue_inode_info {
struct pid *notify_owner; struct pid *notify_owner;
u32 notify_self_exec_id; u32 notify_self_exec_id;
struct user_namespace *notify_user_ns; struct user_namespace *notify_user_ns;
struct user_struct *user; /* user who created, for accounting */ struct ucounts *ucounts; /* user who created, for accounting */
struct sock *notify_sock; struct sock *notify_sock;
struct sk_buff *notify_cookie; struct sk_buff *notify_cookie;
...@@ -292,7 +292,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb, ...@@ -292,7 +292,6 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
struct ipc_namespace *ipc_ns, umode_t mode, struct ipc_namespace *ipc_ns, umode_t mode,
struct mq_attr *attr) struct mq_attr *attr)
{ {
struct user_struct *u = current_user();
struct inode *inode; struct inode *inode;
int ret = -ENOMEM; int ret = -ENOMEM;
...@@ -321,7 +320,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb, ...@@ -321,7 +320,7 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
info->notify_owner = NULL; info->notify_owner = NULL;
info->notify_user_ns = NULL; info->notify_user_ns = NULL;
info->qsize = 0; info->qsize = 0;
info->user = NULL; /* set when all is ok */ info->ucounts = NULL; /* set when all is ok */
info->msg_tree = RB_ROOT; info->msg_tree = RB_ROOT;
info->msg_tree_rightmost = NULL; info->msg_tree_rightmost = NULL;
info->node_cache = NULL; info->node_cache = NULL;
...@@ -371,19 +370,23 @@ static struct inode *mqueue_get_inode(struct super_block *sb, ...@@ -371,19 +370,23 @@ static struct inode *mqueue_get_inode(struct super_block *sb,
if (mq_bytes + mq_treesize < mq_bytes) if (mq_bytes + mq_treesize < mq_bytes)
goto out_inode; goto out_inode;
mq_bytes += mq_treesize; mq_bytes += mq_treesize;
spin_lock(&mq_lock); info->ucounts = get_ucounts(current_ucounts());
if (u->mq_bytes + mq_bytes < u->mq_bytes || if (info->ucounts) {
u->mq_bytes + mq_bytes > rlimit(RLIMIT_MSGQUEUE)) { long msgqueue;
spin_lock(&mq_lock);
msgqueue = inc_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
if (msgqueue == LONG_MAX || msgqueue > rlimit(RLIMIT_MSGQUEUE)) {
dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
spin_unlock(&mq_lock);
put_ucounts(info->ucounts);
info->ucounts = NULL;
/* mqueue_evict_inode() releases info->messages */
ret = -EMFILE;
goto out_inode;
}
spin_unlock(&mq_lock); spin_unlock(&mq_lock);
/* mqueue_evict_inode() releases info->messages */
ret = -EMFILE;
goto out_inode;
} }
u->mq_bytes += mq_bytes;
spin_unlock(&mq_lock);
/* all is ok */
info->user = get_uid(u);
} else if (S_ISDIR(mode)) { } else if (S_ISDIR(mode)) {
inc_nlink(inode); inc_nlink(inode);
/* Some things misbehave if size == 0 on a directory */ /* Some things misbehave if size == 0 on a directory */
...@@ -497,7 +500,6 @@ static void mqueue_free_inode(struct inode *inode) ...@@ -497,7 +500,6 @@ static void mqueue_free_inode(struct inode *inode)
static void mqueue_evict_inode(struct inode *inode) static void mqueue_evict_inode(struct inode *inode)
{ {
struct mqueue_inode_info *info; struct mqueue_inode_info *info;
struct user_struct *user;
struct ipc_namespace *ipc_ns; struct ipc_namespace *ipc_ns;
struct msg_msg *msg, *nmsg; struct msg_msg *msg, *nmsg;
LIST_HEAD(tmp_msg); LIST_HEAD(tmp_msg);
...@@ -520,8 +522,7 @@ static void mqueue_evict_inode(struct inode *inode) ...@@ -520,8 +522,7 @@ static void mqueue_evict_inode(struct inode *inode)
free_msg(msg); free_msg(msg);
} }
user = info->user; if (info->ucounts) {
if (user) {
unsigned long mq_bytes, mq_treesize; unsigned long mq_bytes, mq_treesize;
/* Total amount of bytes accounted for the mqueue */ /* Total amount of bytes accounted for the mqueue */
...@@ -533,7 +534,7 @@ static void mqueue_evict_inode(struct inode *inode) ...@@ -533,7 +534,7 @@ static void mqueue_evict_inode(struct inode *inode)
info->attr.mq_msgsize); info->attr.mq_msgsize);
spin_lock(&mq_lock); spin_lock(&mq_lock);
user->mq_bytes -= mq_bytes; dec_rlimit_ucounts(info->ucounts, UCOUNT_RLIMIT_MSGQUEUE, mq_bytes);
/* /*
* get_ns_from_inode() ensures that the * get_ns_from_inode() ensures that the
* (ipc_ns = sb->s_fs_info) is either a valid ipc_ns * (ipc_ns = sb->s_fs_info) is either a valid ipc_ns
...@@ -543,7 +544,8 @@ static void mqueue_evict_inode(struct inode *inode) ...@@ -543,7 +544,8 @@ static void mqueue_evict_inode(struct inode *inode)
if (ipc_ns) if (ipc_ns)
ipc_ns->mq_queues_count--; ipc_ns->mq_queues_count--;
spin_unlock(&mq_lock); spin_unlock(&mq_lock);
free_uid(user); put_ucounts(info->ucounts);
info->ucounts = NULL;
} }
if (ipc_ns) if (ipc_ns)
put_ipc_ns(ipc_ns); put_ipc_ns(ipc_ns);
......
...@@ -60,7 +60,7 @@ struct shmid_kernel /* private to the kernel */ ...@@ -60,7 +60,7 @@ struct shmid_kernel /* private to the kernel */
time64_t shm_ctim; time64_t shm_ctim;
struct pid *shm_cprid; struct pid *shm_cprid;
struct pid *shm_lprid; struct pid *shm_lprid;
struct user_struct *mlock_user; struct ucounts *mlock_ucounts;
/* The task created the shm object. NULL if the task is dead. */ /* The task created the shm object. NULL if the task is dead. */
struct task_struct *shm_creator; struct task_struct *shm_creator;
...@@ -286,10 +286,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp) ...@@ -286,10 +286,10 @@ static void shm_destroy(struct ipc_namespace *ns, struct shmid_kernel *shp)
shm_rmid(ns, shp); shm_rmid(ns, shp);
shm_unlock(shp); shm_unlock(shp);
if (!is_file_hugepages(shm_file)) if (!is_file_hugepages(shm_file))
shmem_lock(shm_file, 0, shp->mlock_user); shmem_lock(shm_file, 0, shp->mlock_ucounts);
else if (shp->mlock_user) else if (shp->mlock_ucounts)
user_shm_unlock(i_size_read(file_inode(shm_file)), user_shm_unlock(i_size_read(file_inode(shm_file)),
shp->mlock_user); shp->mlock_ucounts);
fput(shm_file); fput(shm_file);
ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_cprid, NULL);
ipc_update_pid(&shp->shm_lprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL);
...@@ -625,7 +625,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ...@@ -625,7 +625,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
shp->shm_perm.key = key; shp->shm_perm.key = key;
shp->shm_perm.mode = (shmflg & S_IRWXUGO); shp->shm_perm.mode = (shmflg & S_IRWXUGO);
shp->mlock_user = NULL; shp->mlock_ucounts = NULL;
shp->shm_perm.security = NULL; shp->shm_perm.security = NULL;
error = security_shm_alloc(&shp->shm_perm); error = security_shm_alloc(&shp->shm_perm);
...@@ -650,7 +650,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ...@@ -650,7 +650,7 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
if (shmflg & SHM_NORESERVE) if (shmflg & SHM_NORESERVE)
acctflag = VM_NORESERVE; acctflag = VM_NORESERVE;
file = hugetlb_file_setup(name, hugesize, acctflag, file = hugetlb_file_setup(name, hugesize, acctflag,
&shp->mlock_user, HUGETLB_SHMFS_INODE, &shp->mlock_ucounts, HUGETLB_SHMFS_INODE,
(shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK); (shmflg >> SHM_HUGE_SHIFT) & SHM_HUGE_MASK);
} else { } else {
/* /*
...@@ -698,8 +698,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params) ...@@ -698,8 +698,8 @@ static int newseg(struct ipc_namespace *ns, struct ipc_params *params)
no_id: no_id:
ipc_update_pid(&shp->shm_cprid, NULL); ipc_update_pid(&shp->shm_cprid, NULL);
ipc_update_pid(&shp->shm_lprid, NULL); ipc_update_pid(&shp->shm_lprid, NULL);
if (is_file_hugepages(file) && shp->mlock_user) if (is_file_hugepages(file) && shp->mlock_ucounts)
user_shm_unlock(size, shp->mlock_user); user_shm_unlock(size, shp->mlock_ucounts);
fput(file); fput(file);
ipc_rcu_putref(&shp->shm_perm, shm_rcu_free); ipc_rcu_putref(&shp->shm_perm, shm_rcu_free);
return error; return error;
...@@ -1105,12 +1105,12 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd) ...@@ -1105,12 +1105,12 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd)
goto out_unlock0; goto out_unlock0;
if (cmd == SHM_LOCK) { if (cmd == SHM_LOCK) {
struct user_struct *user = current_user(); struct ucounts *ucounts = current_ucounts();
err = shmem_lock(shm_file, 1, user); err = shmem_lock(shm_file, 1, ucounts);
if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) { if (!err && !(shp->shm_perm.mode & SHM_LOCKED)) {
shp->shm_perm.mode |= SHM_LOCKED; shp->shm_perm.mode |= SHM_LOCKED;
shp->mlock_user = user; shp->mlock_ucounts = ucounts;
} }
goto out_unlock0; goto out_unlock0;
} }
...@@ -1118,9 +1118,9 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd) ...@@ -1118,9 +1118,9 @@ static int shmctl_do_lock(struct ipc_namespace *ns, int shmid, int cmd)
/* SHM_UNLOCK */ /* SHM_UNLOCK */
if (!(shp->shm_perm.mode & SHM_LOCKED)) if (!(shp->shm_perm.mode & SHM_LOCKED))
goto out_unlock0; goto out_unlock0;
shmem_lock(shm_file, 0, shp->mlock_user); shmem_lock(shm_file, 0, shp->mlock_ucounts);
shp->shm_perm.mode &= ~SHM_LOCKED; shp->shm_perm.mode &= ~SHM_LOCKED;
shp->mlock_user = NULL; shp->mlock_ucounts = NULL;
get_file(shm_file); get_file(shm_file);
ipc_unlock_object(&shp->shm_perm); ipc_unlock_object(&shp->shm_perm);
rcu_read_unlock(); rcu_read_unlock();
......
...@@ -60,6 +60,7 @@ struct cred init_cred = { ...@@ -60,6 +60,7 @@ struct cred init_cred = {
.user = INIT_USER, .user = INIT_USER,
.user_ns = &init_user_ns, .user_ns = &init_user_ns,
.group_info = &init_groups, .group_info = &init_groups,
.ucounts = &init_ucounts,
}; };
static inline void set_cred_subscribers(struct cred *cred, int n) static inline void set_cred_subscribers(struct cred *cred, int n)
...@@ -119,6 +120,8 @@ static void put_cred_rcu(struct rcu_head *rcu) ...@@ -119,6 +120,8 @@ static void put_cred_rcu(struct rcu_head *rcu)
if (cred->group_info) if (cred->group_info)
put_group_info(cred->group_info); put_group_info(cred->group_info);
free_uid(cred->user); free_uid(cred->user);
if (cred->ucounts)
put_ucounts(cred->ucounts);
put_user_ns(cred->user_ns); put_user_ns(cred->user_ns);
kmem_cache_free(cred_jar, cred); kmem_cache_free(cred_jar, cred);
} }
...@@ -222,6 +225,7 @@ struct cred *cred_alloc_blank(void) ...@@ -222,6 +225,7 @@ struct cred *cred_alloc_blank(void)
#ifdef CONFIG_DEBUG_CREDENTIALS #ifdef CONFIG_DEBUG_CREDENTIALS
new->magic = CRED_MAGIC; new->magic = CRED_MAGIC;
#endif #endif
new->ucounts = get_ucounts(&init_ucounts);
if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0) if (security_cred_alloc_blank(new, GFP_KERNEL_ACCOUNT) < 0)
goto error; goto error;
...@@ -284,6 +288,11 @@ struct cred *prepare_creds(void) ...@@ -284,6 +288,11 @@ struct cred *prepare_creds(void)
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error; goto error;
new->ucounts = get_ucounts(new->ucounts);
if (!new->ucounts)
goto error;
validate_creds(new); validate_creds(new);
return new; return new;
...@@ -351,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) ...@@ -351,7 +360,7 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
kdebug("share_creds(%p{%d,%d})", kdebug("share_creds(%p{%d,%d})",
p->cred, atomic_read(&p->cred->usage), p->cred, atomic_read(&p->cred->usage),
read_cred_subscribers(p->cred)); read_cred_subscribers(p->cred));
atomic_inc(&p->cred->user->processes); inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
return 0; return 0;
} }
...@@ -363,6 +372,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) ...@@ -363,6 +372,9 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
ret = create_user_ns(new); ret = create_user_ns(new);
if (ret < 0) if (ret < 0)
goto error_put; goto error_put;
ret = set_cred_ucounts(new);
if (ret < 0)
goto error_put;
} }
#ifdef CONFIG_KEYS #ifdef CONFIG_KEYS
...@@ -384,8 +396,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags) ...@@ -384,8 +396,8 @@ int copy_creds(struct task_struct *p, unsigned long clone_flags)
} }
#endif #endif
atomic_inc(&new->user->processes);
p->cred = p->real_cred = get_cred(new); p->cred = p->real_cred = get_cred(new);
inc_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(new, 2); alter_cred_subscribers(new, 2);
validate_creds(new); validate_creds(new);
return 0; return 0;
...@@ -485,12 +497,12 @@ int commit_creds(struct cred *new) ...@@ -485,12 +497,12 @@ int commit_creds(struct cred *new)
* in set_user(). * in set_user().
*/ */
alter_cred_subscribers(new, 2); alter_cred_subscribers(new, 2);
if (new->user != old->user) if (new->user != old->user || new->user_ns != old->user_ns)
atomic_inc(&new->user->processes); inc_rlimit_ucounts(new->ucounts, UCOUNT_RLIMIT_NPROC, 1);
rcu_assign_pointer(task->real_cred, new); rcu_assign_pointer(task->real_cred, new);
rcu_assign_pointer(task->cred, new); rcu_assign_pointer(task->cred, new);
if (new->user != old->user) if (new->user != old->user)
atomic_dec(&old->user->processes); dec_rlimit_ucounts(old->ucounts, UCOUNT_RLIMIT_NPROC, 1);
alter_cred_subscribers(old, -2); alter_cred_subscribers(old, -2);
/* send notifications */ /* send notifications */
...@@ -653,6 +665,31 @@ int cred_fscmp(const struct cred *a, const struct cred *b) ...@@ -653,6 +665,31 @@ int cred_fscmp(const struct cred *a, const struct cred *b)
} }
EXPORT_SYMBOL(cred_fscmp); EXPORT_SYMBOL(cred_fscmp);
int set_cred_ucounts(struct cred *new)
{
struct task_struct *task = current;
const struct cred *old = task->real_cred;
struct ucounts *old_ucounts = new->ucounts;
if (new->user == old->user && new->user_ns == old->user_ns)
return 0;
/*
* This optimization is needed because alloc_ucounts() uses locks
* for table lookups.
*/
if (old_ucounts && old_ucounts->ns == new->user_ns && uid_eq(old_ucounts->uid, new->euid))
return 0;
if (!(new->ucounts = alloc_ucounts(new->user_ns, new->euid)))
return -EAGAIN;
if (old_ucounts)
put_ucounts(old_ucounts);
return 0;
}
/* /*
* initialise the credentials stuff * initialise the credentials stuff
*/ */
...@@ -719,6 +756,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon) ...@@ -719,6 +756,10 @@ struct cred *prepare_kernel_cred(struct task_struct *daemon)
if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0) if (security_prepare_creds(new, old, GFP_KERNEL_ACCOUNT) < 0)
goto error; goto error;
new->ucounts = get_ucounts(new->ucounts);
if (!new->ucounts)
goto error;
put_cred(old); put_cred(old);
validate_creds(new); validate_creds(new);
return new; return new;
......
...@@ -188,7 +188,7 @@ void release_task(struct task_struct *p) ...@@ -188,7 +188,7 @@ void release_task(struct task_struct *p)
/* don't need to get the RCU readlock here - the process is dead and /* don't need to get the RCU readlock here - the process is dead and
* can't be modifying its own credentials. But shut RCU-lockdep up */ * can't be modifying its own credentials. But shut RCU-lockdep up */
rcu_read_lock(); rcu_read_lock();
atomic_dec(&__task_cred(p)->user->processes); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
rcu_read_unlock(); rcu_read_unlock();
cgroup_release(p); cgroup_release(p);
......
...@@ -825,9 +825,14 @@ void __init fork_init(void) ...@@ -825,9 +825,14 @@ void __init fork_init(void)
init_task.signal->rlim[RLIMIT_SIGPENDING] = init_task.signal->rlim[RLIMIT_SIGPENDING] =
init_task.signal->rlim[RLIMIT_NPROC]; init_task.signal->rlim[RLIMIT_NPROC];
for (i = 0; i < UCOUNT_COUNTS; i++) for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++)
init_user_ns.ucount_max[i] = max_threads/2; init_user_ns.ucount_max[i] = max_threads/2;
set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_NPROC, task_rlimit(&init_task, RLIMIT_NPROC));
set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MSGQUEUE, task_rlimit(&init_task, RLIMIT_MSGQUEUE));
set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_SIGPENDING, task_rlimit(&init_task, RLIMIT_SIGPENDING));
set_rlimit_ucount_max(&init_user_ns, UCOUNT_RLIMIT_MEMLOCK, task_rlimit(&init_task, RLIMIT_MEMLOCK));
#ifdef CONFIG_VMAP_STACK #ifdef CONFIG_VMAP_STACK
cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache", cpuhp_setup_state(CPUHP_BP_PREPARE_DYN, "fork:vm_stack_cache",
NULL, free_vm_stack_cache); NULL, free_vm_stack_cache);
...@@ -1978,8 +1983,7 @@ static __latent_entropy struct task_struct *copy_process( ...@@ -1978,8 +1983,7 @@ static __latent_entropy struct task_struct *copy_process(
DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled); DEBUG_LOCKS_WARN_ON(!p->softirqs_enabled);
#endif #endif
retval = -EAGAIN; retval = -EAGAIN;
if (atomic_read(&p->real_cred->user->processes) >= if (is_ucounts_overlimit(task_ucounts(p), UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC))) {
task_rlimit(p, RLIMIT_NPROC)) {
if (p->real_cred->user != INIT_USER && if (p->real_cred->user != INIT_USER &&
!capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN)) !capable(CAP_SYS_RESOURCE) && !capable(CAP_SYS_ADMIN))
goto bad_fork_free; goto bad_fork_free;
...@@ -2388,7 +2392,7 @@ static __latent_entropy struct task_struct *copy_process( ...@@ -2388,7 +2392,7 @@ static __latent_entropy struct task_struct *copy_process(
#endif #endif
delayacct_tsk_free(p); delayacct_tsk_free(p);
bad_fork_cleanup_count: bad_fork_cleanup_count:
atomic_dec(&p->cred->user->processes); dec_rlimit_ucounts(task_ucounts(p), UCOUNT_RLIMIT_NPROC, 1);
exit_creds(p); exit_creds(p);
bad_fork_free: bad_fork_free:
WRITE_ONCE(p->__state, TASK_DEAD); WRITE_ONCE(p->__state, TASK_DEAD);
...@@ -3001,6 +3005,12 @@ int ksys_unshare(unsigned long unshare_flags) ...@@ -3001,6 +3005,12 @@ int ksys_unshare(unsigned long unshare_flags)
if (err) if (err)
goto bad_unshare_cleanup_cred; goto bad_unshare_cleanup_cred;
if (new_cred) {
err = set_cred_ucounts(new_cred);
if (err)
goto bad_unshare_cleanup_cred;
}
if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) { if (new_fs || new_fd || do_sysvsem || new_cred || new_nsproxy) {
if (do_sysvsem) { if (do_sysvsem) {
/* /*
......
...@@ -412,8 +412,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, ...@@ -412,8 +412,8 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
int override_rlimit, const unsigned int sigqueue_flags) int override_rlimit, const unsigned int sigqueue_flags)
{ {
struct sigqueue *q = NULL; struct sigqueue *q = NULL;
struct user_struct *user; struct ucounts *ucounts = NULL;
int sigpending; long sigpending;
/* /*
* Protect access to @t credentials. This can go away when all * Protect access to @t credentials. This can go away when all
...@@ -424,27 +424,26 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags, ...@@ -424,27 +424,26 @@ __sigqueue_alloc(int sig, struct task_struct *t, gfp_t gfp_flags,
* changes from/to zero. * changes from/to zero.
*/ */
rcu_read_lock(); rcu_read_lock();
user = __task_cred(t)->user; ucounts = task_ucounts(t);
sigpending = atomic_inc_return(&user->sigpending); sigpending = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1);
if (sigpending == 1) if (sigpending == 1)
get_uid(user); ucounts = get_ucounts(ucounts);
rcu_read_unlock(); rcu_read_unlock();
if (override_rlimit || likely(sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) { if (override_rlimit || (sigpending < LONG_MAX && sigpending <= task_rlimit(t, RLIMIT_SIGPENDING))) {
q = kmem_cache_alloc(sigqueue_cachep, gfp_flags); q = kmem_cache_alloc(sigqueue_cachep, gfp_flags);
} else { } else {
print_dropped_signal(sig); print_dropped_signal(sig);
} }
if (unlikely(q == NULL)) { if (unlikely(q == NULL)) {
if (atomic_dec_and_test(&user->sigpending)) if (ucounts && dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_SIGPENDING, 1))
free_uid(user); put_ucounts(ucounts);
} else { } else {
INIT_LIST_HEAD(&q->list); INIT_LIST_HEAD(&q->list);
q->flags = sigqueue_flags; q->flags = sigqueue_flags;
q->user = user; q->ucounts = ucounts;
} }
return q; return q;
} }
...@@ -452,8 +451,10 @@ static void __sigqueue_free(struct sigqueue *q) ...@@ -452,8 +451,10 @@ static void __sigqueue_free(struct sigqueue *q)
{ {
if (q->flags & SIGQUEUE_PREALLOC) if (q->flags & SIGQUEUE_PREALLOC)
return; return;
if (atomic_dec_and_test(&q->user->sigpending)) if (q->ucounts && dec_rlimit_ucounts(q->ucounts, UCOUNT_RLIMIT_SIGPENDING, 1)) {
free_uid(q->user); put_ucounts(q->ucounts);
q->ucounts = NULL;
}
kmem_cache_free(sigqueue_cachep, q); kmem_cache_free(sigqueue_cachep, q);
} }
......
...@@ -479,7 +479,7 @@ static int set_user(struct cred *new) ...@@ -479,7 +479,7 @@ static int set_user(struct cred *new)
* for programs doing set*uid()+execve() by harmlessly deferring the * for programs doing set*uid()+execve() by harmlessly deferring the
* failure to the execve() stage. * failure to the execve() stage.
*/ */
if (atomic_read(&new_user->processes) >= rlimit(RLIMIT_NPROC) && if (is_ucounts_overlimit(new->ucounts, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC)) &&
new_user != INIT_USER) new_user != INIT_USER)
current->flags |= PF_NPROC_EXCEEDED; current->flags |= PF_NPROC_EXCEEDED;
else else
...@@ -558,6 +558,10 @@ long __sys_setreuid(uid_t ruid, uid_t euid) ...@@ -558,6 +558,10 @@ long __sys_setreuid(uid_t ruid, uid_t euid)
if (retval < 0) if (retval < 0)
goto error; goto error;
retval = set_cred_ucounts(new);
if (retval < 0)
goto error;
return commit_creds(new); return commit_creds(new);
error: error:
...@@ -616,6 +620,10 @@ long __sys_setuid(uid_t uid) ...@@ -616,6 +620,10 @@ long __sys_setuid(uid_t uid)
if (retval < 0) if (retval < 0)
goto error; goto error;
retval = set_cred_ucounts(new);
if (retval < 0)
goto error;
return commit_creds(new); return commit_creds(new);
error: error:
...@@ -691,6 +699,10 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid) ...@@ -691,6 +699,10 @@ long __sys_setresuid(uid_t ruid, uid_t euid, uid_t suid)
if (retval < 0) if (retval < 0)
goto error; goto error;
retval = set_cred_ucounts(new);
if (retval < 0)
goto error;
return commit_creds(new); return commit_creds(new);
error: error:
......
...@@ -8,6 +8,12 @@ ...@@ -8,6 +8,12 @@
#include <linux/kmemleak.h> #include <linux/kmemleak.h>
#include <linux/user_namespace.h> #include <linux/user_namespace.h>
struct ucounts init_ucounts = {
.ns = &init_user_ns,
.uid = GLOBAL_ROOT_UID,
.count = ATOMIC_INIT(1),
};
#define UCOUNTS_HASHTABLE_BITS 10 #define UCOUNTS_HASHTABLE_BITS 10
static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)]; static struct hlist_head ucounts_hashtable[(1 << UCOUNTS_HASHTABLE_BITS)];
static DEFINE_SPINLOCK(ucounts_lock); static DEFINE_SPINLOCK(ucounts_lock);
...@@ -78,6 +84,10 @@ static struct ctl_table user_table[] = { ...@@ -78,6 +84,10 @@ static struct ctl_table user_table[] = {
UCOUNT_ENTRY("max_fanotify_groups"), UCOUNT_ENTRY("max_fanotify_groups"),
UCOUNT_ENTRY("max_fanotify_marks"), UCOUNT_ENTRY("max_fanotify_marks"),
#endif #endif
{ },
{ },
{ },
{ },
{ } { }
}; };
#endif /* CONFIG_SYSCTL */ #endif /* CONFIG_SYSCTL */
...@@ -129,7 +139,24 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc ...@@ -129,7 +139,24 @@ static struct ucounts *find_ucounts(struct user_namespace *ns, kuid_t uid, struc
return NULL; return NULL;
} }
static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) static void hlist_add_ucounts(struct ucounts *ucounts)
{
struct hlist_head *hashent = ucounts_hashentry(ucounts->ns, ucounts->uid);
spin_lock_irq(&ucounts_lock);
hlist_add_head(&ucounts->node, hashent);
spin_unlock_irq(&ucounts_lock);
}
struct ucounts *get_ucounts(struct ucounts *ucounts)
{
if (ucounts && atomic_add_negative(1, &ucounts->count)) {
put_ucounts(ucounts);
ucounts = NULL;
}
return ucounts;
}
struct ucounts *alloc_ucounts(struct user_namespace *ns, kuid_t uid)
{ {
struct hlist_head *hashent = ucounts_hashentry(ns, uid); struct hlist_head *hashent = ucounts_hashentry(ns, uid);
struct ucounts *ucounts, *new; struct ucounts *ucounts, *new;
...@@ -145,7 +172,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) ...@@ -145,7 +172,7 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
new->ns = ns; new->ns = ns;
new->uid = uid; new->uid = uid;
new->count = 0; atomic_set(&new->count, 1);
spin_lock_irq(&ucounts_lock); spin_lock_irq(&ucounts_lock);
ucounts = find_ucounts(ns, uid, hashent); ucounts = find_ucounts(ns, uid, hashent);
...@@ -153,40 +180,35 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid) ...@@ -153,40 +180,35 @@ static struct ucounts *get_ucounts(struct user_namespace *ns, kuid_t uid)
kfree(new); kfree(new);
} else { } else {
hlist_add_head(&new->node, hashent); hlist_add_head(&new->node, hashent);
ucounts = new; spin_unlock_irq(&ucounts_lock);
return new;
} }
} }
if (ucounts->count == INT_MAX)
ucounts = NULL;
else
ucounts->count += 1;
spin_unlock_irq(&ucounts_lock); spin_unlock_irq(&ucounts_lock);
ucounts = get_ucounts(ucounts);
return ucounts; return ucounts;
} }
static void put_ucounts(struct ucounts *ucounts) void put_ucounts(struct ucounts *ucounts)
{ {
unsigned long flags; unsigned long flags;
spin_lock_irqsave(&ucounts_lock, flags); if (atomic_dec_and_test(&ucounts->count)) {
ucounts->count -= 1; spin_lock_irqsave(&ucounts_lock, flags);
if (!ucounts->count)
hlist_del_init(&ucounts->node); hlist_del_init(&ucounts->node);
else spin_unlock_irqrestore(&ucounts_lock, flags);
ucounts = NULL; kfree(ucounts);
spin_unlock_irqrestore(&ucounts_lock, flags); }
kfree(ucounts);
} }
static inline bool atomic_inc_below(atomic_t *v, int u) static inline bool atomic_long_inc_below(atomic_long_t *v, int u)
{ {
int c, old; long c, old;
c = atomic_read(v); c = atomic_long_read(v);
for (;;) { for (;;) {
if (unlikely(c >= u)) if (unlikely(c >= u))
return false; return false;
old = atomic_cmpxchg(v, c, c+1); old = atomic_long_cmpxchg(v, c, c+1);
if (likely(old == c)) if (likely(old == c))
return true; return true;
c = old; c = old;
...@@ -198,19 +220,19 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid, ...@@ -198,19 +220,19 @@ struct ucounts *inc_ucount(struct user_namespace *ns, kuid_t uid,
{ {
struct ucounts *ucounts, *iter, *bad; struct ucounts *ucounts, *iter, *bad;
struct user_namespace *tns; struct user_namespace *tns;
ucounts = get_ucounts(ns, uid); ucounts = alloc_ucounts(ns, uid);
for (iter = ucounts; iter; iter = tns->ucounts) { for (iter = ucounts; iter; iter = tns->ucounts) {
int max; long max;
tns = iter->ns; tns = iter->ns;
max = READ_ONCE(tns->ucount_max[type]); max = READ_ONCE(tns->ucount_max[type]);
if (!atomic_inc_below(&iter->ucount[type], max)) if (!atomic_long_inc_below(&iter->ucount[type], max))
goto fail; goto fail;
} }
return ucounts; return ucounts;
fail: fail:
bad = iter; bad = iter;
for (iter = ucounts; iter != bad; iter = iter->ns->ucounts) for (iter = ucounts; iter != bad; iter = iter->ns->ucounts)
atomic_dec(&iter->ucount[type]); atomic_long_dec(&iter->ucount[type]);
put_ucounts(ucounts); put_ucounts(ucounts);
return NULL; return NULL;
...@@ -220,12 +242,54 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type) ...@@ -220,12 +242,54 @@ void dec_ucount(struct ucounts *ucounts, enum ucount_type type)
{ {
struct ucounts *iter; struct ucounts *iter;
for (iter = ucounts; iter; iter = iter->ns->ucounts) { for (iter = ucounts; iter; iter = iter->ns->ucounts) {
int dec = atomic_dec_if_positive(&iter->ucount[type]); long dec = atomic_long_dec_if_positive(&iter->ucount[type]);
WARN_ON_ONCE(dec < 0); WARN_ON_ONCE(dec < 0);
} }
put_ucounts(ucounts); put_ucounts(ucounts);
} }
long inc_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
{
struct ucounts *iter;
long ret = 0;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
long max = READ_ONCE(iter->ns->ucount_max[type]);
long new = atomic_long_add_return(v, &iter->ucount[type]);
if (new < 0 || new > max)
ret = LONG_MAX;
else if (iter == ucounts)
ret = new;
}
return ret;
}
bool dec_rlimit_ucounts(struct ucounts *ucounts, enum ucount_type type, long v)
{
struct ucounts *iter;
long new = -1; /* Silence compiler warning */
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
long dec = atomic_long_add_return(-v, &iter->ucount[type]);
WARN_ON_ONCE(dec < 0);
if (iter == ucounts)
new = dec;
}
return (new == 0);
}
bool is_ucounts_overlimit(struct ucounts *ucounts, enum ucount_type type, unsigned long max)
{
struct ucounts *iter;
if (get_ucounts_value(ucounts, type) > max)
return true;
for (iter = ucounts; iter; iter = iter->ns->ucounts) {
max = READ_ONCE(iter->ns->ucount_max[type]);
if (get_ucounts_value(iter, type) > max)
return true;
}
return false;
}
static __init int user_namespace_sysctl_init(void) static __init int user_namespace_sysctl_init(void)
{ {
#ifdef CONFIG_SYSCTL #ifdef CONFIG_SYSCTL
...@@ -241,6 +305,8 @@ static __init int user_namespace_sysctl_init(void) ...@@ -241,6 +305,8 @@ static __init int user_namespace_sysctl_init(void)
BUG_ON(!user_header); BUG_ON(!user_header);
BUG_ON(!setup_userns_sysctls(&init_user_ns)); BUG_ON(!setup_userns_sysctls(&init_user_ns));
#endif #endif
hlist_add_ucounts(&init_ucounts);
inc_rlimit_ucounts(&init_ucounts, UCOUNT_RLIMIT_NPROC, 1);
return 0; return 0;
} }
subsys_initcall(user_namespace_sysctl_init); subsys_initcall(user_namespace_sysctl_init);
...@@ -98,9 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock); ...@@ -98,9 +98,6 @@ static DEFINE_SPINLOCK(uidhash_lock);
/* root_user.__count is 1, for init task cred */ /* root_user.__count is 1, for init task cred */
struct user_struct root_user = { struct user_struct root_user = {
.__count = REFCOUNT_INIT(1), .__count = REFCOUNT_INIT(1),
.processes = ATOMIC_INIT(1),
.sigpending = ATOMIC_INIT(0),
.locked_shm = 0,
.uid = GLOBAL_ROOT_UID, .uid = GLOBAL_ROOT_UID,
.ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0), .ratelimit = RATELIMIT_STATE_INIT(root_user.ratelimit, 0, 0),
}; };
......
...@@ -119,9 +119,13 @@ int create_user_ns(struct cred *new) ...@@ -119,9 +119,13 @@ int create_user_ns(struct cred *new)
ns->owner = owner; ns->owner = owner;
ns->group = group; ns->group = group;
INIT_WORK(&ns->work, free_user_ns); INIT_WORK(&ns->work, free_user_ns);
for (i = 0; i < UCOUNT_COUNTS; i++) { for (i = 0; i < MAX_PER_NAMESPACE_UCOUNTS; i++) {
ns->ucount_max[i] = INT_MAX; ns->ucount_max[i] = INT_MAX;
} }
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_NPROC, rlimit(RLIMIT_NPROC));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MSGQUEUE, rlimit(RLIMIT_MSGQUEUE));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_SIGPENDING, rlimit(RLIMIT_SIGPENDING));
set_rlimit_ucount_max(ns, UCOUNT_RLIMIT_MEMLOCK, rlimit(RLIMIT_MEMLOCK));
ns->ucounts = ucounts; ns->ucounts = ucounts;
/* Inherit USERNS_SETGROUPS_ALLOWED from our parent */ /* Inherit USERNS_SETGROUPS_ALLOWED from our parent */
...@@ -1340,6 +1344,9 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns) ...@@ -1340,6 +1344,9 @@ static int userns_install(struct nsset *nsset, struct ns_common *ns)
put_user_ns(cred->user_ns); put_user_ns(cred->user_ns);
set_cred_user_ns(cred, get_user_ns(user_ns)); set_cred_user_ns(cred, get_user_ns(user_ns));
if (set_cred_ucounts(cred) < 0)
return -EINVAL;
return 0; return 0;
} }
......
...@@ -297,9 +297,9 @@ SYSCALL_DEFINE2(memfd_create, ...@@ -297,9 +297,9 @@ SYSCALL_DEFINE2(memfd_create,
} }
if (flags & MFD_HUGETLB) { if (flags & MFD_HUGETLB) {
struct user_struct *user = NULL; struct ucounts *ucounts = NULL;
file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, file = hugetlb_file_setup(name, 0, VM_NORESERVE, &ucounts,
HUGETLB_ANONHUGE_INODE, HUGETLB_ANONHUGE_INODE,
(flags >> MFD_HUGE_SHIFT) & (flags >> MFD_HUGE_SHIFT) &
MFD_HUGE_MASK); MFD_HUGE_MASK);
......
...@@ -817,9 +817,10 @@ SYSCALL_DEFINE0(munlockall) ...@@ -817,9 +817,10 @@ SYSCALL_DEFINE0(munlockall)
*/ */
static DEFINE_SPINLOCK(shmlock_user_lock); static DEFINE_SPINLOCK(shmlock_user_lock);
int user_shm_lock(size_t size, struct user_struct *user) int user_shm_lock(size_t size, struct ucounts *ucounts)
{ {
unsigned long lock_limit, locked; unsigned long lock_limit, locked;
long memlock;
int allowed = 0; int allowed = 0;
locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT; locked = (size + PAGE_SIZE - 1) >> PAGE_SHIFT;
...@@ -828,21 +829,26 @@ int user_shm_lock(size_t size, struct user_struct *user) ...@@ -828,21 +829,26 @@ int user_shm_lock(size_t size, struct user_struct *user)
allowed = 1; allowed = 1;
lock_limit >>= PAGE_SHIFT; lock_limit >>= PAGE_SHIFT;
spin_lock(&shmlock_user_lock); spin_lock(&shmlock_user_lock);
if (!allowed && memlock = inc_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
locked + user->locked_shm > lock_limit && !capable(CAP_IPC_LOCK))
if (!allowed && (memlock == LONG_MAX || memlock > lock_limit) && !capable(CAP_IPC_LOCK)) {
dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
goto out;
}
if (!get_ucounts(ucounts)) {
dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, locked);
goto out; goto out;
get_uid(user); }
user->locked_shm += locked;
allowed = 1; allowed = 1;
out: out:
spin_unlock(&shmlock_user_lock); spin_unlock(&shmlock_user_lock);
return allowed; return allowed;
} }
void user_shm_unlock(size_t size, struct user_struct *user) void user_shm_unlock(size_t size, struct ucounts *ucounts)
{ {
spin_lock(&shmlock_user_lock); spin_lock(&shmlock_user_lock);
user->locked_shm -= (size + PAGE_SIZE - 1) >> PAGE_SHIFT; dec_rlimit_ucounts(ucounts, UCOUNT_RLIMIT_MEMLOCK, (size + PAGE_SIZE - 1) >> PAGE_SHIFT);
spin_unlock(&shmlock_user_lock); spin_unlock(&shmlock_user_lock);
free_uid(user); put_ucounts(ucounts);
} }
...@@ -1611,7 +1611,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, ...@@ -1611,7 +1611,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
goto out_fput; goto out_fput;
} }
} else if (flags & MAP_HUGETLB) { } else if (flags & MAP_HUGETLB) {
struct user_struct *user = NULL; struct ucounts *ucounts = NULL;
struct hstate *hs; struct hstate *hs;
hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); hs = hstate_sizelog((flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
...@@ -1627,7 +1627,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len, ...@@ -1627,7 +1627,7 @@ unsigned long ksys_mmap_pgoff(unsigned long addr, unsigned long len,
*/ */
file = hugetlb_file_setup(HUGETLB_ANON_FILE, len, file = hugetlb_file_setup(HUGETLB_ANON_FILE, len,
VM_NORESERVE, VM_NORESERVE,
&user, HUGETLB_ANONHUGE_INODE, &ucounts, HUGETLB_ANONHUGE_INODE,
(flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK); (flags >> MAP_HUGE_SHIFT) & MAP_HUGE_MASK);
if (IS_ERR(file)) if (IS_ERR(file))
return PTR_ERR(file); return PTR_ERR(file);
......
...@@ -2227,7 +2227,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma, ...@@ -2227,7 +2227,7 @@ static struct mempolicy *shmem_get_policy(struct vm_area_struct *vma,
} }
#endif #endif
int shmem_lock(struct file *file, int lock, struct user_struct *user) int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{ {
struct inode *inode = file_inode(file); struct inode *inode = file_inode(file);
struct shmem_inode_info *info = SHMEM_I(inode); struct shmem_inode_info *info = SHMEM_I(inode);
...@@ -2239,13 +2239,13 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user) ...@@ -2239,13 +2239,13 @@ int shmem_lock(struct file *file, int lock, struct user_struct *user)
* no serialization needed when called from shm_destroy(). * no serialization needed when called from shm_destroy().
*/ */
if (lock && !(info->flags & VM_LOCKED)) { if (lock && !(info->flags & VM_LOCKED)) {
if (!user_shm_lock(inode->i_size, user)) if (!user_shm_lock(inode->i_size, ucounts))
goto out_nomem; goto out_nomem;
info->flags |= VM_LOCKED; info->flags |= VM_LOCKED;
mapping_set_unevictable(file->f_mapping); mapping_set_unevictable(file->f_mapping);
} }
if (!lock && (info->flags & VM_LOCKED) && user) { if (!lock && (info->flags & VM_LOCKED) && ucounts) {
user_shm_unlock(inode->i_size, user); user_shm_unlock(inode->i_size, ucounts);
info->flags &= ~VM_LOCKED; info->flags &= ~VM_LOCKED;
mapping_clear_unevictable(file->f_mapping); mapping_clear_unevictable(file->f_mapping);
} }
...@@ -4092,7 +4092,7 @@ int shmem_unuse(unsigned int type, bool frontswap, ...@@ -4092,7 +4092,7 @@ int shmem_unuse(unsigned int type, bool frontswap,
return 0; return 0;
} }
int shmem_lock(struct file *file, int lock, struct user_struct *user) int shmem_lock(struct file *file, int lock, struct ucounts *ucounts)
{ {
return 0; return 0;
} }
......
...@@ -49,6 +49,7 @@ TARGETS += proc ...@@ -49,6 +49,7 @@ TARGETS += proc
TARGETS += pstore TARGETS += pstore
TARGETS += ptrace TARGETS += ptrace
TARGETS += openat2 TARGETS += openat2
TARGETS += rlimits
TARGETS += rseq TARGETS += rseq
TARGETS += rtc TARGETS += rtc
TARGETS += seccomp TARGETS += seccomp
......
# SPDX-License-Identifier: GPL-2.0-only
rlimits-per-userns
# SPDX-License-Identifier: GPL-2.0-or-later
CFLAGS += -Wall -O2 -g
TEST_GEN_PROGS := rlimits-per-userns
include ../lib.mk
// SPDX-License-Identifier: GPL-2.0-or-later
/*
* Author: Alexey Gladkov <gladkov.alexey@gmail.com>
*/
#define _GNU_SOURCE
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/prctl.h>
#include <sys/stat.h>
#include <unistd.h>
#include <stdlib.h>
#include <stdio.h>
#include <string.h>
#include <sched.h>
#include <signal.h>
#include <limits.h>
#include <fcntl.h>
#include <errno.h>
#include <err.h>
#define NR_CHILDS 2
static char *service_prog;
static uid_t user = 60000;
static uid_t group = 60000;
static void setrlimit_nproc(rlim_t n)
{
pid_t pid = getpid();
struct rlimit limit = {
.rlim_cur = n,
.rlim_max = n
};
warnx("(pid=%d): Setting RLIMIT_NPROC=%ld", pid, n);
if (setrlimit(RLIMIT_NPROC, &limit) < 0)
err(EXIT_FAILURE, "(pid=%d): setrlimit(RLIMIT_NPROC)", pid);
}
static pid_t fork_child(void)
{
pid_t pid = fork();
if (pid < 0)
err(EXIT_FAILURE, "fork");
if (pid > 0)
return pid;
pid = getpid();
warnx("(pid=%d): New process starting ...", pid);
if (prctl(PR_SET_PDEATHSIG, SIGKILL) < 0)
err(EXIT_FAILURE, "(pid=%d): prctl(PR_SET_PDEATHSIG)", pid);
signal(SIGUSR1, SIG_DFL);
warnx("(pid=%d): Changing to uid=%d, gid=%d", pid, user, group);
if (setgid(group) < 0)
err(EXIT_FAILURE, "(pid=%d): setgid(%d)", pid, group);
if (setuid(user) < 0)
err(EXIT_FAILURE, "(pid=%d): setuid(%d)", pid, user);
warnx("(pid=%d): Service running ...", pid);
warnx("(pid=%d): Unshare user namespace", pid);
if (unshare(CLONE_NEWUSER) < 0)
err(EXIT_FAILURE, "unshare(CLONE_NEWUSER)");
char *const argv[] = { "service", NULL };
char *const envp[] = { "I_AM_SERVICE=1", NULL };
warnx("(pid=%d): Executing real service ...", pid);
execve(service_prog, argv, envp);
err(EXIT_FAILURE, "(pid=%d): execve", pid);
}
int main(int argc, char **argv)
{
size_t i;
pid_t child[NR_CHILDS];
int wstatus[NR_CHILDS];
int childs = NR_CHILDS;
pid_t pid;
if (getenv("I_AM_SERVICE")) {
pause();
exit(EXIT_SUCCESS);
}
service_prog = argv[0];
pid = getpid();
warnx("(pid=%d) Starting testcase", pid);
/*
* This rlimit is not a problem for root because it can be exceeded.
*/
setrlimit_nproc(1);
for (i = 0; i < NR_CHILDS; i++) {
child[i] = fork_child();
wstatus[i] = 0;
usleep(250000);
}
while (1) {
for (i = 0; i < NR_CHILDS; i++) {
if (child[i] <= 0)
continue;
errno = 0;
pid_t ret = waitpid(child[i], &wstatus[i], WNOHANG);
if (!ret || (!WIFEXITED(wstatus[i]) && !WIFSIGNALED(wstatus[i])))
continue;
if (ret < 0 && errno != ECHILD)
warn("(pid=%d): waitpid(%d)", pid, child[i]);
child[i] *= -1;
childs -= 1;
}
if (!childs)
break;
usleep(250000);
for (i = 0; i < NR_CHILDS; i++) {
if (child[i] <= 0)
continue;
kill(child[i], SIGUSR1);
}
}
for (i = 0; i < NR_CHILDS; i++) {
if (WIFEXITED(wstatus[i]))
warnx("(pid=%d): pid %d exited, status=%d",
pid, -child[i], WEXITSTATUS(wstatus[i]));
else if (WIFSIGNALED(wstatus[i]))
warnx("(pid=%d): pid %d killed by signal %d",
pid, -child[i], WTERMSIG(wstatus[i]));
if (WIFSIGNALED(wstatus[i]) && WTERMSIG(wstatus[i]) == SIGUSR1)
continue;
warnx("(pid=%d): Test failed", pid);
exit(EXIT_FAILURE);
}
warnx("(pid=%d): Test passed", pid);
exit(EXIT_SUCCESS);
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment