Commit 0acefef5 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'threads-v5.5' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux

Pull thread management updates from Christian Brauner:

 - A pidfd's fdinfo file currently contains the field "Pid:\t<pid>"
   where <pid> is the pid of the process in the pid namespace of the
   procfs instance the fdinfo file for the pidfd was opened in.

   The fdinfo file has now gained a new "NSpid:\t<ns-pid1>[\t<ns-pid2>[...]]"
   field which lists the pids of the process in all child pid namespaces
   provided the pid namespace of the procfs instance it is looked up
   under has an ancestoral relationship with the pid namespace of the
   process. If it does not 0 will be shown and no further pid namespaces
   will be listed. Tests included. (Christian Kellner)

 - If the process the pidfd references has already exited, print -1 for
   the Pid and NSpid fields in the pidfd's fdinfo file. Tests included.
   (me)

 - Add CLONE_CLEAR_SIGHAND. This lets callers clear all signal handler
   that are not SIG_DFL or SIG_IGN at process creation time. This
   originated as a feature request from glibc to improve performance and
   elimate races in their posix_spawn() implementation. Tests included.
   (me)

 - Add support for choosing a specific pid for a process with clone3().
   This is the feature which was part of the thread update for v5.4 but
   after a discussion at LPC in Lisbon we decided to delay it for one
   more cycle in order to make the interface more generic. This has now
   done. It is now possible to choose a specific pid in a whole pid
   namespaces (sub)hierarchy instead of just one pid namespace. In order
   to choose a specific pid the caller must have CAP_SYS_ADMIN in all
   owning user namespaces of the target pid namespaces. Tests included.
   (Adrian Reber)

 - Test improvements and extensions. (Andrei Vagin, me)

* tag 'threads-v5.5' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  selftests/clone3: skip if clone3() is ENOSYS
  selftests/clone3: check that all pids are released on error paths
  selftests/clone3: report a correct number of fails
  selftests/clone3: flush stdout and stderr before clone3() and _exit()
  selftests: add tests for clone3() with *set_tid
  fork: extend clone3() to support setting a PID
  selftests: add tests for clone3()
  tests: test CLONE_CLEAR_SIGHAND
  clone3: add CLONE_CLEAR_SIGHAND
  pid: use pid_has_task() in pidfd_open()
  exit: use pid_has_task() in do_wait()
  pid: use pid_has_task() in __change_pid()
  test: verify fdinfo for pidfd of reaped process
  pidfd: check pid has attached task in fdinfo
  pidfd: add tests for NSpid info in fdinfo
  pidfd: add NSpid entries to fdinfo
parents 9c91e6a5 11fde161
......@@ -12861,6 +12861,7 @@ S: Maintained
T: git git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux.git
F: samples/pidfd/
F: tools/testing/selftests/pidfd/
F: tools/testing/selftests/clone3/
K: (?i)pidfd
K: (?i)clone3
K: \b(clone_args|kernel_clone_args)\b
......
......@@ -85,6 +85,10 @@ static inline struct pid *get_pid(struct pid *pid)
extern void put_pid(struct pid *pid);
extern struct task_struct *pid_task(struct pid *pid, enum pid_type);
static inline bool pid_has_task(struct pid *pid, enum pid_type type)
{
return !hlist_empty(&pid->tasks[type]);
}
extern struct task_struct *get_pid_task(struct pid *pid, enum pid_type);
extern struct pid *get_task_pid(struct task_struct *task, enum pid_type type);
......@@ -120,7 +124,8 @@ extern struct pid *find_vpid(int nr);
extern struct pid *find_get_pid(int nr);
extern struct pid *find_ge_pid(int nr, struct pid_namespace *);
extern struct pid *alloc_pid(struct pid_namespace *ns);
extern struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size);
extern void free_pid(struct pid *pid);
extern void disable_pid_allocation(struct pid_namespace *ns);
......
......@@ -12,6 +12,8 @@
#include <linux/ns_common.h>
#include <linux/idr.h>
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32
struct fs_pin;
......
......@@ -26,6 +26,9 @@ struct kernel_clone_args {
unsigned long stack;
unsigned long stack_size;
unsigned long tls;
pid_t *set_tid;
/* Number of elements in *set_tid */
size_t set_tid_size;
};
/*
......
......@@ -33,31 +33,48 @@
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
/* Flags for the clone3() syscall. */
#define CLONE_CLEAR_SIGHAND 0x100000000ULL /* Clear any signal handler and reset to SIG_DFL. */
#ifndef __ASSEMBLY__
/**
* struct clone_args - arguments for the clone3 syscall
* @flags: Flags for the new process as listed above.
* All flags are valid except for CSIGNAL and
* CLONE_DETACHED.
* @pidfd: If CLONE_PIDFD is set, a pidfd will be
* returned in this argument.
* @child_tid: If CLONE_CHILD_SETTID is set, the TID of the
* child process will be returned in the child's
* memory.
* @parent_tid: If CLONE_PARENT_SETTID is set, the TID of
* the child process will be returned in the
* parent's memory.
* @exit_signal: The exit_signal the parent process will be
* sent when the child exits.
* @stack: Specify the location of the stack for the
* child process.
* Note, @stack is expected to point to the
* lowest address. The stack direction will be
* determined by the kernel and set up
* appropriately based on @stack_size.
* @stack_size: The size of the stack for the child process.
* @tls: If CLONE_SETTLS is set, the tls descriptor
* is set to tls.
* @flags: Flags for the new process as listed above.
* All flags are valid except for CSIGNAL and
* CLONE_DETACHED.
* @pidfd: If CLONE_PIDFD is set, a pidfd will be
* returned in this argument.
* @child_tid: If CLONE_CHILD_SETTID is set, the TID of the
* child process will be returned in the child's
* memory.
* @parent_tid: If CLONE_PARENT_SETTID is set, the TID of
* the child process will be returned in the
* parent's memory.
* @exit_signal: The exit_signal the parent process will be
* sent when the child exits.
* @stack: Specify the location of the stack for the
* child process.
* Note, @stack is expected to point to the
* lowest address. The stack direction will be
* determined by the kernel and set up
* appropriately based on @stack_size.
* @stack_size: The size of the stack for the child process.
* @tls: If CLONE_SETTLS is set, the tls descriptor
* is set to tls.
* @set_tid: Pointer to an array of type *pid_t. The size
* of the array is defined using @set_tid_size.
* This array is used to select PIDs/TIDs for
* newly created processes. The first element in
* this defines the PID in the most nested PID
* namespace. Each additional element in the array
* defines the PID in the parent PID namespace of
* the original PID namespace. If the array has
* less entries than the number of currently
* nested PID namespaces only the PIDs in the
* corresponding namespaces are set.
* @set_tid_size: This defines the size of the array referenced
* in @set_tid. This cannot be larger than the
* kernel's limit of nested PID namespaces.
*
* The structure is versioned by size and thus extensible.
* New struct members must go at the end of the struct and
......@@ -72,10 +89,13 @@ struct clone_args {
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
};
#endif
#define CLONE_ARGS_SIZE_VER0 64 /* sizeof first published struct */
#define CLONE_ARGS_SIZE_VER1 80 /* sizeof second published struct */
/*
* Scheduling policies
......
......@@ -1457,7 +1457,7 @@ static long do_wait(struct wait_opts *wo)
*/
wo->notask_error = -ECHILD;
if ((wo->wo_type < PIDTYPE_MAX) &&
(!wo->wo_pid || hlist_empty(&wo->wo_pid->tasks[wo->wo_type])))
(!wo->wo_pid || !pid_has_task(wo->wo_pid, wo->wo_type)))
goto notask;
set_current_state(TASK_INTERRUPTIBLE);
......
......@@ -1517,6 +1517,11 @@ static int copy_sighand(unsigned long clone_flags, struct task_struct *tsk)
spin_lock_irq(&current->sighand->siglock);
memcpy(sig->action, current->sighand->action, sizeof(sig->action));
spin_unlock_irq(&current->sighand->siglock);
/* Reset all signal handler not set to SIG_IGN to SIG_DFL. */
if (clone_flags & CLONE_CLEAR_SIGHAND)
flush_signal_handlers(tsk, 0);
return 0;
}
......@@ -1695,12 +1700,68 @@ static int pidfd_release(struct inode *inode, struct file *file)
}
#ifdef CONFIG_PROC_FS
/**
* pidfd_show_fdinfo - print information about a pidfd
* @m: proc fdinfo file
* @f: file referencing a pidfd
*
* Pid:
* This function will print the pid that a given pidfd refers to in the
* pid namespace of the procfs instance.
* If the pid namespace of the process is not a descendant of the pid
* namespace of the procfs instance 0 will be shown as its pid. This is
* similar to calling getppid() on a process whose parent is outside of
* its pid namespace.
*
* NSpid:
* If pid namespaces are supported then this function will also print
* the pid of a given pidfd refers to for all descendant pid namespaces
* starting from the current pid namespace of the instance, i.e. the
* Pid field and the first entry in the NSpid field will be identical.
* If the pid namespace of the process is not a descendant of the pid
* namespace of the procfs instance 0 will be shown as its first NSpid
* entry and no others will be shown.
* Note that this differs from the Pid and NSpid fields in
* /proc/<pid>/status where Pid and NSpid are always shown relative to
* the pid namespace of the procfs instance. The difference becomes
* obvious when sending around a pidfd between pid namespaces from a
* different branch of the tree, i.e. where no ancestoral relation is
* present between the pid namespaces:
* - create two new pid namespaces ns1 and ns2 in the initial pid
* namespace (also take care to create new mount namespaces in the
* new pid namespace and mount procfs)
* - create a process with a pidfd in ns1
* - send pidfd from ns1 to ns2
* - read /proc/self/fdinfo/<pidfd> and observe that both Pid and NSpid
* have exactly one entry, which is 0
*/
static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
{
struct pid_namespace *ns = proc_pid_ns(file_inode(m->file));
struct pid *pid = f->private_data;
struct pid_namespace *ns;
pid_t nr = -1;
if (likely(pid_has_task(pid, PIDTYPE_PID))) {
ns = proc_pid_ns(file_inode(m->file));
nr = pid_nr_ns(pid, ns);
}
seq_put_decimal_ll(m, "Pid:\t", nr);
#ifdef CONFIG_PID_NS
seq_put_decimal_ll(m, "\nNSpid:\t", nr);
if (nr > 0) {
int i;
seq_put_decimal_ull(m, "Pid:\t", pid_nr_ns(pid, ns));
/* If nr is non-zero it means that 'pid' is valid and that
* ns, i.e. the pid namespace associated with the procfs
* instance, is in the pid namespace hierarchy of pid.
* Start at one below the already printed level.
*/
for (i = ns->level + 1; i <= pid->level; i++)
seq_put_decimal_ll(m, "\t", pid->numbers[i].nr);
}
#endif
seq_putc(m, '\n');
}
#endif
......@@ -2026,7 +2087,8 @@ static __latent_entropy struct task_struct *copy_process(
stackleak_task_init(p);
if (pid != &init_struct_pid) {
pid = alloc_pid(p->nsproxy->pid_ns_for_children);
pid = alloc_pid(p->nsproxy->pid_ns_for_children, args->set_tid,
args->set_tid_size);
if (IS_ERR(pid)) {
retval = PTR_ERR(pid);
goto bad_fork_cleanup_thread;
......@@ -2529,6 +2591,7 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
{
int err;
struct clone_args args;
pid_t *kset_tid = kargs->set_tid;
if (unlikely(usize > PAGE_SIZE))
return -E2BIG;
......@@ -2539,6 +2602,15 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
if (err)
return err;
if (unlikely(args.set_tid_size > MAX_PID_NS_LEVEL))
return -EINVAL;
if (unlikely(!args.set_tid && args.set_tid_size > 0))
return -EINVAL;
if (unlikely(args.set_tid && args.set_tid_size == 0))
return -EINVAL;
/*
* Verify that higher 32bits of exit_signal are unset and that
* it is a valid signal
......@@ -2556,8 +2628,16 @@ noinline static int copy_clone_args_from_user(struct kernel_clone_args *kargs,
.stack = args.stack,
.stack_size = args.stack_size,
.tls = args.tls,
.set_tid_size = args.set_tid_size,
};
if (args.set_tid &&
copy_from_user(kset_tid, u64_to_user_ptr(args.set_tid),
(kargs->set_tid_size * sizeof(pid_t))))
return -EFAULT;
kargs->set_tid = kset_tid;
return 0;
}
......@@ -2591,11 +2671,8 @@ static inline bool clone3_stack_valid(struct kernel_clone_args *kargs)
static bool clone3_args_valid(struct kernel_clone_args *kargs)
{
/*
* All lower bits of the flag word are taken.
* Verify that no other unknown flags are passed along.
*/
if (kargs->flags & ~CLONE_LEGACY_FLAGS)
/* Verify that no unknown flags are passed along. */
if (kargs->flags & ~(CLONE_LEGACY_FLAGS | CLONE_CLEAR_SIGHAND))
return false;
/*
......@@ -2605,6 +2682,10 @@ static bool clone3_args_valid(struct kernel_clone_args *kargs)
if (kargs->flags & (CLONE_DETACHED | CSIGNAL))
return false;
if ((kargs->flags & (CLONE_SIGHAND | CLONE_CLEAR_SIGHAND)) ==
(CLONE_SIGHAND | CLONE_CLEAR_SIGHAND))
return false;
if ((kargs->flags & (CLONE_THREAD | CLONE_PARENT)) &&
kargs->exit_signal)
return false;
......@@ -2631,6 +2712,9 @@ SYSCALL_DEFINE2(clone3, struct clone_args __user *, uargs, size_t, size)
int err;
struct kernel_clone_args kargs;
pid_t set_tid[MAX_PID_NS_LEVEL];
kargs.set_tid = set_tid;
err = copy_clone_args_from_user(&kargs, uargs, size);
if (err)
......
......@@ -157,7 +157,8 @@ void free_pid(struct pid *pid)
call_rcu(&pid->rcu, delayed_put_pid);
}
struct pid *alloc_pid(struct pid_namespace *ns)
struct pid *alloc_pid(struct pid_namespace *ns, pid_t *set_tid,
size_t set_tid_size)
{
struct pid *pid;
enum pid_type type;
......@@ -166,6 +167,17 @@ struct pid *alloc_pid(struct pid_namespace *ns)
struct upid *upid;
int retval = -ENOMEM;
/*
* set_tid_size contains the size of the set_tid array. Starting at
* the most nested currently active PID namespace it tells alloc_pid()
* which PID to set for a process in that most nested PID namespace
* up to set_tid_size PID namespaces. It does not have to set the PID
* for a process in all nested PID namespaces but set_tid_size must
* never be greater than the current ns->level + 1.
*/
if (set_tid_size > ns->level + 1)
return ERR_PTR(-EINVAL);
pid = kmem_cache_alloc(ns->pid_cachep, GFP_KERNEL);
if (!pid)
return ERR_PTR(retval);
......@@ -174,24 +186,54 @@ struct pid *alloc_pid(struct pid_namespace *ns)
pid->level = ns->level;
for (i = ns->level; i >= 0; i--) {
int pid_min = 1;
int tid = 0;
if (set_tid_size) {
tid = set_tid[ns->level - i];
retval = -EINVAL;
if (tid < 1 || tid >= pid_max)
goto out_free;
/*
* Also fail if a PID != 1 is requested and
* no PID 1 exists.
*/
if (tid != 1 && !tmp->child_reaper)
goto out_free;
retval = -EPERM;
if (!ns_capable(tmp->user_ns, CAP_SYS_ADMIN))
goto out_free;
set_tid_size--;
}
idr_preload(GFP_KERNEL);
spin_lock_irq(&pidmap_lock);
/*
* init really needs pid 1, but after reaching the maximum
* wrap back to RESERVED_PIDS
*/
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
pid_min = RESERVED_PIDS;
/*
* Store a null pointer so find_pid_ns does not find
* a partially initialized PID (see below).
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
if (tid) {
nr = idr_alloc(&tmp->idr, NULL, tid,
tid + 1, GFP_ATOMIC);
/*
* If ENOSPC is returned it means that the PID is
* alreay in use. Return EEXIST in that case.
*/
if (nr == -ENOSPC)
nr = -EEXIST;
} else {
int pid_min = 1;
/*
* init really needs pid 1, but after reaching the
* maximum wrap back to RESERVED_PIDS
*/
if (idr_get_cursor(&tmp->idr) > RESERVED_PIDS)
pid_min = RESERVED_PIDS;
/*
* Store a null pointer so find_pid_ns does not find
* a partially initialized PID (see below).
*/
nr = idr_alloc_cyclic(&tmp->idr, NULL, pid_min,
pid_max, GFP_ATOMIC);
}
spin_unlock_irq(&pidmap_lock);
idr_preload_end();
......@@ -299,7 +341,7 @@ static void __change_pid(struct task_struct *task, enum pid_type type,
*pid_ptr = new;
for (tmp = PIDTYPE_MAX; --tmp >= 0; )
if (!hlist_empty(&pid->tasks[tmp]))
if (pid_has_task(pid, tmp))
return;
free_pid(pid);
......@@ -497,7 +539,7 @@ static int pidfd_create(struct pid *pid)
*/
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
int fd, ret;
int fd;
struct pid *p;
if (flags)
......@@ -510,13 +552,11 @@ SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
if (!p)
return -ESRCH;
ret = 0;
rcu_read_lock();
if (!pid_task(p, PIDTYPE_TGID))
ret = -EINVAL;
rcu_read_unlock();
if (pid_has_task(p, PIDTYPE_TGID))
fd = pidfd_create(p);
else
fd = -EINVAL;
fd = ret ?: pidfd_create(p);
put_pid(p);
return fd;
}
......
......@@ -26,8 +26,6 @@
static DEFINE_MUTEX(pid_caches_mutex);
static struct kmem_cache *pid_ns_cachep;
/* MAX_PID_NS_LEVEL is needed for limiting size of 'struct pid' */
#define MAX_PID_NS_LEVEL 32
/* Write once array, filled from the beginning. */
static struct kmem_cache *pid_cache[MAX_PID_NS_LEVEL];
......
......@@ -5,6 +5,7 @@ TARGETS += bpf
TARGETS += breakpoints
TARGETS += capabilities
TARGETS += cgroup
TARGETS += clone3
TARGETS += cpufreq
TARGETS += cpu-hotplug
TARGETS += drivers/dma-buf
......
clone3
clone3_clear_sighand
clone3_set_tid
# SPDX-License-Identifier: GPL-2.0
CFLAGS += -g -I../../../../usr/include/
TEST_GEN_PROGS := clone3 clone3_clear_sighand clone3_set_tid
include ../lib.mk
// SPDX-License-Identifier: GPL-2.0
/* Based on Christian Brauner's clone3() example */
#define _GNU_SOURCE
#include <errno.h>
#include <inttypes.h>
#include <linux/types.h>
#include <linux/sched.h>
#include <stdint.h>
#include <stdio.h>
#include <stdlib.h>
#include <sys/syscall.h>
#include <sys/types.h>
#include <sys/un.h>
#include <sys/wait.h>
#include <unistd.h>
#include <sched.h>
#include "../kselftest.h"
#include "clone3_selftests.h"
/*
* Different sizes of struct clone_args
*/
#ifndef CLONE3_ARGS_SIZE_V0
#define CLONE3_ARGS_SIZE_V0 64
#endif
enum test_mode {
CLONE3_ARGS_NO_TEST,
CLONE3_ARGS_ALL_0,
CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG,
CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG,
CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG,
CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG,
};
static int call_clone3(uint64_t flags, size_t size, enum test_mode test_mode)
{
struct clone_args args = {
.flags = flags,
.exit_signal = SIGCHLD,
};
struct clone_args_extended {
struct clone_args args;
__aligned_u64 excess_space[2];
} args_ext;
pid_t pid = -1;
int status;
memset(&args_ext, 0, sizeof(args_ext));
if (size > sizeof(struct clone_args))
args_ext.excess_space[1] = 1;
if (size == 0)
size = sizeof(struct clone_args);
switch (test_mode) {
case CLONE3_ARGS_ALL_0:
args.flags = 0;
args.exit_signal = 0;
break;
case CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG:
args.exit_signal = 0xbadc0ded00000000ULL;
break;
case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG:
args.exit_signal = 0x0000000080000000ULL;
break;
case CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG:
args.exit_signal = 0x0000000000000100ULL;
break;
case CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG:
args.exit_signal = 0x00000000000000f0ULL;
break;
}
memcpy(&args_ext.args, &args, sizeof(struct clone_args));
pid = sys_clone3((struct clone_args *)&args_ext, size);
if (pid < 0) {
ksft_print_msg("%s - Failed to create new process\n",
strerror(errno));
return -errno;
}
if (pid == 0) {
ksft_print_msg("I am the child, my PID is %d\n", getpid());
_exit(EXIT_SUCCESS);
}
ksft_print_msg("I am the parent (%d). My child's pid is %d\n",
getpid(), pid);
if (waitpid(-1, &status, __WALL) < 0) {
ksft_print_msg("Child returned %s\n", strerror(errno));
return -errno;
}
if (WEXITSTATUS(status))
return WEXITSTATUS(status);
return 0;
}
static void test_clone3(uint64_t flags, size_t size, int expected,
enum test_mode test_mode)
{
int ret;
ksft_print_msg(
"[%d] Trying clone3() with flags %#" PRIx64 " (size %zu)\n",
getpid(), flags, size);
ret = call_clone3(flags, size, test_mode);
ksft_print_msg("[%d] clone3() with flags says: %d expected %d\n",
getpid(), ret, expected);
if (ret != expected)
ksft_test_result_fail(
"[%d] Result (%d) is different than expected (%d)\n",
getpid(), ret, expected);
else
ksft_test_result_pass(
"[%d] Result (%d) matches expectation (%d)\n",
getpid(), ret, expected);
}
int main(int argc, char *argv[])
{
pid_t pid;
uid_t uid = getuid();
test_clone3_supported();
ksft_print_header();
ksft_set_plan(17);
/* Just a simple clone3() should return 0.*/
test_clone3(0, 0, 0, CLONE3_ARGS_NO_TEST);
/* Do a clone3() in a new PID NS.*/
if (uid == 0)
test_clone3(CLONE_NEWPID, 0, 0, CLONE3_ARGS_NO_TEST);
else
ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
/* Do a clone3() with CLONE3_ARGS_SIZE_V0. */
test_clone3(0, CLONE3_ARGS_SIZE_V0, 0, CLONE3_ARGS_NO_TEST);
/* Do a clone3() with CLONE3_ARGS_SIZE_V0 - 8 */
test_clone3(0, CLONE3_ARGS_SIZE_V0 - 8, -EINVAL, CLONE3_ARGS_NO_TEST);
/* Do a clone3() with sizeof(struct clone_args) + 8 */
test_clone3(0, sizeof(struct clone_args) + 8, 0, CLONE3_ARGS_NO_TEST);
/* Do a clone3() with exit_signal having highest 32 bits non-zero */
test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_BIG);
/* Do a clone3() with negative 32-bit exit_signal */
test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NEG);
/* Do a clone3() with exit_signal not fitting into CSIGNAL mask */
test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_CSIG);
/* Do a clone3() with NSIG < exit_signal < CSIG */
test_clone3(0, 0, -EINVAL, CLONE3_ARGS_INVAL_EXIT_SIGNAL_NSIG);
test_clone3(0, sizeof(struct clone_args) + 8, 0, CLONE3_ARGS_ALL_0);
test_clone3(0, sizeof(struct clone_args) + 16, -E2BIG,
CLONE3_ARGS_ALL_0);
test_clone3(0, sizeof(struct clone_args) * 2, -E2BIG,
CLONE3_ARGS_ALL_0);
/* Do a clone3() with > page size */
test_clone3(0, getpagesize() + 8, -E2BIG, CLONE3_ARGS_NO_TEST);
/* Do a clone3() with CLONE3_ARGS_SIZE_V0 in a new PID NS. */
if (uid == 0)
test_clone3(CLONE_NEWPID, CLONE3_ARGS_SIZE_V0, 0,
CLONE3_ARGS_NO_TEST);
else
ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
/* Do a clone3() with CLONE3_ARGS_SIZE_V0 - 8 in a new PID NS */
test_clone3(CLONE_NEWPID, CLONE3_ARGS_SIZE_V0 - 8, -EINVAL,
CLONE3_ARGS_NO_TEST);
/* Do a clone3() with sizeof(struct clone_args) + 8 in a new PID NS */
if (uid == 0)
test_clone3(CLONE_NEWPID, sizeof(struct clone_args) + 8, 0,
CLONE3_ARGS_NO_TEST);
else
ksft_test_result_skip("Skipping clone3() with CLONE_NEWPID\n");
/* Do a clone3() with > page size in a new PID NS */
test_clone3(CLONE_NEWPID, getpagesize() + 8, -E2BIG,
CLONE3_ARGS_NO_TEST);
return !ksft_get_fail_cnt() ? ksft_exit_pass() : ksft_exit_fail();
}
/* SPDX-License-Identifier: GPL-2.0 */
#define _GNU_SOURCE
#include <errno.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <unistd.h>
#include <linux/sched.h>
#include <linux/types.h>
#include <sys/syscall.h>
#include <sys/wait.h>
#include "../kselftest.h"
#include "clone3_selftests.h"
#ifndef CLONE_CLEAR_SIGHAND
#define CLONE_CLEAR_SIGHAND 0x100000000ULL
#endif
static void nop_handler(int signo)
{
}
static int wait_for_pid(pid_t pid)
{
int status, ret;
again:
ret = waitpid(pid, &status, 0);
if (ret == -1) {
if (errno == EINTR)
goto again;
return -1;
}
if (!WIFEXITED(status))
return -1;
return WEXITSTATUS(status);
}
static void test_clone3_clear_sighand(void)
{
int ret;
pid_t pid;
struct clone_args args = {};
struct sigaction act;
/*
* Check that CLONE_CLEAR_SIGHAND and CLONE_SIGHAND are mutually
* exclusive.
*/
args.flags |= CLONE_CLEAR_SIGHAND | CLONE_SIGHAND;
args.exit_signal = SIGCHLD;
pid = sys_clone3(&args, sizeof(args));
if (pid > 0)
ksft_exit_fail_msg(
"clone3(CLONE_CLEAR_SIGHAND | CLONE_SIGHAND) succeeded\n");
act.sa_handler = nop_handler;
ret = sigemptyset(&act.sa_mask);
if (ret < 0)
ksft_exit_fail_msg("%s - sigemptyset() failed\n",
strerror(errno));
act.sa_flags = 0;
/* Register signal handler for SIGUSR1 */
ret = sigaction(SIGUSR1, &act, NULL);
if (ret < 0)
ksft_exit_fail_msg(
"%s - sigaction(SIGUSR1, &act, NULL) failed\n",
strerror(errno));
/* Register signal handler for SIGUSR2 */
ret = sigaction(SIGUSR2, &act, NULL);
if (ret < 0)
ksft_exit_fail_msg(
"%s - sigaction(SIGUSR2, &act, NULL) failed\n",
strerror(errno));
/* Check that CLONE_CLEAR_SIGHAND works. */
args.flags = CLONE_CLEAR_SIGHAND;
pid = sys_clone3(&args, sizeof(args));
if (pid < 0)
ksft_exit_fail_msg("%s - clone3(CLONE_CLEAR_SIGHAND) failed\n",
strerror(errno));
if (pid == 0) {
ret = sigaction(SIGUSR1, NULL, &act);
if (ret < 0)
exit(EXIT_FAILURE);
if (act.sa_handler != SIG_DFL)
exit(EXIT_FAILURE);
ret = sigaction(SIGUSR2, NULL, &act);
if (ret < 0)
exit(EXIT_FAILURE);
if (act.sa_handler != SIG_DFL)
exit(EXIT_FAILURE);
exit(EXIT_SUCCESS);
}
ret = wait_for_pid(pid);
if (ret)
ksft_exit_fail_msg(
"Failed to clear signal handler for child process\n");
ksft_test_result_pass("Cleared signal handlers for child process\n");
}
int main(int argc, char **argv)
{
ksft_print_header();
test_clone3_supported();
ksft_set_plan(1);
test_clone3_clear_sighand();
return ksft_exit_pass();
}
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef _CLONE3_SELFTESTS_H
#define _CLONE3_SELFTESTS_H
#define _GNU_SOURCE
#include <sched.h>
#include <stdint.h>
#include <syscall.h>
#include <linux/types.h>
#define ptr_to_u64(ptr) ((__u64)((uintptr_t)(ptr)))
#ifndef __NR_clone3
#define __NR_clone3 -1
struct clone_args {
__aligned_u64 flags;
__aligned_u64 pidfd;
__aligned_u64 child_tid;
__aligned_u64 parent_tid;
__aligned_u64 exit_signal;
__aligned_u64 stack;
__aligned_u64 stack_size;
__aligned_u64 tls;
__aligned_u64 set_tid;
__aligned_u64 set_tid_size;
};
#endif
static pid_t sys_clone3(struct clone_args *args, size_t size)
{
fflush(stdout);
fflush(stderr);
return syscall(__NR_clone3, args, size);
}
static inline void test_clone3_supported(void)
{
pid_t pid;
struct clone_args args = {};
if (__NR_clone3 < 0)
ksft_exit_skip("clone3() syscall is not supported\n");
/* Set to something that will always cause EINVAL. */
args.exit_signal = -1;
pid = sys_clone3(&args, sizeof(args));
if (!pid)
exit(EXIT_SUCCESS);
if (pid > 0) {
wait(NULL);
ksft_exit_fail_msg(
"Managed to create child process with invalid exit_signal\n");
}
if (errno == ENOSYS)
ksft_exit_skip("clone3() syscall is not supported\n");
ksft_print_msg("clone3() syscall supported\n");
}
#endif /* _CLONE3_SELFTESTS_H */
This diff is collapsed.
# SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -g -I../../../../usr/include/ -pthread
TEST_GEN_PROGS := pidfd_test pidfd_open_test pidfd_poll_test pidfd_wait
TEST_GEN_PROGS := pidfd_test pidfd_fdinfo_test pidfd_open_test pidfd_poll_test pidfd_wait
include ../lib.mk
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <assert.h>
#include <errno.h>
#include <fcntl.h>
#include <linux/types.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
#include <sys/wait.h>
#include "pidfd.h"
#include "../kselftest.h"
struct error {
int code;
char msg[512];
};
static int error_set(struct error *err, int code, const char *fmt, ...)
{
va_list args;
int r;
if (code == PIDFD_PASS || !err || err->code != PIDFD_PASS)
return code;
err->code = code;
va_start(args, fmt);
r = vsnprintf(err->msg, sizeof(err->msg), fmt, args);
assert((size_t)r < sizeof(err->msg));
va_end(args);
return code;
}
static void error_report(struct error *err, const char *test_name)
{
switch (err->code) {
case PIDFD_ERROR:
ksft_exit_fail_msg("%s test: Fatal: %s\n", test_name, err->msg);
break;
case PIDFD_FAIL:
/* will be: not ok %d # error %s test: %s */
ksft_test_result_error("%s test: %s\n", test_name, err->msg);
break;
case PIDFD_SKIP:
/* will be: not ok %d # SKIP %s test: %s */
ksft_test_result_skip("%s test: %s\n", test_name, err->msg);
break;
case PIDFD_XFAIL:
ksft_test_result_pass("%s test: Expected failure: %s\n",
test_name, err->msg);
break;
case PIDFD_PASS:
ksft_test_result_pass("%s test: Passed\n");
break;
default:
ksft_exit_fail_msg("%s test: Unknown code: %d %s\n",
test_name, err->code, err->msg);
break;
}
}
static inline int error_check(struct error *err, const char *test_name)
{
/* In case of error we bail out and terminate the test program */
if (err->code == PIDFD_ERROR)
error_report(err, test_name);
return err->code;
}
struct child {
pid_t pid;
int fd;
};
static struct child clone_newns(int (*fn)(void *), void *args,
struct error *err)
{
static int flags = CLONE_PIDFD | CLONE_NEWPID | CLONE_NEWNS | SIGCHLD;
size_t stack_size = 1024;
char *stack[1024] = { 0 };
struct child ret;
if (!(flags & CLONE_NEWUSER) && geteuid() != 0)
flags |= CLONE_NEWUSER;
#ifdef __ia64__
ret.pid = __clone2(fn, stack, stack_size, flags, args, &ret.fd);
#else
ret.pid = clone(fn, stack + stack_size, flags, args, &ret.fd);
#endif
if (ret.pid < 0) {
error_set(err, PIDFD_ERROR, "clone failed (ret %d, errno %d)",
ret.fd, errno);
return ret;
}
ksft_print_msg("New child: %d, fd: %d\n", ret.pid, ret.fd);
return ret;
}
static inline void child_close(struct child *child)
{
close(child->fd);
}
static inline int child_join(struct child *child, struct error *err)
{
int r;
r = wait_for_pid(child->pid);
if (r < 0)
error_set(err, PIDFD_ERROR, "waitpid failed (ret %d, errno %d)",
r, errno);
else if (r > 0)
error_set(err, r, "child %d reported: %d", child->pid, r);
return r;
}
static inline int child_join_close(struct child *child, struct error *err)
{
child_close(child);
return child_join(child, err);
}
static inline void trim_newline(char *str)
{
char *pos = strrchr(str, '\n');
if (pos)
*pos = '\0';
}
static int verify_fdinfo(int pidfd, struct error *err, const char *prefix,
size_t prefix_len, const char *expect, ...)
{
char buffer[512] = {0, };
char path[512] = {0, };
va_list args;
FILE *f;
char *line = NULL;
size_t n = 0;
int found = 0;
int r;
va_start(args, expect);
r = vsnprintf(buffer, sizeof(buffer), expect, args);
assert((size_t)r < sizeof(buffer));
va_end(args);
snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", pidfd);
f = fopen(path, "re");
if (!f)
return error_set(err, PIDFD_ERROR, "fdinfo open failed for %d",
pidfd);
while (getline(&line, &n, f) != -1) {
char *val;
if (strncmp(line, prefix, prefix_len))
continue;
found = 1;
val = line + prefix_len;
r = strcmp(val, buffer);
if (r != 0) {
trim_newline(line);
trim_newline(buffer);
error_set(err, PIDFD_FAIL, "%s '%s' != '%s'",
prefix, val, buffer);
}
break;
}
free(line);
fclose(f);
if (found == 0)
return error_set(err, PIDFD_FAIL, "%s not found for fd %d",
prefix, pidfd);
return PIDFD_PASS;
}
static int child_fdinfo_nspid_test(void *args)
{
struct error err;
int pidfd;
int r;
/* if we got no fd for the sibling, we are done */
if (!args)
return PIDFD_PASS;
/* verify that we can not resolve the pidfd for a process
* in a sibling pid namespace, i.e. a pid namespace it is
* not in our or a descended namespace
*/
r = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0);
if (r < 0) {
ksft_print_msg("Failed to remount / private\n");
return PIDFD_ERROR;
}
(void)umount2("/proc", MNT_DETACH);
r = mount("proc", "/proc", "proc", 0, NULL);
if (r < 0) {
ksft_print_msg("Failed to remount /proc\n");
return PIDFD_ERROR;
}
pidfd = *(int *)args;
r = verify_fdinfo(pidfd, &err, "NSpid:", 6, "\t0\n");
if (r != PIDFD_PASS)
ksft_print_msg("NSpid fdinfo check failed: %s\n", err.msg);
return r;
}
static void test_pidfd_fdinfo_nspid(void)
{
struct child a, b;
struct error err = {0, };
const char *test_name = "pidfd check for NSpid in fdinfo";
/* Create a new child in a new pid and mount namespace */
a = clone_newns(child_fdinfo_nspid_test, NULL, &err);
error_check(&err, test_name);
/* Pass the pidfd representing the first child to the
* second child, which will be in a sibling pid namespace,
* which means that the fdinfo NSpid entry for the pidfd
* should only contain '0'.
*/
b = clone_newns(child_fdinfo_nspid_test, &a.fd, &err);
error_check(&err, test_name);
/* The children will have pid 1 in the new pid namespace,
* so the line must be 'NSPid:\t<pid>\t1'.
*/
verify_fdinfo(a.fd, &err, "NSpid:", 6, "\t%d\t%d\n", a.pid, 1);
verify_fdinfo(b.fd, &err, "NSpid:", 6, "\t%d\t%d\n", b.pid, 1);
/* wait for the process, check the exit status and set
* 'err' accordingly, if it is not already set.
*/
child_join_close(&a, &err);
child_join_close(&b, &err);
error_report(&err, test_name);
}
static void test_pidfd_dead_fdinfo(void)
{
struct child a;
struct error err = {0, };
const char *test_name = "pidfd check fdinfo for dead process";
/* Create a new child in a new pid and mount namespace */
a = clone_newns(child_fdinfo_nspid_test, NULL, &err);
error_check(&err, test_name);
child_join(&a, &err);
verify_fdinfo(a.fd, &err, "Pid:", 4, "\t-1\n");
verify_fdinfo(a.fd, &err, "NSpid:", 6, "\t-1\n");
child_close(&a);
error_report(&err, test_name);
}
int main(int argc, char **argv)
{
ksft_print_header();
ksft_set_plan(2);
test_pidfd_fdinfo_nspid();
test_pidfd_dead_fdinfo();
return ksft_exit_pass();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment