Commit 5450e8a3 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'pidfd-updates-v5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux

Pull pidfd updates from Christian Brauner:
 "This adds two main features.

   - First, it adds polling support for pidfds. This allows process
     managers to know when a (non-parent) process dies in a race-free
     way.

     The notification mechanism used follows the same logic that is
     currently used when the parent of a task is notified of a child's
     death. With this patchset it is possible to put pidfds in an
     {e}poll loop and get reliable notifications for process (i.e.
     thread-group) exit.

   - The second feature compliments the first one by making it possible
     to retrieve pollable pidfds for processes that were not created
     using CLONE_PIDFD.

     A lot of processes get created with traditional PID-based calls
     such as fork() or clone() (without CLONE_PIDFD). For these
     processes a caller can currently not create a pollable pidfd. This
     is a problem for Android's low memory killer (LMK) and service
     managers such as systemd.

  Both patchsets are accompanied by selftests.

  It's perhaps worth noting that the work done so far and the work done
  in this branch for pidfd_open() and polling support do already see
  some adoption:

   - Android is in the process of backporting this work to all their LTS
     kernels [1]

   - Service managers make use of pidfd_send_signal but will need to
     wait until we enable waiting on pidfds for full adoption.

   - And projects I maintain make use of both pidfd_send_signal and
     CLONE_PIDFD [2] and will use polling support and pidfd_open() too"

[1] https://android-review.googlesource.com/q/topic:%22pidfd+polling+support+4.9+backport%22
    https://android-review.googlesource.com/q/topic:%22pidfd+polling+support+4.14+backport%22
    https://android-review.googlesource.com/q/topic:%22pidfd+polling+support+4.19+backport%22

[2] https://github.com/lxc/lxc/blob/aab6e3eb73c343231cdde775db938994fc6f2803/src/lxc/start.c#L1753

* tag 'pidfd-updates-v5.3' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  tests: add pidfd_open() tests
  arch: wire-up pidfd_open()
  pid: add pidfd_open()
  pidfd: add polling selftests
  pidfd: add polling support
parents 29cd581b 172bb24a
...@@ -473,3 +473,4 @@ ...@@ -473,3 +473,4 @@
541 common fsconfig sys_fsconfig 541 common fsconfig sys_fsconfig
542 common fsmount sys_fsmount 542 common fsmount sys_fsmount
543 common fspick sys_fspick 543 common fspick sys_fspick
544 common pidfd_open sys_pidfd_open
...@@ -447,3 +447,4 @@ ...@@ -447,3 +447,4 @@
431 common fsconfig sys_fsconfig 431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount 432 common fsmount sys_fsmount
433 common fspick sys_fspick 433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
...@@ -38,7 +38,7 @@ ...@@ -38,7 +38,7 @@
#define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5) #define __ARM_NR_compat_set_tls (__ARM_NR_COMPAT_BASE + 5)
#define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800) #define __ARM_NR_COMPAT_END (__ARM_NR_COMPAT_BASE + 0x800)
#define __NR_compat_syscalls 434 #define __NR_compat_syscalls 435
#endif #endif
#define __ARCH_WANT_SYS_CLONE #define __ARCH_WANT_SYS_CLONE
......
...@@ -875,6 +875,8 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig) ...@@ -875,6 +875,8 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
__SYSCALL(__NR_fsmount, sys_fsmount) __SYSCALL(__NR_fsmount, sys_fsmount)
#define __NR_fspick 433 #define __NR_fspick 433
__SYSCALL(__NR_fspick, sys_fspick) __SYSCALL(__NR_fspick, sys_fspick)
#define __NR_pidfd_open 434
__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
/* /*
* Please add new compat syscalls above this comment and update * Please add new compat syscalls above this comment and update
......
...@@ -354,3 +354,4 @@ ...@@ -354,3 +354,4 @@
431 common fsconfig sys_fsconfig 431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount 432 common fsmount sys_fsmount
433 common fspick sys_fspick 433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
...@@ -433,3 +433,4 @@ ...@@ -433,3 +433,4 @@
431 common fsconfig sys_fsconfig 431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount 432 common fsmount sys_fsmount
433 common fspick sys_fspick 433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
...@@ -439,3 +439,4 @@ ...@@ -439,3 +439,4 @@
431 common fsconfig sys_fsconfig 431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount 432 common fsmount sys_fsmount
433 common fspick sys_fspick 433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
...@@ -372,3 +372,4 @@ ...@@ -372,3 +372,4 @@
431 n32 fsconfig sys_fsconfig 431 n32 fsconfig sys_fsconfig
432 n32 fsmount sys_fsmount 432 n32 fsmount sys_fsmount
433 n32 fspick sys_fspick 433 n32 fspick sys_fspick
434 n32 pidfd_open sys_pidfd_open
...@@ -348,3 +348,4 @@ ...@@ -348,3 +348,4 @@
431 n64 fsconfig sys_fsconfig 431 n64 fsconfig sys_fsconfig
432 n64 fsmount sys_fsmount 432 n64 fsmount sys_fsmount
433 n64 fspick sys_fspick 433 n64 fspick sys_fspick
434 n64 pidfd_open sys_pidfd_open
...@@ -421,3 +421,4 @@ ...@@ -421,3 +421,4 @@
431 o32 fsconfig sys_fsconfig 431 o32 fsconfig sys_fsconfig
432 o32 fsmount sys_fsmount 432 o32 fsmount sys_fsmount
433 o32 fspick sys_fspick 433 o32 fspick sys_fspick
434 o32 pidfd_open sys_pidfd_open
...@@ -430,3 +430,4 @@ ...@@ -430,3 +430,4 @@
431 common fsconfig sys_fsconfig 431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount 432 common fsmount sys_fsmount
433 common fspick sys_fspick 433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
...@@ -515,3 +515,4 @@ ...@@ -515,3 +515,4 @@
431 common fsconfig sys_fsconfig 431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount 432 common fsmount sys_fsmount
433 common fspick sys_fspick 433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
...@@ -436,3 +436,4 @@ ...@@ -436,3 +436,4 @@
431 common fsconfig sys_fsconfig sys_fsconfig 431 common fsconfig sys_fsconfig sys_fsconfig
432 common fsmount sys_fsmount sys_fsmount 432 common fsmount sys_fsmount sys_fsmount
433 common fspick sys_fspick sys_fspick 433 common fspick sys_fspick sys_fspick
434 common pidfd_open sys_pidfd_open sys_pidfd_open
...@@ -436,3 +436,4 @@ ...@@ -436,3 +436,4 @@
431 common fsconfig sys_fsconfig 431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount 432 common fsmount sys_fsmount
433 common fspick sys_fspick 433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
...@@ -479,3 +479,4 @@ ...@@ -479,3 +479,4 @@
431 common fsconfig sys_fsconfig 431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount 432 common fsmount sys_fsmount
433 common fspick sys_fspick 433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
...@@ -438,3 +438,4 @@ ...@@ -438,3 +438,4 @@
431 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig 431 i386 fsconfig sys_fsconfig __ia32_sys_fsconfig
432 i386 fsmount sys_fsmount __ia32_sys_fsmount 432 i386 fsmount sys_fsmount __ia32_sys_fsmount
433 i386 fspick sys_fspick __ia32_sys_fspick 433 i386 fspick sys_fspick __ia32_sys_fspick
434 i386 pidfd_open sys_pidfd_open __ia32_sys_pidfd_open
...@@ -355,6 +355,7 @@ ...@@ -355,6 +355,7 @@
431 common fsconfig __x64_sys_fsconfig 431 common fsconfig __x64_sys_fsconfig
432 common fsmount __x64_sys_fsmount 432 common fsmount __x64_sys_fsmount
433 common fspick __x64_sys_fspick 433 common fspick __x64_sys_fspick
434 common pidfd_open __x64_sys_pidfd_open
# #
# x32-specific system call numbers start at 512 to avoid cache impact # x32-specific system call numbers start at 512 to avoid cache impact
......
...@@ -404,3 +404,4 @@ ...@@ -404,3 +404,4 @@
431 common fsconfig sys_fsconfig 431 common fsconfig sys_fsconfig
432 common fsmount sys_fsmount 432 common fsmount sys_fsmount
433 common fspick sys_fspick 433 common fspick sys_fspick
434 common pidfd_open sys_pidfd_open
...@@ -3,6 +3,7 @@ ...@@ -3,6 +3,7 @@
#define _LINUX_PID_H #define _LINUX_PID_H
#include <linux/rculist.h> #include <linux/rculist.h>
#include <linux/wait.h>
enum pid_type enum pid_type
{ {
...@@ -60,6 +61,8 @@ struct pid ...@@ -60,6 +61,8 @@ struct pid
unsigned int level; unsigned int level;
/* lists of tasks that use this pid */ /* lists of tasks that use this pid */
struct hlist_head tasks[PIDTYPE_MAX]; struct hlist_head tasks[PIDTYPE_MAX];
/* wait queue for pidfd notifications */
wait_queue_head_t wait_pidfd;
struct rcu_head rcu; struct rcu_head rcu;
struct upid numbers[1]; struct upid numbers[1];
}; };
......
...@@ -927,6 +927,7 @@ asmlinkage long sys_clock_adjtime32(clockid_t which_clock, ...@@ -927,6 +927,7 @@ asmlinkage long sys_clock_adjtime32(clockid_t which_clock,
struct old_timex32 __user *tx); struct old_timex32 __user *tx);
asmlinkage long sys_syncfs(int fd); asmlinkage long sys_syncfs(int fd);
asmlinkage long sys_setns(int fd, int nstype); asmlinkage long sys_setns(int fd, int nstype);
asmlinkage long sys_pidfd_open(pid_t pid, unsigned int flags);
asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg, asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
unsigned int vlen, unsigned flags); unsigned int vlen, unsigned flags);
asmlinkage long sys_process_vm_readv(pid_t pid, asmlinkage long sys_process_vm_readv(pid_t pid,
......
...@@ -844,9 +844,11 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig) ...@@ -844,9 +844,11 @@ __SYSCALL(__NR_fsconfig, sys_fsconfig)
__SYSCALL(__NR_fsmount, sys_fsmount) __SYSCALL(__NR_fsmount, sys_fsmount)
#define __NR_fspick 433 #define __NR_fspick 433
__SYSCALL(__NR_fspick, sys_fspick) __SYSCALL(__NR_fspick, sys_fspick)
#define __NR_pidfd_open 434
__SYSCALL(__NR_pidfd_open, sys_pidfd_open)
#undef __NR_syscalls #undef __NR_syscalls
#define __NR_syscalls 434 #define __NR_syscalls 435
/* /*
* 32 bit systems traditionally used different * 32 bit systems traditionally used different
......
...@@ -1711,8 +1711,34 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f) ...@@ -1711,8 +1711,34 @@ static void pidfd_show_fdinfo(struct seq_file *m, struct file *f)
} }
#endif #endif
/*
* Poll support for process exit notification.
*/
static unsigned int pidfd_poll(struct file *file, struct poll_table_struct *pts)
{
struct task_struct *task;
struct pid *pid = file->private_data;
int poll_flags = 0;
poll_wait(file, &pid->wait_pidfd, pts);
rcu_read_lock();
task = pid_task(pid, PIDTYPE_PID);
/*
* Inform pollers only when the whole thread group exits.
* If the thread group leader exits before all other threads in the
* group, then poll(2) should block, similar to the wait(2) family.
*/
if (!task || (task->exit_state && thread_group_empty(task)))
poll_flags = POLLIN | POLLRDNORM;
rcu_read_unlock();
return poll_flags;
}
const struct file_operations pidfd_fops = { const struct file_operations pidfd_fops = {
.release = pidfd_release, .release = pidfd_release,
.poll = pidfd_poll,
#ifdef CONFIG_PROC_FS #ifdef CONFIG_PROC_FS
.show_fdinfo = pidfd_show_fdinfo, .show_fdinfo = pidfd_show_fdinfo,
#endif #endif
......
...@@ -38,6 +38,8 @@ ...@@ -38,6 +38,8 @@
#include <linux/syscalls.h> #include <linux/syscalls.h>
#include <linux/proc_ns.h> #include <linux/proc_ns.h>
#include <linux/proc_fs.h> #include <linux/proc_fs.h>
#include <linux/anon_inodes.h>
#include <linux/sched/signal.h>
#include <linux/sched/task.h> #include <linux/sched/task.h>
#include <linux/idr.h> #include <linux/idr.h>
...@@ -214,6 +216,8 @@ struct pid *alloc_pid(struct pid_namespace *ns) ...@@ -214,6 +216,8 @@ struct pid *alloc_pid(struct pid_namespace *ns)
for (type = 0; type < PIDTYPE_MAX; ++type) for (type = 0; type < PIDTYPE_MAX; ++type)
INIT_HLIST_HEAD(&pid->tasks[type]); INIT_HLIST_HEAD(&pid->tasks[type]);
init_waitqueue_head(&pid->wait_pidfd);
upid = pid->numbers + ns->level; upid = pid->numbers + ns->level;
spin_lock_irq(&pidmap_lock); spin_lock_irq(&pidmap_lock);
if (!(ns->pid_allocated & PIDNS_ADDING)) if (!(ns->pid_allocated & PIDNS_ADDING))
...@@ -451,6 +455,73 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns) ...@@ -451,6 +455,73 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
return idr_get_next(&ns->idr, &nr); return idr_get_next(&ns->idr, &nr);
} }
/**
* pidfd_create() - Create a new pid file descriptor.
*
* @pid: struct pid that the pidfd will reference
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set.
*
* Note, that this function can only be called after the fd table has
* been unshared to avoid leaking the pidfd to the new process.
*
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
static int pidfd_create(struct pid *pid)
{
int fd;
fd = anon_inode_getfd("[pidfd]", &pidfd_fops, get_pid(pid),
O_RDWR | O_CLOEXEC);
if (fd < 0)
put_pid(pid);
return fd;
}
/**
* pidfd_open() - Open new pid file descriptor.
*
* @pid: pid for which to retrieve a pidfd
* @flags: flags to pass
*
* This creates a new pid file descriptor with the O_CLOEXEC flag set for
* the process identified by @pid. Currently, the process identified by
* @pid must be a thread-group leader. This restriction currently exists
* for all aspects of pidfds including pidfd creation (CLONE_PIDFD cannot
* be used with CLONE_THREAD) and pidfd polling (only supports thread group
* leaders).
*
* Return: On success, a cloexec pidfd is returned.
* On error, a negative errno number will be returned.
*/
SYSCALL_DEFINE2(pidfd_open, pid_t, pid, unsigned int, flags)
{
int fd, ret;
struct pid *p;
if (flags)
return -EINVAL;
if (pid <= 0)
return -EINVAL;
p = find_get_pid(pid);
if (!p)
return -ESRCH;
ret = 0;
rcu_read_lock();
if (!pid_task(p, PIDTYPE_TGID))
ret = -EINVAL;
rcu_read_unlock();
fd = ret ?: pidfd_create(p);
put_pid(p);
return fd;
}
void __init pid_idr_init(void) void __init pid_idr_init(void)
{ {
/* Verify no one has done anything silly: */ /* Verify no one has done anything silly: */
......
...@@ -1881,6 +1881,14 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type) ...@@ -1881,6 +1881,14 @@ int send_sigqueue(struct sigqueue *q, struct pid *pid, enum pid_type type)
return ret; return ret;
} }
static void do_notify_pidfd(struct task_struct *task)
{
struct pid *pid;
pid = task_pid(task);
wake_up_all(&pid->wait_pidfd);
}
/* /*
* Let a parent know about the death of a child. * Let a parent know about the death of a child.
* For a stopped/continued status change, use do_notify_parent_cldstop instead. * For a stopped/continued status change, use do_notify_parent_cldstop instead.
...@@ -1904,6 +1912,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig) ...@@ -1904,6 +1912,9 @@ bool do_notify_parent(struct task_struct *tsk, int sig)
BUG_ON(!tsk->ptrace && BUG_ON(!tsk->ptrace &&
(tsk->group_leader != tsk || !thread_group_empty(tsk))); (tsk->group_leader != tsk || !thread_group_empty(tsk)));
/* Wake up all pidfd waiters */
do_notify_pidfd(tsk);
if (sig != SIGCHLD) { if (sig != SIGCHLD) {
/* /*
* This is only possible if parent == real_parent. * This is only possible if parent == real_parent.
......
# SPDX-License-Identifier: GPL-2.0-only # SPDX-License-Identifier: GPL-2.0-only
CFLAGS += -g -I../../../../usr/include/ CFLAGS += -g -I../../../../usr/include/ -lpthread
TEST_GEN_PROGS := pidfd_test TEST_GEN_PROGS := pidfd_test pidfd_open_test
include ../lib.mk include ../lib.mk
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef __PIDFD_H
#define __PIDFD_H
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
#include <sys/mount.h>
#include "../kselftest.h"
/*
* The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
* That means, when it wraps around any pid < 300 will be skipped.
* So we need to use a pid > 300 in order to test recycling.
*/
#define PID_RECYCLE 1000
/*
* Define a few custom error codes for the child process to clearly indicate
* what is happening. This way we can tell the difference between a system
* error, a test error, etc.
*/
#define PIDFD_PASS 0
#define PIDFD_FAIL 1
#define PIDFD_ERROR 2
#define PIDFD_SKIP 3
#define PIDFD_XFAIL 4
int wait_for_pid(pid_t pid)
{
int status, ret;
again:
ret = waitpid(pid, &status, 0);
if (ret == -1) {
if (errno == EINTR)
goto again;
return -1;
}
if (!WIFEXITED(status))
return -1;
return WEXITSTATUS(status);
}
#endif /* __PIDFD_H */
// SPDX-License-Identifier: GPL-2.0
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <inttypes.h>
#include <limits.h>
#include <linux/types.h>
#include <linux/wait.h>
#include <sched.h>
#include <signal.h>
#include <stdbool.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
#include <sys/mount.h>
#include <sys/prctl.h>
#include <sys/wait.h>
#include <unistd.h>
#include "pidfd.h"
#include "../kselftest.h"
static inline int sys_pidfd_open(pid_t pid, unsigned int flags)
{
return syscall(__NR_pidfd_open, pid, flags);
}
static int safe_int(const char *numstr, int *converted)
{
char *err = NULL;
long sli;
errno = 0;
sli = strtol(numstr, &err, 0);
if (errno == ERANGE && (sli == LONG_MAX || sli == LONG_MIN))
return -ERANGE;
if (errno != 0 && sli == 0)
return -EINVAL;
if (err == numstr || *err != '\0')
return -EINVAL;
if (sli > INT_MAX || sli < INT_MIN)
return -ERANGE;
*converted = (int)sli;
return 0;
}
static int char_left_gc(const char *buffer, size_t len)
{
size_t i;
for (i = 0; i < len; i++) {
if (buffer[i] == ' ' ||
buffer[i] == '\t')
continue;
return i;
}
return 0;
}
static int char_right_gc(const char *buffer, size_t len)
{
int i;
for (i = len - 1; i >= 0; i--) {
if (buffer[i] == ' ' ||
buffer[i] == '\t' ||
buffer[i] == '\n' ||
buffer[i] == '\0')
continue;
return i + 1;
}
return 0;
}
static char *trim_whitespace_in_place(char *buffer)
{
buffer += char_left_gc(buffer, strlen(buffer));
buffer[char_right_gc(buffer, strlen(buffer))] = '\0';
return buffer;
}
static pid_t get_pid_from_fdinfo_file(int pidfd, const char *key, size_t keylen)
{
int ret;
char path[512];
FILE *f;
size_t n = 0;
pid_t result = -1;
char *line = NULL;
snprintf(path, sizeof(path), "/proc/self/fdinfo/%d", pidfd);
f = fopen(path, "re");
if (!f)
return -1;
while (getline(&line, &n, f) != -1) {
char *numstr;
if (strncmp(line, key, keylen))
continue;
numstr = trim_whitespace_in_place(line + 4);
ret = safe_int(numstr, &result);
if (ret < 0)
goto out;
break;
}
out:
free(line);
fclose(f);
return result;
}
int main(int argc, char **argv)
{
int pidfd = -1, ret = 1;
pid_t pid;
ksft_set_plan(3);
pidfd = sys_pidfd_open(-1, 0);
if (pidfd >= 0) {
ksft_print_msg(
"%s - succeeded to open pidfd for invalid pid -1\n",
strerror(errno));
goto on_error;
}
ksft_test_result_pass("do not allow invalid pid test: passed\n");
pidfd = sys_pidfd_open(getpid(), 1);
if (pidfd >= 0) {
ksft_print_msg(
"%s - succeeded to open pidfd with invalid flag value specified\n",
strerror(errno));
goto on_error;
}
ksft_test_result_pass("do not allow invalid flag test: passed\n");
pidfd = sys_pidfd_open(getpid(), 0);
if (pidfd < 0) {
ksft_print_msg("%s - failed to open pidfd\n", strerror(errno));
goto on_error;
}
ksft_test_result_pass("open a new pidfd test: passed\n");
pid = get_pid_from_fdinfo_file(pidfd, "Pid:", sizeof("Pid:") - 1);
ksft_print_msg("pidfd %d refers to process with pid %d\n", pidfd, pid);
ret = 0;
on_error:
if (pidfd >= 0)
close(pidfd);
return !ret ? ksft_exit_pass() : ksft_exit_fail();
}
...@@ -4,22 +4,49 @@ ...@@ -4,22 +4,49 @@
#include <errno.h> #include <errno.h>
#include <fcntl.h> #include <fcntl.h>
#include <linux/types.h> #include <linux/types.h>
#include <pthread.h>
#include <sched.h> #include <sched.h>
#include <signal.h> #include <signal.h>
#include <stdio.h> #include <stdio.h>
#include <stdlib.h> #include <stdlib.h>
#include <string.h> #include <string.h>
#include <syscall.h> #include <syscall.h>
#include <sys/epoll.h>
#include <sys/mman.h>
#include <sys/mount.h> #include <sys/mount.h>
#include <sys/wait.h> #include <sys/wait.h>
#include <time.h>
#include <unistd.h> #include <unistd.h>
#include "pidfd.h"
#include "../kselftest.h" #include "../kselftest.h"
#ifndef __NR_pidfd_send_signal #ifndef __NR_pidfd_send_signal
#define __NR_pidfd_send_signal -1 #define __NR_pidfd_send_signal -1
#endif #endif
#define str(s) _str(s)
#define _str(s) #s
#define CHILD_THREAD_MIN_WAIT 3 /* seconds */
#define MAX_EVENTS 5
#ifndef CLONE_PIDFD
#define CLONE_PIDFD 0x00001000
#endif
static pid_t pidfd_clone(int flags, int *pidfd, int (*fn)(void *))
{
size_t stack_size = 1024;
char *stack[1024] = { 0 };
#ifdef __ia64__
return __clone2(fn, stack, stack_size, flags | SIGCHLD, NULL, pidfd);
#else
return clone(fn, stack + stack_size, flags | SIGCHLD, NULL, pidfd);
#endif
}
static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info, static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
unsigned int flags) unsigned int flags)
{ {
...@@ -66,28 +93,6 @@ static int test_pidfd_send_signal_simple_success(void) ...@@ -66,28 +93,6 @@ static int test_pidfd_send_signal_simple_success(void)
return 0; return 0;
} }
static int wait_for_pid(pid_t pid)
{
int status, ret;
again:
ret = waitpid(pid, &status, 0);
if (ret == -1) {
if (errno == EINTR)
goto again;
return -1;
}
if (ret != pid)
goto again;
if (!WIFEXITED(status))
return -1;
return WEXITSTATUS(status);
}
static int test_pidfd_send_signal_exited_fail(void) static int test_pidfd_send_signal_exited_fail(void)
{ {
int pidfd, ret, saved_errno; int pidfd, ret, saved_errno;
...@@ -132,13 +137,6 @@ static int test_pidfd_send_signal_exited_fail(void) ...@@ -132,13 +137,6 @@ static int test_pidfd_send_signal_exited_fail(void)
return 0; return 0;
} }
/*
* The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
* That means, when it wraps around any pid < 300 will be skipped.
* So we need to use a pid > 300 in order to test recycling.
*/
#define PID_RECYCLE 1000
/* /*
* Maximum number of cycles we allow. This is equivalent to PID_MAX_DEFAULT. * Maximum number of cycles we allow. This is equivalent to PID_MAX_DEFAULT.
* If users set a higher limit or we have cycled PIDFD_MAX_DEFAULT number of * If users set a higher limit or we have cycled PIDFD_MAX_DEFAULT number of
...@@ -147,17 +145,6 @@ static int test_pidfd_send_signal_exited_fail(void) ...@@ -147,17 +145,6 @@ static int test_pidfd_send_signal_exited_fail(void)
*/ */
#define PIDFD_MAX_DEFAULT 0x8000 #define PIDFD_MAX_DEFAULT 0x8000
/*
* Define a few custom error codes for the child process to clearly indicate
* what is happening. This way we can tell the difference between a system
* error, a test error, etc.
*/
#define PIDFD_PASS 0
#define PIDFD_FAIL 1
#define PIDFD_ERROR 2
#define PIDFD_SKIP 3
#define PIDFD_XFAIL 4
static int test_pidfd_send_signal_recycled_pid_fail(void) static int test_pidfd_send_signal_recycled_pid_fail(void)
{ {
int i, ret; int i, ret;
...@@ -372,11 +359,192 @@ static int test_pidfd_send_signal_syscall_support(void) ...@@ -372,11 +359,192 @@ static int test_pidfd_send_signal_syscall_support(void)
return 0; return 0;
} }
static void *test_pidfd_poll_exec_thread(void *priv)
{
ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n",
getpid(), syscall(SYS_gettid));
ksft_print_msg("Child Thread: doing exec of sleep\n");
execl("/bin/sleep", "sleep", str(CHILD_THREAD_MIN_WAIT), (char *)NULL);
ksft_print_msg("Child Thread: DONE. pid %d tid %d\n",
getpid(), syscall(SYS_gettid));
return NULL;
}
static void poll_pidfd(const char *test_name, int pidfd)
{
int c;
int epoll_fd = epoll_create1(EPOLL_CLOEXEC);
struct epoll_event event, events[MAX_EVENTS];
if (epoll_fd == -1)
ksft_exit_fail_msg("%s test: Failed to create epoll file descriptor "
"(errno %d)\n",
test_name, errno);
event.events = EPOLLIN;
event.data.fd = pidfd;
if (epoll_ctl(epoll_fd, EPOLL_CTL_ADD, pidfd, &event)) {
ksft_exit_fail_msg("%s test: Failed to add epoll file descriptor "
"(errno %d)\n",
test_name, errno);
}
c = epoll_wait(epoll_fd, events, MAX_EVENTS, 5000);
if (c != 1 || !(events[0].events & EPOLLIN))
ksft_exit_fail_msg("%s test: Unexpected epoll_wait result (c=%d, events=%x) ",
"(errno %d)\n",
test_name, c, events[0].events, errno);
close(epoll_fd);
return;
}
static int child_poll_exec_test(void *args)
{
pthread_t t1;
ksft_print_msg("Child (pidfd): starting. pid %d tid %d\n", getpid(),
syscall(SYS_gettid));
pthread_create(&t1, NULL, test_pidfd_poll_exec_thread, NULL);
/*
* Exec in the non-leader thread will destroy the leader immediately.
* If the wait in the parent returns too soon, the test fails.
*/
while (1)
sleep(1);
}
static void test_pidfd_poll_exec(int use_waitpid)
{
int pid, pidfd = 0;
int status, ret;
pthread_t t1;
time_t prog_start = time(NULL);
const char *test_name = "pidfd_poll check for premature notification on child thread exec";
ksft_print_msg("Parent: pid: %d\n", getpid());
pid = pidfd_clone(CLONE_PIDFD, &pidfd, child_poll_exec_test);
if (pid < 0)
ksft_exit_fail_msg("%s test: pidfd_clone failed (ret %d, errno %d)\n",
test_name, pid, errno);
ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
if (use_waitpid) {
ret = waitpid(pid, &status, 0);
if (ret == -1)
ksft_print_msg("Parent: error\n");
if (ret == pid)
ksft_print_msg("Parent: Child process waited for.\n");
} else {
poll_pidfd(test_name, pidfd);
}
time_t prog_time = time(NULL) - prog_start;
ksft_print_msg("Time waited for child: %lu\n", prog_time);
close(pidfd);
if (prog_time < CHILD_THREAD_MIN_WAIT || prog_time > CHILD_THREAD_MIN_WAIT + 2)
ksft_exit_fail_msg("%s test: Failed\n", test_name);
else
ksft_test_result_pass("%s test: Passed\n", test_name);
}
static void *test_pidfd_poll_leader_exit_thread(void *priv)
{
ksft_print_msg("Child Thread: starting. pid %d tid %d ; and sleeping\n",
getpid(), syscall(SYS_gettid));
sleep(CHILD_THREAD_MIN_WAIT);
ksft_print_msg("Child Thread: DONE. pid %d tid %d\n", getpid(), syscall(SYS_gettid));
return NULL;
}
static time_t *child_exit_secs;
static int child_poll_leader_exit_test(void *args)
{
pthread_t t1, t2;
ksft_print_msg("Child: starting. pid %d tid %d\n", getpid(), syscall(SYS_gettid));
pthread_create(&t1, NULL, test_pidfd_poll_leader_exit_thread, NULL);
pthread_create(&t2, NULL, test_pidfd_poll_leader_exit_thread, NULL);
/*
* glibc exit calls exit_group syscall, so explicity call exit only
* so that only the group leader exits, leaving the threads alone.
*/
*child_exit_secs = time(NULL);
syscall(SYS_exit, 0);
}
static void test_pidfd_poll_leader_exit(int use_waitpid)
{
int pid, pidfd = 0;
int status, ret;
time_t prog_start = time(NULL);
const char *test_name = "pidfd_poll check for premature notification on non-empty"
"group leader exit";
child_exit_secs = mmap(NULL, sizeof *child_exit_secs, PROT_READ | PROT_WRITE,
MAP_SHARED | MAP_ANONYMOUS, -1, 0);
if (child_exit_secs == MAP_FAILED)
ksft_exit_fail_msg("%s test: mmap failed (errno %d)\n",
test_name, errno);
ksft_print_msg("Parent: pid: %d\n", getpid());
pid = pidfd_clone(CLONE_PIDFD, &pidfd, child_poll_leader_exit_test);
if (pid < 0)
ksft_exit_fail_msg("%s test: pidfd_clone failed (ret %d, errno %d)\n",
test_name, pid, errno);
ksft_print_msg("Parent: Waiting for Child (%d) to complete.\n", pid);
if (use_waitpid) {
ret = waitpid(pid, &status, 0);
if (ret == -1)
ksft_print_msg("Parent: error\n");
} else {
/*
* This sleep tests for the case where if the child exits, and is in
* EXIT_ZOMBIE, but the thread group leader is non-empty, then the poll
* doesn't prematurely return even though there are active threads
*/
sleep(1);
poll_pidfd(test_name, pidfd);
}
if (ret == pid)
ksft_print_msg("Parent: Child process waited for.\n");
time_t since_child_exit = time(NULL) - *child_exit_secs;
ksft_print_msg("Time since child exit: %lu\n", since_child_exit);
close(pidfd);
if (since_child_exit < CHILD_THREAD_MIN_WAIT ||
since_child_exit > CHILD_THREAD_MIN_WAIT + 2)
ksft_exit_fail_msg("%s test: Failed\n", test_name);
else
ksft_test_result_pass("%s test: Passed\n", test_name);
}
int main(int argc, char **argv) int main(int argc, char **argv)
{ {
ksft_print_header(); ksft_print_header();
ksft_set_plan(4); ksft_set_plan(4);
test_pidfd_poll_exec(0);
test_pidfd_poll_exec(1);
test_pidfd_poll_leader_exit(0);
test_pidfd_poll_leader_exit(1);
test_pidfd_send_signal_syscall_support(); test_pidfd_send_signal_syscall_support();
test_pidfd_send_signal_simple_success(); test_pidfd_send_signal_simple_success();
test_pidfd_send_signal_exited_fail(); test_pidfd_send_signal_exited_fail();
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment