Commit a9dce667 authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'pidfd-v5.1-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux

Pull pidfd system call from Christian Brauner:
 "This introduces the ability to use file descriptors from /proc/<pid>/
  as stable handles on struct pid. Even if a pid is recycled the handle
  will not change. For a start these fds can be used to send signals to
  the processes they refer to.

  With the ability to use /proc/<pid> fds as stable handles on struct
  pid we can fix a long-standing issue where after a process has exited
  its pid can be reused by another process. If a caller sends a signal
  to a reused pid it will end up signaling the wrong process.

  With this patchset we enable a variety of use cases. One obvious
  example is that we can now safely delegate an important part of
  process management - sending signals - to processes other than the
  parent of a given process by sending file descriptors around via scm
  rights and not fearing that the given process will have been recycled
  in the meantime. It also allows for easy testing whether a given
  process is still alive or not by sending signal 0 to a pidfd which is
  quite handy.

  There has been some interest in this feature e.g. from systems
  management (systemd, glibc) and container managers. I have requested
  and gotten comments from glibc to make sure that this syscall is
  suitable for their needs as well. In the future I expect it to take on
  most other pid-based signal syscalls. But such features are left for
  the future once they are needed.

  This has been sitting in linux-next for quite a while and has not
  caused any issues. It comes with selftests which verify basic
  functionality and also test that a recycled pid cannot be signaled via
  a pidfd.

  Jon has written about a prior version of this patchset. It should
  cover the basic functionality since not a lot has changed since then:

      https://lwn.net/Articles/773459/

  The commit message for the syscall itself is extensively documenting
  the syscall, including it's functionality and extensibility"

* tag 'pidfd-v5.1-rc1' of git://git.kernel.org/pub/scm/linux/kernel/git/brauner/linux:
  selftests: add tests for pidfd_send_signal()
  signal: add pidfd_send_signal() syscall
parents f67e3fb4 575a0ae9
...@@ -429,6 +429,7 @@ ...@@ -429,6 +429,7 @@
421 i386 rt_sigtimedwait_time64 sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait_time64 421 i386 rt_sigtimedwait_time64 sys_rt_sigtimedwait __ia32_compat_sys_rt_sigtimedwait_time64
422 i386 futex_time64 sys_futex __ia32_sys_futex 422 i386 futex_time64 sys_futex __ia32_sys_futex
423 i386 sched_rr_get_interval_time64 sys_sched_rr_get_interval __ia32_sys_sched_rr_get_interval 423 i386 sched_rr_get_interval_time64 sys_sched_rr_get_interval __ia32_sys_sched_rr_get_interval
424 i386 pidfd_send_signal sys_pidfd_send_signal __ia32_sys_pidfd_send_signal
425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup 425 i386 io_uring_setup sys_io_uring_setup __ia32_sys_io_uring_setup
426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter 426 i386 io_uring_enter sys_io_uring_enter __ia32_sys_io_uring_enter
427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register 427 i386 io_uring_register sys_io_uring_register __ia32_sys_io_uring_register
...@@ -345,6 +345,7 @@ ...@@ -345,6 +345,7 @@
334 common rseq __x64_sys_rseq 334 common rseq __x64_sys_rseq
# don't use numbers 387 through 423, add new calls after the last # don't use numbers 387 through 423, add new calls after the last
# 'common' entry # 'common' entry
424 common pidfd_send_signal __x64_sys_pidfd_send_signal
425 common io_uring_setup __x64_sys_io_uring_setup 425 common io_uring_setup __x64_sys_io_uring_setup
426 common io_uring_enter __x64_sys_io_uring_enter 426 common io_uring_enter __x64_sys_io_uring_enter
427 common io_uring_register __x64_sys_io_uring_register 427 common io_uring_register __x64_sys_io_uring_register
......
...@@ -3074,6 +3074,15 @@ static const struct file_operations proc_tgid_base_operations = { ...@@ -3074,6 +3074,15 @@ static const struct file_operations proc_tgid_base_operations = {
.llseek = generic_file_llseek, .llseek = generic_file_llseek,
}; };
struct pid *tgid_pidfd_to_pid(const struct file *file)
{
if (!d_is_dir(file->f_path.dentry) ||
(file->f_op != &proc_tgid_base_operations))
return ERR_PTR(-EBADF);
return proc_pid(file_inode(file));
}
static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags) static struct dentry *proc_tgid_base_lookup(struct inode *dir, struct dentry *dentry, unsigned int flags)
{ {
return proc_pident_lookup(dir, dentry, return proc_pident_lookup(dir, dentry,
......
...@@ -73,6 +73,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo ...@@ -73,6 +73,7 @@ struct proc_dir_entry *proc_create_net_single_write(const char *name, umode_t mo
int (*show)(struct seq_file *, void *), int (*show)(struct seq_file *, void *),
proc_write_t write, proc_write_t write,
void *data); void *data);
extern struct pid *tgid_pidfd_to_pid(const struct file *file);
#else /* CONFIG_PROC_FS */ #else /* CONFIG_PROC_FS */
...@@ -114,6 +115,11 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p ...@@ -114,6 +115,11 @@ static inline int remove_proc_subtree(const char *name, struct proc_dir_entry *p
#define proc_create_net(name, mode, parent, state_size, ops) ({NULL;}) #define proc_create_net(name, mode, parent, state_size, ops) ({NULL;})
#define proc_create_net_single(name, mode, parent, show, data) ({NULL;}) #define proc_create_net_single(name, mode, parent, show, data) ({NULL;})
static inline struct pid *tgid_pidfd_to_pid(const struct file *file)
{
return ERR_PTR(-EBADF);
}
#endif /* CONFIG_PROC_FS */ #endif /* CONFIG_PROC_FS */
struct net; struct net;
......
...@@ -985,6 +985,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags, ...@@ -985,6 +985,9 @@ asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
unsigned mask, struct statx __user *buffer); unsigned mask, struct statx __user *buffer);
asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len, asmlinkage long sys_rseq(struct rseq __user *rseq, uint32_t rseq_len,
int flags, uint32_t sig); int flags, uint32_t sig);
asmlinkage long sys_pidfd_send_signal(int pidfd, int sig,
siginfo_t __user *info,
unsigned int flags);
/* /*
* Architecture-specific system calls * Architecture-specific system calls
......
...@@ -824,6 +824,8 @@ __SYSCALL(__NR_futex_time64, sys_futex) ...@@ -824,6 +824,8 @@ __SYSCALL(__NR_futex_time64, sys_futex)
__SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval) __SYSCALL(__NR_sched_rr_get_interval_time64, sys_sched_rr_get_interval)
#endif #endif
#define __NR_pidfd_send_signal 424
__SYSCALL(__NR_pidfd_send_signal, sys_pidfd_send_signal)
#define __NR_io_uring_setup 425 #define __NR_io_uring_setup 425
__SYSCALL(__NR_io_uring_setup, sys_io_uring_setup) __SYSCALL(__NR_io_uring_setup, sys_io_uring_setup)
#define __NR_io_uring_enter 426 #define __NR_io_uring_enter 426
......
...@@ -19,7 +19,9 @@ ...@@ -19,7 +19,9 @@
#include <linux/sched/task.h> #include <linux/sched/task.h>
#include <linux/sched/task_stack.h> #include <linux/sched/task_stack.h>
#include <linux/sched/cputime.h> #include <linux/sched/cputime.h>
#include <linux/file.h>
#include <linux/fs.h> #include <linux/fs.h>
#include <linux/proc_fs.h>
#include <linux/tty.h> #include <linux/tty.h>
#include <linux/binfmts.h> #include <linux/binfmts.h>
#include <linux/coredump.h> #include <linux/coredump.h>
...@@ -3487,6 +3489,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese, ...@@ -3487,6 +3489,16 @@ COMPAT_SYSCALL_DEFINE4(rt_sigtimedwait_time32, compat_sigset_t __user *, uthese,
#endif #endif
#endif #endif
static inline void prepare_kill_siginfo(int sig, struct kernel_siginfo *info)
{
clear_siginfo(info);
info->si_signo = sig;
info->si_errno = 0;
info->si_code = SI_USER;
info->si_pid = task_tgid_vnr(current);
info->si_uid = from_kuid_munged(current_user_ns(), current_uid());
}
/** /**
* sys_kill - send a signal to a process * sys_kill - send a signal to a process
* @pid: the PID of the process * @pid: the PID of the process
...@@ -3496,16 +3508,125 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig) ...@@ -3496,16 +3508,125 @@ SYSCALL_DEFINE2(kill, pid_t, pid, int, sig)
{ {
struct kernel_siginfo info; struct kernel_siginfo info;
clear_siginfo(&info); prepare_kill_siginfo(sig, &info);
info.si_signo = sig;
info.si_errno = 0;
info.si_code = SI_USER;
info.si_pid = task_tgid_vnr(current);
info.si_uid = from_kuid_munged(current_user_ns(), current_uid());
return kill_something_info(sig, &info, pid); return kill_something_info(sig, &info, pid);
} }
#ifdef CONFIG_PROC_FS
/*
* Verify that the signaler and signalee either are in the same pid namespace
* or that the signaler's pid namespace is an ancestor of the signalee's pid
* namespace.
*/
static bool access_pidfd_pidns(struct pid *pid)
{
struct pid_namespace *active = task_active_pid_ns(current);
struct pid_namespace *p = ns_of_pid(pid);
for (;;) {
if (!p)
return false;
if (p == active)
break;
p = p->parent;
}
return true;
}
static int copy_siginfo_from_user_any(kernel_siginfo_t *kinfo, siginfo_t *info)
{
#ifdef CONFIG_COMPAT
/*
* Avoid hooking up compat syscalls and instead handle necessary
* conversions here. Note, this is a stop-gap measure and should not be
* considered a generic solution.
*/
if (in_compat_syscall())
return copy_siginfo_from_user32(
kinfo, (struct compat_siginfo __user *)info);
#endif
return copy_siginfo_from_user(kinfo, info);
}
/**
* sys_pidfd_send_signal - send a signal to a process through a task file
* descriptor
* @pidfd: the file descriptor of the process
* @sig: signal to be sent
* @info: the signal info
* @flags: future flags to be passed
*
* The syscall currently only signals via PIDTYPE_PID which covers
* kill(<positive-pid>, <signal>. It does not signal threads or process
* groups.
* In order to extend the syscall to threads and process groups the @flags
* argument should be used. In essence, the @flags argument will determine
* what is signaled and not the file descriptor itself. Put in other words,
* grouping is a property of the flags argument not a property of the file
* descriptor.
*
* Return: 0 on success, negative errno on failure
*/
SYSCALL_DEFINE4(pidfd_send_signal, int, pidfd, int, sig,
siginfo_t __user *, info, unsigned int, flags)
{
int ret;
struct fd f;
struct pid *pid;
kernel_siginfo_t kinfo;
/* Enforce flags be set to 0 until we add an extension. */
if (flags)
return -EINVAL;
f = fdget_raw(pidfd);
if (!f.file)
return -EBADF;
/* Is this a pidfd? */
pid = tgid_pidfd_to_pid(f.file);
if (IS_ERR(pid)) {
ret = PTR_ERR(pid);
goto err;
}
ret = -EINVAL;
if (!access_pidfd_pidns(pid))
goto err;
if (info) {
ret = copy_siginfo_from_user_any(&kinfo, info);
if (unlikely(ret))
goto err;
ret = -EINVAL;
if (unlikely(sig != kinfo.si_signo))
goto err;
if ((task_pid(current) != pid) &&
(kinfo.si_code >= 0 || kinfo.si_code == SI_TKILL)) {
/* Only allow sending arbitrary signals to yourself. */
ret = -EPERM;
if (kinfo.si_code != SI_USER)
goto err;
/* Turn this into a regular kill signal. */
prepare_kill_siginfo(sig, &kinfo);
}
} else {
prepare_kill_siginfo(sig, &kinfo);
}
ret = kill_pid_info(sig, &kinfo, pid);
err:
fdput(f);
return ret;
}
#endif /* CONFIG_PROC_FS */
static int static int
do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info) do_send_specific(pid_t tgid, pid_t pid, int sig, struct kernel_siginfo *info)
{ {
......
...@@ -168,6 +168,7 @@ COND_SYSCALL(syslog); ...@@ -168,6 +168,7 @@ COND_SYSCALL(syslog);
/* kernel/sched/core.c */ /* kernel/sched/core.c */
/* kernel/signal.c */ /* kernel/signal.c */
COND_SYSCALL(pidfd_send_signal);
/* kernel/sys.c */ /* kernel/sys.c */
COND_SYSCALL(setregid); COND_SYSCALL(setregid);
......
...@@ -32,6 +32,7 @@ TARGETS += net ...@@ -32,6 +32,7 @@ TARGETS += net
TARGETS += netfilter TARGETS += netfilter
TARGETS += networking/timestamping TARGETS += networking/timestamping
TARGETS += nsfs TARGETS += nsfs
TARGETS += pidfd
TARGETS += powerpc TARGETS += powerpc
TARGETS += proc TARGETS += proc
TARGETS += pstore TARGETS += pstore
......
CFLAGS += -g -I../../../../usr/include/
TEST_GEN_PROGS := pidfd_test
include ../lib.mk
/* SPDX-License-Identifier: GPL-2.0 */
#define _GNU_SOURCE
#include <errno.h>
#include <fcntl.h>
#include <linux/types.h>
#include <sched.h>
#include <signal.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <syscall.h>
#include <sys/mount.h>
#include <sys/wait.h>
#include <unistd.h>
#include "../kselftest.h"
static inline int sys_pidfd_send_signal(int pidfd, int sig, siginfo_t *info,
unsigned int flags)
{
return syscall(__NR_pidfd_send_signal, pidfd, sig, info, flags);
}
static int signal_received;
static void set_signal_received_on_sigusr1(int sig)
{
if (sig == SIGUSR1)
signal_received = 1;
}
/*
* Straightforward test to see whether pidfd_send_signal() works is to send
* a signal to ourself.
*/
static int test_pidfd_send_signal_simple_success(void)
{
int pidfd, ret;
const char *test_name = "pidfd_send_signal send SIGUSR1";
pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC);
if (pidfd < 0)
ksft_exit_fail_msg(
"%s test: Failed to open process file descriptor\n",
test_name);
signal(SIGUSR1, set_signal_received_on_sigusr1);
ret = sys_pidfd_send_signal(pidfd, SIGUSR1, NULL, 0);
close(pidfd);
if (ret < 0)
ksft_exit_fail_msg("%s test: Failed to send signal\n",
test_name);
if (signal_received != 1)
ksft_exit_fail_msg("%s test: Failed to receive signal\n",
test_name);
signal_received = 0;
ksft_test_result_pass("%s test: Sent signal\n", test_name);
return 0;
}
static int wait_for_pid(pid_t pid)
{
int status, ret;
again:
ret = waitpid(pid, &status, 0);
if (ret == -1) {
if (errno == EINTR)
goto again;
return -1;
}
if (ret != pid)
goto again;
if (!WIFEXITED(status))
return -1;
return WEXITSTATUS(status);
}
static int test_pidfd_send_signal_exited_fail(void)
{
int pidfd, ret, saved_errno;
char buf[256];
pid_t pid;
const char *test_name = "pidfd_send_signal signal exited process";
pid = fork();
if (pid < 0)
ksft_exit_fail_msg("%s test: Failed to create new process\n",
test_name);
if (pid == 0)
_exit(EXIT_SUCCESS);
snprintf(buf, sizeof(buf), "/proc/%d", pid);
pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
(void)wait_for_pid(pid);
if (pidfd < 0)
ksft_exit_fail_msg(
"%s test: Failed to open process file descriptor\n",
test_name);
ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0);
saved_errno = errno;
close(pidfd);
if (ret == 0)
ksft_exit_fail_msg(
"%s test: Managed to send signal to process even though it should have failed\n",
test_name);
if (saved_errno != ESRCH)
ksft_exit_fail_msg(
"%s test: Expected to receive ESRCH as errno value but received %d instead\n",
test_name, saved_errno);
ksft_test_result_pass("%s test: Failed to send signal as expected\n",
test_name);
return 0;
}
/*
* The kernel reserves 300 pids via RESERVED_PIDS in kernel/pid.c
* That means, when it wraps around any pid < 300 will be skipped.
* So we need to use a pid > 300 in order to test recycling.
*/
#define PID_RECYCLE 1000
/*
* Maximum number of cycles we allow. This is equivalent to PID_MAX_DEFAULT.
* If users set a higher limit or we have cycled PIDFD_MAX_DEFAULT number of
* times then we skip the test to not go into an infinite loop or block for a
* long time.
*/
#define PIDFD_MAX_DEFAULT 0x8000
/*
* Define a few custom error codes for the child process to clearly indicate
* what is happening. This way we can tell the difference between a system
* error, a test error, etc.
*/
#define PIDFD_PASS 0
#define PIDFD_FAIL 1
#define PIDFD_ERROR 2
#define PIDFD_SKIP 3
#define PIDFD_XFAIL 4
static int test_pidfd_send_signal_recycled_pid_fail(void)
{
int i, ret;
pid_t pid1;
const char *test_name = "pidfd_send_signal signal recycled pid";
ret = unshare(CLONE_NEWPID);
if (ret < 0)
ksft_exit_fail_msg("%s test: Failed to unshare pid namespace\n",
test_name);
ret = unshare(CLONE_NEWNS);
if (ret < 0)
ksft_exit_fail_msg(
"%s test: Failed to unshare mount namespace\n",
test_name);
ret = mount(NULL, "/", NULL, MS_REC | MS_PRIVATE, 0);
if (ret < 0)
ksft_exit_fail_msg("%s test: Failed to remount / private\n",
test_name);
/* pid 1 in new pid namespace */
pid1 = fork();
if (pid1 < 0)
ksft_exit_fail_msg("%s test: Failed to create new process\n",
test_name);
if (pid1 == 0) {
char buf[256];
pid_t pid2;
int pidfd = -1;
(void)umount2("/proc", MNT_DETACH);
ret = mount("proc", "/proc", "proc", 0, NULL);
if (ret < 0)
_exit(PIDFD_ERROR);
/* grab pid PID_RECYCLE */
for (i = 0; i <= PIDFD_MAX_DEFAULT; i++) {
pid2 = fork();
if (pid2 < 0)
_exit(PIDFD_ERROR);
if (pid2 == 0)
_exit(PIDFD_PASS);
if (pid2 == PID_RECYCLE) {
snprintf(buf, sizeof(buf), "/proc/%d", pid2);
ksft_print_msg("pid to recycle is %d\n", pid2);
pidfd = open(buf, O_DIRECTORY | O_CLOEXEC);
}
if (wait_for_pid(pid2))
_exit(PIDFD_ERROR);
if (pid2 >= PID_RECYCLE)
break;
}
/*
* We want to be as predictable as we can so if we haven't been
* able to grab pid PID_RECYCLE skip the test.
*/
if (pid2 != PID_RECYCLE) {
/* skip test */
close(pidfd);
_exit(PIDFD_SKIP);
}
if (pidfd < 0)
_exit(PIDFD_ERROR);
for (i = 0; i <= PIDFD_MAX_DEFAULT; i++) {
char c;
int pipe_fds[2];
pid_t recycled_pid;
int child_ret = PIDFD_PASS;
ret = pipe2(pipe_fds, O_CLOEXEC);
if (ret < 0)
_exit(PIDFD_ERROR);
recycled_pid = fork();
if (recycled_pid < 0)
_exit(PIDFD_ERROR);
if (recycled_pid == 0) {
close(pipe_fds[1]);
(void)read(pipe_fds[0], &c, 1);
close(pipe_fds[0]);
_exit(PIDFD_PASS);
}
/*
* Stop the child so we can inspect whether we have
* recycled pid PID_RECYCLE.
*/
close(pipe_fds[0]);
ret = kill(recycled_pid, SIGSTOP);
close(pipe_fds[1]);
if (ret) {
(void)wait_for_pid(recycled_pid);
_exit(PIDFD_ERROR);
}
/*
* We have recycled the pid. Try to signal it. This
* needs to fail since this is a different process than
* the one the pidfd refers to.
*/
if (recycled_pid == PID_RECYCLE) {
ret = sys_pidfd_send_signal(pidfd, SIGCONT,
NULL, 0);
if (ret && errno == ESRCH)
child_ret = PIDFD_XFAIL;
else
child_ret = PIDFD_FAIL;
}
/* let the process move on */
ret = kill(recycled_pid, SIGCONT);
if (ret)
(void)kill(recycled_pid, SIGKILL);
if (wait_for_pid(recycled_pid))
_exit(PIDFD_ERROR);
switch (child_ret) {
case PIDFD_FAIL:
/* fallthrough */
case PIDFD_XFAIL:
_exit(child_ret);
case PIDFD_PASS:
break;
default:
/* not reached */
_exit(PIDFD_ERROR);
}
/*
* If the user set a custom pid_max limit we could be
* in the millions.
* Skip the test in this case.
*/
if (recycled_pid > PIDFD_MAX_DEFAULT)
_exit(PIDFD_SKIP);
}
/* failed to recycle pid */
_exit(PIDFD_SKIP);
}
ret = wait_for_pid(pid1);
switch (ret) {
case PIDFD_FAIL:
ksft_exit_fail_msg(
"%s test: Managed to signal recycled pid %d\n",
test_name, PID_RECYCLE);
case PIDFD_PASS:
ksft_exit_fail_msg("%s test: Failed to recycle pid %d\n",
test_name, PID_RECYCLE);
case PIDFD_SKIP:
ksft_print_msg("%s test: Skipping test\n", test_name);
ret = 0;
break;
case PIDFD_XFAIL:
ksft_test_result_pass(
"%s test: Failed to signal recycled pid as expected\n",
test_name);
ret = 0;
break;
default /* PIDFD_ERROR */:
ksft_exit_fail_msg("%s test: Error while running tests\n",
test_name);
}
return ret;
}
static int test_pidfd_send_signal_syscall_support(void)
{
int pidfd, ret;
const char *test_name = "pidfd_send_signal check for support";
pidfd = open("/proc/self", O_DIRECTORY | O_CLOEXEC);
if (pidfd < 0)
ksft_exit_fail_msg(
"%s test: Failed to open process file descriptor\n",
test_name);
ret = sys_pidfd_send_signal(pidfd, 0, NULL, 0);
if (ret < 0) {
/*
* pidfd_send_signal() will currently return ENOSYS when
* CONFIG_PROC_FS is not set.
*/
if (errno == ENOSYS)
ksft_exit_skip(
"%s test: pidfd_send_signal() syscall not supported (Ensure that CONFIG_PROC_FS=y is set)\n",
test_name);
ksft_exit_fail_msg("%s test: Failed to send signal\n",
test_name);
}
close(pidfd);
ksft_test_result_pass(
"%s test: pidfd_send_signal() syscall is supported. Tests can be executed\n",
test_name);
return 0;
}
int main(int argc, char **argv)
{
ksft_print_header();
test_pidfd_send_signal_syscall_support();
test_pidfd_send_signal_simple_success();
test_pidfd_send_signal_exited_fail();
test_pidfd_send_signal_recycled_pid_fail();
return ksft_exit_pass();
}
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment