Commit da493dbb authored by Jakub Kicinski's avatar Jakub Kicinski

Merge branch 'af_unix-rework-gc'

Kuniyuki Iwashima says:

====================
af_unix: Rework GC.

When we pass a file descriptor to an AF_UNIX socket via SCM_RIGTHS,
the underlying struct file of the inflight fd gets its refcount bumped.
If the fd is of an AF_UNIX socket, we need to track it in case it forms
cyclic references.

Let's say we send a fd of AF_UNIX socket A to B and vice versa and
close() both sockets.

When created, each socket's struct file initially has one reference.
After the fd exchange, both refcounts are bumped up to 2.  Then, close()
decreases both to 1.  From this point on, no one can touch the file/socket.

However, the struct file has one refcount and thus never calls the
release() function of the AF_UNIX socket.

That's why we need to track all inflight AF_UNIX sockets and run garbage
collection.

This series replaces the current GC implementation that locks each inflight
socket's receive queue and requires trickiness in other places.

The new GC does not lock each socket's queue to minimise its effect and
tries to be lightweight if there is no cyclic reference or no update in
the shape of the inflight fd graph.

The new implementation is based on Tarjan's Strongly Connected Components
algorithm, and we will consider each inflight AF_UNIX socket as a vertex
and its file descriptor as an edge in a directed graph.

For the details, please see each patch.

  patch 1  -  3 : Add struct to express inflight socket graphs
  patch       4 : Optimse inflight fd counting
  patch 5  -  6 : Group SCC possibly forming a cycle
  patch 7  -  8 : Support embryo socket
  patch 9  - 11 : Make GC lightweight
  patch 12 - 13 : Detect dead cycle references
  patch      14 : Replace GC algorithm
  patch      15 : selftest

After this series is applied, we can remove the two ugly tricks for race,
scm_fp_dup() in unix_attach_fds() and spin_lock dance in unix_peek_fds()
as done in patch 14/15 of v1.

Also, we will add cond_resched_lock() in __unix_gc() and convert it to
use a dedicated kthread instead of global system workqueue as suggested
by Paolo in a v4 thread.

v4: https://lore.kernel.org/netdev/20240301022243.73908-1-kuniyu@amazon.com/
v3: https://lore.kernel.org/netdev/20240223214003.17369-1-kuniyu@amazon.com/
v2: https://lore.kernel.org/netdev/20240216210556.65913-1-kuniyu@amazon.com/
v1: https://lore.kernel.org/netdev/20240203030058.60750-1-kuniyu@amazon.com/
====================

Link: https://lore.kernel.org/r/20240325202425.60930-1-kuniyu@amazon.comSigned-off-by: default avatarJakub Kicinski <kuba@kernel.org>
parents 50e2907e 2aa0cff2
......@@ -19,12 +19,30 @@ static inline struct unix_sock *unix_get_socket(struct file *filp)
extern spinlock_t unix_gc_lock;
extern unsigned int unix_tot_inflight;
void unix_inflight(struct user_struct *user, struct file *fp);
void unix_notinflight(struct user_struct *user, struct file *fp);
void unix_add_edges(struct scm_fp_list *fpl, struct unix_sock *receiver);
void unix_del_edges(struct scm_fp_list *fpl);
void unix_update_edges(struct unix_sock *receiver);
int unix_prepare_fpl(struct scm_fp_list *fpl);
void unix_destroy_fpl(struct scm_fp_list *fpl);
void unix_gc(void);
void wait_for_unix_gc(struct scm_fp_list *fpl);
struct unix_vertex {
struct list_head edges;
struct list_head entry;
struct list_head scc_entry;
unsigned long out_degree;
unsigned long index;
unsigned long scc_index;
};
struct unix_edge {
struct unix_sock *predecessor;
struct unix_sock *successor;
struct list_head vertex_entry;
struct list_head stack_entry;
};
struct sock *unix_peer_get(struct sock *sk);
#define UNIX_HASH_MOD (256 - 1)
......@@ -62,12 +80,9 @@ struct unix_sock {
struct path path;
struct mutex iolock, bindlock;
struct sock *peer;
struct list_head link;
unsigned long inflight;
struct sock *listener;
struct unix_vertex *vertex;
spinlock_t lock;
unsigned long gc_flags;
#define UNIX_GC_CANDIDATE 0
#define UNIX_GC_MAYBE_CYCLE 1
struct socket_wq peer_wq;
wait_queue_entry_t peer_wake;
struct scm_stat scm_stat;
......
......@@ -23,10 +23,19 @@ struct scm_creds {
kgid_t gid;
};
#ifdef CONFIG_UNIX
struct unix_edge;
#endif
struct scm_fp_list {
short count;
short count_unix;
short max;
#ifdef CONFIG_UNIX
bool inflight;
struct list_head vertices;
struct unix_edge *edges;
#endif
struct user_struct *user;
struct file *fp[SCM_MAX_FD];
};
......
......@@ -89,6 +89,11 @@ static int scm_fp_copy(struct cmsghdr *cmsg, struct scm_fp_list **fplp)
fpl->count_unix = 0;
fpl->max = SCM_MAX_FD;
fpl->user = NULL;
#if IS_ENABLED(CONFIG_UNIX)
fpl->inflight = false;
fpl->edges = NULL;
INIT_LIST_HEAD(&fpl->vertices);
#endif
}
fpp = &fpl->fp[fpl->count];
......@@ -376,8 +381,14 @@ struct scm_fp_list *scm_fp_dup(struct scm_fp_list *fpl)
if (new_fpl) {
for (i = 0; i < fpl->count; i++)
get_file(fpl->fp[i]);
new_fpl->max = new_fpl->count;
new_fpl->user = get_uid(fpl->user);
#if IS_ENABLED(CONFIG_UNIX)
new_fpl->inflight = false;
new_fpl->edges = NULL;
INIT_LIST_HEAD(&new_fpl->vertices);
#endif
}
return new_fpl;
}
......
......@@ -979,11 +979,11 @@ static struct sock *unix_create1(struct net *net, struct socket *sock, int kern,
sk->sk_max_ack_backlog = net->unx.sysctl_max_dgram_qlen;
sk->sk_destruct = unix_sock_destructor;
u = unix_sk(sk);
u->inflight = 0;
u->listener = NULL;
u->vertex = NULL;
u->path.dentry = NULL;
u->path.mnt = NULL;
spin_lock_init(&u->lock);
INIT_LIST_HEAD(&u->link);
mutex_init(&u->iolock); /* single task reading lock */
mutex_init(&u->bindlock); /* single task binding lock */
init_waitqueue_head(&u->peer_wait);
......@@ -1597,6 +1597,7 @@ static int unix_stream_connect(struct socket *sock, struct sockaddr *uaddr,
newsk->sk_type = sk->sk_type;
init_peercred(newsk);
newu = unix_sk(newsk);
newu->listener = other;
RCU_INIT_POINTER(newsk->sk_wq, &newu->peer_wq);
otheru = unix_sk(other);
......@@ -1692,8 +1693,8 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
bool kern)
{
struct sock *sk = sock->sk;
struct sock *tsk;
struct sk_buff *skb;
struct sock *tsk;
int err;
err = -EOPNOTSUPP;
......@@ -1718,6 +1719,7 @@ static int unix_accept(struct socket *sock, struct socket *newsock, int flags,
}
tsk = skb->sk;
unix_update_edges(unix_sk(tsk));
skb_free_datagram(sk, skb);
wake_up_interruptible(&unix_sk(sk)->peer_wait);
......@@ -1789,8 +1791,6 @@ static inline bool too_many_unix_fds(struct task_struct *p)
static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
int i;
if (too_many_unix_fds(current))
return -ETOOMANYREFS;
......@@ -1802,21 +1802,18 @@ static int unix_attach_fds(struct scm_cookie *scm, struct sk_buff *skb)
if (!UNIXCB(skb).fp)
return -ENOMEM;
for (i = scm->fp->count - 1; i >= 0; i--)
unix_inflight(scm->fp->user, scm->fp->fp[i]);
if (unix_prepare_fpl(UNIXCB(skb).fp))
return -ENOMEM;
return 0;
}
static void unix_detach_fds(struct scm_cookie *scm, struct sk_buff *skb)
{
int i;
scm->fp = UNIXCB(skb).fp;
UNIXCB(skb).fp = NULL;
for (i = scm->fp->count - 1; i >= 0; i--)
unix_notinflight(scm->fp->user, scm->fp->fp[i]);
unix_destroy_fpl(scm->fp);
}
static void unix_peek_fds(struct scm_cookie *scm, struct sk_buff *skb)
......@@ -1937,8 +1934,10 @@ static void scm_stat_add(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);
if (unlikely(fp && fp->count))
if (unlikely(fp && fp->count)) {
atomic_add(fp->count, &u->scm_stat.nr_fds);
unix_add_edges(fp, u);
}
}
static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
......@@ -1946,8 +1945,10 @@ static void scm_stat_del(struct sock *sk, struct sk_buff *skb)
struct scm_fp_list *fp = UNIXCB(skb).fp;
struct unix_sock *u = unix_sk(sk);
if (unlikely(fp && fp->count))
if (unlikely(fp && fp->count)) {
atomic_sub(fp->count, &u->scm_stat.nr_fds);
unix_del_edges(fp);
}
}
/*
......
This diff is collapsed.
......@@ -31,6 +31,7 @@ reuseport_dualstack
rxtimestamp
sctp_hello
scm_pidfd
scm_rights
sk_bind_sendto_listen
sk_connect_zero_addr
socket
......
CFLAGS += $(KHDR_INCLUDES)
TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd
TEST_GEN_PROGS := diag_uid test_unix_oob unix_connect scm_pidfd scm_rights
include ../../lib.mk
// SPDX-License-Identifier: GPL-2.0
/* Copyright Amazon.com Inc. or its affiliates. */
#define _GNU_SOURCE
#include <sched.h>
#include <stdio.h>
#include <string.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/socket.h>
#include <sys/un.h>
#include "../../kselftest_harness.h"
FIXTURE(scm_rights)
{
int fd[16];
};
FIXTURE_VARIANT(scm_rights)
{
char name[16];
int type;
int flags;
bool test_listener;
};
FIXTURE_VARIANT_ADD(scm_rights, dgram)
{
.name = "UNIX ",
.type = SOCK_DGRAM,
.flags = 0,
.test_listener = false,
};
FIXTURE_VARIANT_ADD(scm_rights, stream)
{
.name = "UNIX-STREAM ",
.type = SOCK_STREAM,
.flags = 0,
.test_listener = false,
};
FIXTURE_VARIANT_ADD(scm_rights, stream_oob)
{
.name = "UNIX-STREAM ",
.type = SOCK_STREAM,
.flags = MSG_OOB,
.test_listener = false,
};
FIXTURE_VARIANT_ADD(scm_rights, stream_listener)
{
.name = "UNIX-STREAM ",
.type = SOCK_STREAM,
.flags = 0,
.test_listener = true,
};
FIXTURE_VARIANT_ADD(scm_rights, stream_listener_oob)
{
.name = "UNIX-STREAM ",
.type = SOCK_STREAM,
.flags = MSG_OOB,
.test_listener = true,
};
static int count_sockets(struct __test_metadata *_metadata,
const FIXTURE_VARIANT(scm_rights) *variant)
{
int sockets = -1, len, ret;
char *line = NULL;
size_t unused;
FILE *f;
f = fopen("/proc/net/protocols", "r");
ASSERT_NE(NULL, f);
len = strlen(variant->name);
while (getline(&line, &unused, f) != -1) {
int unused2;
if (strncmp(line, variant->name, len))
continue;
ret = sscanf(line + len, "%d %d", &unused2, &sockets);
ASSERT_EQ(2, ret);
break;
}
free(line);
ret = fclose(f);
ASSERT_EQ(0, ret);
return sockets;
}
FIXTURE_SETUP(scm_rights)
{
int ret;
ret = unshare(CLONE_NEWNET);
ASSERT_EQ(0, ret);
ret = count_sockets(_metadata, variant);
ASSERT_EQ(0, ret);
}
FIXTURE_TEARDOWN(scm_rights)
{
int ret;
sleep(1);
ret = count_sockets(_metadata, variant);
ASSERT_EQ(0, ret);
}
static void create_listeners(struct __test_metadata *_metadata,
FIXTURE_DATA(scm_rights) *self,
int n)
{
struct sockaddr_un addr = {
.sun_family = AF_UNIX,
};
socklen_t addrlen;
int i, ret;
for (i = 0; i < n * 2; i += 2) {
self->fd[i] = socket(AF_UNIX, SOCK_STREAM, 0);
ASSERT_LE(0, self->fd[i]);
addrlen = sizeof(addr.sun_family);
ret = bind(self->fd[i], (struct sockaddr *)&addr, addrlen);
ASSERT_EQ(0, ret);
ret = listen(self->fd[i], -1);
ASSERT_EQ(0, ret);
addrlen = sizeof(addr);
ret = getsockname(self->fd[i], (struct sockaddr *)&addr, &addrlen);
ASSERT_EQ(0, ret);
self->fd[i + 1] = socket(AF_UNIX, SOCK_STREAM, 0);
ASSERT_LE(0, self->fd[i + 1]);
ret = connect(self->fd[i + 1], (struct sockaddr *)&addr, addrlen);
ASSERT_EQ(0, ret);
}
}
static void create_socketpairs(struct __test_metadata *_metadata,
FIXTURE_DATA(scm_rights) *self,
const FIXTURE_VARIANT(scm_rights) *variant,
int n)
{
int i, ret;
ASSERT_GE(sizeof(self->fd) / sizeof(int), n);
for (i = 0; i < n * 2; i += 2) {
ret = socketpair(AF_UNIX, variant->type, 0, self->fd + i);
ASSERT_EQ(0, ret);
}
}
static void __create_sockets(struct __test_metadata *_metadata,
FIXTURE_DATA(scm_rights) *self,
const FIXTURE_VARIANT(scm_rights) *variant,
int n)
{
if (variant->test_listener)
create_listeners(_metadata, self, n);
else
create_socketpairs(_metadata, self, variant, n);
}
static void __close_sockets(struct __test_metadata *_metadata,
FIXTURE_DATA(scm_rights) *self,
int n)
{
int i, ret;
ASSERT_GE(sizeof(self->fd) / sizeof(int), n);
for (i = 0; i < n * 2; i++) {
ret = close(self->fd[i]);
ASSERT_EQ(0, ret);
}
}
void __send_fd(struct __test_metadata *_metadata,
const FIXTURE_DATA(scm_rights) *self,
const FIXTURE_VARIANT(scm_rights) *variant,
int inflight, int receiver)
{
#define MSG "nop"
#define MSGLEN 3
struct {
struct cmsghdr cmsghdr;
int fd[2];
} cmsg = {
.cmsghdr = {
.cmsg_len = CMSG_LEN(sizeof(cmsg.fd)),
.cmsg_level = SOL_SOCKET,
.cmsg_type = SCM_RIGHTS,
},
.fd = {
self->fd[inflight * 2],
self->fd[inflight * 2],
},
};
struct iovec iov = {
.iov_base = MSG,
.iov_len = MSGLEN,
};
struct msghdr msg = {
.msg_name = NULL,
.msg_namelen = 0,
.msg_iov = &iov,
.msg_iovlen = 1,
.msg_control = &cmsg,
.msg_controllen = CMSG_SPACE(sizeof(cmsg.fd)),
};
int ret;
ret = sendmsg(self->fd[receiver * 2 + 1], &msg, variant->flags);
ASSERT_EQ(MSGLEN, ret);
}
#define create_sockets(n) \
__create_sockets(_metadata, self, variant, n)
#define close_sockets(n) \
__close_sockets(_metadata, self, n)
#define send_fd(inflight, receiver) \
__send_fd(_metadata, self, variant, inflight, receiver)
TEST_F(scm_rights, self_ref)
{
create_sockets(2);
send_fd(0, 0);
send_fd(1, 1);
close_sockets(2);
}
TEST_F(scm_rights, triangle)
{
create_sockets(6);
send_fd(0, 1);
send_fd(1, 2);
send_fd(2, 0);
send_fd(3, 4);
send_fd(4, 5);
send_fd(5, 3);
close_sockets(6);
}
TEST_F(scm_rights, cross_edge)
{
create_sockets(8);
send_fd(0, 1);
send_fd(1, 2);
send_fd(2, 0);
send_fd(1, 3);
send_fd(3, 2);
send_fd(4, 5);
send_fd(5, 6);
send_fd(6, 4);
send_fd(5, 7);
send_fd(7, 6);
close_sockets(8);
}
TEST_HARNESS_MAIN
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment