Commit 6dd4142f authored by David S. Miller's avatar David S. Miller

Merge branch 'af_unix-per-netns-socket-hash'

Kuniyuki Iwashima says:

====================
af_unix: Introduce per-netns socket hash table.

This series replaces unix_socket_table with a per-netns hash table and
reduces lock contention and time on iterating over the list.

Note the 3rd-6th patches can be a single patch, but for ease of review,
they are split into small changes without breakage.

Changes:
  v3:
    6th:
      * Remove unix_table_locks from comments.
      * Remove missed spin_unlock(&unix_table_locks) in
        unix_lookup_by_ino() (kernel test robot)

  v2: https://lore.kernel.org/netdev/20220620185151.65294-1-kuniyu@amazon.com/
    3rd:
      * Update changelog
      * Remove holes from per-netns hash table structure
      * Use kvmalloc_array() instead of kmalloc() (Eric Dumazet)
      * Remove unnecessary parts in af_unix_init() (Eric Dumazet)
      * Move `err_sysctl` label into ifdef block (kernel test robot)
      * Remove struct netns_unix from struct net if CONFIG_UNIX is disabled
    4th:
      * Use spin_lock_nested() (kernel test robot)

  v1: https://lore.kernel.org/netdev/20220616234714.4291-1-kuniyu@amazon.com/
====================
Signed-off-by: default avatarDavid S. Miller <davem@davemloft.net>
parents ffd3018b 2f7ca90a
...@@ -16,12 +16,11 @@ void wait_for_unix_gc(void); ...@@ -16,12 +16,11 @@ void wait_for_unix_gc(void);
struct sock *unix_get_socket(struct file *filp); struct sock *unix_get_socket(struct file *filp);
struct sock *unix_peer_get(struct sock *sk); struct sock *unix_peer_get(struct sock *sk);
#define UNIX_HASH_SIZE 256 #define UNIX_HASH_MOD (256 - 1)
#define UNIX_HASH_SIZE (256 * 2)
#define UNIX_HASH_BITS 8 #define UNIX_HASH_BITS 8
extern unsigned int unix_tot_inflight; extern unsigned int unix_tot_inflight;
extern spinlock_t unix_table_locks[2 * UNIX_HASH_SIZE];
extern struct hlist_head unix_socket_table[2 * UNIX_HASH_SIZE];
struct unix_address { struct unix_address {
refcount_t refcnt; refcount_t refcnt;
......
...@@ -120,7 +120,9 @@ struct net { ...@@ -120,7 +120,9 @@ struct net {
struct netns_core core; struct netns_core core;
struct netns_mib mib; struct netns_mib mib;
struct netns_packet packet; struct netns_packet packet;
#if IS_ENABLED(CONFIG_UNIX)
struct netns_unix unx; struct netns_unix unx;
#endif
struct netns_nexthop nexthop; struct netns_nexthop nexthop;
struct netns_ipv4 ipv4; struct netns_ipv4 ipv4;
#if IS_ENABLED(CONFIG_IPV6) #if IS_ENABLED(CONFIG_IPV6)
......
...@@ -5,8 +5,14 @@ ...@@ -5,8 +5,14 @@
#ifndef __NETNS_UNIX_H__ #ifndef __NETNS_UNIX_H__
#define __NETNS_UNIX_H__ #define __NETNS_UNIX_H__
struct unix_table {
spinlock_t *locks;
struct hlist_head *buckets;
};
struct ctl_table_header; struct ctl_table_header;
struct netns_unix { struct netns_unix {
struct unix_table table;
int sysctl_max_dgram_qlen; int sysctl_max_dgram_qlen;
struct ctl_table_header *ctl; struct ctl_table_header *ctl;
}; };
......
This diff is collapsed.
...@@ -13,7 +13,7 @@ ...@@ -13,7 +13,7 @@
static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb) static int sk_diag_dump_name(struct sock *sk, struct sk_buff *nlskb)
{ {
/* might or might not have unix_table_locks */ /* might or might not have a hash table lock */
struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr); struct unix_address *addr = smp_load_acquire(&unix_sk(sk)->addr);
if (!addr) if (!addr)
...@@ -195,25 +195,21 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_r ...@@ -195,25 +195,21 @@ static int sk_diag_dump(struct sock *sk, struct sk_buff *skb, struct unix_diag_r
static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
{ {
struct unix_diag_req *req;
int num, s_num, slot, s_slot;
struct net *net = sock_net(skb->sk); struct net *net = sock_net(skb->sk);
int num, s_num, slot, s_slot;
struct unix_diag_req *req;
req = nlmsg_data(cb->nlh); req = nlmsg_data(cb->nlh);
s_slot = cb->args[0]; s_slot = cb->args[0];
num = s_num = cb->args[1]; num = s_num = cb->args[1];
for (slot = s_slot; for (slot = s_slot; slot < UNIX_HASH_SIZE; s_num = 0, slot++) {
slot < ARRAY_SIZE(unix_socket_table);
s_num = 0, slot++) {
struct sock *sk; struct sock *sk;
num = 0; num = 0;
spin_lock(&unix_table_locks[slot]); spin_lock(&net->unx.table.locks[slot]);
sk_for_each(sk, &unix_socket_table[slot]) { sk_for_each(sk, &net->unx.table.buckets[slot]) {
if (!net_eq(sock_net(sk), net))
continue;
if (num < s_num) if (num < s_num)
goto next; goto next;
if (!(req->udiag_states & (1 << sk->sk_state))) if (!(req->udiag_states & (1 << sk->sk_state)))
...@@ -222,13 +218,13 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -222,13 +218,13 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
NETLINK_CB(cb->skb).portid, NETLINK_CB(cb->skb).portid,
cb->nlh->nlmsg_seq, cb->nlh->nlmsg_seq,
NLM_F_MULTI) < 0) { NLM_F_MULTI) < 0) {
spin_unlock(&unix_table_locks[slot]); spin_unlock(&net->unx.table.locks[slot]);
goto done; goto done;
} }
next: next:
num++; num++;
} }
spin_unlock(&unix_table_locks[slot]); spin_unlock(&net->unx.table.locks[slot]);
} }
done: done:
cb->args[0] = slot; cb->args[0] = slot;
...@@ -237,20 +233,21 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb) ...@@ -237,20 +233,21 @@ static int unix_diag_dump(struct sk_buff *skb, struct netlink_callback *cb)
return skb->len; return skb->len;
} }
static struct sock *unix_lookup_by_ino(unsigned int ino) static struct sock *unix_lookup_by_ino(struct net *net, unsigned int ino)
{ {
struct sock *sk; struct sock *sk;
int i; int i;
for (i = 0; i < ARRAY_SIZE(unix_socket_table); i++) { for (i = 0; i < UNIX_HASH_SIZE; i++) {
spin_lock(&unix_table_locks[i]); spin_lock(&net->unx.table.locks[i]);
sk_for_each(sk, &unix_socket_table[i]) sk_for_each(sk, &net->unx.table.buckets[i]) {
if (ino == sock_i_ino(sk)) { if (ino == sock_i_ino(sk)) {
sock_hold(sk); sock_hold(sk);
spin_unlock(&unix_table_locks[i]); spin_unlock(&net->unx.table.locks[i]);
return sk; return sk;
} }
spin_unlock(&unix_table_locks[i]); }
spin_unlock(&net->unx.table.locks[i]);
} }
return NULL; return NULL;
} }
...@@ -259,21 +256,20 @@ static int unix_diag_get_exact(struct sk_buff *in_skb, ...@@ -259,21 +256,20 @@ static int unix_diag_get_exact(struct sk_buff *in_skb,
const struct nlmsghdr *nlh, const struct nlmsghdr *nlh,
struct unix_diag_req *req) struct unix_diag_req *req)
{ {
int err = -EINVAL;
struct sock *sk;
struct sk_buff *rep;
unsigned int extra_len;
struct net *net = sock_net(in_skb->sk); struct net *net = sock_net(in_skb->sk);
unsigned int extra_len;
struct sk_buff *rep;
struct sock *sk;
int err;
err = -EINVAL;
if (req->udiag_ino == 0) if (req->udiag_ino == 0)
goto out_nosk; goto out_nosk;
sk = unix_lookup_by_ino(req->udiag_ino); sk = unix_lookup_by_ino(net, req->udiag_ino);
err = -ENOENT; err = -ENOENT;
if (sk == NULL) if (sk == NULL)
goto out_nosk; goto out_nosk;
if (!net_eq(sock_net(sk), net))
goto out;
err = sock_diag_check_cookie(sk, req->udiag_cookie); err = sock_diag_check_cookie(sk, req->udiag_cookie);
if (err) if (err)
...@@ -308,7 +304,6 @@ static int unix_diag_get_exact(struct sk_buff *in_skb, ...@@ -308,7 +304,6 @@ static int unix_diag_get_exact(struct sk_buff *in_skb,
static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
{ {
int hdrlen = sizeof(struct unix_diag_req); int hdrlen = sizeof(struct unix_diag_req);
struct net *net = sock_net(skb->sk);
if (nlmsg_len(h) < hdrlen) if (nlmsg_len(h) < hdrlen)
return -EINVAL; return -EINVAL;
...@@ -317,7 +312,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h) ...@@ -317,7 +312,7 @@ static int unix_diag_handler_dump(struct sk_buff *skb, struct nlmsghdr *h)
struct netlink_dump_control c = { struct netlink_dump_control c = {
.dump = unix_diag_dump, .dump = unix_diag_dump,
}; };
return netlink_dump_start(net->diag_nlsk, skb, h, &c); return netlink_dump_start(sock_net(skb->sk)->diag_nlsk, skb, h, &c);
} else } else
return unix_diag_get_exact(skb, h, nlmsg_data(h)); return unix_diag_get_exact(skb, h, nlmsg_data(h));
} }
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment