Commit 841aee4d authored by Chris Leech's avatar Chris Leech Committed by Christoph Hellwig

nvme-tcp: lockdep: annotate in-kernel sockets

Put NVMe/TCP sockets in their own class to avoid some lockdep warnings.
Sockets created by nvme-tcp are not exposed to user-space, and will not
trigger certain code paths that the general socket API exposes.

Lockdep complains about a circular dependency between the socket and
filesystem locks, because setsockopt can trigger a page fault with a
socket lock held, but nvme-tcp sends requests on the socket while file
system locks are held.

  ======================================================
  WARNING: possible circular locking dependency detected
  5.15.0-rc3 #1 Not tainted
  ------------------------------------------------------
  fio/1496 is trying to acquire lock:
  (sk_lock-AF_INET){+.+.}-{0:0}, at: tcp_sendpage+0x23/0x80

  but task is already holding lock:
  (&xfs_dir_ilock_class/5){+.+.}-{3:3}, at: xfs_ilock+0xcf/0x290 [xfs]

  which lock already depends on the new lock.

  other info that might help us debug this:

  chain exists of:
   sk_lock-AF_INET --> sb_internal --> &xfs_dir_ilock_class/5

  Possible unsafe locking scenario:

        CPU0                    CPU1
        ----                    ----
   lock(&xfs_dir_ilock_class/5);
                                lock(sb_internal);
                                lock(&xfs_dir_ilock_class/5);
   lock(sk_lock-AF_INET);

  *** DEADLOCK ***

  6 locks held by fio/1496:
   #0: (sb_writers#13){.+.+}-{0:0}, at: path_openat+0x9fc/0xa20
   #1: (&inode->i_sb->s_type->i_mutex_dir_key){++++}-{3:3}, at: path_openat+0x296/0xa20
   #2: (sb_internal){.+.+}-{0:0}, at: xfs_trans_alloc_icreate+0x41/0xd0 [xfs]
   #3: (&xfs_dir_ilock_class/5){+.+.}-{3:3}, at: xfs_ilock+0xcf/0x290 [xfs]
   #4: (hctx->srcu){....}-{0:0}, at: hctx_lock+0x51/0xd0
   #5: (&queue->send_mutex){+.+.}-{3:3}, at: nvme_tcp_queue_rq+0x33e/0x380 [nvme_tcp]

This annotation lets lockdep analyze nvme-tcp controlled sockets
independently of what the user-space sockets API does.

Link: https://lore.kernel.org/linux-nvme/CAHj4cs9MDYLJ+q+2_GXUK9HxFizv2pxUryUR0toX974M040z7g@mail.gmail.com/Signed-off-by: default avatarChris Leech <cleech@redhat.com>
Signed-off-by: default avatarChristoph Hellwig <hch@lst.de>
parent a387935c
...@@ -30,6 +30,44 @@ static int so_priority; ...@@ -30,6 +30,44 @@ static int so_priority;
module_param(so_priority, int, 0644); module_param(so_priority, int, 0644);
MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority"); MODULE_PARM_DESC(so_priority, "nvme tcp socket optimize priority");
#ifdef CONFIG_DEBUG_LOCK_ALLOC
/* lockdep can detect a circular dependency of the form
* sk_lock -> mmap_lock (page fault) -> fs locks -> sk_lock
* because dependencies are tracked for both nvme-tcp and user contexts. Using
* a separate class prevents lockdep from conflating nvme-tcp socket use with
* user-space socket API use.
*/
static struct lock_class_key nvme_tcp_sk_key[2];
static struct lock_class_key nvme_tcp_slock_key[2];
static void nvme_tcp_reclassify_socket(struct socket *sock)
{
struct sock *sk = sock->sk;
if (WARN_ON_ONCE(!sock_allow_reclassification(sk)))
return;
switch (sk->sk_family) {
case AF_INET:
sock_lock_init_class_and_name(sk, "slock-AF_INET-NVME",
&nvme_tcp_slock_key[0],
"sk_lock-AF_INET-NVME",
&nvme_tcp_sk_key[0]);
break;
case AF_INET6:
sock_lock_init_class_and_name(sk, "slock-AF_INET6-NVME",
&nvme_tcp_slock_key[1],
"sk_lock-AF_INET6-NVME",
&nvme_tcp_sk_key[1]);
break;
default:
WARN_ON_ONCE(1);
}
}
#else
static void nvme_tcp_reclassify_socket(struct socket *sock) { }
#endif
enum nvme_tcp_send_state { enum nvme_tcp_send_state {
NVME_TCP_SEND_CMD_PDU = 0, NVME_TCP_SEND_CMD_PDU = 0,
NVME_TCP_SEND_H2C_PDU, NVME_TCP_SEND_H2C_PDU,
...@@ -1427,6 +1465,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, ...@@ -1427,6 +1465,8 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl,
goto err_destroy_mutex; goto err_destroy_mutex;
} }
nvme_tcp_reclassify_socket(queue->sock);
/* Single syn retry */ /* Single syn retry */
tcp_sock_set_syncnt(queue->sock->sk, 1); tcp_sock_set_syncnt(queue->sock->sk, 1);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment