Commit eca3a04f authored by Linus Torvalds's avatar Linus Torvalds

Merge tag 'dlm-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm

Pull dlm updates from David Teigland:
 "This fixes some races in the lowcomms startup and shutdown code that
  were found by targeted stress testing that quickly and repeatedly
  joins and leaves lockspaces"

* tag 'dlm-6.3' of git://git.kernel.org/pub/scm/linux/kernel/git/teigland/linux-dlm:
  fs: dlm: remove unnecessary waker_up() calls
  fs: dlm: move state change into else branch
  fs: dlm: remove newline in log_print
  fs: dlm: reduce the shutdown timeout to 5 secs
  fs: dlm: make dlm sequence id more robust
  fs: dlm: wait until all midcomms nodes detect version
  fs: dlm: ignore unexpected non dlm opts msgs
  fs: dlm: bring back previous shutdown handling
  fs: dlm: send FIN ack back in right cases
  fs: dlm: move sending fin message into state change handling
  fs: dlm: don't set stop rx flag after node reset
  fs: dlm: fix race setting stop tx flag
  fs: dlm: be sure to call dlm_send_queue_flush()
  fs: dlm: fix use after free in midcomms commit
  fs: dlm: start midcomms before scand
  fs/dlm: Remove "select SRCU"
  fs: dlm: fix return value check in dlm_memory_init()
parents 885ce487 723b197b
......@@ -4,7 +4,6 @@ menuconfig DLM
depends on INET
depends on SYSFS && CONFIGFS_FS && (IPV6 || IPV6=n)
select IP_SCTP
select SRCU
help
A general purpose distributed lock manager for kernel or userspace
applications.
......
......@@ -381,23 +381,23 @@ static int threads_start(void)
{
int error;
error = dlm_scand_start();
/* Thread for sending/receiving messages for all lockspace's */
error = dlm_midcomms_start();
if (error) {
log_print("cannot start dlm_scand thread %d", error);
log_print("cannot start dlm midcomms %d", error);
goto fail;
}
/* Thread for sending/receiving messages for all lockspace's */
error = dlm_midcomms_start();
error = dlm_scand_start();
if (error) {
log_print("cannot start dlm midcomms %d", error);
goto scand_fail;
log_print("cannot start dlm_scand thread %d", error);
goto midcomms_fail;
}
return 0;
scand_fail:
dlm_scand_stop();
midcomms_fail:
dlm_midcomms_stop();
fail:
return error;
}
......@@ -572,7 +572,7 @@ static int new_lockspace(const char *name, const char *cluster,
spin_lock_init(&ls->ls_rcom_spin);
get_random_bytes(&ls->ls_rcom_seq, sizeof(uint64_t));
ls->ls_recover_status = 0;
ls->ls_recover_seq = 0;
ls->ls_recover_seq = get_random_u64();
ls->ls_recover_args = NULL;
init_rwsem(&ls->ls_in_recovery);
init_rwsem(&ls->ls_recv_active);
......@@ -820,6 +820,9 @@ static int release_lockspace(struct dlm_ls *ls, int force)
return rv;
}
if (ls_count == 1)
dlm_midcomms_version_wait();
dlm_device_deregister(ls);
if (force < 3 && dlm_user_daemon_available())
......
......@@ -61,6 +61,7 @@
#include "memory.h"
#include "config.h"
#define DLM_SHUTDOWN_WAIT_TIMEOUT msecs_to_jiffies(5000)
#define NEEDED_RMEM (4*1024*1024)
struct connection {
......@@ -99,6 +100,7 @@ struct connection {
struct connection *othercon;
struct work_struct rwork; /* receive worker */
struct work_struct swork; /* send worker */
wait_queue_head_t shutdown_wait;
unsigned char rx_leftover_buf[DLM_MAX_SOCKET_BUFSIZE];
int rx_leftover;
int mark;
......@@ -282,6 +284,7 @@ static void dlm_con_init(struct connection *con, int nodeid)
INIT_WORK(&con->swork, process_send_sockets);
INIT_WORK(&con->rwork, process_recv_sockets);
spin_lock_init(&con->addrs_lock);
init_waitqueue_head(&con->shutdown_wait);
}
/*
......@@ -790,6 +793,43 @@ static void close_connection(struct connection *con, bool and_other)
up_write(&con->sock_lock);
}
static void shutdown_connection(struct connection *con, bool and_other)
{
int ret;
if (con->othercon && and_other)
shutdown_connection(con->othercon, false);
flush_workqueue(io_workqueue);
down_read(&con->sock_lock);
/* nothing to shutdown */
if (!con->sock) {
up_read(&con->sock_lock);
return;
}
ret = kernel_sock_shutdown(con->sock, SHUT_WR);
up_read(&con->sock_lock);
if (ret) {
log_print("Connection %p failed to shutdown: %d will force close",
con, ret);
goto force_close;
} else {
ret = wait_event_timeout(con->shutdown_wait, !con->sock,
DLM_SHUTDOWN_WAIT_TIMEOUT);
if (ret == 0) {
log_print("Connection %p shutdown timed out, will force close",
con);
goto force_close;
}
}
return;
force_close:
close_connection(con, false);
}
static struct processqueue_entry *new_processqueue_entry(int nodeid,
int buflen)
{
......@@ -1488,6 +1528,7 @@ static void process_recv_sockets(struct work_struct *work)
break;
case DLM_IO_EOF:
close_connection(con, false);
wake_up(&con->shutdown_wait);
/* CF_RECV_PENDING cleared */
break;
case DLM_IO_RESCHED:
......@@ -1695,6 +1736,9 @@ static int work_start(void)
void dlm_lowcomms_shutdown(void)
{
struct connection *con;
int i, idx;
/* stop lowcomms_listen_data_ready calls */
lock_sock(listen_con.sock->sk);
listen_con.sock->sk->sk_data_ready = listen_sock.sk_data_ready;
......@@ -1703,29 +1747,20 @@ void dlm_lowcomms_shutdown(void)
cancel_work_sync(&listen_con.rwork);
dlm_close_sock(&listen_con.sock);
flush_workqueue(process_workqueue);
}
void dlm_lowcomms_shutdown_node(int nodeid, bool force)
{
struct connection *con;
int idx;
idx = srcu_read_lock(&connections_srcu);
con = nodeid2con(nodeid, 0);
if (WARN_ON_ONCE(!con)) {
srcu_read_unlock(&connections_srcu, idx);
return;
}
for (i = 0; i < CONN_HASH_SIZE; i++) {
hlist_for_each_entry_rcu(con, &connection_hash[i], list) {
shutdown_connection(con, true);
stop_connection_io(con);
flush_workqueue(process_workqueue);
close_connection(con, true);
flush_work(&con->swork);
stop_connection_io(con);
WARN_ON_ONCE(!force && !list_empty(&con->writequeue));
close_connection(con, true);
clean_one_writequeue(con);
if (con->othercon)
clean_one_writequeue(con->othercon);
allow_connection_io(con);
clean_one_writequeue(con);
if (con->othercon)
clean_one_writequeue(con->othercon);
allow_connection_io(con);
}
}
srcu_read_unlock(&connections_srcu, idx);
}
......
......@@ -51,7 +51,7 @@ int __init dlm_memory_init(void)
cb_cache = kmem_cache_create("dlm_cb", sizeof(struct dlm_callback),
__alignof__(struct dlm_callback), 0,
NULL);
if (!rsb_cache)
if (!cb_cache)
goto cb;
return 0;
......
This diff is collapsed.
......@@ -20,6 +20,7 @@ struct dlm_mhandle *dlm_midcomms_get_mhandle(int nodeid, int len,
gfp_t allocation, char **ppc);
void dlm_midcomms_commit_mhandle(struct dlm_mhandle *mh, const void *name,
int namelen);
void dlm_midcomms_version_wait(void);
int dlm_midcomms_close(int nodeid);
int dlm_midcomms_start(void);
void dlm_midcomms_stop(void);
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment