fs: dlm: make new_lockspace() wait until recovery completes

Make dlm_new_lockspace() wait until a full recovery completes sucessfully or fails. Previously, dlm_new_lockspace() returned to the caller after dlm_recover_members() finished, which is only partially through recovery. The result of the previous behavior is that the new lockspace would not be usable for some time (especially with overlapping recoveries), and some errors in the later part of recovery could not be returned to the caller. Kernel callers gfs2 and cluster-md have their own wait handling to wait for recovery to complete after calling dlm_new_lockspace(). This continues to work, but will be unnecessary. Signed-off-by: Alexander Aring <aahringo@redhat.com> Signed-off-by: David Teigland <teigland@redhat.com>

fs: dlm: make new_lockspace() wait until recovery completes
Make dlm_new_lockspace() wait until a full recovery completes sucessfully or fails. Previously, dlm_new_lockspace() returned to the caller after dlm_recover_members() finished, which is only partially through recovery. The result of the previous behavior is that the new lockspace would not be usable for some time (especially with overlapping recoveries), and some errors in the later part of recovery could not be returned to the caller. Kernel callers gfs2 and cluster-md have their own wait handling to wait for recovery to complete after calling dlm_new_lockspace(). This continues to work, but will be unnecessary. Signed-off-by: Alexander Aring <aahringo@redhat.com> Signed-off-by: David Teigland <teigland@redhat.com>
682bb91b · Alexander Aring · David Teigland · 7e09b15c · 682bb91b · 682bb91b
Commit 682bb91b authored Jun 22, 2022 by Alexander Aring Committed by David Teigland Jun 24, 2022
Showing with 20 additions and 19 deletions

fs/dlm/dlm_internal.h fs/dlm/dlm_internal.h +2 -2

fs/dlm/lockspace.c fs/dlm/lockspace.c +5 -4

fs/dlm/member.c fs/dlm/member.c +0 -13

fs/dlm/recoverd.c fs/dlm/recoverd.c +13 -0

No files found.
--- a/fs/dlm/dlm_internal.h
+++ b/fs/dlm/dlm_internal.h
@@ -606,8 +606,8 @@ struct dlm_ls {

 	wait_queue_head_t	ls_uevent_wait;	/* user part of join/leave */
 	int			ls_uevent_result;
-	struct completion	ls_members_done;
-	int			ls_members_result;
+	struct completion	ls_recovery_done;
+	int			ls_recovery_result;

 	struct miscdevice       ls_device;


--- a/fs/dlm/lockspace.c
+++ b/fs/dlm/lockspace.c
@@ -548,8 +548,8 @@ static int new_lockspace(const char *name, const char *cluster,

 	init_waitqueue_head(&ls->ls_uevent_wait);
 	ls->ls_uevent_result = 0;
-	init_completion(&ls->ls_members_done);
-	ls->ls_members_result = -1;
+	init_completion(&ls->ls_recovery_done);
+	ls->ls_recovery_result = -1;

 	mutex_init(&ls->ls_cb_mutex);
 	INIT_LIST_HEAD(&ls->ls_cb_delay);
@@ -645,8 +645,9 @@ static int new_lockspace(const char *name, const char *cluster,
 	if (error)
 		goto out_recoverd;

-	wait_for_completion(&ls->ls_members_done);
-	error = ls->ls_members_result;
+	/* wait until recovery is successful or failed */
+	wait_for_completion(&ls->ls_recovery_done);
+	error = ls->ls_recovery_result;
 	if (error)
 		goto out_members;


--- a/fs/dlm/member.c
+++ b/fs/dlm/member.c
@@ -587,19 +587,6 @@ int dlm_recover_members(struct dlm_ls *ls, struct dlm_recover *rv, int *neg_out)
 	*neg_out = neg;

 	error = ping_members(ls);
-	/* error -EINTR means that a new recovery action is triggered.
-	 * We ignore this recovery action and let run the new one which might
-	 * have new member configuration.
-	 */
-	if (error == -EINTR)
-		error = 0;
-
-	/* new_lockspace() may be waiting to know if the config
-	 * is good or bad
-	 */
-	ls->ls_members_result = error;
-	complete(&ls->ls_members_done);
-
 	log_rinfo(ls, "dlm_recover_members %d nodes", ls->ls_num_nodes);
 	return error;
 }

--- a/fs/dlm/recoverd.c
+++ b/fs/dlm/recoverd.c
@@ -243,6 +243,9 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 		  jiffies_to_msecs(jiffies - start));
 	mutex_unlock(&ls->ls_recoverd_active);

+	ls->ls_recovery_result = 0;
+	complete(&ls->ls_recovery_done);
+
 	dlm_lsop_recover_done(ls);
 	return 0;

@@ -251,6 +254,16 @@ static int ls_recover(struct dlm_ls *ls, struct dlm_recover *rv)
 	log_rinfo(ls, "dlm_recover %llu error %d",
 		  (unsigned long long)rv->seq, error);
 	mutex_unlock(&ls->ls_recoverd_active);
+
+	/* let new_lockspace() get aware of critical error if recovery
+	 * was interrupted -EINTR we wait for the next ls_recover()
+	 * iteration until it succeeds.
+	 */
+	if (error != -EINTR) {
+		ls->ls_recovery_result = error;
+		complete(&ls->ls_recovery_done);
+	}
+
 	return error;
 }