Commit e0c2a9aa authored by David Teigland's avatar David Teigland Committed by Steven Whitehouse

GFS2: dlm based recovery coordination

This new method of managing recovery is an alternative to
the previous approach of using the userland gfs_controld.

- use dlm slot numbers to assign journal id's
- use dlm recovery callbacks to initiate journal recovery
- use a dlm lock to determine the first node to mount fs
- use a dlm lock to track journals that need recovery
Signed-off-by: default avatarDavid Teigland <teigland@redhat.com>
Signed-off-by: default avatarSteven Whitehouse <swhiteho@redhat.com>
parent e343a895
......@@ -1353,7 +1353,7 @@ void gfs2_glock_complete(struct gfs2_glock *gl, int ret)
spin_lock(&gl->gl_spin);
gl->gl_reply = ret;
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))) {
if (unlikely(test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))) {
if (gfs2_should_freeze(gl)) {
set_bit(GLF_FROZEN, &gl->gl_flags);
spin_unlock(&gl->gl_spin);
......
......@@ -121,7 +121,10 @@ enum {
struct lm_lockops {
const char *lm_proto_name;
int (*lm_mount) (struct gfs2_sbd *sdp, const char *fsname);
int (*lm_mount) (struct gfs2_sbd *sdp, const char *table);
void (*lm_first_done) (struct gfs2_sbd *sdp);
void (*lm_recovery_result) (struct gfs2_sbd *sdp, unsigned int jid,
unsigned int result);
void (*lm_unmount) (struct gfs2_sbd *sdp);
void (*lm_withdraw) (struct gfs2_sbd *sdp);
void (*lm_put_lock) (struct gfs2_glock *gl);
......
......@@ -139,8 +139,45 @@ struct gfs2_bufdata {
#define GDLM_STRNAME_BYTES 25
#define GDLM_LVB_SIZE 32
/*
* ls_recover_flags:
*
* DFL_BLOCK_LOCKS: dlm is in recovery and will grant locks that had been
* held by failed nodes whose journals need recovery. Those locks should
* only be used for journal recovery until the journal recovery is done.
* This is set by the dlm recover_prep callback and cleared by the
* gfs2_control thread when journal recovery is complete. To avoid
* races between recover_prep setting and gfs2_control clearing, recover_spin
* is held while changing this bit and reading/writing recover_block
* and recover_start.
*
* DFL_NO_DLM_OPS: dlm lockspace ops/callbacks are not being used.
*
* DFL_FIRST_MOUNT: this node is the first to mount this fs and is doing
* recovery of all journals before allowing other nodes to mount the fs.
* This is cleared when FIRST_MOUNT_DONE is set.
*
* DFL_FIRST_MOUNT_DONE: this node was the first mounter, and has finished
* recovery of all journals, and now allows other nodes to mount the fs.
*
* DFL_MOUNT_DONE: gdlm_mount has completed successfully and cleared
* BLOCK_LOCKS for the first time. The gfs2_control thread should now
* control clearing BLOCK_LOCKS for further recoveries.
*
* DFL_UNMOUNT: gdlm_unmount sets to keep sdp off gfs2_control_wq.
*
* DFL_DLM_RECOVERY: set while dlm is in recovery, between recover_prep()
* and recover_done(), i.e. set while recover_block == recover_start.
*/
enum {
DFL_BLOCK_LOCKS = 0,
DFL_NO_DLM_OPS = 1,
DFL_FIRST_MOUNT = 2,
DFL_FIRST_MOUNT_DONE = 3,
DFL_MOUNT_DONE = 4,
DFL_UNMOUNT = 5,
DFL_DLM_RECOVERY = 6,
};
struct lm_lockname {
......@@ -499,14 +536,26 @@ struct gfs2_sb_host {
struct lm_lockstruct {
int ls_jid;
unsigned int ls_first;
unsigned int ls_first_done;
unsigned int ls_nodir;
const struct lm_lockops *ls_ops;
unsigned long ls_flags;
dlm_lockspace_t *ls_dlm;
int ls_recover_jid_done;
int ls_recover_jid_status;
int ls_recover_jid_done; /* These two are deprecated, */
int ls_recover_jid_status; /* used previously by gfs_controld */
struct dlm_lksb ls_mounted_lksb; /* mounted_lock */
struct dlm_lksb ls_control_lksb; /* control_lock */
char ls_control_lvb[GDLM_LVB_SIZE]; /* control_lock lvb */
struct completion ls_sync_wait; /* {control,mounted}_{lock,unlock} */
spinlock_t ls_recover_spin; /* protects following fields */
unsigned long ls_recover_flags; /* DFL_ */
uint32_t ls_recover_mount; /* gen in first recover_done cb */
uint32_t ls_recover_start; /* gen in last recover_done cb */
uint32_t ls_recover_block; /* copy recover_start in last recover_prep */
uint32_t ls_recover_size; /* size of recover_submit, recover_result */
uint32_t *ls_recover_submit; /* gen in last recover_slot cb per jid */
uint32_t *ls_recover_result; /* result of last jid recovery */
};
struct gfs2_sbd {
......@@ -544,6 +593,7 @@ struct gfs2_sbd {
wait_queue_head_t sd_glock_wait;
atomic_t sd_glock_disposal;
struct completion sd_locking_init;
struct delayed_work sd_control_work;
/* Inode Stuff */
......
This diff is collapsed.
......@@ -28,6 +28,8 @@
#include "recovery.h"
#include "dir.h"
struct workqueue_struct *gfs2_control_wq;
static struct shrinker qd_shrinker = {
.shrink = gfs2_shrink_qd_memory,
.seeks = DEFAULT_SEEKS,
......@@ -146,12 +148,19 @@ static int __init init_gfs2_fs(void)
if (!gfs_recovery_wq)
goto fail_wq;
gfs2_control_wq = alloc_workqueue("gfs2_control",
WQ_NON_REENTRANT | WQ_UNBOUND | WQ_FREEZABLE, 0);
if (!gfs2_control_wq)
goto fail_control;
gfs2_register_debugfs();
printk("GFS2 installed\n");
return 0;
fail_control:
destroy_workqueue(gfs_recovery_wq);
fail_wq:
unregister_filesystem(&gfs2meta_fs_type);
fail_unregister:
......@@ -195,6 +204,7 @@ static void __exit exit_gfs2_fs(void)
unregister_filesystem(&gfs2_fs_type);
unregister_filesystem(&gfs2meta_fs_type);
destroy_workqueue(gfs_recovery_wq);
destroy_workqueue(gfs2_control_wq);
rcu_barrier();
......
......@@ -562,8 +562,12 @@ static void gfs2_others_may_mount(struct gfs2_sbd *sdp)
{
char *message = "FIRSTMOUNT=Done";
char *envp[] = { message, NULL };
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
ls->ls_first_done = 1;
fs_info(sdp, "first mount done, others may mount\n");
if (sdp->sd_lockstruct.ls_ops->lm_first_done)
sdp->sd_lockstruct.ls_ops->lm_first_done(sdp);
kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
}
......@@ -944,7 +948,6 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
struct gfs2_args *args = &sdp->sd_args;
const char *proto = sdp->sd_proto_name;
const char *table = sdp->sd_table_name;
const char *fsname;
char *o, *options;
int ret;
......@@ -1004,21 +1007,12 @@ static int gfs2_lm_mount(struct gfs2_sbd *sdp, int silent)
}
}
if (sdp->sd_args.ar_spectator)
snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s", table);
else
snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u", table,
sdp->sd_lockstruct.ls_jid);
fsname = strchr(table, ':');
if (fsname)
fsname++;
if (lm->lm_mount == NULL) {
fs_info(sdp, "Now mounting FS...\n");
complete_all(&sdp->sd_locking_init);
return 0;
}
ret = lm->lm_mount(sdp, fsname);
ret = lm->lm_mount(sdp, table);
if (ret == 0)
fs_info(sdp, "Joined cluster. Now mounting FS...\n");
complete_all(&sdp->sd_locking_init);
......@@ -1124,6 +1118,8 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
if (error)
goto fail;
snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s", sdp->sd_table_name);
gfs2_create_debugfs_file(sdp);
error = gfs2_sys_fs_add(sdp);
......@@ -1160,6 +1156,13 @@ static int fill_super(struct super_block *sb, struct gfs2_args *args, int silent
goto fail_sb;
}
if (sdp->sd_args.ar_spectator)
snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.s",
sdp->sd_table_name);
else
snprintf(sdp->sd_fsname, GFS2_FSNAME_LEN, "%s.%u",
sdp->sd_table_name, sdp->sd_lockstruct.ls_jid);
error = init_inodes(sdp, DO);
if (error)
goto fail_sb;
......
......@@ -436,12 +436,16 @@ static void gfs2_recovery_done(struct gfs2_sbd *sdp, unsigned int jid,
char env_status[20];
char *envp[] = { env_jid, env_status, NULL };
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
ls->ls_recover_jid_done = jid;
ls->ls_recover_jid_status = message;
sprintf(env_jid, "JID=%d", jid);
sprintf(env_status, "RECOVERY=%s",
message == LM_RD_SUCCESS ? "Done" : "Failed");
kobject_uevent_env(&sdp->sd_kobj, KOBJ_CHANGE, envp);
if (sdp->sd_lockstruct.ls_ops->lm_recovery_result)
sdp->sd_lockstruct.ls_ops->lm_recovery_result(sdp, jid, message);
}
void gfs2_recover_func(struct work_struct *work)
......
......@@ -298,7 +298,7 @@ static ssize_t block_show(struct gfs2_sbd *sdp, char *buf)
ssize_t ret;
int val = 0;
if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_flags))
if (test_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags))
val = 1;
ret = sprintf(buf, "%d\n", val);
return ret;
......@@ -313,9 +313,9 @@ static ssize_t block_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
val = simple_strtol(buf, NULL, 0);
if (val == 1)
set_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
set_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
else if (val == 0) {
clear_bit(DFL_BLOCK_LOCKS, &ls->ls_flags);
clear_bit(DFL_BLOCK_LOCKS, &ls->ls_recover_flags);
smp_mb__after_clear_bit();
gfs2_glock_thaw(sdp);
} else {
......@@ -360,19 +360,14 @@ static ssize_t lkfirst_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
static ssize_t first_done_show(struct gfs2_sbd *sdp, char *buf)
{
struct lm_lockstruct *ls = &sdp->sd_lockstruct;
return sprintf(buf, "%d\n", ls->ls_first_done);
return sprintf(buf, "%d\n", !!test_bit(DFL_FIRST_MOUNT_DONE, &ls->ls_recover_flags));
}
static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid)
{
unsigned jid;
struct gfs2_jdesc *jd;
int rv;
rv = sscanf(buf, "%u", &jid);
if (rv != 1)
return -EINVAL;
rv = -ESHUTDOWN;
spin_lock(&sdp->sd_jindex_spin);
if (test_bit(SDF_NORECOVERY, &sdp->sd_flags))
......@@ -389,6 +384,20 @@ static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
}
out:
spin_unlock(&sdp->sd_jindex_spin);
return rv;
}
static ssize_t recover_store(struct gfs2_sbd *sdp, const char *buf, size_t len)
{
unsigned jid;
int rv;
rv = sscanf(buf, "%u", &jid);
if (rv != 1)
return -EINVAL;
rv = gfs2_recover_set(sdp, jid);
return rv ? rv : len;
}
......
......@@ -19,5 +19,7 @@ void gfs2_sys_fs_del(struct gfs2_sbd *sdp);
int gfs2_sys_init(void);
void gfs2_sys_uninit(void);
int gfs2_recover_set(struct gfs2_sbd *sdp, unsigned jid);
#endif /* __SYS_DOT_H__ */
......@@ -22,6 +22,8 @@
#define GFS2_LIVE_LOCK 1
#define GFS2_TRANS_LOCK 2
#define GFS2_RENAME_LOCK 3
#define GFS2_CONTROL_LOCK 4
#define GFS2_MOUNTED_LOCK 5
/* Format numbers for various metadata types */
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment