Commit 84d8cd69 authored by David Teigland's avatar David Teigland Committed by Steven Whitehouse

[DLM] timeout fixes

Various fixes related to the new timeout feature:
- add_timeout() missed setting TIMEWARN flag on lkb's when the
  TIMEOUT flag was already set
- clear_proc_locks should remove a dead process's locks from the
  timeout list
- the end-of-life calculation for user locks needs to consider that
  ETIMEDOUT is equivalent to -DLM_ECANCEL
- make initial default timewarn_cs config value visible in configfs
- change bit position of TIMEOUT_CANCEL flag so it's not copied to
  a remote master node
- set timestamp on remote lkb's so a lock dump will display the time
  they've been waiting
Signed-off-by: default avatarDavid Teigland <teigland@redhat.com>
Signed-off-by: default avatarSteven Whitehouse <swhiteho@redhat.com>
parent b3cab7b9
......@@ -433,6 +433,7 @@ static struct config_group *make_cluster(struct config_group *g,
cl->cl_toss_secs = dlm_config.ci_toss_secs;
cl->cl_scan_secs = dlm_config.ci_scan_secs;
cl->cl_log_debug = dlm_config.ci_log_debug;
cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
space_list = &sps->ss_group;
comm_list = &cms->cs_group;
......
......@@ -215,9 +215,9 @@ struct dlm_args {
#define DLM_IFL_OVERLAP_CANCEL 0x00100000
#define DLM_IFL_ENDOFLIFE 0x00200000
#define DLM_IFL_WATCH_TIMEWARN 0x00400000
#define DLM_IFL_TIMEOUT_CANCEL 0x00800000
#define DLM_IFL_USER 0x00000001
#define DLM_IFL_ORPHAN 0x00000002
#define DLM_IFL_TIMEOUT_CANCEL 0x00000004
struct dlm_lkb {
struct dlm_rsb *lkb_resource; /* the rsb */
......
......@@ -1010,17 +1010,18 @@ static void add_timeout(struct dlm_lkb *lkb)
{
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
if (is_master_copy(lkb))
if (is_master_copy(lkb)) {
lkb->lkb_timestamp = jiffies;
return;
if (lkb->lkb_exflags & DLM_LKF_TIMEOUT)
goto add_it;
}
if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN;
goto add_it;
}
if (lkb->lkb_exflags & DLM_LKF_TIMEOUT)
goto add_it;
return;
add_it:
......@@ -3510,7 +3511,6 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
case -DLM_ECANCEL:
receive_flags_reply(lkb, ms);
revert_lock_pc(r, lkb);
if (ms->m_result)
queue_cast(r, lkb, -DLM_ECANCEL);
break;
case 0:
......@@ -4534,6 +4534,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
lkb = del_proc_lock(ls, proc);
if (!lkb)
break;
del_timeout(lkb);
if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
orphan_proc_lock(ls, lkb);
else
......
......@@ -133,8 +133,6 @@ void dlm_timeout_warn(struct dlm_lkb *lkb)
size_t size;
int rv;
log_debug(lkb->lkb_resource->res_ls, "timeout_warn %x", lkb->lkb_id);
size = nla_total_size(sizeof(struct dlm_lock_data)) +
nla_total_size(0); /* why this? */
......
......@@ -138,6 +138,35 @@ static void compat_output(struct dlm_lock_result *res,
}
#endif
/* Figure out if this lock is at the end of its life and no longer
available for the application to use. The lkb still exists until
the final ast is read. A lock becomes EOL in three situations:
1. a noqueue request fails with EAGAIN
2. an unlock completes with EUNLOCK
3. a cancel of a waiting request completes with ECANCEL/EDEADLK
An EOL lock needs to be removed from the process's list of locks.
And we can't allow any new operation on an EOL lock. This is
not related to the lifetime of the lkb struct which is managed
entirely by refcount. */
static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
{
switch (sb_status) {
case -DLM_EUNLOCK:
return 1;
case -DLM_ECANCEL:
case -ETIMEDOUT:
if (lkb->lkb_grmode == DLM_LOCK_IV)
return 1;
break;
case -EAGAIN:
if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV)
return 1;
break;
}
return 0;
}
/* we could possibly check if the cancel of an orphan has resulted in the lkb
being removed and then remove that lkb from the orphans list and free it */
......@@ -184,25 +213,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
log_debug(ls, "ast overlap %x status %x %x",
lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
/* Figure out if this lock is at the end of its life and no longer
available for the application to use. The lkb still exists until
the final ast is read. A lock becomes EOL in three situations:
1. a noqueue request fails with EAGAIN
2. an unlock completes with EUNLOCK
3. a cancel of a waiting request completes with ECANCEL
An EOL lock needs to be removed from the process's list of locks.
And we can't allow any new operation on an EOL lock. This is
not related to the lifetime of the lkb struct which is managed
entirely by refcount. */
if (type == AST_COMP &&
lkb->lkb_grmode == DLM_LOCK_IV &&
ua->lksb.sb_status == -EAGAIN)
eol = 1;
else if (ua->lksb.sb_status == -DLM_EUNLOCK ||
(ua->lksb.sb_status == -DLM_ECANCEL &&
lkb->lkb_grmode == DLM_LOCK_IV))
eol = 1;
eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
if (eol) {
lkb->lkb_ast_type &= ~AST_BAST;
lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
......
Markdown is supported
0%
or
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment