[DLM] timeout fixes
Various fixes related to the new timeout feature: - add_timeout() missed setting TIMEWARN flag on lkb's when the TIMEOUT flag was already set - clear_proc_locks should remove a dead process's locks from the timeout list - the end-of-life calculation for user locks needs to consider that ETIMEDOUT is equivalent to -DLM_ECANCEL - make initial default timewarn_cs config value visible in configfs - change bit position of TIMEOUT_CANCEL flag so it's not copied to a remote master node - set timestamp on remote lkb's so a lock dump will display the time they've been waiting Signed-off-by: David Teigland <teigland@redhat.com> Signed-off-by: Steven Whitehouse <swhiteho@redhat.com>
This commit is contained in:
parent
b3cab7b9a3
commit
84d8cd69a8
|
@ -433,6 +433,7 @@ static struct config_group *make_cluster(struct config_group *g,
|
|||
cl->cl_toss_secs = dlm_config.ci_toss_secs;
|
||||
cl->cl_scan_secs = dlm_config.ci_scan_secs;
|
||||
cl->cl_log_debug = dlm_config.ci_log_debug;
|
||||
cl->cl_timewarn_cs = dlm_config.ci_timewarn_cs;
|
||||
|
||||
space_list = &sps->ss_group;
|
||||
comm_list = &cms->cs_group;
|
||||
|
|
|
@ -215,9 +215,9 @@ struct dlm_args {
|
|||
#define DLM_IFL_OVERLAP_CANCEL 0x00100000
|
||||
#define DLM_IFL_ENDOFLIFE 0x00200000
|
||||
#define DLM_IFL_WATCH_TIMEWARN 0x00400000
|
||||
#define DLM_IFL_TIMEOUT_CANCEL 0x00800000
|
||||
#define DLM_IFL_USER 0x00000001
|
||||
#define DLM_IFL_ORPHAN 0x00000002
|
||||
#define DLM_IFL_TIMEOUT_CANCEL 0x00000004
|
||||
|
||||
struct dlm_lkb {
|
||||
struct dlm_rsb *lkb_resource; /* the rsb */
|
||||
|
|
|
@ -1010,17 +1010,18 @@ static void add_timeout(struct dlm_lkb *lkb)
|
|||
{
|
||||
struct dlm_ls *ls = lkb->lkb_resource->res_ls;
|
||||
|
||||
if (is_master_copy(lkb))
|
||||
if (is_master_copy(lkb)) {
|
||||
lkb->lkb_timestamp = jiffies;
|
||||
return;
|
||||
|
||||
if (lkb->lkb_exflags & DLM_LKF_TIMEOUT)
|
||||
goto add_it;
|
||||
}
|
||||
|
||||
if (test_bit(LSFL_TIMEWARN, &ls->ls_flags) &&
|
||||
!(lkb->lkb_exflags & DLM_LKF_NODLCKWT)) {
|
||||
lkb->lkb_flags |= DLM_IFL_WATCH_TIMEWARN;
|
||||
goto add_it;
|
||||
}
|
||||
if (lkb->lkb_exflags & DLM_LKF_TIMEOUT)
|
||||
goto add_it;
|
||||
return;
|
||||
|
||||
add_it:
|
||||
|
@ -3510,8 +3511,7 @@ static void _receive_cancel_reply(struct dlm_lkb *lkb, struct dlm_message *ms)
|
|||
case -DLM_ECANCEL:
|
||||
receive_flags_reply(lkb, ms);
|
||||
revert_lock_pc(r, lkb);
|
||||
if (ms->m_result)
|
||||
queue_cast(r, lkb, -DLM_ECANCEL);
|
||||
queue_cast(r, lkb, -DLM_ECANCEL);
|
||||
break;
|
||||
case 0:
|
||||
break;
|
||||
|
@ -4534,6 +4534,7 @@ void dlm_clear_proc_locks(struct dlm_ls *ls, struct dlm_user_proc *proc)
|
|||
lkb = del_proc_lock(ls, proc);
|
||||
if (!lkb)
|
||||
break;
|
||||
del_timeout(lkb);
|
||||
if (lkb->lkb_exflags & DLM_LKF_PERSISTENT)
|
||||
orphan_proc_lock(ls, lkb);
|
||||
else
|
||||
|
|
|
@ -133,8 +133,6 @@ void dlm_timeout_warn(struct dlm_lkb *lkb)
|
|||
size_t size;
|
||||
int rv;
|
||||
|
||||
log_debug(lkb->lkb_resource->res_ls, "timeout_warn %x", lkb->lkb_id);
|
||||
|
||||
size = nla_total_size(sizeof(struct dlm_lock_data)) +
|
||||
nla_total_size(0); /* why this? */
|
||||
|
||||
|
|
|
@ -138,6 +138,35 @@ static void compat_output(struct dlm_lock_result *res,
|
|||
}
|
||||
#endif
|
||||
|
||||
/* Figure out if this lock is at the end of its life and no longer
|
||||
available for the application to use. The lkb still exists until
|
||||
the final ast is read. A lock becomes EOL in three situations:
|
||||
1. a noqueue request fails with EAGAIN
|
||||
2. an unlock completes with EUNLOCK
|
||||
3. a cancel of a waiting request completes with ECANCEL/EDEADLK
|
||||
An EOL lock needs to be removed from the process's list of locks.
|
||||
And we can't allow any new operation on an EOL lock. This is
|
||||
not related to the lifetime of the lkb struct which is managed
|
||||
entirely by refcount. */
|
||||
|
||||
static int lkb_is_endoflife(struct dlm_lkb *lkb, int sb_status, int type)
|
||||
{
|
||||
switch (sb_status) {
|
||||
case -DLM_EUNLOCK:
|
||||
return 1;
|
||||
case -DLM_ECANCEL:
|
||||
case -ETIMEDOUT:
|
||||
if (lkb->lkb_grmode == DLM_LOCK_IV)
|
||||
return 1;
|
||||
break;
|
||||
case -EAGAIN:
|
||||
if (type == AST_COMP && lkb->lkb_grmode == DLM_LOCK_IV)
|
||||
return 1;
|
||||
break;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
/* we could possibly check if the cancel of an orphan has resulted in the lkb
|
||||
being removed and then remove that lkb from the orphans list and free it */
|
||||
|
||||
|
@ -184,25 +213,7 @@ void dlm_user_add_ast(struct dlm_lkb *lkb, int type)
|
|||
log_debug(ls, "ast overlap %x status %x %x",
|
||||
lkb->lkb_id, ua->lksb.sb_status, lkb->lkb_flags);
|
||||
|
||||
/* Figure out if this lock is at the end of its life and no longer
|
||||
available for the application to use. The lkb still exists until
|
||||
the final ast is read. A lock becomes EOL in three situations:
|
||||
1. a noqueue request fails with EAGAIN
|
||||
2. an unlock completes with EUNLOCK
|
||||
3. a cancel of a waiting request completes with ECANCEL
|
||||
An EOL lock needs to be removed from the process's list of locks.
|
||||
And we can't allow any new operation on an EOL lock. This is
|
||||
not related to the lifetime of the lkb struct which is managed
|
||||
entirely by refcount. */
|
||||
|
||||
if (type == AST_COMP &&
|
||||
lkb->lkb_grmode == DLM_LOCK_IV &&
|
||||
ua->lksb.sb_status == -EAGAIN)
|
||||
eol = 1;
|
||||
else if (ua->lksb.sb_status == -DLM_EUNLOCK ||
|
||||
(ua->lksb.sb_status == -DLM_ECANCEL &&
|
||||
lkb->lkb_grmode == DLM_LOCK_IV))
|
||||
eol = 1;
|
||||
eol = lkb_is_endoflife(lkb, ua->lksb.sb_status, type);
|
||||
if (eol) {
|
||||
lkb->lkb_ast_type &= ~AST_BAST;
|
||||
lkb->lkb_flags |= DLM_IFL_ENDOFLIFE;
|
||||
|
|
Loading…
Reference in New Issue