2019-05-24 18:04:05 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0-or-later */
|
2021-05-07 09:06:44 +08:00
|
|
|
/*
|
2005-12-16 06:31:24 +08:00
|
|
|
* dlmglue.h
|
|
|
|
*
|
|
|
|
* description here
|
|
|
|
*
|
|
|
|
* Copyright (C) 2002, 2004 Oracle. All rights reserved.
|
|
|
|
*/
|
|
|
|
|
|
|
|
|
|
|
|
#ifndef DLMGLUE_H
|
|
|
|
#define DLMGLUE_H
|
|
|
|
|
2006-09-09 05:14:34 +08:00
|
|
|
#include "dcache.h"
|
|
|
|
|
2007-09-08 04:58:15 +08:00
|
|
|
#define OCFS2_LVB_VERSION 5
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
struct ocfs2_meta_lvb {
|
2006-09-13 06:22:18 +08:00
|
|
|
__u8 lvb_version;
|
2006-09-23 08:28:19 +08:00
|
|
|
__u8 lvb_reserved0;
|
2007-09-08 04:58:15 +08:00
|
|
|
__be16 lvb_idynfeatures;
|
2005-12-16 06:31:24 +08:00
|
|
|
__be32 lvb_iclusters;
|
|
|
|
__be32 lvb_iuid;
|
|
|
|
__be32 lvb_igid;
|
|
|
|
__be64 lvb_iatime_packed;
|
|
|
|
__be64 lvb_ictime_packed;
|
|
|
|
__be64 lvb_imtime_packed;
|
|
|
|
__be64 lvb_isize;
|
|
|
|
__be16 lvb_imode;
|
|
|
|
__be16 lvb_inlink;
|
2006-07-04 08:27:12 +08:00
|
|
|
__be32 lvb_iattr;
|
2006-09-13 06:35:49 +08:00
|
|
|
__be32 lvb_igeneration;
|
|
|
|
__be32 lvb_reserved2;
|
2005-12-16 06:31:24 +08:00
|
|
|
};
|
|
|
|
|
2008-08-26 01:56:50 +08:00
|
|
|
#define OCFS2_QINFO_LVB_VERSION 1
|
|
|
|
|
|
|
|
struct ocfs2_qinfo_lvb {
|
|
|
|
__u8 lvb_version;
|
|
|
|
__u8 lvb_reserved[3];
|
|
|
|
__be32 lvb_bgrace;
|
|
|
|
__be32 lvb_igrace;
|
|
|
|
__be32 lvb_syncms;
|
|
|
|
__be32 lvb_blocks;
|
|
|
|
__be32 lvb_free_blk;
|
|
|
|
__be32 lvb_free_entry;
|
|
|
|
};
|
|
|
|
|
2009-06-04 08:02:55 +08:00
|
|
|
#define OCFS2_ORPHAN_LVB_VERSION 1
|
|
|
|
|
|
|
|
struct ocfs2_orphan_scan_lvb {
|
|
|
|
__u8 lvb_version;
|
|
|
|
__u8 lvb_reserved[3];
|
|
|
|
__be32 lvb_os_seqno;
|
|
|
|
};
|
|
|
|
|
2018-02-01 08:15:10 +08:00
|
|
|
#define OCFS2_TRIMFS_LVB_VERSION 1
|
|
|
|
|
|
|
|
struct ocfs2_trim_fs_lvb {
|
|
|
|
__u8 lvb_version;
|
|
|
|
__u8 lvb_success;
|
|
|
|
__u8 lvb_reserved[2];
|
|
|
|
__be32 lvb_nodenum;
|
|
|
|
__be64 lvb_start;
|
|
|
|
__be64 lvb_len;
|
|
|
|
__be64 lvb_minlen;
|
|
|
|
__be64 lvb_trimlen;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct ocfs2_trim_fs_info {
|
|
|
|
u8 tf_valid; /* lvb is valid, or not */
|
|
|
|
u8 tf_success; /* trim is successful, or not */
|
|
|
|
u32 tf_nodenum; /* osb node number */
|
|
|
|
u64 tf_start; /* trim start offset in clusters */
|
|
|
|
u64 tf_len; /* trim end offset in clusters */
|
|
|
|
u64 tf_minlen; /* trim minimum contiguous free clusters */
|
|
|
|
u64 tf_trimlen; /* trimmed length in bytes */
|
|
|
|
};
|
|
|
|
|
ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock
We are in the situation that we have to avoid recursive cluster locking,
but there is no way to check if a cluster lock has been taken by a precess
already.
Mostly, we can avoid recursive locking by writing code carefully.
However, we found that it's very hard to handle the routines that are
invoked directly by vfs code. For instance:
const struct inode_operations ocfs2_file_iops = {
.permission = ocfs2_permission,
.get_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
};
Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR):
do_sys_open
may_open
inode_permission
ocfs2_permission
ocfs2_inode_lock() <=== first time
generic_permission
get_acl
ocfs2_iop_get_acl
ocfs2_inode_lock() <=== recursive one
A deadlock will occur if a remote EX request comes in between two of
ocfs2_inode_lock(). Briefly describe how the deadlock is formed:
On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in
BAST(ocfs2_generic_handle_bast) when downconvert is started on behalf of
the remote EX lock request. Another hand, the recursive cluster lock
(the second one) will be blocked in in __ocfs2_cluster_lock() because of
OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why? because
there is no chance for the first cluster lock on this node to be
unlocked - we block ourselves in the code path.
The idea to fix this issue is mostly taken from gfs2 code.
1. introduce a new field: struct ocfs2_lock_res.l_holders, to keep track
of the processes' pid who has taken the cluster lock of this lock
resource;
2. introduce a new flag for ocfs2_inode_lock_full:
OCFS2_META_LOCK_GETBH; it means just getting back disk inode bh for
us if we've got cluster lock.
3. export a helper: ocfs2_is_locked_by_me() is used to check if we have
got the cluster lock in the upper code path.
The tracking logic should be used by some of the ocfs2 vfs's callbacks,
to solve the recursive locking issue cuased by the fact that vfs
routines can call into each other.
The performance penalty of processing the holder list should only be
seen at a few cases where the tracking logic is used, such as get/set
acl.
You may ask what if the first time we got a PR lock, and the second time
we want a EX lock? fortunately, this case never happens in the real
world, as far as I can see, including permission check,
(get|set)_(acl|attr), and the gfs2 code also do so.
[sfr@canb.auug.org.au remove some inlines]
Link: http://lkml.kernel.org/r/20170117100948.11657-2-zren@suse.com
Signed-off-by: Eric Ren <zren@suse.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-02-23 07:40:41 +08:00
|
|
|
struct ocfs2_lock_holder {
|
|
|
|
struct list_head oh_list;
|
|
|
|
struct pid *oh_owner_pid;
|
ocfs2: ocfs2_inode_lock_tracker does not distinguish lock level
ocfs2_inode_lock_tracker as a variant of ocfs2_inode_lock, is used to
prevent deadlock due to recursive lock acquisition.
But this function does not distinguish whether the requested level is EX
or PR.
If a RP lock has been attained, this function will immediately return
success afterwards even an EX lock is requested.
But actually the return value does not mean that the process got a EX
lock, because ocfs2_inode_lock has not been called.
When taking lock levels into account, we face some different situations:
1. no lock is held
In this case, just lock the inode and return 0
2. We are holding a lock
For this situation, things diverges into several cases
wanted holding what to do
ex ex see 2.1 below
ex pr see 2.2 below
pr ex see 2.1 below
pr pr see 2.1 below
2.1 lock level that is been held is compatible
with the wanted level, so no lock action will be tacken.
2.2 Otherwise, an upgrade is needed, but it is forbidden.
Reason why upgrade within a process is forbidden is that lock upgrade
may cause dead lock. The following illustrate how it happens.
process 1 process 2
ocfs2_inode_lock_tracker(ex=0)
<====== ocfs2_inode_lock_tracker(ex=1)
ocfs2_inode_lock_tracker(ex=1)
For the status quo of ocfs2, without this patch, neither a bug nor
end-user impact will be caused because the wrong logic is avoided.
But I'm afraid this generic interface, may be called by other developers
in future and used in this situation.
a process
ocfs2_inode_lock_tracker(ex=0)
ocfs2_inode_lock_tracker(ex=1)
Link: http://lkml.kernel.org/r/20180510053230.17217-1-lchen@suse.com
Signed-off-by: Larry Chen <lchen@suse.com>
Reviewed-by: Gang He <ghe@suse.com>
Cc: Mark Fasheh <mark@fasheh.com>
Cc: Joel Becker <jlbec@evilplan.org>
Cc: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Joseph Qi <jiangqi903@gmail.com>
Cc: Changwei Ge <ge.changwei@h3c.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-06-08 08:04:43 +08:00
|
|
|
int oh_ex;
|
ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock
We are in the situation that we have to avoid recursive cluster locking,
but there is no way to check if a cluster lock has been taken by a precess
already.
Mostly, we can avoid recursive locking by writing code carefully.
However, we found that it's very hard to handle the routines that are
invoked directly by vfs code. For instance:
const struct inode_operations ocfs2_file_iops = {
.permission = ocfs2_permission,
.get_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
};
Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR):
do_sys_open
may_open
inode_permission
ocfs2_permission
ocfs2_inode_lock() <=== first time
generic_permission
get_acl
ocfs2_iop_get_acl
ocfs2_inode_lock() <=== recursive one
A deadlock will occur if a remote EX request comes in between two of
ocfs2_inode_lock(). Briefly describe how the deadlock is formed:
On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in
BAST(ocfs2_generic_handle_bast) when downconvert is started on behalf of
the remote EX lock request. Another hand, the recursive cluster lock
(the second one) will be blocked in in __ocfs2_cluster_lock() because of
OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why? because
there is no chance for the first cluster lock on this node to be
unlocked - we block ourselves in the code path.
The idea to fix this issue is mostly taken from gfs2 code.
1. introduce a new field: struct ocfs2_lock_res.l_holders, to keep track
of the processes' pid who has taken the cluster lock of this lock
resource;
2. introduce a new flag for ocfs2_inode_lock_full:
OCFS2_META_LOCK_GETBH; it means just getting back disk inode bh for
us if we've got cluster lock.
3. export a helper: ocfs2_is_locked_by_me() is used to check if we have
got the cluster lock in the upper code path.
The tracking logic should be used by some of the ocfs2 vfs's callbacks,
to solve the recursive locking issue cuased by the fact that vfs
routines can call into each other.
The performance penalty of processing the holder list should only be
seen at a few cases where the tracking logic is used, such as get/set
acl.
You may ask what if the first time we got a PR lock, and the second time
we want a EX lock? fortunately, this case never happens in the real
world, as far as I can see, including permission check,
(get|set)_(acl|attr), and the gfs2 code also do so.
[sfr@canb.auug.org.au remove some inlines]
Link: http://lkml.kernel.org/r/20170117100948.11657-2-zren@suse.com
Signed-off-by: Eric Ren <zren@suse.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-02-23 07:40:41 +08:00
|
|
|
};
|
|
|
|
|
2007-10-19 06:30:42 +08:00
|
|
|
/* ocfs2_inode_lock_full() 'arg_flags' flags */
|
2005-12-16 06:31:24 +08:00
|
|
|
/* don't wait on recovery. */
|
|
|
|
#define OCFS2_META_LOCK_RECOVERY (0x01)
|
|
|
|
/* Instruct the dlm not to queue ourselves on the other node. */
|
|
|
|
#define OCFS2_META_LOCK_NOQUEUE (0x02)
|
2007-09-25 06:56:19 +08:00
|
|
|
/* don't block waiting for the downconvert thread, instead return -EAGAIN */
|
2005-12-16 06:31:24 +08:00
|
|
|
#define OCFS2_LOCK_NONBLOCK (0x04)
|
ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock
We are in the situation that we have to avoid recursive cluster locking,
but there is no way to check if a cluster lock has been taken by a precess
already.
Mostly, we can avoid recursive locking by writing code carefully.
However, we found that it's very hard to handle the routines that are
invoked directly by vfs code. For instance:
const struct inode_operations ocfs2_file_iops = {
.permission = ocfs2_permission,
.get_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
};
Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR):
do_sys_open
may_open
inode_permission
ocfs2_permission
ocfs2_inode_lock() <=== first time
generic_permission
get_acl
ocfs2_iop_get_acl
ocfs2_inode_lock() <=== recursive one
A deadlock will occur if a remote EX request comes in between two of
ocfs2_inode_lock(). Briefly describe how the deadlock is formed:
On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in
BAST(ocfs2_generic_handle_bast) when downconvert is started on behalf of
the remote EX lock request. Another hand, the recursive cluster lock
(the second one) will be blocked in in __ocfs2_cluster_lock() because of
OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why? because
there is no chance for the first cluster lock on this node to be
unlocked - we block ourselves in the code path.
The idea to fix this issue is mostly taken from gfs2 code.
1. introduce a new field: struct ocfs2_lock_res.l_holders, to keep track
of the processes' pid who has taken the cluster lock of this lock
resource;
2. introduce a new flag for ocfs2_inode_lock_full:
OCFS2_META_LOCK_GETBH; it means just getting back disk inode bh for
us if we've got cluster lock.
3. export a helper: ocfs2_is_locked_by_me() is used to check if we have
got the cluster lock in the upper code path.
The tracking logic should be used by some of the ocfs2 vfs's callbacks,
to solve the recursive locking issue cuased by the fact that vfs
routines can call into each other.
The performance penalty of processing the holder list should only be
seen at a few cases where the tracking logic is used, such as get/set
acl.
You may ask what if the first time we got a PR lock, and the second time
we want a EX lock? fortunately, this case never happens in the real
world, as far as I can see, including permission check,
(get|set)_(acl|attr), and the gfs2 code also do so.
[sfr@canb.auug.org.au remove some inlines]
Link: http://lkml.kernel.org/r/20170117100948.11657-2-zren@suse.com
Signed-off-by: Eric Ren <zren@suse.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-02-23 07:40:41 +08:00
|
|
|
/* just get back disk inode bh if we've got cluster lock. */
|
|
|
|
#define OCFS2_META_LOCK_GETBH (0x08)
|
2005-12-16 06:31:24 +08:00
|
|
|
|
2009-06-04 21:26:50 +08:00
|
|
|
/* Locking subclasses of inode cluster lock */
|
|
|
|
enum {
|
|
|
|
OI_LS_NORMAL = 0,
|
|
|
|
OI_LS_PARENT,
|
|
|
|
OI_LS_RENAME1,
|
|
|
|
OI_LS_RENAME2,
|
2010-09-07 13:30:06 +08:00
|
|
|
OI_LS_REFLINK_TARGET,
|
2009-06-04 21:26:50 +08:00
|
|
|
};
|
|
|
|
|
2005-12-16 06:31:24 +08:00
|
|
|
int ocfs2_dlm_init(struct ocfs2_super *osb);
|
2008-02-02 07:03:57 +08:00
|
|
|
void ocfs2_dlm_shutdown(struct ocfs2_super *osb, int hangup_pending);
|
2005-12-16 06:31:24 +08:00
|
|
|
void ocfs2_lock_res_init_once(struct ocfs2_lock_res *res);
|
|
|
|
void ocfs2_inode_lock_res_init(struct ocfs2_lock_res *res,
|
|
|
|
enum ocfs2_lock_type type,
|
2006-09-23 08:28:19 +08:00
|
|
|
unsigned int generation,
|
2005-12-16 06:31:24 +08:00
|
|
|
struct inode *inode);
|
2006-09-09 05:14:34 +08:00
|
|
|
void ocfs2_dentry_lock_res_init(struct ocfs2_dentry_lock *dl,
|
|
|
|
u64 parent, struct inode *inode);
|
2007-12-21 08:43:10 +08:00
|
|
|
struct ocfs2_file_private;
|
|
|
|
void ocfs2_file_lock_res_init(struct ocfs2_lock_res *lockres,
|
|
|
|
struct ocfs2_file_private *fp);
|
2008-08-26 01:56:50 +08:00
|
|
|
struct ocfs2_mem_dqinfo;
|
|
|
|
void ocfs2_qinfo_lock_res_init(struct ocfs2_lock_res *lockres,
|
|
|
|
struct ocfs2_mem_dqinfo *info);
|
2009-08-18 11:19:58 +08:00
|
|
|
void ocfs2_refcount_lock_res_init(struct ocfs2_lock_res *lockres,
|
|
|
|
struct ocfs2_super *osb, u64 ref_blkno,
|
|
|
|
unsigned int generation);
|
2005-12-16 06:31:24 +08:00
|
|
|
void ocfs2_lock_res_free(struct ocfs2_lock_res *res);
|
|
|
|
int ocfs2_create_new_inode_locks(struct inode *inode);
|
|
|
|
int ocfs2_drop_inode_locks(struct inode *inode);
|
|
|
|
int ocfs2_rw_lock(struct inode *inode, int write);
|
2018-02-01 08:15:17 +08:00
|
|
|
int ocfs2_try_rw_lock(struct inode *inode, int write);
|
2005-12-16 06:31:24 +08:00
|
|
|
void ocfs2_rw_unlock(struct inode *inode, int write);
|
2007-03-21 07:01:38 +08:00
|
|
|
int ocfs2_open_lock(struct inode *inode);
|
|
|
|
int ocfs2_try_open_lock(struct inode *inode, int write);
|
|
|
|
void ocfs2_open_unlock(struct inode *inode);
|
2007-10-19 06:30:42 +08:00
|
|
|
int ocfs2_inode_lock_atime(struct inode *inode,
|
2006-11-15 15:48:42 +08:00
|
|
|
struct vfsmount *vfsmnt,
|
2018-02-01 08:15:25 +08:00
|
|
|
int *level, int wait);
|
2009-06-04 21:26:50 +08:00
|
|
|
int ocfs2_inode_lock_full_nested(struct inode *inode,
|
2005-12-16 06:31:24 +08:00
|
|
|
struct buffer_head **ret_bh,
|
|
|
|
int ex,
|
2009-06-04 21:26:50 +08:00
|
|
|
int arg_flags,
|
|
|
|
int subclass);
|
2007-10-19 06:30:42 +08:00
|
|
|
int ocfs2_inode_lock_with_page(struct inode *inode,
|
2005-12-16 06:31:24 +08:00
|
|
|
struct buffer_head **ret_bh,
|
|
|
|
int ex,
|
|
|
|
struct page *page);
|
2009-06-04 21:26:50 +08:00
|
|
|
/* Variants without special locking class or flags */
|
|
|
|
#define ocfs2_inode_lock_full(i, r, e, f)\
|
|
|
|
ocfs2_inode_lock_full_nested(i, r, e, f, OI_LS_NORMAL)
|
|
|
|
#define ocfs2_inode_lock_nested(i, b, e, s)\
|
|
|
|
ocfs2_inode_lock_full_nested(i, b, e, 0, s)
|
2005-12-16 06:31:24 +08:00
|
|
|
/* 99% of the time we don't want to supply any additional flags --
|
|
|
|
* those are for very specific cases only. */
|
2009-06-04 21:26:50 +08:00
|
|
|
#define ocfs2_inode_lock(i, b, e) ocfs2_inode_lock_full_nested(i, b, e, 0, OI_LS_NORMAL)
|
2018-02-01 08:15:17 +08:00
|
|
|
#define ocfs2_try_inode_lock(i, b, e)\
|
|
|
|
ocfs2_inode_lock_full_nested(i, b, e, OCFS2_META_LOCK_NOQUEUE,\
|
|
|
|
OI_LS_NORMAL)
|
2007-10-19 06:30:42 +08:00
|
|
|
void ocfs2_inode_unlock(struct inode *inode,
|
2005-12-16 06:31:24 +08:00
|
|
|
int ex);
|
|
|
|
int ocfs2_super_lock(struct ocfs2_super *osb,
|
|
|
|
int ex);
|
|
|
|
void ocfs2_super_unlock(struct ocfs2_super *osb,
|
|
|
|
int ex);
|
2009-06-23 02:40:07 +08:00
|
|
|
int ocfs2_orphan_scan_lock(struct ocfs2_super *osb, u32 *seqno);
|
|
|
|
void ocfs2_orphan_scan_unlock(struct ocfs2_super *osb, u32 seqno);
|
2009-06-04 08:02:55 +08:00
|
|
|
|
2005-12-16 06:31:24 +08:00
|
|
|
int ocfs2_rename_lock(struct ocfs2_super *osb);
|
|
|
|
void ocfs2_rename_unlock(struct ocfs2_super *osb);
|
2009-03-06 21:29:10 +08:00
|
|
|
int ocfs2_nfs_sync_lock(struct ocfs2_super *osb, int ex);
|
|
|
|
void ocfs2_nfs_sync_unlock(struct ocfs2_super *osb, int ex);
|
2018-02-01 08:15:10 +08:00
|
|
|
void ocfs2_trim_fs_lock_res_init(struct ocfs2_super *osb);
|
|
|
|
void ocfs2_trim_fs_lock_res_uninit(struct ocfs2_super *osb);
|
|
|
|
int ocfs2_trim_fs_lock(struct ocfs2_super *osb,
|
|
|
|
struct ocfs2_trim_fs_info *info, int trylock);
|
|
|
|
void ocfs2_trim_fs_unlock(struct ocfs2_super *osb,
|
|
|
|
struct ocfs2_trim_fs_info *info);
|
2006-09-09 05:14:34 +08:00
|
|
|
int ocfs2_dentry_lock(struct dentry *dentry, int ex);
|
|
|
|
void ocfs2_dentry_unlock(struct dentry *dentry, int ex);
|
2007-12-21 08:43:10 +08:00
|
|
|
int ocfs2_file_lock(struct file *file, int ex, int trylock);
|
|
|
|
void ocfs2_file_unlock(struct file *file);
|
2008-08-26 01:56:50 +08:00
|
|
|
int ocfs2_qinfo_lock(struct ocfs2_mem_dqinfo *oinfo, int ex);
|
|
|
|
void ocfs2_qinfo_unlock(struct ocfs2_mem_dqinfo *oinfo, int ex);
|
2009-08-18 11:19:58 +08:00
|
|
|
struct ocfs2_refcount_tree;
|
|
|
|
int ocfs2_refcount_lock(struct ocfs2_refcount_tree *ref_tree, int ex);
|
|
|
|
void ocfs2_refcount_unlock(struct ocfs2_refcount_tree *ref_tree, int ex);
|
2008-08-26 01:56:50 +08:00
|
|
|
|
2006-09-09 05:14:34 +08:00
|
|
|
|
2014-04-04 05:46:57 +08:00
|
|
|
void ocfs2_mark_lockres_freeing(struct ocfs2_super *osb,
|
|
|
|
struct ocfs2_lock_res *lockres);
|
2006-09-09 05:14:34 +08:00
|
|
|
void ocfs2_simple_drop_lockres(struct ocfs2_super *osb,
|
|
|
|
struct ocfs2_lock_res *lockres);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
2007-09-25 06:56:19 +08:00
|
|
|
/* for the downconvert thread */
|
|
|
|
void ocfs2_wake_downconvert_thread(struct ocfs2_super *osb);
|
2005-12-16 06:31:24 +08:00
|
|
|
|
|
|
|
struct ocfs2_dlm_debug *ocfs2_new_dlm_debug(void);
|
|
|
|
void ocfs2_put_dlm_debug(struct ocfs2_dlm_debug *dlm_debug);
|
|
|
|
|
2008-01-31 08:58:36 +08:00
|
|
|
/* To set the locking protocol on module initialization */
|
|
|
|
void ocfs2_set_locking_protocol(void);
|
ocfs2/dlmglue: prepare tracking logic to avoid recursive cluster lock
We are in the situation that we have to avoid recursive cluster locking,
but there is no way to check if a cluster lock has been taken by a precess
already.
Mostly, we can avoid recursive locking by writing code carefully.
However, we found that it's very hard to handle the routines that are
invoked directly by vfs code. For instance:
const struct inode_operations ocfs2_file_iops = {
.permission = ocfs2_permission,
.get_acl = ocfs2_iop_get_acl,
.set_acl = ocfs2_iop_set_acl,
};
Both ocfs2_permission() and ocfs2_iop_get_acl() call ocfs2_inode_lock(PR):
do_sys_open
may_open
inode_permission
ocfs2_permission
ocfs2_inode_lock() <=== first time
generic_permission
get_acl
ocfs2_iop_get_acl
ocfs2_inode_lock() <=== recursive one
A deadlock will occur if a remote EX request comes in between two of
ocfs2_inode_lock(). Briefly describe how the deadlock is formed:
On one hand, OCFS2_LOCK_BLOCKED flag of this lockres is set in
BAST(ocfs2_generic_handle_bast) when downconvert is started on behalf of
the remote EX lock request. Another hand, the recursive cluster lock
(the second one) will be blocked in in __ocfs2_cluster_lock() because of
OCFS2_LOCK_BLOCKED. But, the downconvert never complete, why? because
there is no chance for the first cluster lock on this node to be
unlocked - we block ourselves in the code path.
The idea to fix this issue is mostly taken from gfs2 code.
1. introduce a new field: struct ocfs2_lock_res.l_holders, to keep track
of the processes' pid who has taken the cluster lock of this lock
resource;
2. introduce a new flag for ocfs2_inode_lock_full:
OCFS2_META_LOCK_GETBH; it means just getting back disk inode bh for
us if we've got cluster lock.
3. export a helper: ocfs2_is_locked_by_me() is used to check if we have
got the cluster lock in the upper code path.
The tracking logic should be used by some of the ocfs2 vfs's callbacks,
to solve the recursive locking issue cuased by the fact that vfs
routines can call into each other.
The performance penalty of processing the holder list should only be
seen at a few cases where the tracking logic is used, such as get/set
acl.
You may ask what if the first time we got a PR lock, and the second time
we want a EX lock? fortunately, this case never happens in the real
world, as far as I can see, including permission check,
(get|set)_(acl|attr), and the gfs2 code also do so.
[sfr@canb.auug.org.au remove some inlines]
Link: http://lkml.kernel.org/r/20170117100948.11657-2-zren@suse.com
Signed-off-by: Eric Ren <zren@suse.com>
Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com>
Reviewed-by: Joseph Qi <jiangqi903@gmail.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Cc: Mark Fasheh <mfasheh@versity.com>
Cc: Joel Becker <jlbec@evilplan.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2017-02-23 07:40:41 +08:00
|
|
|
|
|
|
|
/* The _tracker pair is used to avoid cluster recursive locking */
|
|
|
|
int ocfs2_inode_lock_tracker(struct inode *inode,
|
|
|
|
struct buffer_head **ret_bh,
|
|
|
|
int ex,
|
|
|
|
struct ocfs2_lock_holder *oh);
|
|
|
|
void ocfs2_inode_unlock_tracker(struct inode *inode,
|
|
|
|
int ex,
|
|
|
|
struct ocfs2_lock_holder *oh,
|
|
|
|
int had_lock);
|
|
|
|
|
2005-12-16 06:31:24 +08:00
|
|
|
#endif /* DLMGLUE_H */
|