From ee860b6a650360c91f5d5f9a94262aad9be90015 Mon Sep 17 00:00:00 2001 From: Sachin Prabhu Date: Wed, 10 Mar 2010 10:28:40 -0500 Subject: [PATCH 01/11] [PATCH] Skip check for mandatory locks when unlocking ocfs2_lock() will skip locks on file which has mode set to 02666. This is a problem in cases where the mode of the file is changed after a process has obtained a lock on the file. ocfs2_lock() should skip the check for mandatory locks when unlocking a file. Signed-off-by: Sachin Prabhu Signed-off-by: Joel Becker --- fs/ocfs2/locks.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/locks.c b/fs/ocfs2/locks.c index 544ac6245175..b5cb3ede9408 100644 --- a/fs/ocfs2/locks.c +++ b/fs/ocfs2/locks.c @@ -133,7 +133,7 @@ int ocfs2_lock(struct file *file, int cmd, struct file_lock *fl) if (!(fl->fl_flags & FL_POSIX)) return -ENOLCK; - if (__mandatory_lock(inode)) + if (__mandatory_lock(inode) && fl->fl_type != F_UNLCK) return -ENOLCK; return ocfs2_plock(osb->cconn, OCFS2_I(inode)->ip_blkno, file, cmd, fl); From 78c37eb0d5e6a9727b12ea0f1821795ffaa66cfe Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 3 Mar 2010 11:26:27 +0800 Subject: [PATCH 02/11] ocfs2: Change bg_chain check for ocfs2_validate_gd_parent. In ocfs2_validate_gd_parent, we check bg_chain against the cl_next_free_rec of the dinode. Actually in resize, we have the chance of bg_chain == cl_next_free_rec. So add some additional condition check for it. I also rename paramter "clean_error" to "resize", since the old one is not clearly enough to indicate that we should only meet with this case in resize. btw, the correpsonding bug is http://oss.oracle.com/bugzilla/show_bug.cgi?id=1230. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/suballoc.c | 13 ++++++++----- 1 file changed, 8 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index c3c60bc3e072..0016503d83ef 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -152,7 +152,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) #define do_error(fmt, ...) \ do{ \ - if (clean_error) \ + if (resize) \ mlog(ML_ERROR, fmt "\n", ##__VA_ARGS__); \ else \ ocfs2_error(sb, fmt, ##__VA_ARGS__); \ @@ -160,7 +160,7 @@ static u32 ocfs2_bits_per_group(struct ocfs2_chain_list *cl) static int ocfs2_validate_gd_self(struct super_block *sb, struct buffer_head *bh, - int clean_error) + int resize) { struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; @@ -211,7 +211,7 @@ static int ocfs2_validate_gd_self(struct super_block *sb, static int ocfs2_validate_gd_parent(struct super_block *sb, struct ocfs2_dinode *di, struct buffer_head *bh, - int clean_error) + int resize) { unsigned int max_bits; struct ocfs2_group_desc *gd = (struct ocfs2_group_desc *)bh->b_data; @@ -233,8 +233,11 @@ static int ocfs2_validate_gd_parent(struct super_block *sb, return -EINVAL; } - if (le16_to_cpu(gd->bg_chain) >= - le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) { + /* In resize, we may meet the case bg_chain == cl_next_free_rec. */ + if ((le16_to_cpu(gd->bg_chain) > + le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) || + ((le16_to_cpu(gd->bg_chain) == + le16_to_cpu(di->id2.i_chain.cl_next_free_rec)) && !resize)) { do_error("Group descriptor #%llu has bad chain %u", (unsigned long long)bh->b_blocknr, le16_to_cpu(gd->bg_chain)); From 6527f8f848ec84b9daf1cb07601266126b8507ab Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Wed, 10 Mar 2010 09:56:52 +0800 Subject: [PATCH 03/11] ocfs2: Update i_blocks in reflink operations. In reflink, we need to upate i_blocks for the target inode. Reported-by: Jie Liu Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/refcounttree.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index 9e96921dffda..29405f2ff616 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4075,6 +4075,7 @@ static int ocfs2_complete_reflink(struct inode *s_inode, OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features; spin_unlock(&OCFS2_I(t_inode)->ip_lock); i_size_write(t_inode, size); + t_inode->i_blocks = s_inode->i_blocks; di->i_xattr_inline_size = s_di->i_xattr_inline_size; di->i_clusters = s_di->i_clusters; From fcefd25ac89239cb57fa198f125a79ff85468c75 Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Mon, 15 Mar 2010 15:39:00 -0700 Subject: [PATCH 04/11] ocfs2: set i_mode on disk during acl operations ocfs2_set_acl() and ocfs2_init_acl() were setting i_mode on the in-memory inode, but never setting it on the disk copy. Thus, acls were some times not getting propagated between nodes. This patch fixes the issue by adding a helper function ocfs2_acl_set_mode() which does this the right way. ocfs2_set_acl() and ocfs2_init_acl() are then updated to call ocfs2_acl_set_mode(). Signed-off-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/acl.c | 77 ++++++++++++++++++++++++++++++++++++++++++++++---- 1 file changed, 72 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/acl.c b/fs/ocfs2/acl.c index 0501974bedd0..8ccf0f8c9cc8 100644 --- a/fs/ocfs2/acl.c +++ b/fs/ocfs2/acl.c @@ -30,6 +30,8 @@ #include "alloc.h" #include "dlmglue.h" #include "file.h" +#include "inode.h" +#include "journal.h" #include "ocfs2_fs.h" #include "xattr.h" @@ -165,6 +167,60 @@ static struct posix_acl *ocfs2_get_acl(struct inode *inode, int type) return acl; } +/* + * Helper function to set i_mode in memory and disk. Some call paths + * will not have di_bh or a journal handle to pass, in which case it + * will create it's own. + */ +static int ocfs2_acl_set_mode(struct inode *inode, struct buffer_head *di_bh, + handle_t *handle, umode_t new_mode) +{ + int ret, commit_handle = 0; + struct ocfs2_dinode *di; + + if (di_bh == NULL) { + ret = ocfs2_read_inode_block(inode, &di_bh); + if (ret) { + mlog_errno(ret); + goto out; + } + } else + get_bh(di_bh); + + if (handle == NULL) { + handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb), + OCFS2_INODE_UPDATE_CREDITS); + if (IS_ERR(handle)) { + ret = PTR_ERR(handle); + mlog_errno(ret); + goto out_brelse; + } + + commit_handle = 1; + } + + di = (struct ocfs2_dinode *)di_bh->b_data; + ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (ret) { + mlog_errno(ret); + goto out_commit; + } + + inode->i_mode = new_mode; + di->i_mode = cpu_to_le16(inode->i_mode); + + ocfs2_journal_dirty(handle, di_bh); + +out_commit: + if (commit_handle) + ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle); +out_brelse: + brelse(di_bh); +out: + return ret; +} + /* * Set the access or default ACL of an inode. */ @@ -193,9 +249,14 @@ static int ocfs2_set_acl(handle_t *handle, if (ret < 0) return ret; else { - inode->i_mode = mode; if (ret == 0) acl = NULL; + + ret = ocfs2_acl_set_mode(inode, di_bh, + handle, mode); + if (ret) + return ret; + } } break; @@ -283,6 +344,7 @@ int ocfs2_init_acl(handle_t *handle, struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); struct posix_acl *acl = NULL; int ret = 0; + mode_t mode; if (!S_ISLNK(inode->i_mode)) { if (osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) { @@ -291,12 +353,17 @@ int ocfs2_init_acl(handle_t *handle, if (IS_ERR(acl)) return PTR_ERR(acl); } - if (!acl) - inode->i_mode &= ~current_umask(); + if (!acl) { + mode = inode->i_mode & ~current_umask(); + ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); + if (ret) { + mlog_errno(ret); + goto cleanup; + } + } } if ((osb->s_mount_opt & OCFS2_MOUNT_POSIX_ACL) && acl) { struct posix_acl *clone; - mode_t mode; if (S_ISDIR(inode->i_mode)) { ret = ocfs2_set_acl(handle, inode, di_bh, @@ -313,7 +380,7 @@ int ocfs2_init_acl(handle_t *handle, mode = inode->i_mode; ret = posix_acl_create_masq(clone, &mode); if (ret >= 0) { - inode->i_mode = mode; + ret = ocfs2_acl_set_mode(inode, di_bh, handle, mode); if (ret > 0) { ret = ocfs2_set_acl(handle, inode, di_bh, ACL_TYPE_ACCESS, From b22b63ebafb97b66d1054e69941ee049d790c6cf Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 11 Mar 2010 18:43:46 -0800 Subject: [PATCH 05/11] ocfs2: Always try for maximum bits with new local alloc windows What we were doing before was to ask for the current window size as the maximum allocation. This had the effect of limiting the amount of allocation we could get for the local alloc during times when the window size was shrunk due to fragmentation. In some cases, that could actually *increase* fragmentation by artificially limiting the number of bits we can accept. So while we still want to ask for a minimum number of bits equal to window size, there is no reason why we should limit the number of bits the local alloc should accept. Hence always allow the maximum number of local alloc bits. Signed-off-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/localalloc.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index ca992d91f511..171c691b42a0 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -984,8 +984,7 @@ static int ocfs2_local_alloc_reserve_for_window(struct ocfs2_super *osb, } retry_enospc: - (*ac)->ac_bits_wanted = osb->local_alloc_bits; - + (*ac)->ac_bits_wanted = osb->local_alloc_default_bits; status = ocfs2_reserve_cluster_bitmap_bits(osb, *ac); if (status == -ENOSPC) { if (ocfs2_recalc_la_window(osb, OCFS2_LA_EVENT_ENOSPC) == @@ -1061,6 +1060,7 @@ retry_enospc: OCFS2_LA_DISABLED) goto bail; + ac->ac_bits_wanted = osb->local_alloc_default_bits; status = ocfs2_claim_clusters(osb, handle, ac, osb->local_alloc_bits, &cluster_off, From dfe4d3d6a6f707fff1dbfd4b8fce65e64a91b809 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 19 Mar 2010 15:04:23 +0800 Subject: [PATCH 06/11] ocfs2: Fix the update of name_offset when removing xattrs When replacing a xattr's value, in some case we wipe its name/value first and then re-add it. The wipe is done by ocfs2_xa_block_wipe_namevalue() when the xattr is in the inode or block. We currently adjust name_offset for all the entries which have (offset < name_offset). This does not adjust the entrie we're replacing. Since we are replacing the entry, we don't adjust the total entry count. When we calculate a new namevalue location, we trust the entries now-wrong offset in ocfs2_xa_get_free_start(). The solution is to also adjust the name_offset for the replaced entry, allowing ocfs2_xa_get_free_start() to calculate the new namevalue location correctly. The following script can trigger a kernel panic easily. echo 'y'|mkfs.ocfs2 --fs-features=local,xattr -b 4K $DEVICE mount -t ocfs2 $DEVICE $MNT_DIR FILE=$MNT_DIR/$RANDOM for((i=0;i<76;i++)) do string_76="a$string_76" done string_78="aa$string_76" string_82="aaaa$string_78" touch $FILE setfattr -n 'user.test1234567890' -v $string_76 $FILE setfattr -n 'user.test1234567890' -v $string_78 $FILE setfattr -n 'user.test1234567890' -v $string_82 $FILE Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/xattr.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index d1b0d386f6d1..82c2a0b53eb4 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -1622,7 +1622,7 @@ static void ocfs2_xa_block_wipe_namevalue(struct ocfs2_xa_loc *loc) /* Now tell xh->xh_entries about it */ for (i = 0; i < count; i++) { offset = le16_to_cpu(xh->xh_entries[i].xe_name_offset); - if (offset < namevalue_offset) + if (offset <= namevalue_offset) le16_add_cpu(&xh->xh_entries[i].xe_name_offset, namevalue_size); } From b23179681c90a55e2a2083e1dde9f727ecffb2b7 Mon Sep 17 00:00:00 2001 From: Tao Ma Date: Fri, 19 Mar 2010 15:04:24 +0800 Subject: [PATCH 07/11] ocfs2: Init meta_ac properly in ocfs2_create_empty_xattr_block. You can't store a pointer that you haven't filled in yet and expect it to work. Signed-off-by: Tao Ma Signed-off-by: Joel Becker --- fs/ocfs2/xattr.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/ocfs2/xattr.c b/fs/ocfs2/xattr.c index 82c2a0b53eb4..3e7773089b96 100644 --- a/fs/ocfs2/xattr.c +++ b/fs/ocfs2/xattr.c @@ -6528,13 +6528,11 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode, int indexed) { int ret; - struct ocfs2_alloc_context *meta_ac; struct ocfs2_super *osb = OCFS2_SB(inode->i_sb); - struct ocfs2_xattr_set_ctxt ctxt = { - .meta_ac = meta_ac, - }; + struct ocfs2_xattr_set_ctxt ctxt; - ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac); + memset(&ctxt, 0, sizeof(ctxt)); + ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &ctxt.meta_ac); if (ret < 0) { mlog_errno(ret); return ret; @@ -6556,7 +6554,7 @@ static int ocfs2_create_empty_xattr_block(struct inode *inode, ocfs2_commit_trans(osb, ctxt.handle); out: - ocfs2_free_alloc_context(meta_ac); + ocfs2_free_alloc_context(ctxt.meta_ac); return ret; } From b4414eea0e7b9c134262c801a87e338bf675962c Mon Sep 17 00:00:00 2001 From: Mark Fasheh Date: Thu, 11 Mar 2010 18:31:09 -0800 Subject: [PATCH 08/11] ocfs2: Clear undo bits when local alloc is freed When the local alloc file changes windows, unused bits are freed back to the global bitmap. By defnition, those bits can not be in use by any file. Also, the local alloc will never have been able to allocate those bits if they were part of a previous truncate. Therefore it makes sense that we should clear unused local alloc bits in the undo buffer so that they can be used immediatly. [ Modified to call it ocfs2_release_clusters() -- Joel ] Signed-off-by: Mark Fasheh Signed-off-by: Joel Becker --- fs/ocfs2/localalloc.c | 6 ++- fs/ocfs2/ocfs2.h | 14 ++++- fs/ocfs2/suballoc.c | 116 +++++++++++++++++++++++++++--------------- fs/ocfs2/suballoc.h | 5 ++ 4 files changed, 95 insertions(+), 46 deletions(-) diff --git a/fs/ocfs2/localalloc.c b/fs/ocfs2/localalloc.c index 171c691b42a0..c983715d8d8c 100644 --- a/fs/ocfs2/localalloc.c +++ b/fs/ocfs2/localalloc.c @@ -872,8 +872,10 @@ static int ocfs2_sync_local_to_main(struct ocfs2_super *osb, (unsigned long long)la_start_blk, (unsigned long long)blkno); - status = ocfs2_free_clusters(handle, main_bm_inode, - main_bm_bh, blkno, count); + status = ocfs2_release_clusters(handle, + main_bm_inode, + main_bm_bh, blkno, + count); if (status < 0) { mlog_errno(status); goto bail; diff --git a/fs/ocfs2/ocfs2.h b/fs/ocfs2/ocfs2.h index 1238b491db90..adf5e2ebc2c4 100644 --- a/fs/ocfs2/ocfs2.h +++ b/fs/ocfs2/ocfs2.h @@ -763,8 +763,18 @@ static inline unsigned int ocfs2_megabytes_to_clusters(struct super_block *sb, return megs << (20 - OCFS2_SB(sb)->s_clustersize_bits); } -#define ocfs2_set_bit ext2_set_bit -#define ocfs2_clear_bit ext2_clear_bit +static inline void _ocfs2_set_bit(unsigned int bit, unsigned long *bitmap) +{ + ext2_set_bit(bit, bitmap); +} +#define ocfs2_set_bit(bit, addr) _ocfs2_set_bit((bit), (unsigned long *)(addr)) + +static inline void _ocfs2_clear_bit(unsigned int bit, unsigned long *bitmap) +{ + ext2_clear_bit(bit, bitmap); +} +#define ocfs2_clear_bit(bit, addr) _ocfs2_clear_bit((bit), (unsigned long *)(addr)) + #define ocfs2_test_bit ext2_test_bit #define ocfs2_find_next_zero_bit ext2_find_next_zero_bit #define ocfs2_find_next_bit ext2_find_next_bit diff --git a/fs/ocfs2/suballoc.c b/fs/ocfs2/suballoc.c index 0016503d83ef..19ba00f28547 100644 --- a/fs/ocfs2/suballoc.c +++ b/fs/ocfs2/suballoc.c @@ -95,13 +95,6 @@ static inline int ocfs2_block_group_set_bits(handle_t *handle, struct buffer_head *group_bh, unsigned int bit_off, unsigned int num_bits); -static inline int ocfs2_block_group_clear_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits); - static int ocfs2_relink_block_group(handle_t *handle, struct inode *alloc_inode, struct buffer_head *fe_bh, @@ -1978,18 +1971,18 @@ int ocfs2_claim_clusters(struct ocfs2_super *osb, bits_wanted, cluster_start, num_clusters); } -static inline int ocfs2_block_group_clear_bits(handle_t *handle, - struct inode *alloc_inode, - struct ocfs2_group_desc *bg, - struct buffer_head *group_bh, - unsigned int bit_off, - unsigned int num_bits) +static int ocfs2_block_group_clear_bits(handle_t *handle, + struct inode *alloc_inode, + struct ocfs2_group_desc *bg, + struct buffer_head *group_bh, + unsigned int bit_off, + unsigned int num_bits, + void (*undo_fn)(unsigned int bit, + unsigned long *bmap)) { int status; unsigned int tmp; - int journal_type = OCFS2_JOURNAL_ACCESS_WRITE; struct ocfs2_group_desc *undo_bg = NULL; - int cluster_bitmap = 0; mlog_entry_void(); @@ -1999,20 +1992,18 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle, mlog(0, "off = %u, num = %u\n", bit_off, num_bits); - if (ocfs2_is_cluster_bitmap(alloc_inode)) - journal_type = OCFS2_JOURNAL_ACCESS_UNDO; - + BUG_ON(undo_fn && !ocfs2_is_cluster_bitmap(alloc_inode)); status = ocfs2_journal_access_gd(handle, INODE_CACHE(alloc_inode), - group_bh, journal_type); + group_bh, + undo_fn ? + OCFS2_JOURNAL_ACCESS_UNDO : + OCFS2_JOURNAL_ACCESS_WRITE); if (status < 0) { mlog_errno(status); goto bail; } - if (ocfs2_is_cluster_bitmap(alloc_inode)) - cluster_bitmap = 1; - - if (cluster_bitmap) { + if (undo_fn) { jbd_lock_bh_state(group_bh); undo_bg = (struct ocfs2_group_desc *) bh2jh(group_bh)->b_committed_data; @@ -2023,13 +2014,13 @@ static inline int ocfs2_block_group_clear_bits(handle_t *handle, while(tmp--) { ocfs2_clear_bit((bit_off + tmp), (unsigned long *) bg->bg_bitmap); - if (cluster_bitmap) - ocfs2_set_bit(bit_off + tmp, - (unsigned long *) undo_bg->bg_bitmap); + if (undo_fn) + undo_fn(bit_off + tmp, + (unsigned long *) undo_bg->bg_bitmap); } le16_add_cpu(&bg->bg_free_bits_count, num_bits); - if (cluster_bitmap) + if (undo_fn) jbd_unlock_bh_state(group_bh); status = ocfs2_journal_dirty(handle, group_bh); @@ -2042,12 +2033,14 @@ bail: /* * expects the suballoc inode to already be locked. */ -int ocfs2_free_suballoc_bits(handle_t *handle, - struct inode *alloc_inode, - struct buffer_head *alloc_bh, - unsigned int start_bit, - u64 bg_blkno, - unsigned int count) +static int _ocfs2_free_suballoc_bits(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count, + void (*undo_fn)(unsigned int bit, + unsigned long *bitmap)) { int status = 0; u32 tmp_used; @@ -2082,7 +2075,7 @@ int ocfs2_free_suballoc_bits(handle_t *handle, status = ocfs2_block_group_clear_bits(handle, alloc_inode, group, group_bh, - start_bit, count); + start_bit, count, undo_fn); if (status < 0) { mlog_errno(status); goto bail; @@ -2113,6 +2106,17 @@ bail: return status; } +int ocfs2_free_suballoc_bits(handle_t *handle, + struct inode *alloc_inode, + struct buffer_head *alloc_bh, + unsigned int start_bit, + u64 bg_blkno, + unsigned int count) +{ + return _ocfs2_free_suballoc_bits(handle, alloc_inode, alloc_bh, + start_bit, bg_blkno, count, NULL); +} + int ocfs2_free_dinode(handle_t *handle, struct inode *inode_alloc_inode, struct buffer_head *inode_alloc_bh, @@ -2126,11 +2130,13 @@ int ocfs2_free_dinode(handle_t *handle, inode_alloc_bh, bit, bg_blkno, 1); } -int ocfs2_free_clusters(handle_t *handle, - struct inode *bitmap_inode, - struct buffer_head *bitmap_bh, - u64 start_blk, - unsigned int num_clusters) +static int _ocfs2_free_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters, + void (*undo_fn)(unsigned int bit, + unsigned long *bitmap)) { int status; u16 bg_start_bit; @@ -2157,9 +2163,9 @@ int ocfs2_free_clusters(handle_t *handle, mlog(0, "bg_blkno = %llu, bg_start_bit = %u\n", (unsigned long long)bg_blkno, bg_start_bit); - status = ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, - bg_start_bit, bg_blkno, - num_clusters); + status = _ocfs2_free_suballoc_bits(handle, bitmap_inode, bitmap_bh, + bg_start_bit, bg_blkno, + num_clusters, undo_fn); if (status < 0) { mlog_errno(status); goto out; @@ -2173,6 +2179,32 @@ out: return status; } +int ocfs2_free_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters) +{ + return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, + start_blk, num_clusters, + _ocfs2_set_bit); +} + +/* + * Give never-used clusters back to the global bitmap. We don't need + * to protect these bits in the undo buffer. + */ +int ocfs2_release_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters) +{ + return _ocfs2_free_clusters(handle, bitmap_inode, bitmap_bh, + start_blk, num_clusters, + _ocfs2_clear_bit); +} + static inline void ocfs2_debug_bg(struct ocfs2_group_desc *bg) { printk("Block Group:\n"); diff --git a/fs/ocfs2/suballoc.h b/fs/ocfs2/suballoc.h index fa60723c43e8..e0f46df357e6 100644 --- a/fs/ocfs2/suballoc.h +++ b/fs/ocfs2/suballoc.h @@ -127,6 +127,11 @@ int ocfs2_free_clusters(handle_t *handle, struct buffer_head *bitmap_bh, u64 start_blk, unsigned int num_clusters); +int ocfs2_release_clusters(handle_t *handle, + struct inode *bitmap_inode, + struct buffer_head *bitmap_bh, + u64 start_blk, + unsigned int num_clusters); static inline u64 ocfs2_which_suballoc_group(u64 block, unsigned int bit) { From 3939fda4b389993caf8741df5739b3e49f33a263 Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Fri, 19 Mar 2010 09:21:09 +0800 Subject: [PATCH 09/11] Ocfs2: Journaling i_flags and i_orphaned_slot when adding inode to orphan dir. Currently, some callers were missing to journal the dirty inode after adding it to orphan dir. Now we're going to journal such modifications within the ocfs2_orphan_add() itself, It's safe to do so, though some existing caller may duplicate this, and it makes the logic look more straightforward anyway. Signed-off-by: Tristan Ye Signed-off-by: Joel Becker --- fs/ocfs2/namei.c | 28 +++++++++++++++++++++++----- 1 file changed, 23 insertions(+), 5 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index d9cd4e373a53..b1eb50ae4097 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -84,7 +84,7 @@ static int ocfs2_prepare_orphan_dir(struct ocfs2_super *osb, static int ocfs2_orphan_add(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct ocfs2_dinode *fe, + struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, struct inode *orphan_dir_inode); @@ -879,7 +879,7 @@ static int ocfs2_unlink(struct inode *dir, fe = (struct ocfs2_dinode *) fe_bh->b_data; if (inode_is_unlinkable(inode)) { - status = ocfs2_orphan_add(osb, handle, inode, fe, orphan_name, + status = ocfs2_orphan_add(osb, handle, inode, fe_bh, orphan_name, &orphan_insert, orphan_dir); if (status < 0) { mlog_errno(status); @@ -1300,7 +1300,7 @@ static int ocfs2_rename(struct inode *old_dir, if (S_ISDIR(new_inode->i_mode) || (ocfs2_read_links_count(newfe) == 1)) { status = ocfs2_orphan_add(osb, handle, new_inode, - newfe, orphan_name, + newfe_bh, orphan_name, &orphan_insert, orphan_dir); if (status < 0) { mlog_errno(status); @@ -1911,7 +1911,7 @@ leave: static int ocfs2_orphan_add(struct ocfs2_super *osb, handle_t *handle, struct inode *inode, - struct ocfs2_dinode *fe, + struct buffer_head *fe_bh, char *name, struct ocfs2_dir_lookup_result *lookup, struct inode *orphan_dir_inode) @@ -1919,6 +1919,7 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh = NULL; int status = 0; struct ocfs2_dinode *orphan_fe; + struct ocfs2_dinode *fe = (struct ocfs2_dinode *) fe_bh->b_data; mlog_entry("(inode->i_ino = %lu)\n", inode->i_ino); @@ -1959,6 +1960,21 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, goto leave; } + /* + * We're going to journal the change of i_flags and i_orphaned_slot. + * It's safe anyway, though some callers may duplicate the journaling. + * Journaling within the func just make the logic look more + * straightforward. + */ + status = ocfs2_journal_access_di(handle, + INODE_CACHE(inode), + fe_bh, + OCFS2_JOURNAL_ACCESS_WRITE); + if (status < 0) { + mlog_errno(status); + goto leave; + } + le32_add_cpu(&fe->i_flags, OCFS2_ORPHANED_FL); /* Record which orphan dir our inode now resides @@ -1966,6 +1982,8 @@ static int ocfs2_orphan_add(struct ocfs2_super *osb, * dir to lock. */ fe->i_orphaned_slot = cpu_to_le16(osb->slot_num); + ocfs2_journal_dirty(handle, fe_bh); + mlog(0, "Inode %llu orphaned in slot %d\n", (unsigned long long)OCFS2_I(inode)->ip_blkno, osb->slot_num); @@ -2123,7 +2141,7 @@ int ocfs2_create_inode_in_orphan(struct inode *dir, } di = (struct ocfs2_dinode *)new_di_bh->b_data; - status = ocfs2_orphan_add(osb, handle, inode, di, orphan_name, + status = ocfs2_orphan_add(osb, handle, inode, new_di_bh, orphan_name, &orphan_insert, orphan_dir); if (status < 0) { mlog_errno(status); From b54c2ca475fa7d7450a45b6d778dae9dbe0bcbfe Mon Sep 17 00:00:00 2001 From: Tristan Ye Date: Fri, 19 Mar 2010 09:21:10 +0800 Subject: [PATCH 10/11] Ocfs2: Handle deletion of reflinked oprhan inodes correctly. The rule is that all inodes in the orphan dir have ORPHANED_FL, otherwise we treated it as an ERROR. This rule works well except for some rare cases of reflink operation: http://oss.oracle.com/bugzilla/show_bug.cgi?id=1215 The problem is caused by how reflink and our orphan_scan thread interact. * The orphan scan pulls the orphans into a queue first, then runs the queue at a later time. We only hold the orphan_dir's lock during scanning. * Reflink create a oprhaned target in orphan_dir as its first step. It removes the target and clears the flag as the final step. These two steps take the orphan_dir's lock, but it is not held for the duration. Based on the above semantics, a reflink inode can be moved out of the orphan dir and have its ORPHANED_FL cleared before the queue of orphans is run. This leads to a ERROR in ocfs2_query_wipde_inode(). This patch teaches ocfs2_query_wipe_inode() to detect previously orphaned reflink targets. If a reflink fails or a crash occurs during the relfink operation, the inode will retain ORPHANED_FL and will be properly wiped. Signed-off-by: Tristan Ye Signed-off-by: Joel Becker --- fs/ocfs2/inode.c | 15 +++++++++++++++ 1 file changed, 15 insertions(+) diff --git a/fs/ocfs2/inode.c b/fs/ocfs2/inode.c index 278a223aae14..ab207901d32a 100644 --- a/fs/ocfs2/inode.c +++ b/fs/ocfs2/inode.c @@ -891,6 +891,21 @@ static int ocfs2_query_inode_wipe(struct inode *inode, /* Do some basic inode verification... */ di = (struct ocfs2_dinode *) di_bh->b_data; if (!(di->i_flags & cpu_to_le32(OCFS2_ORPHANED_FL))) { + /* + * Inodes in the orphan dir must have ORPHANED_FL. The only + * inodes that come back out of the orphan dir are reflink + * targets. A reflink target may be moved out of the orphan + * dir between the time we scan the directory and the time we + * process it. This would lead to HAS_REFCOUNT_FL being set but + * ORPHANED_FL not. + */ + if (di->i_dyn_features & cpu_to_le16(OCFS2_HAS_REFCOUNT_FL)) { + mlog(0, "Reflinked inode %llu is no longer orphaned. " + "it shouldn't be deleted\n", + (unsigned long long)oi->ip_blkno); + goto bail; + } + /* for lack of a better error? */ status = -EEXIST; mlog(ML_ERROR, From 14741472a05245ed5778aa0aec055e1f920b6ef8 Mon Sep 17 00:00:00 2001 From: Srinivas Eeda Date: Mon, 22 Mar 2010 16:50:47 -0700 Subject: [PATCH 11/11] ocfs2: Fix a race in o2dlm lockres mastery In o2dlm, the master of a lock resource keeps a map of all interested nodes. This prevents the master from purging the resource before an interested node can create a lock. A race between the mastery thread and the mastery handler allowed an interested node to discover who the master is without informing the master directly. This is easily fixed by holding the dlm spinlock a little longer in the mastery handler. Signed-off-by: Srinivas Eeda Signed-off-by: Joel Becker --- fs/ocfs2/dlm/dlmmaster.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ocfs2/dlm/dlmmaster.c b/fs/ocfs2/dlm/dlmmaster.c index a659606dcb95..9289b4357d27 100644 --- a/fs/ocfs2/dlm/dlmmaster.c +++ b/fs/ocfs2/dlm/dlmmaster.c @@ -1875,7 +1875,6 @@ int dlm_assert_master_handler(struct o2net_msg *msg, u32 len, void *data, ok: spin_unlock(&res->spinlock); } - spin_unlock(&dlm->spinlock); // mlog(0, "woo! got an assert_master from node %u!\n", // assert->node_idx); @@ -1926,7 +1925,6 @@ ok: /* master is known, detach if not already detached. * ensures that only one assert_master call will happen * on this mle. */ - spin_lock(&dlm->spinlock); spin_lock(&dlm->master_lock); rr = atomic_read(&mle->mle_refs.refcount); @@ -1959,7 +1957,6 @@ ok: __dlm_put_mle(mle); } spin_unlock(&dlm->master_lock); - spin_unlock(&dlm->spinlock); } else if (res) { if (res->owner != assert->node_idx) { mlog(0, "assert_master from %u, but current " @@ -1967,6 +1964,7 @@ ok: res->owner, namelen, name); } } + spin_unlock(&dlm->spinlock); done: ret = 0;