From f3e0f3da1b65e84ea82176c1cda03a4b694c9911 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2015 02:35:14 -0400 Subject: [PATCH 01/65] ufs: kill more lock_ufs() calls a) move it inside ufs_truncate() b) ufs_free_inode() doesn't need it - it's serialized on ->s_lock c) ufs_write_inode() doesn't need it either (and can be called without it anyway). Signed-off-by: Al Viro --- fs/ufs/inode.c | 13 ++----------- fs/ufs/truncate.c | 4 ++-- 2 files changed, 4 insertions(+), 13 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index f913a6924b23..0e4d88e0e709 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -867,11 +867,7 @@ static int ufs_update_inode(struct inode * inode, int do_sync) int ufs_write_inode(struct inode *inode, struct writeback_control *wbc) { - int ret; - lock_ufs(inode->i_sb); - ret = ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); - unlock_ufs(inode->i_sb); - return ret; + return ufs_update_inode(inode, wbc->sync_mode == WB_SYNC_ALL); } int ufs_sync_inode (struct inode *inode) @@ -890,22 +886,17 @@ void ufs_evict_inode(struct inode * inode) if (want_delete) { loff_t old_i_size; /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ - lock_ufs(inode->i_sb); mark_inode_dirty(inode); ufs_update_inode(inode, IS_SYNC(inode)); old_i_size = inode->i_size; inode->i_size = 0; if (inode->i_blocks && ufs_truncate(inode, old_i_size)) ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); - unlock_ufs(inode->i_sb); } invalidate_inode_buffers(inode); clear_inode(inode); - if (want_delete) { - lock_ufs(inode->i_sb); + if (want_delete) ufs_free_inode(inode); - unlock_ufs(inode->i_sb); - } } diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 21154704c168..90cf3a76c500 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -457,6 +457,7 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; + lock_ufs(sb); err = ufs_alloc_lastblock(inode); if (err) { @@ -486,6 +487,7 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) ufsi->i_lastfrag = DIRECT_FRAGMENT; mark_inode_dirty(inode); out: + unlock_ufs(sb); UFSD("EXIT: err %d\n", err); return err; } @@ -506,9 +508,7 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr) /* XXX(truncate): truncate_setsize should be called last */ truncate_setsize(inode, attr->ia_size); - lock_ufs(inode->i_sb); error = ufs_truncate(inode, old_i_size); - unlock_ufs(inode->i_sb); if (error) return error; } From d622f167b8435c856376edec130053fb56bf83e4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2015 18:04:16 -0400 Subject: [PATCH 02/65] ufs: switch ufs_evict_inode() to trimmed-down variant of ufs_truncate() Signed-off-by: Al Viro --- fs/ufs/inode.c | 9 ++----- fs/ufs/truncate.c | 60 ++++++++++++++++++++++++++++++++--------------- fs/ufs/ufs.h | 2 +- 3 files changed, 44 insertions(+), 27 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 0e4d88e0e709..282b0ced6272 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -884,14 +884,9 @@ void ufs_evict_inode(struct inode * inode) truncate_inode_pages_final(&inode->i_data); if (want_delete) { - loff_t old_i_size; - /*UFS_I(inode)->i_dtime = CURRENT_TIME;*/ - mark_inode_dirty(inode); - ufs_update_inode(inode, IS_SYNC(inode)); - old_i_size = inode->i_size; inode->i_size = 0; - if (inode->i_blocks && ufs_truncate(inode, old_i_size)) - ufs_warning(inode->i_sb, __func__, "ufs_truncate failed\n"); + if (inode->i_blocks) + ufs_truncate_blocks(inode); } invalidate_inode_buffers(inode); diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 90cf3a76c500..5a2e7082a0ae 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -440,12 +440,36 @@ out: return err; } -int ufs_truncate(struct inode *inode, loff_t old_i_size) +static void __ufs_truncate_blocks(struct inode *inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - int retry, err = 0; + int retry; + + while (1) { + retry = ufs_trunc_direct(inode); + retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, + ufs_get_direct_data_ptr(uspi, ufsi, + UFS_IND_BLOCK)); + retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, + ufs_get_direct_data_ptr(uspi, ufsi, + UFS_DIND_BLOCK)); + retry |= ufs_trunc_tindirect (inode); + if (!retry) + break; + if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) + ufs_sync_inode (inode); + yield(); + } + + ufsi->i_lastfrag = DIRECT_FRAGMENT; +} + +int ufs_truncate(struct inode *inode, loff_t old_i_size) +{ + struct super_block *sb = inode->i_sb; + int err = 0; UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", inode->i_ino, (unsigned long long)i_size_read(inode), @@ -467,24 +491,8 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); - while (1) { - retry = ufs_trunc_direct(inode); - retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, - ufs_get_direct_data_ptr(uspi, ufsi, - UFS_IND_BLOCK)); - retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, - ufs_get_direct_data_ptr(uspi, ufsi, - UFS_DIND_BLOCK)); - retry |= ufs_trunc_tindirect (inode); - if (!retry) - break; - if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) - ufs_sync_inode (inode); - yield(); - } - + __ufs_truncate_blocks(inode); inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; - ufsi->i_lastfrag = DIRECT_FRAGMENT; mark_inode_dirty(inode); out: unlock_ufs(sb); @@ -492,6 +500,20 @@ out: return err; } +void ufs_truncate_blocks(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + + lock_ufs(sb); + __ufs_truncate_blocks(inode); + unlock_ufs(sb); +} + int ufs_setattr(struct dentry *dentry, struct iattr *attr) { struct inode *inode = d_inode(dentry); diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 2e31ea2e35a3..43fcab381de1 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -141,7 +141,7 @@ extern const struct inode_operations ufs_fast_symlink_inode_operations; extern const struct inode_operations ufs_symlink_inode_operations; /* truncate.c */ -extern int ufs_truncate (struct inode *, loff_t); +extern void ufs_truncate_blocks(struct inode *); extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) From 3b7a3a05e8b006a73c406230b3d2d3da920779d9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2015 18:06:40 -0400 Subject: [PATCH 03/65] ufs: free excessive blocks upon ->write_begin() failure/short copy Broken in "[PATCH] ufs: truncate should allocate block for last byte"; all way back in 2006. ufs_setattr() hadn't been the only user of vmtruncate() and eliminating ->truncate() method required corrections in a bunch of places. Eventually those places had migrated into ->write_begin() failure exit and ->write_end() after short copy... Signed-off-by: Al Viro --- fs/ufs/inode.c | 18 ++++++++++++++++-- 1 file changed, 16 insertions(+), 2 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 282b0ced6272..a4fc3adfdc4c 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -530,8 +530,10 @@ static void ufs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; - if (to > inode->i_size) + if (to > inode->i_size) { truncate_pagecache(inode, inode->i_size); + ufs_truncate_blocks(inode); + } } static int ufs_write_begin(struct file *file, struct address_space *mapping, @@ -548,6 +550,18 @@ static int ufs_write_begin(struct file *file, struct address_space *mapping, return ret; } +static int ufs_write_end(struct file *file, struct address_space *mapping, + loff_t pos, unsigned len, unsigned copied, + struct page *page, void *fsdata) +{ + int ret; + + ret = generic_write_end(file, mapping, pos, len, copied, page, fsdata); + if (ret < len) + ufs_write_failed(mapping, pos + len); + return ret; +} + static sector_t ufs_bmap(struct address_space *mapping, sector_t block) { return generic_block_bmap(mapping,block,ufs_getfrag_block); @@ -557,7 +571,7 @@ const struct address_space_operations ufs_aops = { .readpage = ufs_readpage, .writepage = ufs_writepage, .write_begin = ufs_write_begin, - .write_end = generic_write_end, + .write_end = ufs_write_end, .bmap = ufs_bmap }; From 2401aa29ab5c42cc34853a5c1457fbf66593690f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2015 18:15:07 -0400 Subject: [PATCH 04/65] ufs: move truncate_setsize() down into ufs_truncate() just prior to __ufs_truncate_blocks(), with matching change of calling conventions Signed-off-by: Al Viro --- fs/ufs/truncate.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 5a2e7082a0ae..6f56036ff724 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -370,7 +370,7 @@ static int ufs_trunc_tindirect(struct inode *inode) return retry; } -static int ufs_alloc_lastblock(struct inode *inode) +static int ufs_alloc_lastblock(struct inode *inode, loff_t size) { int err = 0; struct super_block *sb = inode->i_sb; @@ -382,7 +382,7 @@ static int ufs_alloc_lastblock(struct inode *inode) struct buffer_head *bh; u64 phys64; - lastfrag = (i_size_read(inode) + uspi->s_fsize - 1) >> uspi->s_fshift; + lastfrag = (size + uspi->s_fsize - 1) >> uspi->s_fshift; if (!lastfrag) goto out; @@ -466,14 +466,14 @@ static void __ufs_truncate_blocks(struct inode *inode) ufsi->i_lastfrag = DIRECT_FRAGMENT; } -int ufs_truncate(struct inode *inode, loff_t old_i_size) +int ufs_truncate(struct inode *inode, loff_t size) { struct super_block *sb = inode->i_sb; int err = 0; UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", - inode->i_ino, (unsigned long long)i_size_read(inode), - (unsigned long long)old_i_size); + inode->i_ino, (unsigned long long)size, + (unsigned long long)i_size_read(inode)); if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) @@ -482,14 +482,14 @@ int ufs_truncate(struct inode *inode, loff_t old_i_size) return -EPERM; lock_ufs(sb); - err = ufs_alloc_lastblock(inode); + err = ufs_alloc_lastblock(inode, size); - if (err) { - i_size_write(inode, old_i_size); + if (err) goto out; - } - block_truncate_page(inode->i_mapping, inode->i_size, ufs_getfrag_block); + block_truncate_page(inode->i_mapping, size, ufs_getfrag_block); + + truncate_setsize(inode, size); __ufs_truncate_blocks(inode); inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; @@ -525,12 +525,7 @@ int ufs_setattr(struct dentry *dentry, struct iattr *attr) return error; if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { - loff_t old_i_size = inode->i_size; - - /* XXX(truncate): truncate_setsize should be called last */ - truncate_setsize(inode, attr->ia_size); - - error = ufs_truncate(inode, old_i_size); + error = ufs_truncate(inode, attr->ia_size); if (error) return error; } From 493b4537a26b104fb3bd07ff4a46b6ede4288e76 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2015 18:17:28 -0400 Subject: [PATCH 05/65] ufs: move lock_ufs() down into __ufs_truncate_blocks() Signed-off-by: Al Viro --- fs/ufs/truncate.c | 9 ++------- 1 file changed, 2 insertions(+), 7 deletions(-) diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 6f56036ff724..155e13aea80c 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -447,6 +447,7 @@ static void __ufs_truncate_blocks(struct inode *inode) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; int retry; + lock_ufs(sb); while (1) { retry = ufs_trunc_direct(inode); retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, @@ -464,11 +465,11 @@ static void __ufs_truncate_blocks(struct inode *inode) } ufsi->i_lastfrag = DIRECT_FRAGMENT; + unlock_ufs(sb); } int ufs_truncate(struct inode *inode, loff_t size) { - struct super_block *sb = inode->i_sb; int err = 0; UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", @@ -481,7 +482,6 @@ int ufs_truncate(struct inode *inode, loff_t size) if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return -EPERM; - lock_ufs(sb); err = ufs_alloc_lastblock(inode, size); if (err) @@ -495,23 +495,18 @@ int ufs_truncate(struct inode *inode, loff_t size) inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); out: - unlock_ufs(sb); UFSD("EXIT: err %d\n", err); return err; } void ufs_truncate_blocks(struct inode *inode) { - struct super_block *sb = inode->i_sb; if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) return; if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) return; - - lock_ufs(sb); __ufs_truncate_blocks(inode); - unlock_ufs(sb); } int ufs_setattr(struct dentry *dentry, struct iattr *attr) From 4af7b2c080715b9452fdaefb7ada72b4dc79593e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 17 Jun 2015 19:26:18 -0400 Subject: [PATCH 06/65] ufs: bforget() indirect blocks before freeing them right now it doesn't matter (lock_ufs() serializes everything), but when we switch to per-inode locking, it will be needed. Signed-off-by: Al Viro --- fs/ufs/truncate.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 155e13aea80c..9908a6045d7a 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -237,9 +237,9 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) tmp = ufs_data_ptr_to_cpu(sb, p); ufs_data_ptr_clear(uspi, p); + ubh_bforget(ind_ubh); ufs_free_blocks (inode, tmp, uspi->s_fpb); mark_inode_dirty(inode); - ubh_bforget(ind_ubh); ind_ubh = NULL; } if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) @@ -299,9 +299,9 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) tmp = ufs_data_ptr_to_cpu(sb, p); ufs_data_ptr_clear(uspi, p); + ubh_bforget(dind_bh); ufs_free_blocks(inode, tmp, uspi->s_fpb); mark_inode_dirty(inode); - ubh_bforget(dind_bh); dind_bh = NULL; } if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) @@ -357,9 +357,9 @@ static int ufs_trunc_tindirect(struct inode *inode) tmp = ufs_data_ptr_to_cpu(sb, p); ufs_data_ptr_clear(uspi, p); + ubh_bforget(tind_bh); ufs_free_blocks(inode, tmp, uspi->s_fpb); mark_inode_dirty(inode); - ubh_bforget(tind_bh); tind_bh = NULL; } if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) From 724bb09fdc06d4ff03757b25d6dba9ef1b133e8f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 17 Jun 2015 12:02:56 -0400 Subject: [PATCH 07/65] ufs: don't use lock_ufs() for block pointers tree protection * stores to block pointers are under per-inode seqlock (meta_lock) and mutex (truncate_mutex) * fetches of block pointers are either under truncate_mutex, or wrapped into seqretry loop on meta_lock * all changes of ->i_size are under truncate_mutex and i_mutex * all changes of ->i_lastfrag are under truncate_mutex It's similar to what ext2 is doing; the main difference is that unlike ext2 we can't rely upon the atomicity of stores into block pointers - on UFS2 they are 64bit. So we can't cut the corner when switching a pointer from NULL to non-NULL as we could in ext2_splice_branch() and need to use meta_lock on all modifications. We use seqlock where ext2 uses rwlock; ext2 could probably also benefit from such change... Another non-trivial difference is that with UFS we *cannot* have reader grab truncate_mutex in case of race - it has to keep retrying. That might be possible to change, but not until we lift tail unpacking several levels up in call chain. After that commit we do *NOT* hold fs-wide serialization on accesses to block pointers anymore. Moreover, lock_ufs() can become a normal mutex now - it's only used on statfs, remount and sync_fs and none of those uses are recursive. As the matter of fact, *now* it can be collapsed with ->s_lock, and be eventually replaced with saner per-cylinder-group spinlocks, but that's a separate story. Signed-off-by: Al Viro --- fs/ufs/balloc.c | 4 ++ fs/ufs/inode.c | 138 +++++++++++++++++++++++++++++++--------------- fs/ufs/super.c | 2 + fs/ufs/truncate.c | 22 +++++++- fs/ufs/ufs.h | 2 + 5 files changed, 121 insertions(+), 47 deletions(-) diff --git a/fs/ufs/balloc.c b/fs/ufs/balloc.c index a7106eda5024..fb8b54eb77c5 100644 --- a/fs/ufs/balloc.c +++ b/fs/ufs/balloc.c @@ -417,7 +417,9 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, if (oldcount == 0) { result = ufs_alloc_fragments (inode, cgno, goal, count, err); if (result) { + write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); + write_sequnlock(&UFS_I(inode)->meta_lock); *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); @@ -473,7 +475,9 @@ u64 ufs_new_fragments(struct inode *inode, void *p, u64 fragment, ufs_change_blocknr(inode, fragment - oldcount, oldcount, uspi->s_sbbase + tmp, uspi->s_sbbase + result, locked_page); + write_seqlock(&UFS_I(inode)->meta_lock); ufs_cpu_to_data_ptr(sb, p, result); + write_sequnlock(&UFS_I(inode)->meta_lock); *err = 0; UFS_I(inode)->i_lastfrag = max(UFS_I(inode)->i_lastfrag, fragment + count); diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index a4fc3adfdc4c..100f93c6b309 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -41,8 +41,6 @@ #include "swab.h" #include "util.h" -static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock); - static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) { struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; @@ -75,12 +73,53 @@ static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t off return n; } +typedef struct { + void *p; + union { + __fs32 key32; + __fs64 key64; + }; + struct buffer_head *bh; +} Indirect; + +static inline int grow_chain32(struct ufs_inode_info *ufsi, + struct buffer_head *bh, __fs32 *v, + Indirect *from, Indirect *to) +{ + Indirect *p; + unsigned seq; + to->bh = bh; + do { + seq = read_seqbegin(&ufsi->meta_lock); + to->key32 = *(__fs32 *)(to->p = v); + for (p = from; p <= to && p->key32 == *(__fs32 *)p->p; p++) + ; + } while (read_seqretry(&ufsi->meta_lock, seq)); + return (p > to); +} + +static inline int grow_chain64(struct ufs_inode_info *ufsi, + struct buffer_head *bh, __fs64 *v, + Indirect *from, Indirect *to) +{ + Indirect *p; + unsigned seq; + to->bh = bh; + do { + seq = read_seqbegin(&ufsi->meta_lock); + to->key64 = *(__fs64 *)(to->p = v); + for (p = from; p <= to && p->key64 == *(__fs64 *)p->p; p++) + ; + } while (read_seqretry(&ufsi->meta_lock, seq)); + return (p > to); +} + /* * Returns the location of the fragment from * the beginning of the filesystem. */ -static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock) +static u64 ufs_frag_map(struct inode *inode, sector_t frag) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; @@ -88,12 +127,10 @@ static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock) u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift; int shift = uspi->s_apbshift-uspi->s_fpbshift; sector_t offsets[4], *p; + Indirect chain[4], *q = chain; int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets); - u64 ret = 0L; - __fs32 block; - __fs64 u2_block = 0L; unsigned flags = UFS_SB(sb)->s_flags; - u64 temp = 0L; + u64 res = 0; UFSD(": frag = %llu depth = %d\n", (unsigned long long)frag, depth); UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n", @@ -101,59 +138,73 @@ static u64 ufs_frag_map(struct inode *inode, sector_t frag, bool needs_lock) (unsigned long long)mask); if (depth == 0) - return 0; + goto no_block; +again: p = offsets; - if (needs_lock) - lock_ufs(sb); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) goto ufs2; - block = ufsi->i_u1.i_data[*p++]; - if (!block) - goto out; + if (!grow_chain32(ufsi, NULL, &ufsi->i_u1.i_data[*p++], chain, q)) + goto changed; + if (!q->key32) + goto no_block; while (--depth) { + __fs32 *ptr; struct buffer_head *bh; sector_t n = *p++; - bh = sb_bread(sb, uspi->s_sbbase + fs32_to_cpu(sb, block)+(n>>shift)); + bh = sb_bread(sb, uspi->s_sbbase + + fs32_to_cpu(sb, q->key32) + (n>>shift)); if (!bh) - goto out; - block = ((__fs32 *) bh->b_data)[n & mask]; - brelse (bh); - if (!block) - goto out; + goto no_block; + ptr = (__fs32 *)bh->b_data + (n & mask); + if (!grow_chain32(ufsi, bh, ptr, chain, ++q)) + goto changed; + if (!q->key32) + goto no_block; } - ret = (u64) (uspi->s_sbbase + fs32_to_cpu(sb, block) + (frag & uspi->s_fpbmask)); - goto out; + res = fs32_to_cpu(sb, q->key32); + goto found; + ufs2: - u2_block = ufsi->i_u1.u2_i_data[*p++]; - if (!u2_block) - goto out; - + if (!grow_chain64(ufsi, NULL, &ufsi->i_u1.u2_i_data[*p++], chain, q)) + goto changed; + if (!q->key64) + goto no_block; while (--depth) { + __fs64 *ptr; struct buffer_head *bh; sector_t n = *p++; - - temp = (u64)(uspi->s_sbbase) + fs64_to_cpu(sb, u2_block); - bh = sb_bread(sb, temp +(u64) (n>>shift)); + bh = sb_bread(sb, uspi->s_sbbase + + fs64_to_cpu(sb, q->key64) + (n>>shift)); if (!bh) - goto out; - u2_block = ((__fs64 *)bh->b_data)[n & mask]; - brelse(bh); - if (!u2_block) - goto out; + goto no_block; + ptr = (__fs64 *)bh->b_data + (n & mask); + if (!grow_chain64(ufsi, bh, ptr, chain, ++q)) + goto changed; + if (!q->key64) + goto no_block; } - temp = (u64)uspi->s_sbbase + fs64_to_cpu(sb, u2_block); - ret = temp + (u64) (frag & uspi->s_fpbmask); + res = fs64_to_cpu(sb, q->key64); +found: + res += uspi->s_sbbase + (frag & uspi->s_fpbmask); +no_block: + while (q > chain) { + brelse(q->bh); + q--; + } + return res; -out: - if (needs_lock) - unlock_ufs(sb); - return ret; +changed: + while (q > chain) { + brelse(q->bh); + q--; + } + goto again; } /** @@ -421,10 +472,9 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head int ret, err, new; unsigned long ptr,phys; u64 phys64 = 0; - bool needs_lock = (sbi->mutex_owner != current); if (!create) { - phys64 = ufs_frag_map(inode, fragment, needs_lock); + phys64 = ufs_frag_map(inode, fragment); UFSD("phys64 = %llu\n", (unsigned long long)phys64); if (phys64) map_bh(bh_result, sb, phys64); @@ -438,8 +488,7 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head ret = 0; bh = NULL; - if (needs_lock) - lock_ufs(sb); + mutex_lock(&UFS_I(inode)->truncate_mutex); UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); if (fragment > @@ -501,8 +550,7 @@ out: set_buffer_new(bh_result); map_bh(bh_result, sb, phys); abort: - if (needs_lock) - unlock_ufs(sb); + mutex_unlock(&UFS_I(inode)->truncate_mutex); return err; diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 250579a80d90..15cd3338340c 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -1429,6 +1429,8 @@ static struct inode *ufs_alloc_inode(struct super_block *sb) return NULL; ei->vfs_inode.i_version = 1; + seqlock_init(&ei->meta_lock); + mutex_init(&ei->truncate_mutex); return &ei->vfs_inode; } diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 9908a6045d7a..ad34b7f4b499 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -128,7 +128,9 @@ next1: tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp) continue; + write_seqlock(&ufsi->meta_lock); ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); if (free_count == 0) { frag_to_free = tmp; @@ -157,7 +159,9 @@ next1: if (!tmp ) ufs_panic(sb, "ufs_truncate_direct", "internal error"); frag4 = ufs_fragnum (frag4); + write_seqlock(&ufsi->meta_lock); ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); ufs_free_fragments (inode, tmp, frag4); mark_inode_dirty(inode); @@ -199,7 +203,9 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) return 1; } if (!ind_ubh) { + write_seqlock(&UFS_I(inode)->meta_lock); ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); return 0; } @@ -210,7 +216,9 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) if (!tmp) continue; + write_seqlock(&UFS_I(inode)->meta_lock); ufs_data_ptr_clear(uspi, ind); + write_sequnlock(&UFS_I(inode)->meta_lock); ubh_mark_buffer_dirty(ind_ubh); if (free_count == 0) { frag_to_free = tmp; @@ -235,7 +243,9 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) break; if (i >= uspi->s_apb) { tmp = ufs_data_ptr_to_cpu(sb, p); + write_seqlock(&UFS_I(inode)->meta_lock); ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); ubh_bforget(ind_ubh); ufs_free_blocks (inode, tmp, uspi->s_fpb); @@ -278,7 +288,9 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) return 1; } if (!dind_bh) { + write_seqlock(&UFS_I(inode)->meta_lock); ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); return 0; } @@ -297,7 +309,9 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) break; if (i >= uspi->s_apb) { tmp = ufs_data_ptr_to_cpu(sb, p); + write_seqlock(&UFS_I(inode)->meta_lock); ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); ubh_bforget(dind_bh); ufs_free_blocks(inode, tmp, uspi->s_fpb); @@ -339,7 +353,9 @@ static int ufs_trunc_tindirect(struct inode *inode) return 1; } if (!tind_bh) { + write_seqlock(&ufsi->meta_lock); ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); return 0; } @@ -355,7 +371,9 @@ static int ufs_trunc_tindirect(struct inode *inode) break; if (i >= uspi->s_apb) { tmp = ufs_data_ptr_to_cpu(sb, p); + write_seqlock(&ufsi->meta_lock); ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); ubh_bforget(tind_bh); ufs_free_blocks(inode, tmp, uspi->s_fpb); @@ -447,7 +465,7 @@ static void __ufs_truncate_blocks(struct inode *inode) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; int retry; - lock_ufs(sb); + mutex_lock(&ufsi->truncate_mutex); while (1) { retry = ufs_trunc_direct(inode); retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, @@ -465,7 +483,7 @@ static void __ufs_truncate_blocks(struct inode *inode) } ufsi->i_lastfrag = DIRECT_FRAGMENT; - unlock_ufs(sb); + mutex_unlock(&ufsi->truncate_mutex); } int ufs_truncate(struct inode *inode, loff_t size) diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 43fcab381de1..ea28b73a8b74 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -46,6 +46,8 @@ struct ufs_inode_info { __u32 i_oeftflag; __u16 i_osync; __u64 i_lastfrag; + seqlock_t meta_lock; + struct mutex truncate_mutex; __u32 i_dir_start_lookup; struct inode vfs_inode; }; From dff7cfd36e305488421d82a0ed3dd0209c333745 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2015 04:27:05 -0400 Subject: [PATCH 08/65] ufs: kill lock_ufs() There were 3 remaining users; in two of them we took ->s_lock immediately after lock_ufs() and held it until just before unlock_ufs(); the third one (statfs) could not be called from itself or from other two (remount and sync_fs). Just use ->s_lock in statfs and don't bother with lock_ufs at all. Signed-off-by: Al Viro --- fs/ufs/super.c | 34 ++-------------------------------- fs/ufs/ufs.h | 5 ----- 2 files changed, 2 insertions(+), 37 deletions(-) diff --git a/fs/ufs/super.c b/fs/ufs/super.c index 15cd3338340c..f6390eec02ca 100644 --- a/fs/ufs/super.c +++ b/fs/ufs/super.c @@ -94,22 +94,6 @@ #include "swab.h" #include "util.h" -void lock_ufs(struct super_block *sb) -{ - struct ufs_sb_info *sbi = UFS_SB(sb); - - mutex_lock(&sbi->mutex); - sbi->mutex_owner = current; -} - -void unlock_ufs(struct super_block *sb) -{ - struct ufs_sb_info *sbi = UFS_SB(sb); - - sbi->mutex_owner = NULL; - mutex_unlock(&sbi->mutex); -} - static struct inode *ufs_nfs_get_inode(struct super_block *sb, u64 ino, u32 generation) { struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -694,7 +678,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait) struct ufs_super_block_third * usb3; unsigned flags; - lock_ufs(sb); mutex_lock(&UFS_SB(sb)->s_lock); UFSD("ENTER\n"); @@ -714,7 +697,6 @@ static int ufs_sync_fs(struct super_block *sb, int wait) UFSD("EXIT\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return 0; } @@ -758,7 +740,6 @@ static void ufs_put_super(struct super_block *sb) ubh_brelse_uspi (sbi->s_uspi); kfree (sbi->s_uspi); - mutex_destroy(&sbi->mutex); kfree (sbi); sb->s_fs_info = NULL; UFSD("EXIT\n"); @@ -801,7 +782,6 @@ static int ufs_fill_super(struct super_block *sb, void *data, int silent) UFSD("flag %u\n", (int)(sb->s_flags & MS_RDONLY)); - mutex_init(&sbi->mutex); mutex_init(&sbi->s_lock); spin_lock_init(&sbi->work_lock); INIT_DELAYED_WORK(&sbi->sync_work, delayed_sync_fs); @@ -1257,7 +1237,6 @@ magic_found: return 0; failed: - mutex_destroy(&sbi->mutex); if (ubh) ubh_brelse_uspi (uspi); kfree (uspi); @@ -1280,7 +1259,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) unsigned flags; sync_filesystem(sb); - lock_ufs(sb); mutex_lock(&UFS_SB(sb)->s_lock); uspi = UFS_SB(sb)->s_uspi; flags = UFS_SB(sb)->s_flags; @@ -1296,7 +1274,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufs_set_opt (new_mount_opt, ONERROR_LOCK); if (!ufs_parse_options (data, &new_mount_opt)) { mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EINVAL; } if (!(new_mount_opt & UFS_MOUNT_UFSTYPE)) { @@ -1304,14 +1281,12 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) } else if ((new_mount_opt & UFS_MOUNT_UFSTYPE) != ufstype) { pr_err("ufstype can't be changed during remount\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EINVAL; } if ((*mount_flags & MS_RDONLY) == (sb->s_flags & MS_RDONLY)) { UFS_SB(sb)->s_mount_opt = new_mount_opt; mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return 0; } @@ -1335,7 +1310,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) #ifndef CONFIG_UFS_FS_WRITE pr_err("ufs was compiled with read-only support, can't be mounted as read-write\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EINVAL; #else if (ufstype != UFS_MOUNT_UFSTYPE_SUN && @@ -1345,13 +1319,11 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) ufstype != UFS_MOUNT_UFSTYPE_UFS2) { pr_err("this ufstype is read-only supported\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EINVAL; } if (!ufs_read_cylinder_structures(sb)) { pr_err("failed during remounting\n"); mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return -EPERM; } sb->s_flags &= ~MS_RDONLY; @@ -1359,7 +1331,6 @@ static int ufs_remount (struct super_block *sb, int *mount_flags, char *data) } UFS_SB(sb)->s_mount_opt = new_mount_opt; mutex_unlock(&UFS_SB(sb)->s_lock); - unlock_ufs(sb); return 0; } @@ -1391,8 +1362,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) struct ufs_super_block_third *usb3; u64 id = huge_encode_dev(sb->s_bdev->bd_dev); - lock_ufs(sb); - + mutex_lock(&UFS_SB(sb)->s_lock); usb3 = ubh_get_usb_third(uspi); if ((flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) { @@ -1413,7 +1383,7 @@ static int ufs_statfs(struct dentry *dentry, struct kstatfs *buf) buf->f_fsid.val[0] = (u32)id; buf->f_fsid.val[1] = (u32)(id >> 32); - unlock_ufs(sb); + mutex_unlock(&UFS_SB(sb)->s_lock); return 0; } diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index ea28b73a8b74..478f35b493a6 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -24,8 +24,6 @@ struct ufs_sb_info { unsigned s_cgno[UFS_MAX_GROUP_LOADED]; unsigned short s_cg_loaded; unsigned s_mount_opt; - struct mutex mutex; - struct task_struct *mutex_owner; struct super_block *sb; int work_queued; /* non-zero if the delayed work is queued */ struct delayed_work sync_work; /* FS sync delayed work */ @@ -172,7 +170,4 @@ static inline u32 ufs_dtogd(struct ufs_sb_private_info * uspi, u64 b) return do_div(b, uspi->s_fpg); } -extern void lock_ufs(struct super_block *sb); -extern void unlock_ufs(struct super_block *sb); - #endif /* _UFS_UFS_H */ From 6a799d3514217d217b4e74a1ee4f016428582dc5 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2015 18:43:08 -0400 Subject: [PATCH 09/65] ufs: ufs_trunc_direct() always returns 0 make it return void Signed-off-by: Al Viro --- fs/ufs/truncate.c | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index ad34b7f4b499..c56f4ef1cb7a 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -63,7 +63,7 @@ #define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) -static int ufs_trunc_direct(struct inode *inode) +static void ufs_trunc_direct(struct inode *inode) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block * sb; @@ -72,7 +72,6 @@ static int ufs_trunc_direct(struct inode *inode) u64 frag1, frag2, frag3, frag4, block1, block2; unsigned frag_to_free, free_count; unsigned i, tmp; - int retry; UFSD("ENTER: ino %lu\n", inode->i_ino); @@ -81,7 +80,6 @@ static int ufs_trunc_direct(struct inode *inode) frag_to_free = 0; free_count = 0; - retry = 0; frag1 = DIRECT_FRAGMENT; frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); @@ -168,7 +166,6 @@ next1: next3: UFSD("EXIT: ino %lu\n", inode->i_ino); - return retry; } @@ -467,8 +464,8 @@ static void __ufs_truncate_blocks(struct inode *inode) mutex_lock(&ufsi->truncate_mutex); while (1) { - retry = ufs_trunc_direct(inode); - retry |= ufs_trunc_indirect(inode, UFS_IND_BLOCK, + ufs_trunc_direct(inode); + retry = ufs_trunc_indirect(inode, UFS_IND_BLOCK, ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, From 687857930d9294100a4636e45b78a244e6ba4125 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2015 18:45:21 -0400 Subject: [PATCH 10/65] ufs: ufs_trunc_...() has exclusion with everything that might cause allocations Currently - on lock_ufs(), eventually - on per-inode mutex. lock_ufs() used to be mere BKL, which is much weaker, so it needed those rechecks. BKL doesn't provide any exclusion once we lose CPU; its blind replacement, OTOH, _does_. Making that per-filesystem was an atrocity, but at least we can simplify life here. And yes, we certainly need to make that sucker per-inode - these days inode.c and truncate.c uses are needed only to protect the block pointers. Signed-off-by: Al Viro --- fs/ufs/truncate.c | 12 ------------ 1 file changed, 12 deletions(-) diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index c56f4ef1cb7a..3beaa848e30a 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -195,10 +195,6 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) if (!tmp) return 0; ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize); - if (tmp != ufs_data_ptr_to_cpu(sb, p)) { - ubh_brelse (ind_ubh); - return 1; - } if (!ind_ubh) { write_seqlock(&UFS_I(inode)->meta_lock); ufs_data_ptr_clear(uspi, p); @@ -280,10 +276,6 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) if (!tmp) return 0; dind_bh = ubh_bread(sb, tmp, uspi->s_bsize); - if (tmp != ufs_data_ptr_to_cpu(sb, p)) { - ubh_brelse (dind_bh); - return 1; - } if (!dind_bh) { write_seqlock(&UFS_I(inode)->meta_lock); ufs_data_ptr_clear(uspi, p); @@ -345,10 +337,6 @@ static int ufs_trunc_tindirect(struct inode *inode) if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) return 0; tind_bh = ubh_bread (sb, tmp, uspi->s_bsize); - if (tmp != ufs_data_ptr_to_cpu(sb, p)) { - ubh_brelse (tind_bh); - return 1; - } if (!tind_bh) { write_seqlock(&ufsi->meta_lock); ufs_data_ptr_clear(uspi, p); From 0d23cf7616253b7960edeae720b9f5dfdccee445 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 16 Jun 2015 18:52:28 -0400 Subject: [PATCH 11/65] ufs: no retries are needed on truncate Signed-off-by: Al Viro --- fs/ufs/truncate.c | 57 ++++++++++++++--------------------------------- 1 file changed, 17 insertions(+), 40 deletions(-) diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c index 3beaa848e30a..f84dd3078929 100644 --- a/fs/ufs/truncate.c +++ b/fs/ufs/truncate.c @@ -169,7 +169,7 @@ next1: } -static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) +static void ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) { struct super_block * sb; struct ufs_sb_private_info * uspi; @@ -177,7 +177,6 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) void *ind; u64 tmp, indirect_block, i, frag_to_free; unsigned free_count; - int retry; UFSD("ENTER: ino %lu, offset %llu, p: %p\n", inode->i_ino, (unsigned long long)offset, p); @@ -189,17 +188,16 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) frag_to_free = 0; free_count = 0; - retry = 0; tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp) - return 0; + return; ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize); if (!ind_ubh) { write_seqlock(&UFS_I(inode)->meta_lock); ufs_data_ptr_clear(uspi, p); write_sequnlock(&UFS_I(inode)->meta_lock); - return 0; + return; } indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0; @@ -250,18 +248,15 @@ static int ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) ubh_brelse (ind_ubh); UFSD("EXIT: ino %lu\n", inode->i_ino); - - return retry; } -static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) +static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) { struct super_block * sb; struct ufs_sb_private_info * uspi; struct ufs_buffer_head *dind_bh; u64 i, tmp, dindirect_block; void *dind; - int retry = 0; UFSD("ENTER: ino %lu\n", inode->i_ino); @@ -270,17 +265,16 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) dindirect_block = (DIRECT_BLOCK > offset) ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0; - retry = 0; tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp) - return 0; + return; dind_bh = ubh_bread(sb, tmp, uspi->s_bsize); if (!dind_bh) { write_seqlock(&UFS_I(inode)->meta_lock); ufs_data_ptr_clear(uspi, p); write_sequnlock(&UFS_I(inode)->meta_lock); - return 0; + return; } for (i = dindirect_block ; i < uspi->s_apb ; i++) { @@ -288,7 +282,7 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) tmp = ufs_data_ptr_to_cpu(sb, dind); if (!tmp) continue; - retry |= ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind); + ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind); ubh_mark_buffer_dirty(dind_bh); } @@ -312,11 +306,9 @@ static int ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) ubh_brelse (dind_bh); UFSD("EXIT: ino %lu\n", inode->i_ino); - - return retry; } -static int ufs_trunc_tindirect(struct inode *inode) +static void ufs_trunc_tindirect(struct inode *inode) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -324,29 +316,26 @@ static int ufs_trunc_tindirect(struct inode *inode) struct ufs_buffer_head * tind_bh; u64 tindirect_block, tmp, i; void *tind, *p; - int retry; UFSD("ENTER: ino %lu\n", inode->i_ino); - retry = 0; - tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb)) ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0; p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) - return 0; + return; tind_bh = ubh_bread (sb, tmp, uspi->s_bsize); if (!tind_bh) { write_seqlock(&ufsi->meta_lock); ufs_data_ptr_clear(uspi, p); write_sequnlock(&ufsi->meta_lock); - return 0; + return; } for (i = tindirect_block ; i < uspi->s_apb ; i++) { tind = ubh_get_data_ptr(uspi, tind_bh, i); - retry |= ufs_trunc_dindirect(inode, UFS_NDADDR + + ufs_trunc_dindirect(inode, UFS_NDADDR + uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind); ubh_mark_buffer_dirty(tind_bh); } @@ -370,7 +359,6 @@ static int ufs_trunc_tindirect(struct inode *inode) ubh_brelse (tind_bh); UFSD("EXIT: ino %lu\n", inode->i_ino); - return retry; } static int ufs_alloc_lastblock(struct inode *inode, loff_t size) @@ -448,25 +436,14 @@ static void __ufs_truncate_blocks(struct inode *inode) struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - int retry; mutex_lock(&ufsi->truncate_mutex); - while (1) { - ufs_trunc_direct(inode); - retry = ufs_trunc_indirect(inode, UFS_IND_BLOCK, - ufs_get_direct_data_ptr(uspi, ufsi, - UFS_IND_BLOCK)); - retry |= ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, - ufs_get_direct_data_ptr(uspi, ufsi, - UFS_DIND_BLOCK)); - retry |= ufs_trunc_tindirect (inode); - if (!retry) - break; - if (IS_SYNC(inode) && (inode->i_state & I_DIRTY)) - ufs_sync_inode (inode); - yield(); - } - + ufs_trunc_direct(inode); + ufs_trunc_indirect(inode, UFS_IND_BLOCK, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); + ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); + ufs_trunc_tindirect(inode); ufsi->i_lastfrag = DIRECT_FRAGMENT; mutex_unlock(&ufsi->truncate_mutex); } From 010d331fc315c96607aa6ecdfebb9fcdd349fc9b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 17 Jun 2015 12:44:14 -0400 Subject: [PATCH 12/65] ufs: move truncate code into inode.c It is closely tied to block pointers handling there, can benefit from existing helpers, etc. - no point keeping them apart. Trimmed the trailing whitespaces in inode.c at the same time. Signed-off-by: Al Viro --- fs/ufs/Makefile | 2 +- fs/ufs/inode.c | 480 ++++++++++++++++++++++++++++++++++++++++-- fs/ufs/truncate.c | 515 ---------------------------------------------- fs/ufs/ufs.h | 6 +- 4 files changed, 470 insertions(+), 533 deletions(-) delete mode 100644 fs/ufs/truncate.c diff --git a/fs/ufs/Makefile b/fs/ufs/Makefile index 4d0e02b022b3..392db25c0b56 100644 --- a/fs/ufs/Makefile +++ b/fs/ufs/Makefile @@ -5,5 +5,5 @@ obj-$(CONFIG_UFS_FS) += ufs.o ufs-objs := balloc.o cylinder.o dir.o file.o ialloc.o inode.o \ - namei.o super.o symlink.o truncate.o util.o + namei.o super.o symlink.o util.o ccflags-$(CONFIG_UFS_DEBUG) += -DDEBUG diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 100f93c6b309..ec758edbda47 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -126,7 +126,7 @@ static u64 ufs_frag_map(struct inode *inode, sector_t frag) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift; int shift = uspi->s_apbshift-uspi->s_fpbshift; - sector_t offsets[4], *p; + unsigned offsets[4], *p; Indirect chain[4], *q = chain; int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets); unsigned flags = UFS_SB(sb)->s_flags; @@ -290,14 +290,14 @@ repeat: return NULL; } lastfrag = ufsi->i_lastfrag; - + } tmp = ufs_data_ptr_to_cpu(sb, ufs_get_direct_data_ptr(uspi, ufsi, lastblock)); if (tmp) goal = tmp + uspi->s_fpb; - tmp = ufs_new_fragments (inode, p, fragment - blockoff, + tmp = ufs_new_fragments (inode, p, fragment - blockoff, goal, required + blockoff, err, phys != NULL ? locked_page : NULL); @@ -436,7 +436,7 @@ repeat: if (ufs_data_ptr_to_cpu(sb, p)) goto repeat; goto out; - } + } if (!phys) { @@ -463,7 +463,7 @@ out: * readpage, writepage and so on */ -int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) +static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) { struct super_block * sb = inode->i_sb; struct ufs_sb_info * sbi = UFS_SB(sb); @@ -472,7 +472,7 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head int ret, err, new; unsigned long ptr,phys; u64 phys64 = 0; - + if (!create) { phys64 = ufs_frag_map(inode, fragment); UFSD("phys64 = %llu\n", (unsigned long long)phys64); @@ -498,7 +498,7 @@ int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head err = 0; ptr = fragment; - + /* * ok, these macros clean the logic up a bit and make * it much more readable: @@ -574,6 +574,8 @@ int ufs_prepare_chunk(struct page *page, loff_t pos, unsigned len) return __block_write_begin(page, pos, len, ufs_getfrag_block); } +static void ufs_truncate_blocks(struct inode *); + static void ufs_write_failed(struct address_space *mapping, loff_t to) { struct inode *inode = mapping->host; @@ -661,7 +663,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_error (sb, "ufs_read_inode", "inode %lu has zero nlink\n", inode->i_ino); return -1; } - + /* * Linux now has 32-bit uid and gid, so we can support EFT. */ @@ -681,7 +683,7 @@ static int ufs1_read_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufsi->i_shadow = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_shadow); ufsi->i_oeftflag = fs32_to_cpu(sb, ufs_inode->ui_u3.ui_sun.ui_oeftflag); - + if (S_ISCHR(mode) || S_ISBLK(mode) || inode->i_blocks) { memcpy(ufsi->i_u1.i_data, &ufs_inode->ui_u2.ui_addr, sizeof(ufs_inode->ui_u2.ui_addr)); @@ -815,7 +817,7 @@ static void ufs1_update_inode(struct inode *inode, struct ufs_inode *ufs_inode) ufs_set_inode_uid(sb, ufs_inode, i_uid_read(inode)); ufs_set_inode_gid(sb, ufs_inode, i_gid_read(inode)); - + ufs_inode->ui_size = cpu_to_fs64(sb, inode->i_size); ufs_inode->ui_atime.tv_sec = cpu_to_fs32(sb, inode->i_atime.tv_sec); ufs_inode->ui_atime.tv_usec = 0; @@ -917,12 +919,12 @@ static int ufs_update_inode(struct inode * inode, int do_sync) ufs1_update_inode(inode, ufs_inode + ufs_inotofsbo(inode->i_ino)); } - + mark_buffer_dirty(bh); if (do_sync) sync_dirty_buffer(bh); brelse (bh); - + UFSD("EXIT\n"); return 0; } @@ -957,3 +959,457 @@ void ufs_evict_inode(struct inode * inode) if (want_delete) ufs_free_inode(inode); } + +#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) +#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) + +static void ufs_trunc_direct(struct inode *inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block * sb; + struct ufs_sb_private_info * uspi; + void *p; + u64 frag1, frag2, frag3, frag4, block1, block2; + unsigned frag_to_free, free_count; + unsigned i, tmp; + + UFSD("ENTER: ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + frag_to_free = 0; + free_count = 0; + + frag1 = DIRECT_FRAGMENT; + frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); + frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); + frag3 = frag4 & ~uspi->s_fpbmask; + block1 = block2 = 0; + if (frag2 > frag3) { + frag2 = frag4; + frag3 = frag4 = 0; + } else if (frag2 < frag3) { + block1 = ufs_fragstoblks (frag2); + block2 = ufs_fragstoblks (frag3); + } + + UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu," + " frag3 %llu, frag4 %llu\n", inode->i_ino, + (unsigned long long)frag1, (unsigned long long)frag2, + (unsigned long long)block1, (unsigned long long)block2, + (unsigned long long)frag3, (unsigned long long)frag4); + + if (frag1 >= frag2) + goto next1; + + /* + * Free first free fragments + */ + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp ) + ufs_panic (sb, "ufs_trunc_direct", "internal error"); + frag2 -= frag1; + frag1 = ufs_fragnum (frag1); + + ufs_free_fragments(inode, tmp + frag1, frag2); + mark_inode_dirty(inode); + frag_to_free = tmp + frag1; + +next1: + /* + * Free whole blocks + */ + for (i = block1 ; i < block2; i++) { + p = ufs_get_direct_data_ptr(uspi, ufsi, i); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + continue; + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + + if (free_count == 0) { + frag_to_free = tmp; + free_count = uspi->s_fpb; + } else if (free_count > 0 && frag_to_free == tmp - free_count) + free_count += uspi->s_fpb; + else { + ufs_free_blocks (inode, frag_to_free, free_count); + frag_to_free = tmp; + free_count = uspi->s_fpb; + } + mark_inode_dirty(inode); + } + + if (free_count > 0) + ufs_free_blocks (inode, frag_to_free, free_count); + + if (frag3 >= frag4) + goto next3; + + /* + * Free last free fragments + */ + p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp ) + ufs_panic(sb, "ufs_truncate_direct", "internal error"); + frag4 = ufs_fragnum (frag4); + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + + ufs_free_fragments (inode, tmp, frag4); + mark_inode_dirty(inode); + next3: + + UFSD("EXIT: ino %lu\n", inode->i_ino); +} + + +static void ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_buffer_head * ind_ubh; + void *ind; + u64 tmp, indirect_block, i, frag_to_free; + unsigned free_count; + + UFSD("ENTER: ino %lu, offset %llu, p: %p\n", + inode->i_ino, (unsigned long long)offset, p); + + BUG_ON(!p); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + frag_to_free = 0; + free_count = 0; + + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + return; + ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize); + if (!ind_ubh) { + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + return; + } + + indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0; + for (i = indirect_block; i < uspi->s_apb; i++) { + ind = ubh_get_data_ptr(uspi, ind_ubh, i); + tmp = ufs_data_ptr_to_cpu(sb, ind); + if (!tmp) + continue; + + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, ind); + write_sequnlock(&UFS_I(inode)->meta_lock); + ubh_mark_buffer_dirty(ind_ubh); + if (free_count == 0) { + frag_to_free = tmp; + free_count = uspi->s_fpb; + } else if (free_count > 0 && frag_to_free == tmp - free_count) + free_count += uspi->s_fpb; + else { + ufs_free_blocks (inode, frag_to_free, free_count); + frag_to_free = tmp; + free_count = uspi->s_fpb; + } + + mark_inode_dirty(inode); + } + + if (free_count > 0) { + ufs_free_blocks (inode, frag_to_free, free_count); + } + for (i = 0; i < uspi->s_apb; i++) + if (!ufs_is_data_ptr_zero(uspi, + ubh_get_data_ptr(uspi, ind_ubh, i))) + break; + if (i >= uspi->s_apb) { + tmp = ufs_data_ptr_to_cpu(sb, p); + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + + ubh_bforget(ind_ubh); + ufs_free_blocks (inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + ind_ubh = NULL; + } + if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) + ubh_sync_block(ind_ubh); + ubh_brelse (ind_ubh); + + UFSD("EXIT: ino %lu\n", inode->i_ino); +} + +static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) +{ + struct super_block * sb; + struct ufs_sb_private_info * uspi; + struct ufs_buffer_head *dind_bh; + u64 i, tmp, dindirect_block; + void *dind; + + UFSD("ENTER: ino %lu\n", inode->i_ino); + + sb = inode->i_sb; + uspi = UFS_SB(sb)->s_uspi; + + dindirect_block = (DIRECT_BLOCK > offset) + ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0; + + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + return; + dind_bh = ubh_bread(sb, tmp, uspi->s_bsize); + if (!dind_bh) { + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + return; + } + + for (i = dindirect_block ; i < uspi->s_apb ; i++) { + dind = ubh_get_data_ptr(uspi, dind_bh, i); + tmp = ufs_data_ptr_to_cpu(sb, dind); + if (!tmp) + continue; + ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind); + ubh_mark_buffer_dirty(dind_bh); + } + + for (i = 0; i < uspi->s_apb; i++) + if (!ufs_is_data_ptr_zero(uspi, + ubh_get_data_ptr(uspi, dind_bh, i))) + break; + if (i >= uspi->s_apb) { + tmp = ufs_data_ptr_to_cpu(sb, p); + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + + ubh_bforget(dind_bh); + ufs_free_blocks(inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + dind_bh = NULL; + } + if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) + ubh_sync_block(dind_bh); + ubh_brelse (dind_bh); + + UFSD("EXIT: ino %lu\n", inode->i_ino); +} + +static void ufs_trunc_tindirect(struct inode *inode) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct ufs_inode_info *ufsi = UFS_I(inode); + struct ufs_buffer_head * tind_bh; + u64 tindirect_block, tmp, i; + void *tind, *p; + + UFSD("ENTER: ino %lu\n", inode->i_ino); + + tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb)) + ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0; + + p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); + if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) + return; + tind_bh = ubh_bread (sb, tmp, uspi->s_bsize); + if (!tind_bh) { + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + return; + } + + for (i = tindirect_block ; i < uspi->s_apb ; i++) { + tind = ubh_get_data_ptr(uspi, tind_bh, i); + ufs_trunc_dindirect(inode, UFS_NDADDR + + uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind); + ubh_mark_buffer_dirty(tind_bh); + } + for (i = 0; i < uspi->s_apb; i++) + if (!ufs_is_data_ptr_zero(uspi, + ubh_get_data_ptr(uspi, tind_bh, i))) + break; + if (i >= uspi->s_apb) { + tmp = ufs_data_ptr_to_cpu(sb, p); + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + + ubh_bforget(tind_bh); + ufs_free_blocks(inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + tind_bh = NULL; + } + if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) + ubh_sync_block(tind_bh); + ubh_brelse (tind_bh); + + UFSD("EXIT: ino %lu\n", inode->i_ino); +} + +static int ufs_alloc_lastblock(struct inode *inode, loff_t size) +{ + int err = 0; + struct super_block *sb = inode->i_sb; + struct address_space *mapping = inode->i_mapping; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned i, end; + sector_t lastfrag; + struct page *lastpage; + struct buffer_head *bh; + u64 phys64; + + lastfrag = (size + uspi->s_fsize - 1) >> uspi->s_fshift; + + if (!lastfrag) + goto out; + + lastfrag--; + + lastpage = ufs_get_locked_page(mapping, lastfrag >> + (PAGE_CACHE_SHIFT - inode->i_blkbits)); + if (IS_ERR(lastpage)) { + err = -EIO; + goto out; + } + + end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1); + bh = page_buffers(lastpage); + for (i = 0; i < end; ++i) + bh = bh->b_this_page; + + + err = ufs_getfrag_block(inode, lastfrag, bh, 1); + + if (unlikely(err)) + goto out_unlock; + + if (buffer_new(bh)) { + clear_buffer_new(bh); + unmap_underlying_metadata(bh->b_bdev, + bh->b_blocknr); + /* + * we do not zeroize fragment, because of + * if it maped to hole, it already contains zeroes + */ + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + set_page_dirty(lastpage); + } + + if (lastfrag >= UFS_IND_FRAGMENT) { + end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1; + phys64 = bh->b_blocknr + 1; + for (i = 0; i < end; ++i) { + bh = sb_getblk(sb, i + phys64); + lock_buffer(bh); + memset(bh->b_data, 0, sb->s_blocksize); + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + sync_dirty_buffer(bh); + brelse(bh); + } + } +out_unlock: + ufs_put_locked_page(lastpage); +out: + return err; +} + +static void __ufs_truncate_blocks(struct inode *inode) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + + mutex_lock(&ufsi->truncate_mutex); + ufs_trunc_direct(inode); + ufs_trunc_indirect(inode, UFS_IND_BLOCK, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); + ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); + ufs_trunc_tindirect(inode); + ufsi->i_lastfrag = DIRECT_FRAGMENT; + mutex_unlock(&ufsi->truncate_mutex); +} + +static int ufs_truncate(struct inode *inode, loff_t size) +{ + int err = 0; + + UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", + inode->i_ino, (unsigned long long)size, + (unsigned long long)i_size_read(inode)); + + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return -EINVAL; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return -EPERM; + + err = ufs_alloc_lastblock(inode, size); + + if (err) + goto out; + + block_truncate_page(inode->i_mapping, size, ufs_getfrag_block); + + truncate_setsize(inode, size); + + __ufs_truncate_blocks(inode); + inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; + mark_inode_dirty(inode); +out: + UFSD("EXIT: err %d\n", err); + return err; +} + +void ufs_truncate_blocks(struct inode *inode) +{ + if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || + S_ISLNK(inode->i_mode))) + return; + if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) + return; + __ufs_truncate_blocks(inode); +} + +int ufs_setattr(struct dentry *dentry, struct iattr *attr) +{ + struct inode *inode = d_inode(dentry); + unsigned int ia_valid = attr->ia_valid; + int error; + + error = inode_change_ok(inode, attr); + if (error) + return error; + + if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { + error = ufs_truncate(inode, attr->ia_size); + if (error) + return error; + } + + setattr_copy(inode, attr); + mark_inode_dirty(inode); + return 0; +} + +const struct inode_operations ufs_file_inode_operations = { + .setattr = ufs_setattr, +}; diff --git a/fs/ufs/truncate.c b/fs/ufs/truncate.c deleted file mode 100644 index f84dd3078929..000000000000 --- a/fs/ufs/truncate.c +++ /dev/null @@ -1,515 +0,0 @@ -/* - * linux/fs/ufs/truncate.c - * - * Copyright (C) 1998 - * Daniel Pirkl - * Charles University, Faculty of Mathematics and Physics - * - * from - * - * linux/fs/ext2/truncate.c - * - * Copyright (C) 1992, 1993, 1994, 1995 - * Remy Card (card@masi.ibp.fr) - * Laboratoire MASI - Institut Blaise Pascal - * Universite Pierre et Marie Curie (Paris VI) - * - * from - * - * linux/fs/minix/truncate.c - * - * Copyright (C) 1991, 1992 Linus Torvalds - * - * Big-endian to little-endian byte-swapping/bitmaps by - * David S. Miller (davem@caip.rutgers.edu), 1995 - */ - -/* - * Real random numbers for secure rm added 94/02/18 - * Idea from Pierre del Perugia - */ - -/* - * Adoptation to use page cache and UFS2 write support by - * Evgeniy Dushistov , 2006-2007 - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "ufs_fs.h" -#include "ufs.h" -#include "swab.h" -#include "util.h" - -/* - * Secure deletion currently doesn't work. It interacts very badly - * with buffers shared with memory mappings, and for that reason - * can't be done in the truncate() routines. It should instead be - * done separately in "release()" before calling the truncate routines - * that will release the actual file blocks. - * - * Linus - */ - -#define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) -#define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) - - -static void ufs_trunc_direct(struct inode *inode) -{ - struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block * sb; - struct ufs_sb_private_info * uspi; - void *p; - u64 frag1, frag2, frag3, frag4, block1, block2; - unsigned frag_to_free, free_count; - unsigned i, tmp; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - frag_to_free = 0; - free_count = 0; - - frag1 = DIRECT_FRAGMENT; - frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); - frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); - frag3 = frag4 & ~uspi->s_fpbmask; - block1 = block2 = 0; - if (frag2 > frag3) { - frag2 = frag4; - frag3 = frag4 = 0; - } else if (frag2 < frag3) { - block1 = ufs_fragstoblks (frag2); - block2 = ufs_fragstoblks (frag3); - } - - UFSD("ino %lu, frag1 %llu, frag2 %llu, block1 %llu, block2 %llu," - " frag3 %llu, frag4 %llu\n", inode->i_ino, - (unsigned long long)frag1, (unsigned long long)frag2, - (unsigned long long)block1, (unsigned long long)block2, - (unsigned long long)frag3, (unsigned long long)frag4); - - if (frag1 >= frag2) - goto next1; - - /* - * Free first free fragments - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag1)); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp ) - ufs_panic (sb, "ufs_trunc_direct", "internal error"); - frag2 -= frag1; - frag1 = ufs_fragnum (frag1); - - ufs_free_fragments(inode, tmp + frag1, frag2); - mark_inode_dirty(inode); - frag_to_free = tmp + frag1; - -next1: - /* - * Free whole blocks - */ - for (i = block1 ; i < block2; i++) { - p = ufs_get_direct_data_ptr(uspi, ufsi, i); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - continue; - write_seqlock(&ufsi->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&ufsi->meta_lock); - - if (free_count == 0) { - frag_to_free = tmp; - free_count = uspi->s_fpb; - } else if (free_count > 0 && frag_to_free == tmp - free_count) - free_count += uspi->s_fpb; - else { - ufs_free_blocks (inode, frag_to_free, free_count); - frag_to_free = tmp; - free_count = uspi->s_fpb; - } - mark_inode_dirty(inode); - } - - if (free_count > 0) - ufs_free_blocks (inode, frag_to_free, free_count); - - if (frag3 >= frag4) - goto next3; - - /* - * Free last free fragments - */ - p = ufs_get_direct_data_ptr(uspi, ufsi, ufs_fragstoblks(frag3)); - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp ) - ufs_panic(sb, "ufs_truncate_direct", "internal error"); - frag4 = ufs_fragnum (frag4); - write_seqlock(&ufsi->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&ufsi->meta_lock); - - ufs_free_fragments (inode, tmp, frag4); - mark_inode_dirty(inode); - next3: - - UFSD("EXIT: ino %lu\n", inode->i_ino); -} - - -static void ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) -{ - struct super_block * sb; - struct ufs_sb_private_info * uspi; - struct ufs_buffer_head * ind_ubh; - void *ind; - u64 tmp, indirect_block, i, frag_to_free; - unsigned free_count; - - UFSD("ENTER: ino %lu, offset %llu, p: %p\n", - inode->i_ino, (unsigned long long)offset, p); - - BUG_ON(!p); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - frag_to_free = 0; - free_count = 0; - - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - return; - ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize); - if (!ind_ubh) { - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&UFS_I(inode)->meta_lock); - return; - } - - indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0; - for (i = indirect_block; i < uspi->s_apb; i++) { - ind = ubh_get_data_ptr(uspi, ind_ubh, i); - tmp = ufs_data_ptr_to_cpu(sb, ind); - if (!tmp) - continue; - - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, ind); - write_sequnlock(&UFS_I(inode)->meta_lock); - ubh_mark_buffer_dirty(ind_ubh); - if (free_count == 0) { - frag_to_free = tmp; - free_count = uspi->s_fpb; - } else if (free_count > 0 && frag_to_free == tmp - free_count) - free_count += uspi->s_fpb; - else { - ufs_free_blocks (inode, frag_to_free, free_count); - frag_to_free = tmp; - free_count = uspi->s_fpb; - } - - mark_inode_dirty(inode); - } - - if (free_count > 0) { - ufs_free_blocks (inode, frag_to_free, free_count); - } - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, ind_ubh, i))) - break; - if (i >= uspi->s_apb) { - tmp = ufs_data_ptr_to_cpu(sb, p); - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&UFS_I(inode)->meta_lock); - - ubh_bforget(ind_ubh); - ufs_free_blocks (inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - ind_ubh = NULL; - } - if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) - ubh_sync_block(ind_ubh); - ubh_brelse (ind_ubh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); -} - -static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) -{ - struct super_block * sb; - struct ufs_sb_private_info * uspi; - struct ufs_buffer_head *dind_bh; - u64 i, tmp, dindirect_block; - void *dind; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - dindirect_block = (DIRECT_BLOCK > offset) - ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0; - - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - return; - dind_bh = ubh_bread(sb, tmp, uspi->s_bsize); - if (!dind_bh) { - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&UFS_I(inode)->meta_lock); - return; - } - - for (i = dindirect_block ; i < uspi->s_apb ; i++) { - dind = ubh_get_data_ptr(uspi, dind_bh, i); - tmp = ufs_data_ptr_to_cpu(sb, dind); - if (!tmp) - continue; - ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind); - ubh_mark_buffer_dirty(dind_bh); - } - - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, dind_bh, i))) - break; - if (i >= uspi->s_apb) { - tmp = ufs_data_ptr_to_cpu(sb, p); - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&UFS_I(inode)->meta_lock); - - ubh_bforget(dind_bh); - ufs_free_blocks(inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - dind_bh = NULL; - } - if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) - ubh_sync_block(dind_bh); - ubh_brelse (dind_bh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); -} - -static void ufs_trunc_tindirect(struct inode *inode) -{ - struct super_block *sb = inode->i_sb; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_inode_info *ufsi = UFS_I(inode); - struct ufs_buffer_head * tind_bh; - u64 tindirect_block, tmp, i; - void *tind, *p; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb)) - ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0; - - p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); - if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) - return; - tind_bh = ubh_bread (sb, tmp, uspi->s_bsize); - if (!tind_bh) { - write_seqlock(&ufsi->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&ufsi->meta_lock); - return; - } - - for (i = tindirect_block ; i < uspi->s_apb ; i++) { - tind = ubh_get_data_ptr(uspi, tind_bh, i); - ufs_trunc_dindirect(inode, UFS_NDADDR + - uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind); - ubh_mark_buffer_dirty(tind_bh); - } - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, tind_bh, i))) - break; - if (i >= uspi->s_apb) { - tmp = ufs_data_ptr_to_cpu(sb, p); - write_seqlock(&ufsi->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&ufsi->meta_lock); - - ubh_bforget(tind_bh); - ufs_free_blocks(inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - tind_bh = NULL; - } - if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) - ubh_sync_block(tind_bh); - ubh_brelse (tind_bh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); -} - -static int ufs_alloc_lastblock(struct inode *inode, loff_t size) -{ - int err = 0; - struct super_block *sb = inode->i_sb; - struct address_space *mapping = inode->i_mapping; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - unsigned i, end; - sector_t lastfrag; - struct page *lastpage; - struct buffer_head *bh; - u64 phys64; - - lastfrag = (size + uspi->s_fsize - 1) >> uspi->s_fshift; - - if (!lastfrag) - goto out; - - lastfrag--; - - lastpage = ufs_get_locked_page(mapping, lastfrag >> - (PAGE_CACHE_SHIFT - inode->i_blkbits)); - if (IS_ERR(lastpage)) { - err = -EIO; - goto out; - } - - end = lastfrag & ((1 << (PAGE_CACHE_SHIFT - inode->i_blkbits)) - 1); - bh = page_buffers(lastpage); - for (i = 0; i < end; ++i) - bh = bh->b_this_page; - - - err = ufs_getfrag_block(inode, lastfrag, bh, 1); - - if (unlikely(err)) - goto out_unlock; - - if (buffer_new(bh)) { - clear_buffer_new(bh); - unmap_underlying_metadata(bh->b_bdev, - bh->b_blocknr); - /* - * we do not zeroize fragment, because of - * if it maped to hole, it already contains zeroes - */ - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - set_page_dirty(lastpage); - } - - if (lastfrag >= UFS_IND_FRAGMENT) { - end = uspi->s_fpb - ufs_fragnum(lastfrag) - 1; - phys64 = bh->b_blocknr + 1; - for (i = 0; i < end; ++i) { - bh = sb_getblk(sb, i + phys64); - lock_buffer(bh); - memset(bh->b_data, 0, sb->s_blocksize); - set_buffer_uptodate(bh); - mark_buffer_dirty(bh); - unlock_buffer(bh); - sync_dirty_buffer(bh); - brelse(bh); - } - } -out_unlock: - ufs_put_locked_page(lastpage); -out: - return err; -} - -static void __ufs_truncate_blocks(struct inode *inode) -{ - struct ufs_inode_info *ufsi = UFS_I(inode); - struct super_block *sb = inode->i_sb; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - - mutex_lock(&ufsi->truncate_mutex); - ufs_trunc_direct(inode); - ufs_trunc_indirect(inode, UFS_IND_BLOCK, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); - ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_tindirect(inode); - ufsi->i_lastfrag = DIRECT_FRAGMENT; - mutex_unlock(&ufsi->truncate_mutex); -} - -int ufs_truncate(struct inode *inode, loff_t size) -{ - int err = 0; - - UFSD("ENTER: ino %lu, i_size: %llu, old_i_size: %llu\n", - inode->i_ino, (unsigned long long)size, - (unsigned long long)i_size_read(inode)); - - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return -EINVAL; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return -EPERM; - - err = ufs_alloc_lastblock(inode, size); - - if (err) - goto out; - - block_truncate_page(inode->i_mapping, size, ufs_getfrag_block); - - truncate_setsize(inode, size); - - __ufs_truncate_blocks(inode); - inode->i_mtime = inode->i_ctime = CURRENT_TIME_SEC; - mark_inode_dirty(inode); -out: - UFSD("EXIT: err %d\n", err); - return err; -} - -void ufs_truncate_blocks(struct inode *inode) -{ - if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || - S_ISLNK(inode->i_mode))) - return; - if (IS_APPEND(inode) || IS_IMMUTABLE(inode)) - return; - __ufs_truncate_blocks(inode); -} - -int ufs_setattr(struct dentry *dentry, struct iattr *attr) -{ - struct inode *inode = d_inode(dentry); - unsigned int ia_valid = attr->ia_valid; - int error; - - error = inode_change_ok(inode, attr); - if (error) - return error; - - if (ia_valid & ATTR_SIZE && attr->ia_size != inode->i_size) { - error = ufs_truncate(inode, attr->ia_size); - if (error) - return error; - } - - setattr_copy(inode, attr); - mark_inode_dirty(inode); - return 0; -} - -const struct inode_operations ufs_file_inode_operations = { - .setattr = ufs_setattr, -}; diff --git a/fs/ufs/ufs.h b/fs/ufs/ufs.h index 478f35b493a6..7da4aca868c0 100644 --- a/fs/ufs/ufs.h +++ b/fs/ufs/ufs.h @@ -122,7 +122,7 @@ extern struct inode *ufs_iget(struct super_block *, unsigned long); extern int ufs_write_inode (struct inode *, struct writeback_control *); extern int ufs_sync_inode (struct inode *); extern void ufs_evict_inode (struct inode *); -extern int ufs_getfrag_block (struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create); +extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); /* namei.c */ extern const struct file_operations ufs_dir_operations; @@ -140,10 +140,6 @@ void ufs_mark_sb_dirty(struct super_block *sb); extern const struct inode_operations ufs_fast_symlink_inode_operations; extern const struct inode_operations ufs_symlink_inode_operations; -/* truncate.c */ -extern void ufs_truncate_blocks(struct inode *); -extern int ufs_setattr(struct dentry *dentry, struct iattr *attr); - static inline struct ufs_sb_info *UFS_SB(struct super_block *sb) { return sb->s_fs_info; From 4e3911f3d704d681477cdb4e1a2bfd52d5e42d23 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Jun 2015 14:13:14 -0400 Subject: [PATCH 13/65] ufs: the offsets ufs_block_to_path() puts into array are not sector_t type makes no sense - those are indices in block number arrays, not block numbers. And no, UFS is not likely to grow indirect blocks with 4Gpointers in them... Signed-off-by: Al Viro --- fs/ufs/inode.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index ec758edbda47..43672183fee3 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -41,7 +41,7 @@ #include "swab.h" #include "util.h" -static int ufs_block_to_path(struct inode *inode, sector_t i_block, sector_t offsets[4]) +static int ufs_block_to_path(struct inode *inode, sector_t i_block, unsigned offsets[4]) { struct ufs_sb_private_info *uspi = UFS_SB(inode->i_sb)->s_uspi; int ptrs = uspi->s_apb; @@ -153,7 +153,7 @@ again: while (--depth) { __fs32 *ptr; struct buffer_head *bh; - sector_t n = *p++; + unsigned n = *p++; bh = sb_bread(sb, uspi->s_sbbase + fs32_to_cpu(sb, q->key32) + (n>>shift)); @@ -177,7 +177,7 @@ ufs2: while (--depth) { __fs64 *ptr; struct buffer_head *bh; - sector_t n = *p++; + unsigned n = *p++; bh = sb_bread(sb, uspi->s_sbbase + fs64_to_cpu(sb, q->key64) + (n>>shift)); From 31cd043e1a09c579c4cd38ea432200fbeae6af1f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 17 Jun 2015 01:10:03 -0400 Subject: [PATCH 14/65] ufs: beginning of __ufs_truncate_block() massage Use ufs_block_to_path() to find the cutoff path in the block pointers' tree. For now just use the information about the depth (to bypass the fully preserved subtrees); subsequent commits will use the information about actual path. Signed-off-by: Al Viro --- fs/ufs/inode.c | 16 ++++++++++++---- 1 file changed, 12 insertions(+), 4 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 43672183fee3..afb0f32b921c 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1336,14 +1336,22 @@ static void __ufs_truncate_blocks(struct inode *inode) struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned offsets[4]; + int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets); mutex_lock(&ufsi->truncate_mutex); - ufs_trunc_direct(inode); - ufs_trunc_indirect(inode, UFS_IND_BLOCK, + switch (depth) { + case 1: + ufs_trunc_direct(inode); + case 2: + ufs_trunc_indirect(inode, UFS_IND_BLOCK, ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); - ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, + case 3: + ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_tindirect(inode); + case 4: + ufs_trunc_tindirect(inode); + } ufsi->i_lastfrag = DIRECT_FRAGMENT; mutex_unlock(&ufsi->truncate_mutex); } From 18ca51d8211065f10672374336cd08d495968c73 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 13:45:07 -0400 Subject: [PATCH 15/65] ufs_trunc_tindirect(): pass the number of blocks to keep IOW, the distance of cutoff from the begining of the branch (in blocks). That (and the fact that block just prior to cutoff is guaranteed to be present) allows to tell whether to free triple indirect block just by looking at the offset. While we are at it, using u64 for index in the block is wrong - those should be unsigned int. Signed-off-by: Al Viro --- fs/ufs/inode.c | 28 +++++++++++----------------- 1 file changed, 11 insertions(+), 17 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index afb0f32b921c..5b3f1c44d4b0 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1208,19 +1208,17 @@ static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) UFSD("EXIT: ino %lu\n", inode->i_ino); } -static void ufs_trunc_tindirect(struct inode *inode) +static void ufs_trunc_tindirect(struct inode *inode, u64 offset) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_inode_info *ufsi = UFS_I(inode); struct ufs_buffer_head * tind_bh; - u64 tindirect_block, tmp, i; + u64 tmp; void *tind, *p; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - tindirect_block = (DIRECT_BLOCK > (UFS_NDADDR + uspi->s_apb + uspi->s_2apb)) - ? ((DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb) >> uspi->s_2apbshift) : 0; + bool free_it = !offset; + unsigned tindirect_block = offset >> uspi->s_2apbshift; + unsigned i; p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) @@ -1239,11 +1237,7 @@ static void ufs_trunc_tindirect(struct inode *inode) uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind); ubh_mark_buffer_dirty(tind_bh); } - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, tind_bh, i))) - break; - if (i >= uspi->s_apb) { + if (free_it) { tmp = ufs_data_ptr_to_cpu(sb, p); write_seqlock(&ufsi->meta_lock); ufs_data_ptr_clear(uspi, p); @@ -1252,13 +1246,11 @@ static void ufs_trunc_tindirect(struct inode *inode) ubh_bforget(tind_bh); ufs_free_blocks(inode, tmp, uspi->s_fpb); mark_inode_dirty(inode); - tind_bh = NULL; + return; } - if (IS_SYNC(inode) && tind_bh && ubh_buffer_dirty(tind_bh)) + if (IS_SYNC(inode) && ubh_buffer_dirty(tind_bh)) ubh_sync_block(tind_bh); ubh_brelse (tind_bh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); } static int ufs_alloc_lastblock(struct inode *inode, loff_t size) @@ -1349,8 +1341,10 @@ static void __ufs_truncate_blocks(struct inode *inode) case 3: ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); + ufs_trunc_tindirect(inode, 0); + break; case 4: - ufs_trunc_tindirect(inode); + ufs_trunc_tindirect(inode, DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb); } ufsi->i_lastfrag = DIRECT_FRAGMENT; mutex_unlock(&ufsi->truncate_mutex); From 6ac36b8777d934e3cd7eb0f023a5043d5c03b00c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 17 Jun 2015 01:54:58 -0400 Subject: [PATCH 16/65] ufs_trunc_indirect(): pass the index of the first pointer to free ... instead of file offset. Same cleanups as in the tindirect conversion in previous commit. Signed-off-by: Al Viro --- fs/ufs/inode.c | 56 +++++++++++++++++++++----------------------------- 1 file changed, 23 insertions(+), 33 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 5b3f1c44d4b0..9c4471a82d2f 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1069,25 +1069,16 @@ next1: } -static void ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) +static void ufs_trunc_indirect(struct inode *inode, unsigned from, void *p) { - struct super_block * sb; - struct ufs_sb_private_info * uspi; + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_buffer_head * ind_ubh; void *ind; - u64 tmp, indirect_block, i, frag_to_free; - unsigned free_count; - - UFSD("ENTER: ino %lu, offset %llu, p: %p\n", - inode->i_ino, (unsigned long long)offset, p); - - BUG_ON(!p); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - frag_to_free = 0; - free_count = 0; + u64 tmp, frag_to_free = 0; + unsigned free_count = 0; + bool to_free = !from; + unsigned i; tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp) @@ -1100,8 +1091,7 @@ static void ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) return; } - indirect_block = (DIRECT_BLOCK > offset) ? (DIRECT_BLOCK - offset) : 0; - for (i = indirect_block; i < uspi->s_apb; i++) { + for (i = from; i < uspi->s_apb; i++) { ind = ubh_get_data_ptr(uspi, ind_ubh, i); tmp = ufs_data_ptr_to_cpu(sb, ind); if (!tmp) @@ -1128,11 +1118,7 @@ static void ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) if (free_count > 0) { ufs_free_blocks (inode, frag_to_free, free_count); } - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, ind_ubh, i))) - break; - if (i >= uspi->s_apb) { + if (to_free) { tmp = ufs_data_ptr_to_cpu(sb, p); write_seqlock(&UFS_I(inode)->meta_lock); ufs_data_ptr_clear(uspi, p); @@ -1141,13 +1127,11 @@ static void ufs_trunc_indirect(struct inode *inode, u64 offset, void *p) ubh_bforget(ind_ubh); ufs_free_blocks (inode, tmp, uspi->s_fpb); mark_inode_dirty(inode); - ind_ubh = NULL; + return; } - if (IS_SYNC(inode) && ind_ubh && ubh_buffer_dirty(ind_ubh)) + if (IS_SYNC(inode) && ubh_buffer_dirty(ind_ubh)) ubh_sync_block(ind_ubh); ubh_brelse (ind_ubh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); } static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) @@ -1157,14 +1141,20 @@ static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) struct ufs_buffer_head *dind_bh; u64 i, tmp, dindirect_block; void *dind; + unsigned from; UFSD("ENTER: ino %lu\n", inode->i_ino); sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - dindirect_block = (DIRECT_BLOCK > offset) - ? ((DIRECT_BLOCK - offset) >> uspi->s_apbshift) : 0; + if (DIRECT_BLOCK <= offset) { + dindirect_block = 0; + from = 0; + } else { + dindirect_block = (DIRECT_BLOCK - offset) >> uspi->s_apbshift; + from = (DIRECT_BLOCK - offset) & uspi->s_apbmask; + } tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp) @@ -1177,12 +1167,12 @@ static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) return; } - for (i = dindirect_block ; i < uspi->s_apb ; i++) { + for (i = dindirect_block ; i < uspi->s_apb ; i++, from = 0) { dind = ubh_get_data_ptr(uspi, dind_bh, i); tmp = ufs_data_ptr_to_cpu(sb, dind); if (!tmp) continue; - ufs_trunc_indirect (inode, offset + (i << uspi->s_apbshift), dind); + ufs_trunc_indirect(inode, from, dind); ubh_mark_buffer_dirty(dind_bh); } @@ -1328,7 +1318,7 @@ static void __ufs_truncate_blocks(struct inode *inode) struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - unsigned offsets[4]; + unsigned offsets[4] = {0,}; int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets); mutex_lock(&ufsi->truncate_mutex); @@ -1336,7 +1326,7 @@ static void __ufs_truncate_blocks(struct inode *inode) case 1: ufs_trunc_direct(inode); case 2: - ufs_trunc_indirect(inode, UFS_IND_BLOCK, + ufs_trunc_indirect(inode, offsets[1], ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); case 3: ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, From 7bad5939fcd04bb83122bdb90981ec5ae2f90e0d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 14:21:09 -0400 Subject: [PATCH 17/65] ufs_trunc_dindirect(): pass the number of blocks to keep same as the previous two. Signed-off-by: Al Viro --- fs/ufs/inode.c | 57 +++++++++++++++++++++++--------------------------- 1 file changed, 26 insertions(+), 31 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 9c4471a82d2f..b4d6398a2d54 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1136,25 +1136,15 @@ static void ufs_trunc_indirect(struct inode *inode, unsigned from, void *p) static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) { - struct super_block * sb; - struct ufs_sb_private_info * uspi; + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_buffer_head *dind_bh; - u64 i, tmp, dindirect_block; + u64 tmp; void *dind; - unsigned from; - - UFSD("ENTER: ino %lu\n", inode->i_ino); - - sb = inode->i_sb; - uspi = UFS_SB(sb)->s_uspi; - - if (DIRECT_BLOCK <= offset) { - dindirect_block = 0; - from = 0; - } else { - dindirect_block = (DIRECT_BLOCK - offset) >> uspi->s_apbshift; - from = (DIRECT_BLOCK - offset) & uspi->s_apbmask; - } + bool free_it = !offset; + unsigned dindirect_block = offset >> uspi->s_apbshift; + unsigned from = offset & uspi->s_apbmask; + unsigned i; tmp = ufs_data_ptr_to_cpu(sb, p); if (!tmp) @@ -1176,11 +1166,7 @@ static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) ubh_mark_buffer_dirty(dind_bh); } - for (i = 0; i < uspi->s_apb; i++) - if (!ufs_is_data_ptr_zero(uspi, - ubh_get_data_ptr(uspi, dind_bh, i))) - break; - if (i >= uspi->s_apb) { + if (free_it) { tmp = ufs_data_ptr_to_cpu(sb, p); write_seqlock(&UFS_I(inode)->meta_lock); ufs_data_ptr_clear(uspi, p); @@ -1189,13 +1175,11 @@ static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) ubh_bforget(dind_bh); ufs_free_blocks(inode, tmp, uspi->s_fpb); mark_inode_dirty(inode); - dind_bh = NULL; + return; } - if (IS_SYNC(inode) && dind_bh && ubh_buffer_dirty(dind_bh)) + if (IS_SYNC(inode) && ubh_buffer_dirty(dind_bh)) ubh_sync_block(dind_bh); ubh_brelse (dind_bh); - - UFSD("EXIT: ino %lu\n", inode->i_ino); } static void ufs_trunc_tindirect(struct inode *inode, u64 offset) @@ -1210,6 +1194,8 @@ static void ufs_trunc_tindirect(struct inode *inode, u64 offset) unsigned tindirect_block = offset >> uspi->s_2apbshift; unsigned i; + offset -= tindirect_block << uspi->s_2apbshift; + p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) return; @@ -1221,10 +1207,9 @@ static void ufs_trunc_tindirect(struct inode *inode, u64 offset) return; } - for (i = tindirect_block ; i < uspi->s_apb ; i++) { + for (i = tindirect_block ; i < uspi->s_apb ; i++, offset = 0) { tind = ubh_get_data_ptr(uspi, tind_bh, i); - ufs_trunc_dindirect(inode, UFS_NDADDR + - uspi->s_apb + ((i + 1) << uspi->s_2apbshift), tind); + ufs_trunc_dindirect(inode, offset, tind); ubh_mark_buffer_dirty(tind_bh); } if (free_it) { @@ -1318,18 +1303,28 @@ static void __ufs_truncate_blocks(struct inode *inode) struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - unsigned offsets[4] = {0,}; + unsigned offsets[4]; int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets); mutex_lock(&ufsi->truncate_mutex); switch (depth) { case 1: ufs_trunc_direct(inode); + ufs_trunc_indirect(inode, 0, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); + ufs_trunc_dindirect(inode, 0, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); + ufs_trunc_tindirect(inode, 0); + break; case 2: ufs_trunc_indirect(inode, offsets[1], ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); + ufs_trunc_dindirect(inode, 0, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); + ufs_trunc_tindirect(inode, 0); + break; case 3: - ufs_trunc_dindirect(inode, UFS_IND_BLOCK + uspi->s_apb, + ufs_trunc_dindirect(inode, DIRECT_BLOCK - UFS_IND_BLOCK - uspi->s_apb, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); ufs_trunc_tindirect(inode, 0); break; From 7a4fdda72451f094374324a552be9fc7de8f3e8d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 14:28:54 -0400 Subject: [PATCH 18/65] __ufs_truncate(); find cutoff distances into branches by offsets[] array Signed-off-by: Al Viro --- fs/ufs/inode.c | 8 ++++++-- 1 file changed, 6 insertions(+), 2 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index b4d6398a2d54..c2544d62adf2 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1324,12 +1324,16 @@ static void __ufs_truncate_blocks(struct inode *inode) ufs_trunc_tindirect(inode, 0); break; case 3: - ufs_trunc_dindirect(inode, DIRECT_BLOCK - UFS_IND_BLOCK - uspi->s_apb, + ufs_trunc_dindirect(inode, + (offsets[1] << uspi->s_apbshift) + offsets[2], ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); ufs_trunc_tindirect(inode, 0); break; case 4: - ufs_trunc_tindirect(inode, DIRECT_BLOCK - UFS_NDADDR - uspi->s_apb - uspi->s_2apb); + ufs_trunc_tindirect(inode, + (offsets[1] << uspi->s_2apbshift) + + (offsets[2] << uspi->s_apbshift) + + offsets[3]); } ufsi->i_lastfrag = DIRECT_FRAGMENT; mutex_unlock(&ufsi->truncate_mutex); From 85416288bf730cffb61ab6ce8a7b97b17c73458f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 14:36:31 -0400 Subject: [PATCH 19/65] ufs_trunc_...indirect(): pass the array of indices instead of offsets rather than bitslicing the offset just formed as sum of shifted indices, pass the array of those indices itself. NULL is used as equivalent of "all zeroes" (== free the entire branch). Signed-off-by: Al Viro --- fs/ufs/inode.c | 50 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index c2544d62adf2..34d8dac4fe8b 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1069,7 +1069,7 @@ next1: } -static void ufs_trunc_indirect(struct inode *inode, unsigned from, void *p) +static void ufs_trunc_indirect(struct inode *inode, unsigned *offsets, void *p) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -1077,7 +1077,8 @@ static void ufs_trunc_indirect(struct inode *inode, unsigned from, void *p) void *ind; u64 tmp, frag_to_free = 0; unsigned free_count = 0; - bool to_free = !from; + unsigned from = offsets ? *offsets : 0; + bool to_free = !offsets || !from; unsigned i; tmp = ufs_data_ptr_to_cpu(sb, p); @@ -1134,16 +1135,15 @@ static void ufs_trunc_indirect(struct inode *inode, unsigned from, void *p) ubh_brelse (ind_ubh); } -static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) +static void ufs_trunc_dindirect(struct inode *inode, unsigned *offsets, void *p) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_buffer_head *dind_bh; u64 tmp; void *dind; - bool free_it = !offset; - unsigned dindirect_block = offset >> uspi->s_apbshift; - unsigned from = offset & uspi->s_apbmask; + bool free_it = !offsets || !(offsets[0] || offsets[1]); + unsigned dindirect_block = offsets ? *offsets++ : 0; unsigned i; tmp = ufs_data_ptr_to_cpu(sb, p); @@ -1157,12 +1157,12 @@ static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) return; } - for (i = dindirect_block ; i < uspi->s_apb ; i++, from = 0) { + for (i = dindirect_block ; i < uspi->s_apb ; i++, offsets = NULL) { dind = ubh_get_data_ptr(uspi, dind_bh, i); tmp = ufs_data_ptr_to_cpu(sb, dind); if (!tmp) continue; - ufs_trunc_indirect(inode, from, dind); + ufs_trunc_indirect(inode, offsets, dind); ubh_mark_buffer_dirty(dind_bh); } @@ -1182,7 +1182,7 @@ static void ufs_trunc_dindirect(struct inode *inode, u64 offset, void *p) ubh_brelse (dind_bh); } -static void ufs_trunc_tindirect(struct inode *inode, u64 offset) +static void ufs_trunc_tindirect(struct inode *inode, unsigned *offsets) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -1190,12 +1190,10 @@ static void ufs_trunc_tindirect(struct inode *inode, u64 offset) struct ufs_buffer_head * tind_bh; u64 tmp; void *tind, *p; - bool free_it = !offset; - unsigned tindirect_block = offset >> uspi->s_2apbshift; + bool free_it = !offsets || !(offsets[0] || offsets[1] || offsets[2]); + unsigned tindirect_block = offsets ? *offsets++ : 0; unsigned i; - offset -= tindirect_block << uspi->s_2apbshift; - p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) return; @@ -1207,9 +1205,9 @@ static void ufs_trunc_tindirect(struct inode *inode, u64 offset) return; } - for (i = tindirect_block ; i < uspi->s_apb ; i++, offset = 0) { + for (i = tindirect_block ; i < uspi->s_apb ; i++, offsets = NULL) { tind = ubh_get_data_ptr(uspi, tind_bh, i); - ufs_trunc_dindirect(inode, offset, tind); + ufs_trunc_dindirect(inode, offsets, tind); ubh_mark_buffer_dirty(tind_bh); } if (free_it) { @@ -1310,30 +1308,26 @@ static void __ufs_truncate_blocks(struct inode *inode) switch (depth) { case 1: ufs_trunc_direct(inode); - ufs_trunc_indirect(inode, 0, + ufs_trunc_indirect(inode, NULL, ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); - ufs_trunc_dindirect(inode, 0, + ufs_trunc_dindirect(inode, NULL, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_tindirect(inode, 0); + ufs_trunc_tindirect(inode, NULL); break; case 2: - ufs_trunc_indirect(inode, offsets[1], + ufs_trunc_indirect(inode, offsets + 1, ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); - ufs_trunc_dindirect(inode, 0, + ufs_trunc_dindirect(inode, NULL, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_tindirect(inode, 0); + ufs_trunc_tindirect(inode, NULL); break; case 3: - ufs_trunc_dindirect(inode, - (offsets[1] << uspi->s_apbshift) + offsets[2], + ufs_trunc_dindirect(inode, offsets + 1, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_tindirect(inode, 0); + ufs_trunc_tindirect(inode, NULL); break; case 4: - ufs_trunc_tindirect(inode, - (offsets[1] << uspi->s_2apbshift) + - (offsets[2] << uspi->s_apbshift) + - offsets[3]); + ufs_trunc_tindirect(inode, offsets + 1); } ufsi->i_lastfrag = DIRECT_FRAGMENT; mutex_unlock(&ufsi->truncate_mutex); From 6775e24d9ccf6a48ebd1d31ca77db5ebfe00ce43 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 14:55:50 -0400 Subject: [PATCH 20/65] ufs_trunc_..indirect(): more massage towards unifying Instead of manually checking that the array contains only zeroes, find the position of the last non-zero (in __ufs_truncate(), where we can conveniently do that) and use that to tell if there's any non-zero in the array tail passed to ufs_trunc_...indirect(). The goal of all that clumsiness is to get fold these functions together. Signed-off-by: Al Viro --- fs/ufs/inode.c | 43 ++++++++++++++++++++++++++----------------- 1 file changed, 26 insertions(+), 17 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 34d8dac4fe8b..e90266a221b8 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1069,7 +1069,7 @@ next1: } -static void ufs_trunc_indirect(struct inode *inode, unsigned *offsets, void *p) +static void ufs_trunc_indirect(struct inode *inode, unsigned *offsets, int depth2, void *p) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -1078,7 +1078,7 @@ static void ufs_trunc_indirect(struct inode *inode, unsigned *offsets, void *p) u64 tmp, frag_to_free = 0; unsigned free_count = 0; unsigned from = offsets ? *offsets : 0; - bool to_free = !offsets || !from; + bool to_free = !offsets || !depth2; unsigned i; tmp = ufs_data_ptr_to_cpu(sb, p); @@ -1135,14 +1135,14 @@ static void ufs_trunc_indirect(struct inode *inode, unsigned *offsets, void *p) ubh_brelse (ind_ubh); } -static void ufs_trunc_dindirect(struct inode *inode, unsigned *offsets, void *p) +static void ufs_trunc_dindirect(struct inode *inode, unsigned *offsets, int depth2, void *p) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_buffer_head *dind_bh; u64 tmp; void *dind; - bool free_it = !offsets || !(offsets[0] || offsets[1]); + bool free_it = !offsets || !depth2; unsigned dindirect_block = offsets ? *offsets++ : 0; unsigned i; @@ -1162,7 +1162,7 @@ static void ufs_trunc_dindirect(struct inode *inode, unsigned *offsets, void *p) tmp = ufs_data_ptr_to_cpu(sb, dind); if (!tmp) continue; - ufs_trunc_indirect(inode, offsets, dind); + ufs_trunc_indirect(inode, offsets, depth2 - 1, dind); ubh_mark_buffer_dirty(dind_bh); } @@ -1182,7 +1182,7 @@ static void ufs_trunc_dindirect(struct inode *inode, unsigned *offsets, void *p) ubh_brelse (dind_bh); } -static void ufs_trunc_tindirect(struct inode *inode, unsigned *offsets) +static void ufs_trunc_tindirect(struct inode *inode, unsigned *offsets, int depth2) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -1190,7 +1190,7 @@ static void ufs_trunc_tindirect(struct inode *inode, unsigned *offsets) struct ufs_buffer_head * tind_bh; u64 tmp; void *tind, *p; - bool free_it = !offsets || !(offsets[0] || offsets[1] || offsets[2]); + bool free_it = !offsets || !depth2; unsigned tindirect_block = offsets ? *offsets++ : 0; unsigned i; @@ -1207,7 +1207,7 @@ static void ufs_trunc_tindirect(struct inode *inode, unsigned *offsets) for (i = tindirect_block ; i < uspi->s_apb ; i++, offsets = NULL) { tind = ubh_get_data_ptr(uspi, tind_bh, i); - ufs_trunc_dindirect(inode, offsets, tind); + ufs_trunc_dindirect(inode, offsets, depth2 - 1, tind); ubh_mark_buffer_dirty(tind_bh); } if (free_it) { @@ -1303,31 +1303,40 @@ static void __ufs_truncate_blocks(struct inode *inode) struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; unsigned offsets[4]; int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets); + int depth2; + + if (!depth) + return; + + /* find the last non-zero in offsets[] */ + for (depth2 = depth - 1; depth2; depth2--) + if (offsets[depth2]) + break; mutex_lock(&ufsi->truncate_mutex); switch (depth) { case 1: ufs_trunc_direct(inode); - ufs_trunc_indirect(inode, NULL, + ufs_trunc_indirect(inode, NULL, 0, ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); - ufs_trunc_dindirect(inode, NULL, + ufs_trunc_dindirect(inode, NULL, 0, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_tindirect(inode, NULL); + ufs_trunc_tindirect(inode, NULL, 0); break; case 2: - ufs_trunc_indirect(inode, offsets + 1, + ufs_trunc_indirect(inode, offsets + 1, depth2, ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); - ufs_trunc_dindirect(inode, NULL, + ufs_trunc_dindirect(inode, NULL, 0, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_tindirect(inode, NULL); + ufs_trunc_tindirect(inode, NULL, 0); break; case 3: - ufs_trunc_dindirect(inode, offsets + 1, + ufs_trunc_dindirect(inode, offsets + 1, depth2, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_tindirect(inode, NULL); + ufs_trunc_tindirect(inode, NULL, 0); break; case 4: - ufs_trunc_tindirect(inode, offsets + 1); + ufs_trunc_tindirect(inode, offsets + 1, depth2); } ufsi->i_lastfrag = DIRECT_FRAGMENT; mutex_unlock(&ufsi->truncate_mutex); From 9e0fbbde2724d5d3bb9edca6b77e26eb28341154 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 15:33:47 -0400 Subject: [PATCH 21/65] unify ufs_trunc_..indirect() Signed-off-by: Al Viro --- fs/ufs/inode.c | 200 +++++++++++++++---------------------------------- 1 file changed, 61 insertions(+), 139 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index e90266a221b8..0d57c41b7705 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1068,147 +1068,65 @@ next1: UFSD("EXIT: ino %lu\n", inode->i_ino); } - -static void ufs_trunc_indirect(struct inode *inode, unsigned *offsets, int depth2, void *p) -{ - struct super_block *sb = inode->i_sb; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_buffer_head * ind_ubh; - void *ind; - u64 tmp, frag_to_free = 0; - unsigned free_count = 0; - unsigned from = offsets ? *offsets : 0; - bool to_free = !offsets || !depth2; - unsigned i; - - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - return; - ind_ubh = ubh_bread(sb, tmp, uspi->s_bsize); - if (!ind_ubh) { - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&UFS_I(inode)->meta_lock); - return; - } - - for (i = from; i < uspi->s_apb; i++) { - ind = ubh_get_data_ptr(uspi, ind_ubh, i); - tmp = ufs_data_ptr_to_cpu(sb, ind); - if (!tmp) - continue; - - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, ind); - write_sequnlock(&UFS_I(inode)->meta_lock); - ubh_mark_buffer_dirty(ind_ubh); - if (free_count == 0) { - frag_to_free = tmp; - free_count = uspi->s_fpb; - } else if (free_count > 0 && frag_to_free == tmp - free_count) - free_count += uspi->s_fpb; - else { - ufs_free_blocks (inode, frag_to_free, free_count); - frag_to_free = tmp; - free_count = uspi->s_fpb; - } - - mark_inode_dirty(inode); - } - - if (free_count > 0) { - ufs_free_blocks (inode, frag_to_free, free_count); - } - if (to_free) { - tmp = ufs_data_ptr_to_cpu(sb, p); - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&UFS_I(inode)->meta_lock); - - ubh_bforget(ind_ubh); - ufs_free_blocks (inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - return; - } - if (IS_SYNC(inode) && ubh_buffer_dirty(ind_ubh)) - ubh_sync_block(ind_ubh); - ubh_brelse (ind_ubh); -} - -static void ufs_trunc_dindirect(struct inode *inode, unsigned *offsets, int depth2, void *p) -{ - struct super_block *sb = inode->i_sb; - struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_buffer_head *dind_bh; - u64 tmp; - void *dind; - bool free_it = !offsets || !depth2; - unsigned dindirect_block = offsets ? *offsets++ : 0; - unsigned i; - - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - return; - dind_bh = ubh_bread(sb, tmp, uspi->s_bsize); - if (!dind_bh) { - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&UFS_I(inode)->meta_lock); - return; - } - - for (i = dindirect_block ; i < uspi->s_apb ; i++, offsets = NULL) { - dind = ubh_get_data_ptr(uspi, dind_bh, i); - tmp = ufs_data_ptr_to_cpu(sb, dind); - if (!tmp) - continue; - ufs_trunc_indirect(inode, offsets, depth2 - 1, dind); - ubh_mark_buffer_dirty(dind_bh); - } - - if (free_it) { - tmp = ufs_data_ptr_to_cpu(sb, p); - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&UFS_I(inode)->meta_lock); - - ubh_bforget(dind_bh); - ufs_free_blocks(inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - return; - } - if (IS_SYNC(inode) && ubh_buffer_dirty(dind_bh)) - ubh_sync_block(dind_bh); - ubh_brelse (dind_bh); -} - -static void ufs_trunc_tindirect(struct inode *inode, unsigned *offsets, int depth2) +static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, int depth, void *p) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_inode_info *ufsi = UFS_I(inode); - struct ufs_buffer_head * tind_bh; + struct ufs_buffer_head *ubh; u64 tmp; - void *tind, *p; bool free_it = !offsets || !depth2; - unsigned tindirect_block = offsets ? *offsets++ : 0; + unsigned from = offsets ? *offsets++ : 0; unsigned i; - p = ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK); - if (!(tmp = ufs_data_ptr_to_cpu(sb, p))) + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) return; - tind_bh = ubh_bread (sb, tmp, uspi->s_bsize); - if (!tind_bh) { + ubh = ubh_bread (sb, tmp, uspi->s_bsize); + if (!ubh) { write_seqlock(&ufsi->meta_lock); ufs_data_ptr_clear(uspi, p); write_sequnlock(&ufsi->meta_lock); return; } - for (i = tindirect_block ; i < uspi->s_apb ; i++, offsets = NULL) { - tind = ubh_get_data_ptr(uspi, tind_bh, i); - ufs_trunc_dindirect(inode, offsets, depth2 - 1, tind); - ubh_mark_buffer_dirty(tind_bh); + if (--depth) { + for (i = from ; i < uspi->s_apb ; i++, offsets = NULL) { + void *ind = ubh_get_data_ptr(uspi, ubh, i); + ufs_trunc_branch(inode, offsets, depth2 - 1, depth, ind); + ubh_mark_buffer_dirty(ubh); + } + } else { + u64 frag_to_free = 0; + unsigned free_count = 0; + + for (i = from; i < uspi->s_apb; i++) { + void *ind = ubh_get_data_ptr(uspi, ubh, i); + tmp = ufs_data_ptr_to_cpu(sb, ind); + if (!tmp) + continue; + + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, ind); + write_sequnlock(&UFS_I(inode)->meta_lock); + ubh_mark_buffer_dirty(ubh); + if (free_count == 0) { + frag_to_free = tmp; + free_count = uspi->s_fpb; + } else if (free_count > 0 && frag_to_free == tmp - free_count) + free_count += uspi->s_fpb; + else { + ufs_free_blocks (inode, frag_to_free, free_count); + frag_to_free = tmp; + free_count = uspi->s_fpb; + } + + mark_inode_dirty(inode); + } + + if (free_count > 0) { + ufs_free_blocks (inode, frag_to_free, free_count); + } } if (free_it) { tmp = ufs_data_ptr_to_cpu(sb, p); @@ -1216,14 +1134,14 @@ static void ufs_trunc_tindirect(struct inode *inode, unsigned *offsets, int dept ufs_data_ptr_clear(uspi, p); write_sequnlock(&ufsi->meta_lock); - ubh_bforget(tind_bh); + ubh_bforget(ubh); ufs_free_blocks(inode, tmp, uspi->s_fpb); mark_inode_dirty(inode); return; } - if (IS_SYNC(inode) && ubh_buffer_dirty(tind_bh)) - ubh_sync_block(tind_bh); - ubh_brelse (tind_bh); + if (IS_SYNC(inode) && ubh_buffer_dirty(ubh)) + ubh_sync_block(ubh); + ubh_brelse(ubh); } static int ufs_alloc_lastblock(struct inode *inode, loff_t size) @@ -1317,26 +1235,30 @@ static void __ufs_truncate_blocks(struct inode *inode) switch (depth) { case 1: ufs_trunc_direct(inode); - ufs_trunc_indirect(inode, NULL, 0, + ufs_trunc_branch(inode, NULL, 0, 1, ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); - ufs_trunc_dindirect(inode, NULL, 0, + ufs_trunc_branch(inode, NULL, 0, 2, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_tindirect(inode, NULL, 0); + ufs_trunc_branch(inode, NULL, 0, 3, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK)); break; case 2: - ufs_trunc_indirect(inode, offsets + 1, depth2, + ufs_trunc_branch(inode, offsets + 1, depth2, 1, ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); - ufs_trunc_dindirect(inode, NULL, 0, + ufs_trunc_branch(inode, NULL, 0, 2, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_tindirect(inode, NULL, 0); + ufs_trunc_branch(inode, NULL, 0, 3, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK)); break; case 3: - ufs_trunc_dindirect(inode, offsets + 1, depth2, + ufs_trunc_branch(inode, offsets + 1, depth2, 2, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_tindirect(inode, NULL, 0); + ufs_trunc_branch(inode, NULL, 0, 3, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK)); break; case 4: - ufs_trunc_tindirect(inode, offsets + 1, depth2); + ufs_trunc_branch(inode, offsets + 1, depth2, 3, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK)); } ufsi->i_lastfrag = DIRECT_FRAGMENT; mutex_unlock(&ufsi->truncate_mutex); From ef3a315d4ca179fd0b56597e695cd262a8b559b7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 15:42:25 -0400 Subject: [PATCH 22/65] __ufs_truncate_blocks(): unify freeing the full branches Signed-off-by: Al Viro --- fs/ufs/inode.c | 29 ++++++++++++++--------------- 1 file changed, 14 insertions(+), 15 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 0d57c41b7705..1427d277a690 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1235,31 +1235,30 @@ static void __ufs_truncate_blocks(struct inode *inode) switch (depth) { case 1: ufs_trunc_direct(inode); - ufs_trunc_branch(inode, NULL, 0, 1, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); - ufs_trunc_branch(inode, NULL, 0, 2, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_branch(inode, NULL, 0, 3, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK)); - break; + goto l1; case 2: ufs_trunc_branch(inode, offsets + 1, depth2, 1, ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); - ufs_trunc_branch(inode, NULL, 0, 2, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_branch(inode, NULL, 0, 3, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK)); - break; + goto l2; case 3: ufs_trunc_branch(inode, offsets + 1, depth2, 2, ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - ufs_trunc_branch(inode, NULL, 0, 3, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK)); - break; + goto l3; case 4: ufs_trunc_branch(inode, offsets + 1, depth2, 3, ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK)); + goto l4; } +l1: + ufs_trunc_branch(inode, NULL, 0, 1, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); +l2: + ufs_trunc_branch(inode, NULL, 0, 2, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); +l3: + ufs_trunc_branch(inode, NULL, 0, 3, + ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK)); +l4: ufsi->i_lastfrag = DIRECT_FRAGMENT; mutex_unlock(&ufsi->truncate_mutex); } From 42432739b5902f72011f701f5cd5b4227ebe991c Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 15:47:17 -0400 Subject: [PATCH 23/65] __ufs_trunc_blocks(): turn the part after switch into a loop ... and turn the switch into if (), since all cases with depth != 1 have just become identical. Signed-off-by: Al Viro --- fs/ufs/inode.c | 35 ++++++++++------------------------- 1 file changed, 10 insertions(+), 25 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 1427d277a690..285eacd02d60 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1222,6 +1222,7 @@ static void __ufs_truncate_blocks(struct inode *inode) unsigned offsets[4]; int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets); int depth2; + unsigned i; if (!depth) return; @@ -1232,33 +1233,17 @@ static void __ufs_truncate_blocks(struct inode *inode) break; mutex_lock(&ufsi->truncate_mutex); - switch (depth) { - case 1: + if (depth == 1) { ufs_trunc_direct(inode); - goto l1; - case 2: - ufs_trunc_branch(inode, offsets + 1, depth2, 1, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); - goto l2; - case 3: - ufs_trunc_branch(inode, offsets + 1, depth2, 2, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); - goto l3; - case 4: - ufs_trunc_branch(inode, offsets + 1, depth2, 3, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK)); - goto l4; + offsets[0] = UFS_IND_BLOCK; + } else { + ufs_trunc_branch(inode, offsets + 1, depth2, depth - 1, + ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]++)); + } + for (i = offsets[0]; i <= UFS_TIND_BLOCK; i++) { + ufs_trunc_branch(inode, NULL, 0, i - UFS_IND_BLOCK + 1, + ufs_get_direct_data_ptr(uspi, ufsi, i)); } -l1: - ufs_trunc_branch(inode, NULL, 0, 1, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_IND_BLOCK)); -l2: - ufs_trunc_branch(inode, NULL, 0, 2, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_DIND_BLOCK)); -l3: - ufs_trunc_branch(inode, NULL, 0, 3, - ufs_get_direct_data_ptr(uspi, ufsi, UFS_TIND_BLOCK)); -l4: ufsi->i_lastfrag = DIRECT_FRAGMENT; mutex_unlock(&ufsi->truncate_mutex); } From 97e0f8f87c918620689ce542664a3115b752649d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 16:05:18 -0400 Subject: [PATCH 24/65] ufs_trunc_branch(): never call with offsets != NULL && depth2 == 0 For calls in __ufs_truncate_blocks() it's just a matter of not incrementing offsets[0] and not making that call - immediately following loop will be executed one extra time and we'll be just fine. For recursive call in ufs_trunc_branch() itself, just assing NULL to offsets if we would be about to make such call. Signed-off-by: Al Viro --- fs/ufs/inode.c | 9 ++++++--- 1 file changed, 6 insertions(+), 3 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 285eacd02d60..9e409c12afdf 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1075,7 +1075,7 @@ static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, struct ufs_inode_info *ufsi = UFS_I(inode); struct ufs_buffer_head *ubh; u64 tmp; - bool free_it = !offsets || !depth2; + bool free_it = !offsets; unsigned from = offsets ? *offsets++ : 0; unsigned i; @@ -1091,9 +1091,11 @@ static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, } if (--depth) { + if (!--depth2) + offsets = NULL; for (i = from ; i < uspi->s_apb ; i++, offsets = NULL) { void *ind = ubh_get_data_ptr(uspi, ubh, i); - ufs_trunc_branch(inode, offsets, depth2 - 1, depth, ind); + ufs_trunc_branch(inode, offsets, depth2, depth, ind); ubh_mark_buffer_dirty(ubh); } } else { @@ -1237,7 +1239,8 @@ static void __ufs_truncate_blocks(struct inode *inode) ufs_trunc_direct(inode); offsets[0] = UFS_IND_BLOCK; } else { - ufs_trunc_branch(inode, offsets + 1, depth2, depth - 1, + if (depth2) + ufs_trunc_branch(inode, offsets + 1, depth2, depth - 1, ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]++)); } for (i = offsets[0]; i <= UFS_TIND_BLOCK; i++) { From a96574233c5d2e50736d83abf65161ec5fa55852 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 16:13:56 -0400 Subject: [PATCH 25/65] ufs_trunc_branch(): separate the calls with non-NULL offsets Signed-off-by: Al Viro --- fs/ufs/inode.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 9e409c12afdf..480c34ee1805 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1091,13 +1091,16 @@ static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, } if (--depth) { - if (!--depth2) - offsets = NULL; - for (i = from ; i < uspi->s_apb ; i++, offsets = NULL) { - void *ind = ubh_get_data_ptr(uspi, ubh, i); + if (offsets && --depth2) { + void *ind = ubh_get_data_ptr(uspi, ubh, from++); ufs_trunc_branch(inode, offsets, depth2, depth, ind); ubh_mark_buffer_dirty(ubh); } + for (i = from ; i < uspi->s_apb ; i++) { + void *ind = ubh_get_data_ptr(uspi, ubh, i); + ufs_trunc_branch(inode, NULL, 0, depth, ind); + ubh_mark_buffer_dirty(ubh); + } } else { u64 frag_to_free = 0; unsigned free_count = 0; From a138b4b688c10eb82044451b81534c382d1cddbd Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 02:18:54 -0400 Subject: [PATCH 26/65] ufs: unify the logics for collecting adjacent data blocks to free open-coded in several places... Signed-off-by: Al Viro --- fs/ufs/inode.c | 56 ++++++++++++++++++++------------------------------ 1 file changed, 22 insertions(+), 34 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 480c34ee1805..39de7782b7c5 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -960,6 +960,22 @@ void ufs_evict_inode(struct inode * inode) ufs_free_inode(inode); } +struct to_free { + struct inode *inode; + u64 to; + unsigned count; +}; + +static inline void free_data(struct to_free *ctx, u64 from, unsigned count) +{ + if (ctx->count && ctx->to != from) { + ufs_free_blocks(ctx->inode, ctx->to - ctx->count, ctx->count); + ctx->count = 0; + } + ctx->count += count; + ctx->to = from + count; +} + #define DIRECT_BLOCK ((inode->i_size + uspi->s_bsize - 1) >> uspi->s_bshift) #define DIRECT_FRAGMENT ((inode->i_size + uspi->s_fsize - 1) >> uspi->s_fshift) @@ -970,7 +986,7 @@ static void ufs_trunc_direct(struct inode *inode) struct ufs_sb_private_info * uspi; void *p; u64 frag1, frag2, frag3, frag4, block1, block2; - unsigned frag_to_free, free_count; + struct to_free ctx = {.inode = inode}; unsigned i, tmp; UFSD("ENTER: ino %lu\n", inode->i_ino); @@ -978,9 +994,6 @@ static void ufs_trunc_direct(struct inode *inode) sb = inode->i_sb; uspi = UFS_SB(sb)->s_uspi; - frag_to_free = 0; - free_count = 0; - frag1 = DIRECT_FRAGMENT; frag4 = min_t(u64, UFS_NDIR_FRAGMENT, ufsi->i_lastfrag); frag2 = ((frag1 & uspi->s_fpbmask) ? ((frag1 | uspi->s_fpbmask) + 1) : frag1); @@ -1015,7 +1028,6 @@ static void ufs_trunc_direct(struct inode *inode) ufs_free_fragments(inode, tmp + frag1, frag2); mark_inode_dirty(inode); - frag_to_free = tmp + frag1; next1: /* @@ -1030,21 +1042,11 @@ next1: ufs_data_ptr_clear(uspi, p); write_sequnlock(&ufsi->meta_lock); - if (free_count == 0) { - frag_to_free = tmp; - free_count = uspi->s_fpb; - } else if (free_count > 0 && frag_to_free == tmp - free_count) - free_count += uspi->s_fpb; - else { - ufs_free_blocks (inode, frag_to_free, free_count); - frag_to_free = tmp; - free_count = uspi->s_fpb; - } + free_data(&ctx, tmp, uspi->s_fpb); mark_inode_dirty(inode); } - if (free_count > 0) - ufs_free_blocks (inode, frag_to_free, free_count); + free_data(&ctx, 0, 0); if (frag3 >= frag4) goto next3; @@ -1102,8 +1104,7 @@ static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, ubh_mark_buffer_dirty(ubh); } } else { - u64 frag_to_free = 0; - unsigned free_count = 0; + struct to_free ctx = {.inode = inode}; for (i = from; i < uspi->s_apb; i++) { void *ind = ubh_get_data_ptr(uspi, ubh, i); @@ -1115,23 +1116,10 @@ static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, ufs_data_ptr_clear(uspi, ind); write_sequnlock(&UFS_I(inode)->meta_lock); ubh_mark_buffer_dirty(ubh); - if (free_count == 0) { - frag_to_free = tmp; - free_count = uspi->s_fpb; - } else if (free_count > 0 && frag_to_free == tmp - free_count) - free_count += uspi->s_fpb; - else { - ufs_free_blocks (inode, frag_to_free, free_count); - frag_to_free = tmp; - free_count = uspi->s_fpb; - } - + free_data(&ctx, tmp, uspi->s_fpb); mark_inode_dirty(inode); } - - if (free_count > 0) { - ufs_free_blocks (inode, frag_to_free, free_count); - } + free_data(&ctx, 0, 0); } if (free_it) { tmp = ufs_data_ptr_to_cpu(sb, p); From 6d1ebbca2b2fe516ff5f279848cffbd23d2b0270 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 17:11:49 -0400 Subject: [PATCH 27/65] split ufs_truncate_branch() into full- and partial-branch variants Signed-off-by: Al Viro --- fs/ufs/inode.c | 76 +++++++++++++++++++++++++++++++++++++++----------- 1 file changed, 59 insertions(+), 17 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 39de7782b7c5..c06556558c9b 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1070,15 +1070,13 @@ next1: UFSD("EXIT: ino %lu\n", inode->i_ino); } -static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, int depth, void *p) +static void free_full_branch(struct inode *inode, int depth, void *p) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct ufs_inode_info *ufsi = UFS_I(inode); struct ufs_buffer_head *ubh; u64 tmp; - bool free_it = !offsets; - unsigned from = offsets ? *offsets++ : 0; unsigned i; tmp = ufs_data_ptr_to_cpu(sb, p); @@ -1093,14 +1091,69 @@ static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, } if (--depth) { - if (offsets && --depth2) { + for (i = 0 ; i < uspi->s_apb ; i++) { + void *ind = ubh_get_data_ptr(uspi, ubh, i); + free_full_branch(inode, depth, ind); + ubh_mark_buffer_dirty(ubh); + } + } else { + struct to_free ctx = {.inode = inode}; + + for (i = 0; i < uspi->s_apb; i++) { + void *ind = ubh_get_data_ptr(uspi, ubh, i); + tmp = ufs_data_ptr_to_cpu(sb, ind); + if (!tmp) + continue; + + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, ind); + write_sequnlock(&UFS_I(inode)->meta_lock); + ubh_mark_buffer_dirty(ubh); + free_data(&ctx, tmp, uspi->s_fpb); + mark_inode_dirty(inode); + } + free_data(&ctx, 0, 0); + } + tmp = ufs_data_ptr_to_cpu(sb, p); + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + + ubh_bforget(ubh); + ufs_free_blocks(inode, tmp, uspi->s_fpb); + mark_inode_dirty(inode); +} + +static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, int depth, void *p) +{ + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + struct ufs_inode_info *ufsi = UFS_I(inode); + struct ufs_buffer_head *ubh; + u64 tmp; + unsigned from = *offsets++; + unsigned i; + + tmp = ufs_data_ptr_to_cpu(sb, p); + if (!tmp) + return; + ubh = ubh_bread (sb, tmp, uspi->s_bsize); + if (!ubh) { + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + return; + } + + if (--depth) { + if (--depth2) { void *ind = ubh_get_data_ptr(uspi, ubh, from++); ufs_trunc_branch(inode, offsets, depth2, depth, ind); ubh_mark_buffer_dirty(ubh); } for (i = from ; i < uspi->s_apb ; i++) { void *ind = ubh_get_data_ptr(uspi, ubh, i); - ufs_trunc_branch(inode, NULL, 0, depth, ind); + free_full_branch(inode, depth, ind); ubh_mark_buffer_dirty(ubh); } } else { @@ -1121,17 +1174,6 @@ static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, } free_data(&ctx, 0, 0); } - if (free_it) { - tmp = ufs_data_ptr_to_cpu(sb, p); - write_seqlock(&ufsi->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&ufsi->meta_lock); - - ubh_bforget(ubh); - ufs_free_blocks(inode, tmp, uspi->s_fpb); - mark_inode_dirty(inode); - return; - } if (IS_SYNC(inode) && ubh_buffer_dirty(ubh)) ubh_sync_block(ubh); ubh_brelse(ubh); @@ -1235,7 +1277,7 @@ static void __ufs_truncate_blocks(struct inode *inode) ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]++)); } for (i = offsets[0]; i <= UFS_TIND_BLOCK; i++) { - ufs_trunc_branch(inode, NULL, 0, i - UFS_IND_BLOCK + 1, + free_full_branch(inode, i - UFS_IND_BLOCK + 1, ufs_get_direct_data_ptr(uspi, ufsi, i)); } ufsi->i_lastfrag = DIRECT_FRAGMENT; From 6aab6dd37946d0d592105872bd533bb7d2931f3f Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 17:23:34 -0400 Subject: [PATCH 28/65] ufs_trunc_branch(): massage towards killing recursion We always have 0 < depth2 <= depth in there, so if (--depth) { if (--depth2) A B } else { C // not using depth2 } D // not using depth2 is equivalent to if (--depth2) A with s/depth/depth - 1/ if (--depth) B else C D Signed-off-by: Al Viro --- fs/ufs/inode.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index c06556558c9b..dac81c318da7 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1145,12 +1145,12 @@ static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, return; } + if (--depth2) { + void *ind = ubh_get_data_ptr(uspi, ubh, from++); + ufs_trunc_branch(inode, offsets, depth2, depth - 1, ind); + ubh_mark_buffer_dirty(ubh); + } if (--depth) { - if (--depth2) { - void *ind = ubh_get_data_ptr(uspi, ubh, from++); - ufs_trunc_branch(inode, offsets, depth2, depth, ind); - ubh_mark_buffer_dirty(ubh); - } for (i = from ; i < uspi->s_apb ; i++) { void *ind = ubh_get_data_ptr(uspi, ubh, i); free_full_branch(inode, depth, ind); From 7b4e4f7f815db0059150a12542b28c787e19c0d7 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 19:13:02 -0400 Subject: [PATCH 29/65] ufs_trunc_branch(): kill recursion turn recursion into a pair of loops Signed-off-by: Al Viro --- fs/ufs/inode.c | 52 +++++++++++++++++++++++++------------------------- 1 file changed, 26 insertions(+), 26 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index dac81c318da7..314caad56d83 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1124,34 +1124,14 @@ static void free_full_branch(struct inode *inode, int depth, void *p) mark_inode_dirty(inode); } -static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, int depth, void *p) +static void free_branch_tail(struct inode *inode, unsigned from, struct ufs_buffer_head *ubh, int depth) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_inode_info *ufsi = UFS_I(inode); - struct ufs_buffer_head *ubh; - u64 tmp; - unsigned from = *offsets++; unsigned i; - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) - return; - ubh = ubh_bread (sb, tmp, uspi->s_bsize); - if (!ubh) { - write_seqlock(&ufsi->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&ufsi->meta_lock); - return; - } - - if (--depth2) { - void *ind = ubh_get_data_ptr(uspi, ubh, from++); - ufs_trunc_branch(inode, offsets, depth2, depth - 1, ind); - ubh_mark_buffer_dirty(ubh); - } if (--depth) { - for (i = from ; i < uspi->s_apb ; i++) { + for (i = from; i < uspi->s_apb ; i++) { void *ind = ubh_get_data_ptr(uspi, ubh, i); free_full_branch(inode, depth, ind); ubh_mark_buffer_dirty(ubh); @@ -1161,7 +1141,7 @@ static void ufs_trunc_branch(struct inode *inode, unsigned *offsets, int depth2, for (i = from; i < uspi->s_apb; i++) { void *ind = ubh_get_data_ptr(uspi, ubh, i); - tmp = ufs_data_ptr_to_cpu(sb, ind); + u64 tmp = ufs_data_ptr_to_cpu(sb, ind); if (!tmp) continue; @@ -1258,6 +1238,9 @@ static void __ufs_truncate_blocks(struct inode *inode) int depth = ufs_block_to_path(inode, DIRECT_BLOCK, offsets); int depth2; unsigned i; + struct ufs_buffer_head *ubh[3]; + void *p; + u64 block; if (!depth) return; @@ -1272,9 +1255,26 @@ static void __ufs_truncate_blocks(struct inode *inode) ufs_trunc_direct(inode); offsets[0] = UFS_IND_BLOCK; } else { - if (depth2) - ufs_trunc_branch(inode, offsets + 1, depth2, depth - 1, - ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]++)); + /* get the blocks that should be partially emptied */ + p = ufs_get_direct_data_ptr(uspi, ufsi, offsets[0]); + for (i = 0; i < depth2; i++) { + offsets[i]++; /* next branch is fully freed */ + block = ufs_data_ptr_to_cpu(sb, p); + if (!block) + break; + ubh[i] = ubh_bread(sb, block, uspi->s_bsize); + if (!ubh[i]) { + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + break; + } + p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]); + } + while (i--) { + ubh_mark_buffer_dirty(ubh[i]); + free_branch_tail(inode, offsets[i + 1], ubh[i], depth - i - 1); + } } for (i = offsets[0]; i <= UFS_TIND_BLOCK; i++) { free_full_branch(inode, i - UFS_IND_BLOCK + 1, From 163073db51930d1f9c2960b8e5660c269164f29b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 20:07:08 -0400 Subject: [PATCH 30/65] free_full_branch(): saner calling conventions Have caller fetch the block number *and* remove it from wherever it was. Pass the block number instead. Signed-off-by: Al Viro --- fs/ufs/inode.c | 100 +++++++++++++++++++++++++------------------------ 1 file changed, 51 insertions(+), 49 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 314caad56d83..efe71e5acb00 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1070,57 +1070,48 @@ next1: UFSD("EXIT: ino %lu\n", inode->i_ino); } -static void free_full_branch(struct inode *inode, int depth, void *p) +static void free_full_branch(struct inode *inode, u64 ind_block, int depth) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct ufs_inode_info *ufsi = UFS_I(inode); - struct ufs_buffer_head *ubh; - u64 tmp; + struct ufs_buffer_head *ubh = ubh_bread(sb, ind_block, uspi->s_bsize); unsigned i; - tmp = ufs_data_ptr_to_cpu(sb, p); - if (!tmp) + if (!ubh) return; - ubh = ubh_bread (sb, tmp, uspi->s_bsize); - if (!ubh) { - write_seqlock(&ufsi->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&ufsi->meta_lock); - return; - } if (--depth) { - for (i = 0 ; i < uspi->s_apb ; i++) { - void *ind = ubh_get_data_ptr(uspi, ubh, i); - free_full_branch(inode, depth, ind); - ubh_mark_buffer_dirty(ubh); + for (i = 0; i < uspi->s_apb; i++) { + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + free_full_branch(inode, block, depth); + ubh_mark_buffer_dirty(ubh); + } } } else { struct to_free ctx = {.inode = inode}; for (i = 0; i < uspi->s_apb; i++) { - void *ind = ubh_get_data_ptr(uspi, ubh, i); - tmp = ufs_data_ptr_to_cpu(sb, ind); - if (!tmp) - continue; - - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, ind); - write_sequnlock(&UFS_I(inode)->meta_lock); - ubh_mark_buffer_dirty(ubh); - free_data(&ctx, tmp, uspi->s_fpb); - mark_inode_dirty(inode); + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + ubh_mark_buffer_dirty(ubh); + free_data(&ctx, block, uspi->s_fpb); + mark_inode_dirty(inode); + } } free_data(&ctx, 0, 0); } - tmp = ufs_data_ptr_to_cpu(sb, p); - write_seqlock(&ufsi->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&ufsi->meta_lock); ubh_bforget(ubh); - ufs_free_blocks(inode, tmp, uspi->s_fpb); + ufs_free_blocks(inode, ind_block, uspi->s_fpb); mark_inode_dirty(inode); } @@ -1132,25 +1123,30 @@ static void free_branch_tail(struct inode *inode, unsigned from, struct ufs_buff if (--depth) { for (i = from; i < uspi->s_apb ; i++) { - void *ind = ubh_get_data_ptr(uspi, ubh, i); - free_full_branch(inode, depth, ind); - ubh_mark_buffer_dirty(ubh); + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + ubh_mark_buffer_dirty(ubh); + free_full_branch(inode, block, depth); + } } } else { struct to_free ctx = {.inode = inode}; for (i = from; i < uspi->s_apb; i++) { - void *ind = ubh_get_data_ptr(uspi, ubh, i); - u64 tmp = ufs_data_ptr_to_cpu(sb, ind); - if (!tmp) - continue; - - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, ind); - write_sequnlock(&UFS_I(inode)->meta_lock); - ubh_mark_buffer_dirty(ubh); - free_data(&ctx, tmp, uspi->s_fpb); - mark_inode_dirty(inode); + void *p = ubh_get_data_ptr(uspi, ubh, i); + u64 block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&UFS_I(inode)->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&UFS_I(inode)->meta_lock); + ubh_mark_buffer_dirty(ubh); + free_data(&ctx, block, uspi->s_fpb); + mark_inode_dirty(inode); + } } free_data(&ctx, 0, 0); } @@ -1277,8 +1273,14 @@ static void __ufs_truncate_blocks(struct inode *inode) } } for (i = offsets[0]; i <= UFS_TIND_BLOCK; i++) { - free_full_branch(inode, i - UFS_IND_BLOCK + 1, - ufs_get_direct_data_ptr(uspi, ufsi, i)); + p = ufs_get_direct_data_ptr(uspi, ufsi, i); + block = ufs_data_ptr_to_cpu(sb, p); + if (block) { + write_seqlock(&ufsi->meta_lock); + ufs_data_ptr_clear(uspi, p); + write_sequnlock(&ufsi->meta_lock); + free_full_branch(inode, block, i - UFS_IND_BLOCK + 1); + } } ufsi->i_lastfrag = DIRECT_FRAGMENT; mutex_unlock(&ufsi->truncate_mutex); From b6eede0ec642d1be17065110718cb4f4ed7ba5e0 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 20:09:39 -0400 Subject: [PATCH 31/65] move marking inode dirty to the end of __ufs_truncate_blocks() Signed-off-by: Al Viro --- fs/ufs/inode.c | 7 +------ 1 file changed, 1 insertion(+), 6 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index efe71e5acb00..26835a80f7dd 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1027,7 +1027,6 @@ static void ufs_trunc_direct(struct inode *inode) frag1 = ufs_fragnum (frag1); ufs_free_fragments(inode, tmp + frag1, frag2); - mark_inode_dirty(inode); next1: /* @@ -1043,7 +1042,6 @@ next1: write_sequnlock(&ufsi->meta_lock); free_data(&ctx, tmp, uspi->s_fpb); - mark_inode_dirty(inode); } free_data(&ctx, 0, 0); @@ -1064,7 +1062,6 @@ next1: write_sequnlock(&ufsi->meta_lock); ufs_free_fragments (inode, tmp, frag4); - mark_inode_dirty(inode); next3: UFSD("EXIT: ino %lu\n", inode->i_ino); @@ -1104,7 +1101,6 @@ static void free_full_branch(struct inode *inode, u64 ind_block, int depth) write_sequnlock(&UFS_I(inode)->meta_lock); ubh_mark_buffer_dirty(ubh); free_data(&ctx, block, uspi->s_fpb); - mark_inode_dirty(inode); } } free_data(&ctx, 0, 0); @@ -1112,7 +1108,6 @@ static void free_full_branch(struct inode *inode, u64 ind_block, int depth) ubh_bforget(ubh); ufs_free_blocks(inode, ind_block, uspi->s_fpb); - mark_inode_dirty(inode); } static void free_branch_tail(struct inode *inode, unsigned from, struct ufs_buffer_head *ubh, int depth) @@ -1145,7 +1140,6 @@ static void free_branch_tail(struct inode *inode, unsigned from, struct ufs_buff write_sequnlock(&UFS_I(inode)->meta_lock); ubh_mark_buffer_dirty(ubh); free_data(&ctx, block, uspi->s_fpb); - mark_inode_dirty(inode); } } free_data(&ctx, 0, 0); @@ -1283,6 +1277,7 @@ static void __ufs_truncate_blocks(struct inode *inode) } } ufsi->i_lastfrag = DIRECT_FRAGMENT; + mark_inode_dirty(inode); mutex_unlock(&ufsi->truncate_mutex); } From cc7231e30916f5326bdde55a7a4c59431e15bc1b Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 20:14:02 -0400 Subject: [PATCH 32/65] free_full_branch(): don't bother modifying the block we are going to free Note that it's already made unreachable from the inode, so we don't have to worry about ufs_frag_map() walking into something already freed. Signed-off-by: Al Viro --- fs/ufs/inode.c | 14 ++------------ 1 file changed, 2 insertions(+), 12 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 26835a80f7dd..424949f459c8 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1081,13 +1081,8 @@ static void free_full_branch(struct inode *inode, u64 ind_block, int depth) for (i = 0; i < uspi->s_apb; i++) { void *p = ubh_get_data_ptr(uspi, ubh, i); u64 block = ufs_data_ptr_to_cpu(sb, p); - if (block) { - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&UFS_I(inode)->meta_lock); + if (block) free_full_branch(inode, block, depth); - ubh_mark_buffer_dirty(ubh); - } } } else { struct to_free ctx = {.inode = inode}; @@ -1095,13 +1090,8 @@ static void free_full_branch(struct inode *inode, u64 ind_block, int depth) for (i = 0; i < uspi->s_apb; i++) { void *p = ubh_get_data_ptr(uspi, ubh, i); u64 block = ufs_data_ptr_to_cpu(sb, p); - if (block) { - write_seqlock(&UFS_I(inode)->meta_lock); - ufs_data_ptr_clear(uspi, p); - write_sequnlock(&UFS_I(inode)->meta_lock); - ubh_mark_buffer_dirty(ubh); + if (block) free_data(&ctx, block, uspi->s_fpb); - } } free_data(&ctx, 0, 0); } From f53bd1421b3eb84375e9e6964665d23d4190400d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 20:17:32 -0400 Subject: [PATCH 33/65] __ufs_truncate_blocks(): avoid excessive dirtying of indirect blocks There's a case when an indirect block gets dirtied for no good reason - when there's a hole starting in the middle of area covered by it and spanning past its end, and truncate() is done precisely to the beginning of the hole. The block is obviously not modified at all - all removals happen beyond it. However, existing code ends up dirtying it just in case. It's trivial to fix and while it's not a real bug by any stretch of imagination, it makes the damn thing harder to follow. Signed-off-by: Al Viro --- fs/ufs/inode.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 424949f459c8..86cc1eea0fb2 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1251,10 +1251,8 @@ static void __ufs_truncate_blocks(struct inode *inode) } p = ubh_get_data_ptr(uspi, ubh[i], offsets[i + 1]); } - while (i--) { - ubh_mark_buffer_dirty(ubh[i]); + while (i--) free_branch_tail(inode, offsets[i + 1], ubh[i], depth - i - 1); - } } for (i = offsets[0]; i <= UFS_TIND_BLOCK; i++) { p = ufs_get_direct_data_ptr(uspi, ufsi, i); From 5a39c25562aa5eab5a798919855cf41ddeed8b0d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 18 Jun 2015 22:39:46 -0400 Subject: [PATCH 34/65] ufs_inode_get{frag,block}(): get rid of retries We are holding ->truncate_mutex, so nobody else can alter our block pointers. Rechecks/retries were needed back when we only held BKL there, and had to cope with write_begin/writepage and writepage/truncate races. Can't happen anymore... Signed-off-by: Al Viro --- fs/ufs/inode.c | 43 ++++++++----------------------------------- 1 file changed, 8 insertions(+), 35 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 86cc1eea0fb2..95cb0a8f5ec9 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -248,20 +248,12 @@ ufs_inode_getfrag(struct inode *inode, u64 fragment, goal = 0; -repeat: tmp = ufs_data_ptr_to_cpu(sb, p); lastfrag = ufsi->i_lastfrag; if (tmp && fragment < lastfrag) { if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - if (tmp == ufs_data_ptr_to_cpu(sb, p)) { - UFSD("EXIT, result %llu\n", - (unsigned long long)tmp + blockoff); - return result; - } - brelse (result); - goto repeat; + return sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); } else { *phys = uspi->s_sbbase + tmp + blockoff; return NULL; @@ -283,14 +275,9 @@ repeat: ufs_data_ptr_to_cpu(sb, p2), uspi->s_fpb - lastblockoff, err, locked_page); - if (!tmp) { - if (lastfrag != ufsi->i_lastfrag) - goto repeat; - else - return NULL; - } + if (!tmp) + return NULL; lastfrag = ufsi->i_lastfrag; - } tmp = ufs_data_ptr_to_cpu(sb, ufs_get_direct_data_ptr(uspi, ufsi, @@ -325,9 +312,6 @@ repeat: phys != NULL ? locked_page : NULL); } if (!tmp) { - if ((!blockoff && ufs_data_ptr_to_cpu(sb, p)) || - (blockoff && lastfrag != ufsi->i_lastfrag)) - goto repeat; *err = -ENOSPC; return NULL; } @@ -345,7 +329,6 @@ repeat: if (IS_SYNC(inode)) ufs_sync_inode (inode); mark_inode_dirty(inode); - UFSD("EXIT, result %llu\n", (unsigned long long)tmp + blockoff); return result; /* This part : To be implemented .... @@ -409,19 +392,14 @@ ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, p = (__fs64 *)bh->b_data + block; else p = (__fs32 *)bh->b_data + block; -repeat: + tmp = ufs_data_ptr_to_cpu(sb, p); if (tmp) { - if (!phys) { + if (!phys) result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - if (tmp == ufs_data_ptr_to_cpu(sb, p)) - goto out; - brelse (result); - goto repeat; - } else { + else *phys = uspi->s_sbbase + tmp + blockoff; - goto out; - } + goto out; } if (block && (uspi->fs_magic == UFS2_MAGIC ? @@ -432,12 +410,8 @@ repeat: goal = bh->b_blocknr + uspi->s_fpb; tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err, locked_page); - if (!tmp) { - if (ufs_data_ptr_to_cpu(sb, p)) - goto repeat; + if (!tmp) goto out; - } - if (!phys) { result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); @@ -451,7 +425,6 @@ repeat: sync_dirty_buffer(bh); inode->i_ctime = CURRENT_TIME_SEC; mark_inode_dirty(inode); - UFSD("result %llu\n", (unsigned long long)tmp + blockoff); out: brelse (bh); UFSD("EXIT\n"); From 4b7068c8b178401637ef2fb068d6256c97d23f4a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Jun 2015 14:27:23 -0400 Subject: [PATCH 35/65] ufs: move calculation of offsets into ufs_getfrag_block() ... and massage ufs_frag_map() to take those instead of fragment number. As it is, we duplicate the damn thing on the write side, open-coded and bloody hard to follow. Signed-off-by: Al Viro --- fs/ufs/inode.c | 17 +++++++++-------- 1 file changed, 9 insertions(+), 8 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 95cb0a8f5ec9..0f0c6dfccd10 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -119,20 +119,18 @@ static inline int grow_chain64(struct ufs_inode_info *ufsi, * the beginning of the filesystem. */ -static u64 ufs_frag_map(struct inode *inode, sector_t frag) +static u64 ufs_frag_map(struct inode *inode, unsigned offsets[4], int depth) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; u64 mask = (u64) uspi->s_apbmask>>uspi->s_fpbshift; int shift = uspi->s_apbshift-uspi->s_fpbshift; - unsigned offsets[4], *p; Indirect chain[4], *q = chain; - int depth = ufs_block_to_path(inode, frag >> uspi->s_fpbshift, offsets); + unsigned *p; unsigned flags = UFS_SB(sb)->s_flags; u64 res = 0; - UFSD(": frag = %llu depth = %d\n", (unsigned long long)frag, depth); UFSD(": uspi->s_fpbshift = %d ,uspi->s_apbmask = %x, mask=%llx\n", uspi->s_fpbshift, uspi->s_apbmask, (unsigned long long)mask); @@ -191,7 +189,7 @@ ufs2: } res = fs64_to_cpu(sb, q->key64); found: - res += uspi->s_sbbase + (frag & uspi->s_fpbmask); + res += uspi->s_sbbase; no_block: while (q > chain) { brelse(q->bh); @@ -443,14 +441,17 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff struct ufs_sb_private_info * uspi = sbi->s_uspi; struct buffer_head * bh; int ret, err, new; + unsigned offsets[4]; + int depth = ufs_block_to_path(inode, fragment >> uspi->s_fpbshift, offsets); unsigned long ptr,phys; u64 phys64 = 0; if (!create) { - phys64 = ufs_frag_map(inode, fragment); - UFSD("phys64 = %llu\n", (unsigned long long)phys64); - if (phys64) + phys64 = ufs_frag_map(inode, offsets, depth); + if (phys64) { + phys64 += fragment & uspi->s_fpbmask; map_bh(bh_result, sb, phys64); + } return 0; } From 71dd42846ffb2bd1a90e9ac2c52df0cc2ed92307 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Thu, 4 Jun 2015 14:34:43 -0400 Subject: [PATCH 36/65] ufs: use the branch depth in ufs_getfrag_block() we'd already calculated it... Signed-off-by: Al Viro --- fs/ufs/inode.c | 10 ++++------ 1 file changed, 4 insertions(+), 6 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 0f0c6dfccd10..5c4a4abae652 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -465,9 +465,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff mutex_lock(&UFS_I(inode)->truncate_mutex); UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); - if (fragment > - ((UFS_NDADDR + uspi->s_apb + uspi->s_2apb + uspi->s_3apb) - << uspi->s_fpbshift)) + if (!depth) goto abort_too_big; err = 0; @@ -490,17 +488,17 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff ufs_inode_getblock(inode, bh, x, fragment, \ &err, NULL, NULL, NULL) - if (ptr < UFS_NDIR_FRAGMENT) { + if (depth == 1) { bh = GET_INODE_DATABLOCK(ptr); goto out; } ptr -= UFS_NDIR_FRAGMENT; - if (ptr < (1 << (uspi->s_apbshift + uspi->s_fpbshift))) { + if (depth == 2) { bh = GET_INODE_PTR(UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift)); goto get_indirect; } ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift); - if (ptr < (1 << (uspi->s_2apbshift + uspi->s_fpbshift))) { + if (depth == 3) { bh = GET_INODE_PTR(UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift)); goto get_double; } From bbb3eb9d3432ce55a620778ecf5670fa7942090e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jun 2015 00:10:00 -0400 Subject: [PATCH 37/65] ufs_inode_get{frag,block}(): consolidate success exits These calling conventions are rudiments of pre-2.3 times; they really need to be sanitized. This is the first step; next will be _always_ returning a block number, instead of this "return a pointer to buffer_head, except when we get to the actual data" crap. Signed-off-by: Al Viro --- fs/ufs/inode.c | 50 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 28 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 5c4a4abae652..d65a89030c91 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -226,7 +226,6 @@ ufs_inode_getfrag(struct inode *inode, u64 fragment, struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct buffer_head * result; unsigned blockoff, lastblockoff; u64 tmp, goal, lastfrag, block, lastblock; void *p, *p2; @@ -249,14 +248,8 @@ ufs_inode_getfrag(struct inode *inode, u64 fragment, tmp = ufs_data_ptr_to_cpu(sb, p); lastfrag = ufsi->i_lastfrag; - if (tmp && fragment < lastfrag) { - if (!phys) { - return sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - } else { - *phys = uspi->s_sbbase + tmp + blockoff; - return NULL; - } - } + if (tmp && fragment < lastfrag) + goto out; lastblock = ufs_fragstoblks (lastfrag); lastblockoff = ufs_fragnum (lastfrag); @@ -314,20 +307,22 @@ ufs_inode_getfrag(struct inode *inode, u64 fragment, return NULL; } - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - } else { - *phys = uspi->s_sbbase + tmp + blockoff; - result = NULL; + if (phys) { *err = 0; *new = 1; } - inode->i_ctime = CURRENT_TIME_SEC; if (IS_SYNC(inode)) ufs_sync_inode (inode); mark_inode_dirty(inode); - return result; +out: + tmp += uspi->s_sbbase + blockoff; + if (!phys) { + return sb_getblk(sb, tmp); + } else { + *phys = tmp; + return NULL; + } /* This part : To be implemented .... Required only for writing, not required for READ-ONLY. @@ -367,7 +362,7 @@ ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; struct buffer_head * result; unsigned blockoff; - u64 tmp, goal, block; + u64 tmp = 0, goal, block; void *p; block = ufs_fragstoblks (fragment); @@ -392,13 +387,8 @@ ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, p = (__fs32 *)bh->b_data + block; tmp = ufs_data_ptr_to_cpu(sb, p); - if (tmp) { - if (!phys) - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - else - *phys = uspi->s_sbbase + tmp + blockoff; + if (tmp) goto out; - } if (block && (uspi->fs_magic == UFS2_MAGIC ? (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[block-1])) : @@ -411,12 +401,8 @@ ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, if (!tmp) goto out; - if (!phys) { - result = sb_getblk(sb, uspi->s_sbbase + tmp + blockoff); - } else { - *phys = uspi->s_sbbase + tmp + blockoff; + if (new) *new = 1; - } mark_buffer_dirty(bh); if (IS_SYNC(inode)) @@ -425,6 +411,14 @@ ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, mark_inode_dirty(inode); out: brelse (bh); + if (tmp) { + tmp += uspi->s_sbbase + blockoff; + if (phys) { + *phys = tmp; + } else { + result = sb_getblk(sb, tmp); + } + } UFSD("EXIT\n"); return result; } From 8d9dcf14367388674f4d792f494e6f1d6536ac95 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jun 2015 00:32:42 -0400 Subject: [PATCH 38/65] ufs_getfrag_block(): get rid of macro jungles Signed-off-by: Al Viro --- fs/ufs/inode.c | 51 ++++++++++++++++++++++---------------------------- 1 file changed, 22 insertions(+), 29 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index d65a89030c91..156ba3c26906 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -465,49 +465,42 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff err = 0; ptr = fragment; - /* - * ok, these macros clean the logic up a bit and make - * it much more readable: - */ -#define GET_INODE_DATABLOCK(x) \ - ufs_inode_getfrag(inode, x, fragment, 1, &err, &phys, &new,\ - bh_result->b_page) -#define GET_INODE_PTR(x) \ - ufs_inode_getfrag(inode, x, fragment, uspi->s_fpb, &err, NULL, NULL,\ - bh_result->b_page) -#define GET_INDIRECT_DATABLOCK(x) \ - ufs_inode_getblock(inode, bh, x, fragment, \ - &err, &phys, &new, bh_result->b_page) -#define GET_INDIRECT_PTR(x) \ - ufs_inode_getblock(inode, bh, x, fragment, \ - &err, NULL, NULL, NULL) - if (depth == 1) { - bh = GET_INODE_DATABLOCK(ptr); + bh = ufs_inode_getfrag(inode, ptr, fragment, 1, &err, &phys, + &new, bh_result->b_page); goto out; } ptr -= UFS_NDIR_FRAGMENT; if (depth == 2) { - bh = GET_INODE_PTR(UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift)); + bh = ufs_inode_getfrag(inode, + UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift), + fragment, uspi->s_fpb, &err, NULL, NULL, + bh_result->b_page); goto get_indirect; } ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift); if (depth == 3) { - bh = GET_INODE_PTR(UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift)); + bh = ufs_inode_getfrag(inode, + UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift), + fragment, uspi->s_fpb, &err, NULL, NULL, + bh_result->b_page); goto get_double; } ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift); - bh = GET_INODE_PTR(UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift)); - bh = GET_INDIRECT_PTR((ptr >> uspi->s_2apbshift) & uspi->s_apbmask); + bh = ufs_inode_getfrag(inode, + UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift), + fragment, uspi->s_fpb, &err, NULL, NULL, + bh_result->b_page); + bh = ufs_inode_getblock(inode, bh, + (ptr >> uspi->s_2apbshift) & uspi->s_apbmask, + fragment, &err, NULL, NULL, NULL); get_double: - bh = GET_INDIRECT_PTR((ptr >> uspi->s_apbshift) & uspi->s_apbmask); + bh = ufs_inode_getblock(inode, bh, + (ptr >> uspi->s_apbshift) & uspi->s_apbmask, + fragment, &err, NULL, NULL, NULL); get_indirect: - bh = GET_INDIRECT_DATABLOCK(ptr & uspi->s_apbmask); - -#undef GET_INODE_DATABLOCK -#undef GET_INODE_PTR -#undef GET_INDIRECT_DATABLOCK -#undef GET_INDIRECT_PTR + bh = ufs_inode_getblock(inode, bh, ptr & uspi->s_apbmask, fragment, + &err, &phys, &new, bh_result->b_page); out: if (err) From 177848a018cb2cb196feac2990814ac8d7bb3c8e Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jun 2015 00:53:06 -0400 Subject: [PATCH 39/65] ufs_inode_get{frag,block}(): leave sb_getblk() to caller just return the damn block number Signed-off-by: Al Viro --- fs/ufs/inode.c | 88 +++++++++++++++++++++++++++++++------------------- 1 file changed, 55 insertions(+), 33 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 156ba3c26906..eeccf45fcd57 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -218,7 +218,7 @@ changed: * @new: we set it if we allocate new block * @locked_page: for ufs_new_fragments() */ -static struct buffer_head * +static u64 ufs_inode_getfrag(struct inode *inode, u64 fragment, sector_t new_fragment, unsigned int required, int *err, long *phys, int *new, struct page *locked_page) @@ -267,7 +267,7 @@ ufs_inode_getfrag(struct inode *inode, u64 fragment, uspi->s_fpb - lastblockoff, err, locked_page); if (!tmp) - return NULL; + return 0; lastfrag = ufsi->i_lastfrag; } tmp = ufs_data_ptr_to_cpu(sb, @@ -304,7 +304,7 @@ ufs_inode_getfrag(struct inode *inode, u64 fragment, } if (!tmp) { *err = -ENOSPC; - return NULL; + return 0; } if (phys) { @@ -316,13 +316,7 @@ ufs_inode_getfrag(struct inode *inode, u64 fragment, ufs_sync_inode (inode); mark_inode_dirty(inode); out: - tmp += uspi->s_sbbase + blockoff; - if (!phys) { - return sb_getblk(sb, tmp); - } else { - *phys = tmp; - return NULL; - } + return tmp + uspi->s_sbbase; /* This part : To be implemented .... Required only for writing, not required for READ-ONLY. @@ -353,26 +347,22 @@ repeat2: * @new: see ufs_inode_getfrag() * @locked_page: see ufs_inode_getfrag() */ -static struct buffer_head * +static u64 ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, u64 fragment, sector_t new_fragment, int *err, long *phys, int *new, struct page *locked_page) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - struct buffer_head * result; - unsigned blockoff; u64 tmp = 0, goal, block; void *p; block = ufs_fragstoblks (fragment); - blockoff = ufs_fragnum (fragment); UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, metadata %d\n", inode->i_ino, (unsigned long long)fragment, (unsigned long long)new_fragment, !phys); - result = NULL; if (!bh) goto out; if (!buffer_uptodate(bh)) { @@ -411,16 +401,10 @@ ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, mark_inode_dirty(inode); out: brelse (bh); - if (tmp) { - tmp += uspi->s_sbbase + blockoff; - if (phys) { - *phys = tmp; - } else { - result = sb_getblk(sb, tmp); - } - } UFSD("EXIT\n"); - return result; + if (tmp) + tmp += uspi->s_sbbase; + return tmp; } /** @@ -439,11 +423,12 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff int depth = ufs_block_to_path(inode, fragment >> uspi->s_fpbshift, offsets); unsigned long ptr,phys; u64 phys64 = 0; + unsigned frag = fragment & uspi->s_fpbmask; if (!create) { phys64 = ufs_frag_map(inode, offsets, depth); if (phys64) { - phys64 += fragment & uspi->s_fpbmask; + phys64 += frag; map_bh(bh_result, sb, phys64); } return 0; @@ -466,42 +451,79 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff ptr = fragment; if (depth == 1) { - bh = ufs_inode_getfrag(inode, ptr, fragment, 1, &err, &phys, + phys64 = ufs_inode_getfrag(inode, ptr, fragment, 1, &err, &phys, &new, bh_result->b_page); + if (phys64) { + phys64 += frag; + phys = phys64; + } goto out; } ptr -= UFS_NDIR_FRAGMENT; if (depth == 2) { - bh = ufs_inode_getfrag(inode, + phys64 = ufs_inode_getfrag(inode, UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift), fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page); + if (phys64) { + phys64 += (ptr >> uspi->s_apbshift) & uspi->s_fpbmask; + bh = sb_getblk(sb, phys64); + } else { + bh = NULL; + } goto get_indirect; } ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift); if (depth == 3) { - bh = ufs_inode_getfrag(inode, + phys64 = ufs_inode_getfrag(inode, UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift), fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page); + if (phys64) { + phys64 += (ptr >> uspi->s_2apbshift) & uspi->s_fpbmask; + bh = sb_getblk(sb, phys64); + } else { + bh = NULL; + } goto get_double; } ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift); - bh = ufs_inode_getfrag(inode, + phys64 = ufs_inode_getfrag(inode, UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift), fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page); - bh = ufs_inode_getblock(inode, bh, + if (phys64) { + phys64 += (ptr >> uspi->s_3apbshift) & uspi->s_fpbmask; + bh = sb_getblk(sb, phys64); + } else { + bh = NULL; + } + phys64 = ufs_inode_getblock(inode, bh, (ptr >> uspi->s_2apbshift) & uspi->s_apbmask, fragment, &err, NULL, NULL, NULL); + if (phys64) { + phys64 += (ptr >> uspi->s_2apbshift) & uspi->s_fpbmask, + bh = sb_getblk(sb, phys64); + } else { + bh = NULL; + } get_double: - bh = ufs_inode_getblock(inode, bh, + phys64 = ufs_inode_getblock(inode, bh, (ptr >> uspi->s_apbshift) & uspi->s_apbmask, fragment, &err, NULL, NULL, NULL); + if (phys64) { + phys64 += (ptr >> uspi->s_apbshift) & uspi->s_fpbmask, + bh = sb_getblk(sb, phys64); + } else { + bh = NULL; + } get_indirect: - bh = ufs_inode_getblock(inode, bh, ptr & uspi->s_apbmask, fragment, + phys64 = ufs_inode_getblock(inode, bh, ptr & uspi->s_apbmask, fragment, &err, &phys, &new, bh_result->b_page); - + if (phys64) { + phys64 += frag; + phys = phys64; + } out: if (err) goto abort; From 721435a7679e13f810133dbea769f87ad7bae3a1 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jun 2015 01:06:21 -0400 Subject: [PATCH 40/65] ufs_inode_getblock(): pass index instead of 'fragment' The value passed to ufs_inode_getblock() as the 3rd argument had lower bits ignored; the upper bits were shifted down and used and they actually make sense - those are _lower_ bits of index in indirect block (i.e. they form the index within a fragment within an indirect block). Pass those as argument. Upper bits of index (i.e. the number of fragment within indirect block) will join them shortly. Signed-off-by: Al Viro --- fs/ufs/inode.c | 32 +++++++++++++------------------- 1 file changed, 13 insertions(+), 19 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index eeccf45fcd57..6866b904f148 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -338,8 +338,7 @@ repeat2: * ufs_inode_getblock() - allocate new block * @inode: pointer to inode * @bh: pointer to block which hold "pointer" to new allocated block - * @fragment: number of `fragment' which hold pointer - * to new allocated block + * @index: number of pointer in the indirect block * @new_fragment: number of new allocated fragment * (block will hold this fragment and also uspi->s_fpb-1) * @err: see ufs_inode_getfrag() @@ -349,20 +348,14 @@ repeat2: */ static u64 ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, - u64 fragment, sector_t new_fragment, int *err, + unsigned index, sector_t new_fragment, int *err, long *phys, int *new, struct page *locked_page) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - u64 tmp = 0, goal, block; + u64 tmp = 0, goal; void *p; - block = ufs_fragstoblks (fragment); - - UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, metadata %d\n", - inode->i_ino, (unsigned long long)fragment, - (unsigned long long)new_fragment, !phys); - if (!bh) goto out; if (!buffer_uptodate(bh)) { @@ -372,17 +365,17 @@ ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, goto out; } if (uspi->fs_magic == UFS2_MAGIC) - p = (__fs64 *)bh->b_data + block; + p = (__fs64 *)bh->b_data + index; else - p = (__fs32 *)bh->b_data + block; + p = (__fs32 *)bh->b_data + index; tmp = ufs_data_ptr_to_cpu(sb, p); if (tmp) goto out; - if (block && (uspi->fs_magic == UFS2_MAGIC ? - (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[block-1])) : - (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[block-1])))) + if (index && (uspi->fs_magic == UFS2_MAGIC ? + (tmp = fs64_to_cpu(sb, ((__fs64 *)bh->b_data)[index-1])) : + (tmp = fs32_to_cpu(sb, ((__fs32 *)bh->b_data)[index-1])))) goal = tmp + uspi->s_fpb; else goal = bh->b_blocknr + uspi->s_fpb; @@ -424,6 +417,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff unsigned long ptr,phys; u64 phys64 = 0; unsigned frag = fragment & uspi->s_fpbmask; + unsigned mask = uspi->s_apbmask >> uspi->s_fpbshift; if (!create) { phys64 = ufs_frag_map(inode, offsets, depth); @@ -499,7 +493,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff bh = NULL; } phys64 = ufs_inode_getblock(inode, bh, - (ptr >> uspi->s_2apbshift) & uspi->s_apbmask, + offsets[1] & mask, fragment, &err, NULL, NULL, NULL); if (phys64) { phys64 += (ptr >> uspi->s_2apbshift) & uspi->s_fpbmask, @@ -509,7 +503,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff } get_double: phys64 = ufs_inode_getblock(inode, bh, - (ptr >> uspi->s_apbshift) & uspi->s_apbmask, + offsets[depth - 2] & mask, fragment, &err, NULL, NULL, NULL); if (phys64) { phys64 += (ptr >> uspi->s_apbshift) & uspi->s_fpbmask, @@ -518,8 +512,8 @@ get_double: bh = NULL; } get_indirect: - phys64 = ufs_inode_getblock(inode, bh, ptr & uspi->s_apbmask, fragment, - &err, &phys, &new, bh_result->b_page); + phys64 = ufs_inode_getblock(inode, bh, offsets[depth - 1] & mask, + fragment, &err, &phys, &new, bh_result->b_page); if (phys64) { phys64 += frag; phys = phys64; From 619cfac09134b4de7a4f232cf3636cf43728577d Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jun 2015 01:23:08 -0400 Subject: [PATCH 41/65] ufs_inode_getblock(): pass indirect block number and full index ... instead of messing with buffer_head. We can bloody well do sb_bread() in there. Signed-off-by: Al Viro --- fs/ufs/inode.c | 62 +++++++++++++------------------------------------- 1 file changed, 16 insertions(+), 46 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 6866b904f148..25d47df934e2 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -337,8 +337,8 @@ repeat2: /** * ufs_inode_getblock() - allocate new block * @inode: pointer to inode - * @bh: pointer to block which hold "pointer" to new allocated block - * @index: number of pointer in the indirect block + * @ind_block: block number of the indirect block + * @index: number of pointer within the indirect block * @new_fragment: number of new allocated fragment * (block will hold this fragment and also uspi->s_fpb-1) * @err: see ufs_inode_getfrag() @@ -347,23 +347,25 @@ repeat2: * @locked_page: see ufs_inode_getfrag() */ static u64 -ufs_inode_getblock(struct inode *inode, struct buffer_head *bh, +ufs_inode_getblock(struct inode *inode, u64 ind_block, unsigned index, sector_t new_fragment, int *err, long *phys, int *new, struct page *locked_page) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int shift = uspi->s_apbshift - uspi->s_fpbshift; u64 tmp = 0, goal; + struct buffer_head *bh; void *p; - if (!bh) - goto out; - if (!buffer_uptodate(bh)) { - ll_rw_block (READ, 1, &bh); - wait_on_buffer (bh); - if (!buffer_uptodate(bh)) - goto out; - } + if (!ind_block) + return 0; + + bh = sb_bread(sb, ind_block + (index >> shift)); + if (unlikely(!bh)) + return 0; + + index &= uspi->s_apbmask >> uspi->s_fpbshift; if (uspi->fs_magic == UFS2_MAGIC) p = (__fs64 *)bh->b_data + index; else @@ -459,12 +461,6 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift), fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page); - if (phys64) { - phys64 += (ptr >> uspi->s_apbshift) & uspi->s_fpbmask; - bh = sb_getblk(sb, phys64); - } else { - bh = NULL; - } goto get_indirect; } ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift); @@ -473,12 +469,6 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift), fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page); - if (phys64) { - phys64 += (ptr >> uspi->s_2apbshift) & uspi->s_fpbmask; - bh = sb_getblk(sb, phys64); - } else { - bh = NULL; - } goto get_double; } ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift); @@ -486,33 +476,13 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift), fragment, uspi->s_fpb, &err, NULL, NULL, bh_result->b_page); - if (phys64) { - phys64 += (ptr >> uspi->s_3apbshift) & uspi->s_fpbmask; - bh = sb_getblk(sb, phys64); - } else { - bh = NULL; - } - phys64 = ufs_inode_getblock(inode, bh, - offsets[1] & mask, + phys64 = ufs_inode_getblock(inode, phys64, offsets[1], fragment, &err, NULL, NULL, NULL); - if (phys64) { - phys64 += (ptr >> uspi->s_2apbshift) & uspi->s_fpbmask, - bh = sb_getblk(sb, phys64); - } else { - bh = NULL; - } get_double: - phys64 = ufs_inode_getblock(inode, bh, - offsets[depth - 2] & mask, + phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 2], fragment, &err, NULL, NULL, NULL); - if (phys64) { - phys64 += (ptr >> uspi->s_apbshift) & uspi->s_fpbmask, - bh = sb_getblk(sb, phys64); - } else { - bh = NULL; - } get_indirect: - phys64 = ufs_inode_getblock(inode, bh, offsets[depth - 1] & mask, + phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1], fragment, &err, &phys, &new, bh_result->b_page); if (phys64) { phys64 += frag; From 0f3c1294bedcc4544c68d6b84699bdaa334b11b8 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jun 2015 13:40:25 -0400 Subject: [PATCH 42/65] ufs_inode_getfrag(): split extending the partial blocks off ufs_extend_tail() is handling that now. Signed-off-by: Al Viro --- fs/ufs/inode.c | 130 +++++++++++++++++++++++++------------------------ 1 file changed, 66 insertions(+), 64 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 25d47df934e2..d652f64885fd 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -205,6 +205,40 @@ changed: goto again; } +/* + * Unpacking tails: we have a file with partial final block and + * we had been asked to extend it. If the fragment being written + * is within the same block, we need to extend the tail just to cover + * that fragment. Otherwise the tail is extended to full block. + * + * Note that we might need to create a _new_ tail, but that will + * be handled elsewhere; this is strictly for resizing old + * ones. + */ +static bool +ufs_extend_tail(struct inode *inode, u64 writes_to, + int *err, struct page *locked_page) +{ + struct ufs_inode_info *ufsi = UFS_I(inode); + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + unsigned lastfrag = ufsi->i_lastfrag; /* it's a short file, so unsigned is enough */ + unsigned block = ufs_fragstoblks(lastfrag); + unsigned new_size; + void *p; + u64 tmp; + + if (writes_to < (lastfrag | uspi->s_fpbmask)) + new_size = (writes_to & uspi->s_fpbmask) + 1; + else + new_size = uspi->s_fpb; + + p = ufs_get_direct_data_ptr(uspi, ufsi, block); + tmp = ufs_new_fragments(inode, p, lastfrag, ufs_data_ptr_to_cpu(sb, p), + new_size, err, locked_page); + return tmp != 0; +} + /** * ufs_inode_getfrag() - allocate new fragment(s) * @inode: pointer to inode @@ -226,13 +260,10 @@ ufs_inode_getfrag(struct inode *inode, u64 fragment, struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - unsigned blockoff, lastblockoff; - u64 tmp, goal, lastfrag, block, lastblock; - void *p, *p2; - - UFSD("ENTER, ino %lu, fragment %llu, new_fragment %llu, required %u, " - "metadata %d\n", inode->i_ino, (unsigned long long)fragment, - (unsigned long long)new_fragment, required, !phys); + unsigned blockoff; + u64 tmp, goal, lastfrag, block; + unsigned nfrags = uspi->s_fpb; + void *p; /* TODO : to be done for write support if ( (flags & UFS_TYPE_MASK) == UFS_TYPE_UFS2) @@ -242,66 +273,27 @@ ufs_inode_getfrag(struct inode *inode, u64 fragment, block = ufs_fragstoblks (fragment); blockoff = ufs_fragnum (fragment); p = ufs_get_direct_data_ptr(uspi, ufsi, block); - - goal = 0; - tmp = ufs_data_ptr_to_cpu(sb, p); - - lastfrag = ufsi->i_lastfrag; - if (tmp && fragment < lastfrag) + if (tmp) goto out; - lastblock = ufs_fragstoblks (lastfrag); - lastblockoff = ufs_fragnum (lastfrag); - /* - * We will extend file into new block beyond last allocated block - */ - if (lastblock < block) { - /* - * We must reallocate last allocated block - */ - if (lastblockoff) { - p2 = ufs_get_direct_data_ptr(uspi, ufsi, lastblock); - tmp = ufs_new_fragments(inode, p2, lastfrag, - ufs_data_ptr_to_cpu(sb, p2), - uspi->s_fpb - lastblockoff, - err, locked_page); - if (!tmp) - return 0; - lastfrag = ufsi->i_lastfrag; - } - tmp = ufs_data_ptr_to_cpu(sb, - ufs_get_direct_data_ptr(uspi, ufsi, - lastblock)); - if (tmp) - goal = tmp + uspi->s_fpb; - tmp = ufs_new_fragments (inode, p, fragment - blockoff, - goal, required + blockoff, - err, - phys != NULL ? locked_page : NULL); - } else if (lastblock == block) { - /* - * We will extend last allocated block - */ - tmp = ufs_new_fragments(inode, p, fragment - - (blockoff - lastblockoff), - ufs_data_ptr_to_cpu(sb, p), - required + (blockoff - lastblockoff), - err, phys != NULL ? locked_page : NULL); - } else /* (lastblock > block) */ { - /* - * We will allocate new block before last allocated block - */ - if (block) { - tmp = ufs_data_ptr_to_cpu(sb, - ufs_get_direct_data_ptr(uspi, ufsi, block - 1)); - if (tmp) - goal = tmp + uspi->s_fpb; - } - tmp = ufs_new_fragments(inode, p, fragment - blockoff, - goal, uspi->s_fpb, err, - phys != NULL ? locked_page : NULL); + lastfrag = ufsi->i_lastfrag; + + /* will that be a new tail? */ + if (new_fragment < UFS_NDIR_FRAGMENT && new_fragment >= lastfrag) + nfrags = (new_fragment & uspi->s_fpbmask) + 1; + + goal = 0; + if (block) { + goal = ufs_data_ptr_to_cpu(sb, + ufs_get_direct_data_ptr(uspi, ufsi, block - 1)); + if (goal) + goal += uspi->s_fpb; } + tmp = ufs_new_fragments(inode, p, fragment - blockoff, + goal, uspi->s_fpb, err, + phys != NULL ? locked_page : NULL); + if (!tmp) { *err = -ENOSPC; return 0; @@ -419,7 +411,6 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff unsigned long ptr,phys; u64 phys64 = 0; unsigned frag = fragment & uspi->s_fpbmask; - unsigned mask = uspi->s_apbmask >> uspi->s_fpbshift; if (!create) { phys64 = ufs_frag_map(inode, offsets, depth); @@ -444,6 +435,17 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff goto abort_too_big; err = 0; + + if (UFS_I(inode)->i_lastfrag < UFS_NDIR_FRAGMENT) { + unsigned lastfrag = UFS_I(inode)->i_lastfrag; + unsigned tailfrags = lastfrag & uspi->s_fpbmask; + if (tailfrags && fragment >= lastfrag) { + if (!ufs_extend_tail(inode, fragment, + &err, bh_result->b_page)) + goto abort; + } + } + ptr = fragment; if (depth == 1) { From 5336970be09becb2b59ac3812718b2cb80d33347 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jun 2015 13:53:52 -0400 Subject: [PATCH 43/65] ufs_inode_getfrag(): pass index instead of 'fragment' same story as with ufs_inode_getblock() Signed-off-by: Al Viro --- fs/ufs/inode.c | 50 +++++++++++++++++--------------------------------- 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index d652f64885fd..c05cf14ef8ff 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -242,10 +242,8 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, /** * ufs_inode_getfrag() - allocate new fragment(s) * @inode: pointer to inode - * @fragment: number of `fragment' which hold pointer - * to new allocated fragment(s) + * @index: number of block pointer within the inode's array. * @new_fragment: number of new allocated fragment(s) - * @required: how many fragment(s) we require * @err: we set it if something wrong * @phys: pointer to where we save physical number of new allocated fragments, * NULL if we allocate not data(indirect blocks for example). @@ -253,15 +251,14 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, * @locked_page: for ufs_new_fragments() */ static u64 -ufs_inode_getfrag(struct inode *inode, u64 fragment, - sector_t new_fragment, unsigned int required, int *err, +ufs_inode_getfrag(struct inode *inode, unsigned index, + sector_t new_fragment, int *err, long *phys, int *new, struct page *locked_page) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; - unsigned blockoff; - u64 tmp, goal, lastfrag, block; + u64 tmp, goal, lastfrag; unsigned nfrags = uspi->s_fpb; void *p; @@ -270,9 +267,7 @@ ufs_inode_getfrag(struct inode *inode, u64 fragment, goto ufs2; */ - block = ufs_fragstoblks (fragment); - blockoff = ufs_fragnum (fragment); - p = ufs_get_direct_data_ptr(uspi, ufsi, block); + p = ufs_get_direct_data_ptr(uspi, ufsi, index); tmp = ufs_data_ptr_to_cpu(sb, p); if (tmp) goto out; @@ -284,13 +279,13 @@ ufs_inode_getfrag(struct inode *inode, u64 fragment, nfrags = (new_fragment & uspi->s_fpbmask) + 1; goal = 0; - if (block) { + if (index) { goal = ufs_data_ptr_to_cpu(sb, - ufs_get_direct_data_ptr(uspi, ufsi, block - 1)); + ufs_get_direct_data_ptr(uspi, ufsi, index - 1)); if (goal) goal += uspi->s_fpb; } - tmp = ufs_new_fragments(inode, p, fragment - blockoff, + tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), goal, uspi->s_fpb, err, phys != NULL ? locked_page : NULL); @@ -408,7 +403,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff int ret, err, new; unsigned offsets[4]; int depth = ufs_block_to_path(inode, fragment >> uspi->s_fpbshift, offsets); - unsigned long ptr,phys; + unsigned long phys; u64 phys64 = 0; unsigned frag = fragment & uspi->s_fpbmask; @@ -446,38 +441,27 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff } } - ptr = fragment; - if (depth == 1) { - phys64 = ufs_inode_getfrag(inode, ptr, fragment, 1, &err, &phys, - &new, bh_result->b_page); + phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, + &err, &phys, &new, bh_result->b_page); if (phys64) { phys64 += frag; phys = phys64; } goto out; } - ptr -= UFS_NDIR_FRAGMENT; if (depth == 2) { - phys64 = ufs_inode_getfrag(inode, - UFS_IND_FRAGMENT + (ptr >> uspi->s_apbshift), - fragment, uspi->s_fpb, &err, NULL, NULL, - bh_result->b_page); + phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, + &err, NULL, NULL, bh_result->b_page); goto get_indirect; } - ptr -= 1 << (uspi->s_apbshift + uspi->s_fpbshift); if (depth == 3) { - phys64 = ufs_inode_getfrag(inode, - UFS_DIND_FRAGMENT + (ptr >> uspi->s_2apbshift), - fragment, uspi->s_fpb, &err, NULL, NULL, - bh_result->b_page); + phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, + &err, NULL, NULL, bh_result->b_page); goto get_double; } - ptr -= 1 << (uspi->s_2apbshift + uspi->s_fpbshift); - phys64 = ufs_inode_getfrag(inode, - UFS_TIND_FRAGMENT + (ptr >> uspi->s_3apbshift), - fragment, uspi->s_fpb, &err, NULL, NULL, - bh_result->b_page); + phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, + &err, NULL, NULL, bh_result->b_page); phys64 = ufs_inode_getblock(inode, phys64, offsets[1], fragment, &err, NULL, NULL, NULL); get_double: From 4eeff4c9326878ff58ef6fe68d2bf22ef877e5a2 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jun 2015 14:08:16 -0400 Subject: [PATCH 44/65] ufs_getfrag_block(): turn following indirects into a loop Signed-off-by: Al Viro --- fs/ufs/inode.c | 32 ++++++++------------------------ 1 file changed, 8 insertions(+), 24 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index c05cf14ef8ff..f2d8cc2166af 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -444,37 +444,21 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff if (depth == 1) { phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, &err, &phys, &new, bh_result->b_page); - if (phys64) { - phys64 += frag; - phys = phys64; - } - goto out; - } - if (depth == 2) { + } else { + int i; phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, &err, NULL, NULL, bh_result->b_page); - goto get_indirect; + for (i = 1; i < depth - 1; i++) + phys64 = ufs_inode_getblock(inode, phys64, offsets[i], + fragment, &err, NULL, NULL, NULL); + phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1], + fragment, &err, &phys, &new, bh_result->b_page); } - if (depth == 3) { - phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, - &err, NULL, NULL, bh_result->b_page); - goto get_double; - } - phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, - &err, NULL, NULL, bh_result->b_page); - phys64 = ufs_inode_getblock(inode, phys64, offsets[1], - fragment, &err, NULL, NULL, NULL); -get_double: - phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 2], - fragment, &err, NULL, NULL, NULL); -get_indirect: - phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1], - fragment, &err, &phys, &new, bh_result->b_page); +out: if (phys64) { phys64 += frag; phys = phys64; } -out: if (err) goto abort; if (new) From 5fbfb238f7a0a5c4633438eb5bdfb4810995c76a Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jun 2015 14:10:53 -0400 Subject: [PATCH 45/65] ufs_inode_getblock(): failure to read an indirect block is -EIO ... and not "write to beginning of the disk", TYVM... Signed-off-by: Al Viro --- fs/ufs/inode.c | 5 +++-- 1 file changed, 3 insertions(+), 2 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index f2d8cc2166af..ed70147e1cb4 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -349,8 +349,10 @@ ufs_inode_getblock(struct inode *inode, u64 ind_block, return 0; bh = sb_bread(sb, ind_block + (index >> shift)); - if (unlikely(!bh)) + if (unlikely(!bh)) { + *err = -EIO; return 0; + } index &= uspi->s_apbmask >> uspi->s_fpbshift; if (uspi->fs_magic == UFS2_MAGIC) @@ -454,7 +456,6 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1], fragment, &err, &phys, &new, bh_result->b_page); } -out: if (phys64) { phys64 += frag; phys = phys64; From 0385f1f9e3e5cb17047474037002500383237f47 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jun 2015 14:20:21 -0400 Subject: [PATCH 46/65] ufs_getfrag_block(): tidy up a bit Signed-off-by: Al Viro --- fs/ufs/inode.c | 48 +++++++++++++++--------------------------------- 1 file changed, 15 insertions(+), 33 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index ed70147e1cb4..7f551b3e3ba4 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -398,40 +398,30 @@ out: static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buffer_head *bh_result, int create) { - struct super_block * sb = inode->i_sb; - struct ufs_sb_info * sbi = UFS_SB(sb); - struct ufs_sb_private_info * uspi = sbi->s_uspi; - struct buffer_head * bh; - int ret, err, new; + struct super_block *sb = inode->i_sb; + struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; + int err = 0, new = 0; unsigned offsets[4]; int depth = ufs_block_to_path(inode, fragment >> uspi->s_fpbshift, offsets); - unsigned long phys; u64 phys64 = 0; + unsigned long phys; unsigned frag = fragment & uspi->s_fpbmask; if (!create) { phys64 = ufs_frag_map(inode, offsets, depth); - if (phys64) { - phys64 += frag; - map_bh(bh_result, sb, phys64); - } - return 0; + goto out; } /* This code entered only while writing ....? */ - err = -EIO; - new = 0; - ret = 0; - bh = NULL; - mutex_lock(&UFS_I(inode)->truncate_mutex); UFSD("ENTER, ino %lu, fragment %llu\n", inode->i_ino, (unsigned long long)fragment); - if (!depth) - goto abort_too_big; - - err = 0; + if (unlikely(!depth)) { + ufs_warning(sb, "ufs_get_block", "block > big"); + err = -EIO; + goto out; + } if (UFS_I(inode)->i_lastfrag < UFS_NDIR_FRAGMENT) { unsigned lastfrag = UFS_I(inode)->i_lastfrag; @@ -439,7 +429,7 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff if (tailfrags && fragment >= lastfrag) { if (!ufs_extend_tail(inode, fragment, &err, bh_result->b_page)) - goto abort; + goto out; } } @@ -456,23 +446,15 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1], fragment, &err, &phys, &new, bh_result->b_page); } +out: if (phys64) { phys64 += frag; - phys = phys64; + map_bh(bh_result, sb, phys64); + if (new) + set_buffer_new(bh_result); } - if (err) - goto abort; - if (new) - set_buffer_new(bh_result); - map_bh(bh_result, sb, phys); -abort: mutex_unlock(&UFS_I(inode)->truncate_mutex); - return err; - -abort_too_big: - ufs_warning(sb, "ufs_get_block", "block > big"); - goto abort; } static int ufs_writepage(struct page *page, struct writeback_control *wbc) From 4e317ce73aecb735f389ab0d42ae3197a55265e4 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 19 Jun 2015 14:27:10 -0400 Subject: [PATCH 47/65] ufs_inode_get{frag,block}(): get rid of 'phys' argument Just pass NULL as locked_page in case of first block in the indirect chain. Old calling conventions aside, a reason for having 'phys' was that ufs_inode_getfrag() used to be able to do _two_ allocations - indirect block and extending/reallocating a tail. We needed locked_page for the latter (it's a data), but we also needed to figure out that indirect block is metadata. So we used to pass non-NULL locked_page in all cases *and* used NULL phys as indication of being asked to allocate an indirect. With tail unpacking taken into a separate function we don't need those convolutions anymore. Signed-off-by: Al Viro --- fs/ufs/inode.c | 23 ++++++++--------------- 1 file changed, 8 insertions(+), 15 deletions(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 7f551b3e3ba4..a064cf44b143 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -245,15 +245,13 @@ ufs_extend_tail(struct inode *inode, u64 writes_to, * @index: number of block pointer within the inode's array. * @new_fragment: number of new allocated fragment(s) * @err: we set it if something wrong - * @phys: pointer to where we save physical number of new allocated fragments, - * NULL if we allocate not data(indirect blocks for example). * @new: we set it if we allocate new block * @locked_page: for ufs_new_fragments() */ static u64 ufs_inode_getfrag(struct inode *inode, unsigned index, sector_t new_fragment, int *err, - long *phys, int *new, struct page *locked_page) + int *new, struct page *locked_page) { struct ufs_inode_info *ufsi = UFS_I(inode); struct super_block *sb = inode->i_sb; @@ -286,18 +284,15 @@ ufs_inode_getfrag(struct inode *inode, unsigned index, goal += uspi->s_fpb; } tmp = ufs_new_fragments(inode, p, ufs_blknum(new_fragment), - goal, uspi->s_fpb, err, - phys != NULL ? locked_page : NULL); + goal, uspi->s_fpb, err, locked_page); if (!tmp) { *err = -ENOSPC; return 0; } - if (phys) { - *err = 0; + if (new) *new = 1; - } inode->i_ctime = CURRENT_TIME_SEC; if (IS_SYNC(inode)) ufs_sync_inode (inode); @@ -329,14 +324,13 @@ repeat2: * @new_fragment: number of new allocated fragment * (block will hold this fragment and also uspi->s_fpb-1) * @err: see ufs_inode_getfrag() - * @phys: see ufs_inode_getfrag() * @new: see ufs_inode_getfrag() * @locked_page: see ufs_inode_getfrag() */ static u64 ufs_inode_getblock(struct inode *inode, u64 ind_block, unsigned index, sector_t new_fragment, int *err, - long *phys, int *new, struct page *locked_page) + int *new, struct page *locked_page) { struct super_block *sb = inode->i_sb; struct ufs_sb_private_info *uspi = UFS_SB(sb)->s_uspi; @@ -404,7 +398,6 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff unsigned offsets[4]; int depth = ufs_block_to_path(inode, fragment >> uspi->s_fpbshift, offsets); u64 phys64 = 0; - unsigned long phys; unsigned frag = fragment & uspi->s_fpbmask; if (!create) { @@ -435,16 +428,16 @@ static int ufs_getfrag_block(struct inode *inode, sector_t fragment, struct buff if (depth == 1) { phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, - &err, &phys, &new, bh_result->b_page); + &err, &new, bh_result->b_page); } else { int i; phys64 = ufs_inode_getfrag(inode, offsets[0], fragment, - &err, NULL, NULL, bh_result->b_page); + &err, NULL, NULL); for (i = 1; i < depth - 1; i++) phys64 = ufs_inode_getblock(inode, phys64, offsets[i], - fragment, &err, NULL, NULL, NULL); + fragment, &err, NULL, NULL); phys64 = ufs_inode_getblock(inode, phys64, offsets[depth - 1], - fragment, &err, &phys, &new, bh_result->b_page); + fragment, &err, &new, bh_result->b_page); } out: if (phys64) { From bee9182d955227f01ff3b80c4cb6acca9bb40b11 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Sun, 19 Jul 2015 23:48:20 +0200 Subject: [PATCH 48/65] introduce __sb_writers_{acquired,release}() helpers Preparation to hide the sb->s_writers internals from xfs and btrfs. Add 2 trivial define's they can use rather than play with ->s_writers directly. No changes in btrfs/transaction.o and xfs/xfs_aops.o. Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- fs/btrfs/transaction.c | 8 ++------ fs/xfs/xfs_aops.c | 6 ++---- include/linux/fs.h | 5 +++++ 3 files changed, 9 insertions(+), 10 deletions(-) diff --git a/fs/btrfs/transaction.c b/fs/btrfs/transaction.c index f5021fcb154e..a8ab8f5ef38e 100644 --- a/fs/btrfs/transaction.c +++ b/fs/btrfs/transaction.c @@ -1638,9 +1638,7 @@ static void do_async_commit(struct work_struct *work) * Tell lockdep about it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_acquire_read( - &ac->root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(ac->root->fs_info->sb, SB_FREEZE_FS); current->journal_info = ac->newtrans; @@ -1679,9 +1677,7 @@ int btrfs_commit_transaction_async(struct btrfs_trans_handle *trans, * async commit thread will be the one to unlock it. */ if (ac->newtrans->type & __TRANS_FREEZABLE) - rwsem_release( - &root->fs_info->sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(root->fs_info->sb, SB_FREEZE_FS); schedule_work(&ac->work); diff --git a/fs/xfs/xfs_aops.c b/fs/xfs/xfs_aops.c index 3859f5e27a4d..9bbb3507376a 100644 --- a/fs/xfs/xfs_aops.c +++ b/fs/xfs/xfs_aops.c @@ -119,8 +119,7 @@ xfs_setfilesize_trans_alloc( * We may pass freeze protection with a transaction. So tell lockdep * we released it. */ - rwsem_release(&ioend->io_inode->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 1, _THIS_IP_); + __sb_writers_release(ioend->io_inode->i_sb, SB_FREEZE_FS); /* * We hand off the transaction to the completion thread now, so * clear the flag here. @@ -171,8 +170,7 @@ xfs_setfilesize_ioend( * Similarly for freeze protection. */ current_set_flags_nested(&tp->t_pflags, PF_FSTRANS); - rwsem_acquire_read(&VFS_I(ip)->i_sb->s_writers.lock_map[SB_FREEZE_FS-1], - 0, 1, _THIS_IP_); + __sb_writers_acquired(VFS_I(ip)->i_sb, SB_FREEZE_FS); return xfs_setfilesize(ip, tp, ioend->io_offset, ioend->io_size); } diff --git a/include/linux/fs.h b/include/linux/fs.h index 84b783f277f7..acb7cad84edd 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1391,6 +1391,11 @@ extern struct timespec current_fs_time(struct super_block *sb); void __sb_end_write(struct super_block *sb, int level); int __sb_start_write(struct super_block *sb, int level, bool wait); +#define __sb_writers_acquired(sb, lev) \ + rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_) +#define __sb_writers_release(sb, lev) \ + rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_) + /** * sb_end_write - drop write access to a superblock * @sb: the super we wrote to From f4b554af9931585174d4913b482eacab75858964 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Mon, 20 Jul 2015 00:50:55 +0200 Subject: [PATCH 49/65] fix the broken lockdep logic in __sb_start_write() 1. wait_event(frozen < level) without rwsem_acquire_read() is just wrong from lockdep perspective. If we are going to deadlock because the caller is buggy, lockdep can't detect this problem. 2. __sb_start_write() can race with thaw_super() + freeze_super(), and after "goto retry" the 2nd acquire_freeze_lock() is wrong. 3. The "tell lockdep we are doing trylock" hack doesn't look nice. I think this is correct, but this logic should be more explicit. Yes, the recursive read_lock() is fine if we hold the lock on a higher level. But we do not need to fool lockdep. If we can not deadlock in this case then try-lock must not fail and we can use use wait == F throughout this code. Note: as Dave Chinner explains, the "trylock" hack and the fat comment can be probably removed. But this needs a separate change and it will be trivial: just kill __sb_start_write() and rename do_sb_start_write() back to __sb_start_write(). Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- fs/super.c | 73 ++++++++++++++++++++++++++++++------------------------ 1 file changed, 40 insertions(+), 33 deletions(-) diff --git a/fs/super.c b/fs/super.c index b61372354f2b..24a76bcd62a5 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1158,38 +1158,11 @@ void __sb_end_write(struct super_block *sb, int level) } EXPORT_SYMBOL(__sb_end_write); -#ifdef CONFIG_LOCKDEP -/* - * We want lockdep to tell us about possible deadlocks with freezing but - * it's it bit tricky to properly instrument it. Getting a freeze protection - * works as getting a read lock but there are subtle problems. XFS for example - * gets freeze protection on internal level twice in some cases, which is OK - * only because we already hold a freeze protection also on higher level. Due - * to these cases we have to tell lockdep we are doing trylock when we - * already hold a freeze protection for a higher freeze level. - */ -static void acquire_freeze_lock(struct super_block *sb, int level, bool trylock, +static int do_sb_start_write(struct super_block *sb, int level, bool wait, unsigned long ip) { - int i; - - if (!trylock) { - for (i = 0; i < level - 1; i++) - if (lock_is_held(&sb->s_writers.lock_map[i])) { - trylock = true; - break; - } - } - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, trylock, ip); -} -#endif - -/* - * This is an internal function, please use sb_start_{write,pagefault,intwrite} - * instead. - */ -int __sb_start_write(struct super_block *sb, int level, bool wait) -{ + if (wait) + rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip); retry: if (unlikely(sb->s_writers.frozen >= level)) { if (!wait) @@ -1198,9 +1171,6 @@ retry: sb->s_writers.frozen < level); } -#ifdef CONFIG_LOCKDEP - acquire_freeze_lock(sb, level, !wait, _RET_IP_); -#endif percpu_counter_inc(&sb->s_writers.counter[level-1]); /* * Make sure counter is updated before we check for frozen. @@ -1211,8 +1181,45 @@ retry: __sb_end_write(sb, level); goto retry; } + + if (!wait) + rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip); return 1; } + +/* + * This is an internal function, please use sb_start_{write,pagefault,intwrite} + * instead. + */ +int __sb_start_write(struct super_block *sb, int level, bool wait) +{ + bool force_trylock = false; + int ret; + +#ifdef CONFIG_LOCKDEP + /* + * We want lockdep to tell us about possible deadlocks with freezing + * but it's it bit tricky to properly instrument it. Getting a freeze + * protection works as getting a read lock but there are subtle + * problems. XFS for example gets freeze protection on internal level + * twice in some cases, which is OK only because we already hold a + * freeze protection also on higher level. Due to these cases we have + * to use wait == F (trylock mode) which must not fail. + */ + if (wait) { + int i; + + for (i = 0; i < level - 1; i++) + if (lock_is_held(&sb->s_writers.lock_map[i])) { + force_trylock = true; + break; + } + } +#endif + ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_); + WARN_ON(force_trylock & !ret); + return ret; +} EXPORT_SYMBOL(__sb_start_write); /** From 0e28e01f1e73015d8e1b8fa1cda071d0bd9a2600 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 11 Aug 2015 16:28:29 +0200 Subject: [PATCH 50/65] document rwsem_release() in sb_wait_write() Not only we need to avoid the warning from lockdep_sys_exit(), the caller of freeze_super() can never release this lock. Another thread can do this, so there is another reason for rwsem_release(). Plus the comment should explain why we have to fool lockdep. Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- fs/super.c | 14 ++++++++++---- 1 file changed, 10 insertions(+), 4 deletions(-) diff --git a/fs/super.c b/fs/super.c index 24a76bcd62a5..8aa3cbc571d1 100644 --- a/fs/super.c +++ b/fs/super.c @@ -1236,11 +1236,17 @@ static void sb_wait_write(struct super_block *sb, int level) { s64 writers; - /* - * We just cycle-through lockdep here so that it does not complain - * about returning with lock to userspace - */ rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); + /* + * We are going to return to userspace and forget about this lock, the + * ownership goes to the caller of thaw_super() which does unlock. + * + * FIXME: we should do this before return from freeze_super() after we + * called sync_filesystem(sb) and s_op->freeze_fs(sb), and thaw_super() + * should re-acquire these locks before s_op->unfreeze_fs(sb). However + * this leads to lockdep false-positives, so currently we do the early + * release right after acquire. + */ rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); do { From 9287f6925ad9d8fb8c6283066b4f77fd87f123a9 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 17:45:57 +0200 Subject: [PATCH 51/65] percpu-rwsem: introduce percpu_down_read_trylock() Add percpu_down_read_trylock(), it will have the user soon. Signed-off-by: Oleg Nesterov --- include/linux/percpu-rwsem.h | 1 + kernel/locking/percpu-rwsem.c | 13 +++++++++++++ 2 files changed, 14 insertions(+) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 3e88c9a7d57f..16c30cd33501 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -16,6 +16,7 @@ struct percpu_rw_semaphore { }; extern void percpu_down_read(struct percpu_rw_semaphore *); +extern int percpu_down_read_trylock(struct percpu_rw_semaphore *); extern void percpu_up_read(struct percpu_rw_semaphore *); extern void percpu_down_write(struct percpu_rw_semaphore *); diff --git a/kernel/locking/percpu-rwsem.c b/kernel/locking/percpu-rwsem.c index 652a8ee8efe9..f32567254867 100644 --- a/kernel/locking/percpu-rwsem.c +++ b/kernel/locking/percpu-rwsem.c @@ -88,6 +88,19 @@ void percpu_down_read(struct percpu_rw_semaphore *brw) __up_read(&brw->rw_sem); } +int percpu_down_read_trylock(struct percpu_rw_semaphore *brw) +{ + if (unlikely(!update_fast_ctr(brw, +1))) { + if (!__down_read_trylock(&brw->rw_sem)) + return 0; + atomic_inc(&brw->slow_read_ctr); + __up_read(&brw->rw_sem); + } + + rwsem_acquire_read(&brw->rw_sem.dep_map, 0, 1, _RET_IP_); + return 1; +} + void percpu_up_read(struct percpu_rw_semaphore *brw) { rwsem_release(&brw->rw_sem.dep_map, 1, _RET_IP_); From 55cc156505f2e43fa45dbd4bfe8f9c9d848ca44c Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 21 Jul 2015 20:26:44 +0200 Subject: [PATCH 52/65] percpu-rwsem: introduce percpu_rwsem_release() and percpu_rwsem_acquire() Add percpu_rwsem_release() and percpu_rwsem_acquire() for the users which need to return to userspace with percpu-rwsem lock held and/or pass the ownership to another thread. TODO: change percpu_rwsem_release() to use rwsem_clear_owner(). We can either fold kernel/locking/rwsem.h into include/linux/rwsem.h, or add the non-inline percpu_rwsem_clear_owner(). Signed-off-by: Oleg Nesterov --- include/linux/percpu-rwsem.h | 19 +++++++++++++++++++ 1 file changed, 19 insertions(+) diff --git a/include/linux/percpu-rwsem.h b/include/linux/percpu-rwsem.h index 16c30cd33501..834c4e52cb2d 100644 --- a/include/linux/percpu-rwsem.h +++ b/include/linux/percpu-rwsem.h @@ -32,4 +32,23 @@ extern void percpu_free_rwsem(struct percpu_rw_semaphore *); __percpu_init_rwsem(brw, #brw, &rwsem_key); \ }) + +#define percpu_rwsem_is_held(sem) lockdep_is_held(&(sem)->rw_sem) + +static inline void percpu_rwsem_release(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) +{ + lock_release(&sem->rw_sem.dep_map, 1, ip); +#ifdef CONFIG_RWSEM_SPIN_ON_OWNER + if (!read) + sem->rw_sem.owner = NULL; +#endif +} + +static inline void percpu_rwsem_acquire(struct percpu_rw_semaphore *sem, + bool read, unsigned long ip) +{ + lock_acquire(&sem->rw_sem.dep_map, 0, 1, read, 1, NULL, ip); +} + #endif From bf3eac84c42da7017610abc8cfba64921ea92c76 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 11 Aug 2015 17:26:29 +0200 Subject: [PATCH 53/65] percpu-rwsem: kill CONFIG_PERCPU_RWSEM Remove CONFIG_PERCPU_RWSEM, the next patch adds the unconditional user of percpu_rw_semaphore. Signed-off-by: Oleg Nesterov --- arch/Kconfig | 1 - init/Kconfig | 1 - kernel/locking/Makefile | 3 +-- lib/Kconfig | 3 --- 4 files changed, 1 insertion(+), 7 deletions(-) diff --git a/arch/Kconfig b/arch/Kconfig index 8a8ea7110de8..8f3638674e05 100644 --- a/arch/Kconfig +++ b/arch/Kconfig @@ -87,7 +87,6 @@ config KPROBES_ON_FTRACE config UPROBES def_bool n - select PERCPU_RWSEM help Uprobes is the user-space counterpart to kprobes: they enable instrumentation applications (such as 'perf probe') diff --git a/init/Kconfig b/init/Kconfig index af09b4fb43d2..288c0122c2a5 100644 --- a/init/Kconfig +++ b/init/Kconfig @@ -925,7 +925,6 @@ config NUMA_BALANCING_DEFAULT_ENABLED menuconfig CGROUPS bool "Control Group support" select KERNFS - select PERCPU_RWSEM help This option adds support for grouping sets of processes together, for use with process control subsystems such as Cpusets, CFS, memory diff --git a/kernel/locking/Makefile b/kernel/locking/Makefile index 7dd5c9918e4c..4c6a97e1a849 100644 --- a/kernel/locking/Makefile +++ b/kernel/locking/Makefile @@ -1,5 +1,5 @@ -obj-y += mutex.o semaphore.o rwsem.o +obj-y += mutex.o semaphore.o rwsem.o percpu-rwsem.o ifdef CONFIG_FUNCTION_TRACER CFLAGS_REMOVE_lockdep.o = $(CC_FLAGS_FTRACE) @@ -25,6 +25,5 @@ obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock.o obj-$(CONFIG_DEBUG_SPINLOCK) += spinlock_debug.o obj-$(CONFIG_RWSEM_GENERIC_SPINLOCK) += rwsem-spinlock.o obj-$(CONFIG_RWSEM_XCHGADD_ALGORITHM) += rwsem-xadd.o -obj-$(CONFIG_PERCPU_RWSEM) += percpu-rwsem.o obj-$(CONFIG_QUEUED_RWLOCKS) += qrwlock.o obj-$(CONFIG_LOCK_TORTURE_TEST) += locktorture.o diff --git a/lib/Kconfig b/lib/Kconfig index 3a2ef67db6c7..f6aa03dc1576 100644 --- a/lib/Kconfig +++ b/lib/Kconfig @@ -53,9 +53,6 @@ config GENERIC_IO config STMP_DEVICE bool -config PERCPU_RWSEM - bool - config ARCH_USE_CMPXCHG_LOCKREF bool From 853b39a7c82826b8413048feec7bf08e98ce7a84 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Wed, 22 Jul 2015 20:21:13 +0200 Subject: [PATCH 54/65] shift percpu_counter_destroy() into destroy_super_work() Of course, this patch is ugly as hell. It will be (partially) reverted later. We add it to ensure that other WIP changes in percpu_rw_semaphore won't break fs/super.c. We do not even need this change right now, percpu_free_rwsem() is fine in atomic context. But we are going to change this, it will be might_sleep() after we merge the rcu_sync() patches. And even after that we do not really need destroy_super_work(), we will kill it in any case. Instead, destroy_super_rcu() should just check that rss->cb_state == CB_IDLE and do call_rcu() again in the (very unlikely) case this is not true. So this is just the temporary kludge which helps us to avoid the conflicts with the changes which will be (hopefully) routed via rcu tree. Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- fs/super.c | 23 +++++++++++++++++++---- include/linux/fs.h | 3 ++- 2 files changed, 21 insertions(+), 5 deletions(-) diff --git a/fs/super.c b/fs/super.c index 8aa3cbc571d1..c937bd7b4d33 100644 --- a/fs/super.c +++ b/fs/super.c @@ -135,6 +135,24 @@ static unsigned long super_cache_count(struct shrinker *shrink, return total_objects; } +static void destroy_super_work(struct work_struct *work) +{ + struct super_block *s = container_of(work, struct super_block, + destroy_work); + int i; + + for (i = 0; i < SB_FREEZE_LEVELS; i++) + percpu_counter_destroy(&s->s_writers.counter[i]); + kfree(s); +} + +static void destroy_super_rcu(struct rcu_head *head) +{ + struct super_block *s = container_of(head, struct super_block, rcu); + INIT_WORK(&s->destroy_work, destroy_super_work); + schedule_work(&s->destroy_work); +} + /** * destroy_super - frees a superblock * @s: superblock to free @@ -143,16 +161,13 @@ static unsigned long super_cache_count(struct shrinker *shrink, */ static void destroy_super(struct super_block *s) { - int i; list_lru_destroy(&s->s_dentry_lru); list_lru_destroy(&s->s_inode_lru); - for (i = 0; i < SB_FREEZE_LEVELS; i++) - percpu_counter_destroy(&s->s_writers.counter[i]); security_sb_free(s); WARN_ON(!list_empty(&s->s_mounts)); kfree(s->s_subtype); kfree(s->s_options); - kfree_rcu(s, rcu); + call_rcu(&s->rcu, destroy_super_rcu); } /** diff --git a/include/linux/fs.h b/include/linux/fs.h index acb7cad84edd..4bed78966c6b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -30,6 +30,7 @@ #include #include #include +#include #include #include @@ -1375,7 +1376,7 @@ struct super_block { struct list_lru s_dentry_lru ____cacheline_aligned_in_smp; struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; - + struct work_struct destroy_work; /* * Indicates how deep in a filesystem stack this SB is */ From 8129ed29644bf56ed17ec1bbbeed5c568b43d6a0 Mon Sep 17 00:00:00 2001 From: Oleg Nesterov Date: Tue, 11 Aug 2015 17:05:04 +0200 Subject: [PATCH 55/65] change sb_writers to use percpu_rw_semaphore We can remove everything from struct sb_writers except frozen and add the array of percpu_rw_semaphore's instead. This patch doesn't remove sb_writers->wait_unfrozen yet, we keep it for get_super_thawed(). We will probably remove it later. This change tries to address the following problems: - Firstly, __sb_start_write() looks simply buggy. It does __sb_end_write() if it sees ->frozen, but if it migrates to another CPU before percpu_counter_dec(), sb_wait_write() can wrongly succeed if there is another task which holds the same "semaphore": sb_wait_write() can miss the result of the previous percpu_counter_inc() but see the result of this percpu_counter_dec(). - As Dave Hansen reports, it is suboptimal. The trivial microbenchmark that writes to a tmpfs file in a loop runs 12% faster if we change this code to rely on RCU and kill the memory barriers. - This code doesn't look simple. It would be better to rely on the generic locking code. According to Dave, this change adds the same performance improvement. Note: with this change both freeze_super() and thaw_super() will do synchronize_sched_expedited() 3 times. This is just ugly. But: - This will be "fixed" by the rcu_sync changes we are going to merge. After that freeze_super()->percpu_down_write() will use synchronize_sched(), and thaw_super() won't use synchronize() at all. This doesn't need any changes in fs/super.c. - Once we merge rcu_sync changes, we can also change super.c so that all wb_write->rw_sem's will share the single ->rss in struct sb_writes, then freeze_super() will need only one synchronize_sched(). Signed-off-by: Oleg Nesterov Reviewed-by: Jan Kara --- fs/super.c | 111 ++++++++++++--------------------------------- include/linux/fs.h | 19 +++----- 2 files changed, 36 insertions(+), 94 deletions(-) diff --git a/fs/super.c b/fs/super.c index c937bd7b4d33..767b1e10f6ad 100644 --- a/fs/super.c +++ b/fs/super.c @@ -142,7 +142,7 @@ static void destroy_super_work(struct work_struct *work) int i; for (i = 0; i < SB_FREEZE_LEVELS; i++) - percpu_counter_destroy(&s->s_writers.counter[i]); + percpu_free_rwsem(&s->s_writers.rw_sem[i]); kfree(s); } @@ -193,13 +193,11 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) goto fail; for (i = 0; i < SB_FREEZE_LEVELS; i++) { - if (percpu_counter_init(&s->s_writers.counter[i], 0, - GFP_KERNEL) < 0) + if (__percpu_init_rwsem(&s->s_writers.rw_sem[i], + sb_writers_name[i], + &type->s_writers_key[i])) goto fail; - lockdep_init_map(&s->s_writers.lock_map[i], sb_writers_name[i], - &type->s_writers_key[i], 0); } - init_waitqueue_head(&s->s_writers.wait); init_waitqueue_head(&s->s_writers.wait_unfrozen); s->s_bdi = &noop_backing_dev_info; s->s_flags = flags; @@ -1161,47 +1159,10 @@ out: */ void __sb_end_write(struct super_block *sb, int level) { - percpu_counter_dec(&sb->s_writers.counter[level-1]); - /* - * Make sure s_writers are updated before we wake up waiters in - * freeze_super(). - */ - smp_mb(); - if (waitqueue_active(&sb->s_writers.wait)) - wake_up(&sb->s_writers.wait); - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _RET_IP_); + percpu_up_read(sb->s_writers.rw_sem + level-1); } EXPORT_SYMBOL(__sb_end_write); -static int do_sb_start_write(struct super_block *sb, int level, bool wait, - unsigned long ip) -{ - if (wait) - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 0, ip); -retry: - if (unlikely(sb->s_writers.frozen >= level)) { - if (!wait) - return 0; - wait_event(sb->s_writers.wait_unfrozen, - sb->s_writers.frozen < level); - } - - percpu_counter_inc(&sb->s_writers.counter[level-1]); - /* - * Make sure counter is updated before we check for frozen. - * freeze_super() first sets frozen and then checks the counter. - */ - smp_mb(); - if (unlikely(sb->s_writers.frozen >= level)) { - __sb_end_write(sb, level); - goto retry; - } - - if (!wait) - rwsem_acquire_read(&sb->s_writers.lock_map[level-1], 0, 1, ip); - return 1; -} - /* * This is an internal function, please use sb_start_{write,pagefault,intwrite} * instead. @@ -1209,7 +1170,7 @@ retry: int __sb_start_write(struct super_block *sb, int level, bool wait) { bool force_trylock = false; - int ret; + int ret = 1; #ifdef CONFIG_LOCKDEP /* @@ -1225,13 +1186,17 @@ int __sb_start_write(struct super_block *sb, int level, bool wait) int i; for (i = 0; i < level - 1; i++) - if (lock_is_held(&sb->s_writers.lock_map[i])) { + if (percpu_rwsem_is_held(sb->s_writers.rw_sem + i)) { force_trylock = true; break; } } #endif - ret = do_sb_start_write(sb, level, wait && !force_trylock, _RET_IP_); + if (wait && !force_trylock) + percpu_down_read(sb->s_writers.rw_sem + level-1); + else + ret = percpu_down_read_trylock(sb->s_writers.rw_sem + level-1); + WARN_ON(force_trylock & !ret); return ret; } @@ -1243,15 +1208,11 @@ EXPORT_SYMBOL(__sb_start_write); * @level: type of writers we wait for (normal vs page fault) * * This function waits until there are no writers of given type to given file - * system. Caller of this function should make sure there can be no new writers - * of type @level before calling this function. Otherwise this function can - * livelock. + * system. */ static void sb_wait_write(struct super_block *sb, int level) { - s64 writers; - - rwsem_acquire(&sb->s_writers.lock_map[level-1], 0, 0, _THIS_IP_); + percpu_down_write(sb->s_writers.rw_sem + level-1); /* * We are going to return to userspace and forget about this lock, the * ownership goes to the caller of thaw_super() which does unlock. @@ -1262,24 +1223,18 @@ static void sb_wait_write(struct super_block *sb, int level) * this leads to lockdep false-positives, so currently we do the early * release right after acquire. */ - rwsem_release(&sb->s_writers.lock_map[level-1], 1, _THIS_IP_); + percpu_rwsem_release(sb->s_writers.rw_sem + level-1, 0, _THIS_IP_); +} - do { - DEFINE_WAIT(wait); +static void sb_freeze_unlock(struct super_block *sb) +{ + int level; - /* - * We use a barrier in prepare_to_wait() to separate setting - * of frozen and checking of the counter - */ - prepare_to_wait(&sb->s_writers.wait, &wait, - TASK_UNINTERRUPTIBLE); + for (level = 0; level < SB_FREEZE_LEVELS; ++level) + percpu_rwsem_acquire(sb->s_writers.rw_sem + level, 0, _THIS_IP_); - writers = percpu_counter_sum(&sb->s_writers.counter[level-1]); - if (writers) - schedule(); - - finish_wait(&sb->s_writers.wait, &wait); - } while (writers); + for (level = SB_FREEZE_LEVELS - 1; level >= 0; level--) + percpu_up_write(sb->s_writers.rw_sem + level); } /** @@ -1338,20 +1293,14 @@ int freeze_super(struct super_block *sb) return 0; } - /* From now on, no new normal writers can start */ sb->s_writers.frozen = SB_FREEZE_WRITE; - smp_wmb(); - /* Release s_umount to preserve sb_start_write -> s_umount ordering */ up_write(&sb->s_umount); - sb_wait_write(sb, SB_FREEZE_WRITE); + down_write(&sb->s_umount); /* Now we go and block page faults... */ - down_write(&sb->s_umount); sb->s_writers.frozen = SB_FREEZE_PAGEFAULT; - smp_wmb(); - sb_wait_write(sb, SB_FREEZE_PAGEFAULT); /* All writers are done so after syncing there won't be dirty data */ @@ -1359,7 +1308,6 @@ int freeze_super(struct super_block *sb) /* Now wait for internal filesystem counter */ sb->s_writers.frozen = SB_FREEZE_FS; - smp_wmb(); sb_wait_write(sb, SB_FREEZE_FS); if (sb->s_op->freeze_fs) { @@ -1368,7 +1316,7 @@ int freeze_super(struct super_block *sb) printk(KERN_ERR "VFS:Filesystem freeze failed\n"); sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); return ret; @@ -1400,8 +1348,10 @@ int thaw_super(struct super_block *sb) return -EINVAL; } - if (sb->s_flags & MS_RDONLY) + if (sb->s_flags & MS_RDONLY) { + sb->s_writers.frozen = SB_UNFROZEN; goto out; + } if (sb->s_op->unfreeze_fs) { error = sb->s_op->unfreeze_fs(sb); @@ -1413,12 +1363,11 @@ int thaw_super(struct super_block *sb) } } -out: sb->s_writers.frozen = SB_UNFROZEN; - smp_wmb(); + sb_freeze_unlock(sb); +out: wake_up(&sb->s_writers.wait_unfrozen); deactivate_locked_super(sb); - return 0; } EXPORT_SYMBOL(thaw_super); diff --git a/include/linux/fs.h b/include/linux/fs.h index 4bed78966c6b..ce356f66cc2a 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1,7 +1,6 @@ #ifndef _LINUX_FS_H #define _LINUX_FS_H - #include #include #include @@ -31,6 +30,7 @@ #include #include #include +#include #include #include @@ -1275,16 +1275,9 @@ enum { #define SB_FREEZE_LEVELS (SB_FREEZE_COMPLETE - 1) struct sb_writers { - /* Counters for counting writers at each level */ - struct percpu_counter counter[SB_FREEZE_LEVELS]; - wait_queue_head_t wait; /* queue for waiting for - writers / faults to finish */ - int frozen; /* Is sb frozen? */ - wait_queue_head_t wait_unfrozen; /* queue for waiting for - sb to be thawed */ -#ifdef CONFIG_DEBUG_LOCK_ALLOC - struct lockdep_map lock_map[SB_FREEZE_LEVELS]; -#endif + int frozen; /* Is sb frozen? */ + wait_queue_head_t wait_unfrozen; /* for get_super_thawed() */ + struct percpu_rw_semaphore rw_sem[SB_FREEZE_LEVELS]; }; struct super_block { @@ -1393,9 +1386,9 @@ void __sb_end_write(struct super_block *sb, int level); int __sb_start_write(struct super_block *sb, int level, bool wait); #define __sb_writers_acquired(sb, lev) \ - rwsem_acquire_read(&(sb)->s_writers.lock_map[(lev)-1], 0, 1, _THIS_IP_) + percpu_rwsem_acquire(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) #define __sb_writers_release(sb, lev) \ - rwsem_release(&(sb)->s_writers.lock_map[(lev)-1], 1, _THIS_IP_) + percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_) /** * sb_end_write - drop write access to a superblock From d353d7587d02116b9732d5c06615aed75a4d3a47 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 Mar 2015 11:16:36 -0500 Subject: [PATCH 56/65] writeback: plug writeback at a high level Doing writeback on lots of little files causes terrible IOPS storms because of the per-mapping writeback plugging we do. This essentially causes imeediate dispatch of IO for each mapping, regardless of the context in which writeback is occurring. IOWs, running a concurrent write-lots-of-small 4k files using fsmark on XFS results in a huge number of IOPS being issued for data writes. Metadata writes are sorted and plugged at a high level by XFS, so aggregate nicely into large IOs. However, data writeback IOs are dispatched in individual 4k IOs, even when the blocks of two consecutively written files are adjacent. Test VM: 8p, 8GB RAM, 4xSSD in RAID0, 100TB sparse XFS filesystem, metadata CRCs enabled. Kernel: 3.10-rc5 + xfsdev + my 3.11 xfs queue (~70 patches) Test: $ ./fs_mark -D 10000 -S0 -n 10000 -s 4096 -L 120 -d /mnt/scratch/0 -d /mnt/scratch/1 -d /mnt/scratch/2 -d /mnt/scratch/3 -d /mnt/scratch/4 -d /mnt/scratch/5 -d /mnt/scratch/6 -d /mnt/scratch/7 Result: wall sys create rate Physical write IO time CPU (avg files/s) IOPS Bandwidth ----- ----- ------------ ------ --------- unpatched 6m56s 15m47s 24,000+/-500 26,000 130MB/s patched 5m06s 13m28s 32,800+/-600 1,500 180MB/s improvement -26.44% -14.68% +36.67% -94.23% +38.46% If I use zero length files, this workload at about 500 IOPS, so plugging drops the data IOs from roughly 25,500/s to 1000/s. 3 lines of code, 35% better throughput for 15% less CPU. The benefits of plugging at this layer are likely to be higher for spinning media as the IO patterns for this workload are going make a much bigger difference on high IO latency devices..... Signed-off-by: Dave Chinner Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Tested-by: Dave Chinner Reviewed-by: Christoph Hellwig --- fs/fs-writeback.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 518c6294bf6c..d98e37bbf417 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -1439,7 +1439,9 @@ static long writeback_sb_inodes(struct super_block *sb, unsigned long start_time = jiffies; long write_chunk; long wrote = 0; /* count both pages and inodes */ + struct blk_plug plug; + blk_start_plug(&plug); while (!list_empty(&wb->b_io)) { struct inode *inode = wb_inode(wb->b_io.prev); @@ -1537,6 +1539,7 @@ static long writeback_sb_inodes(struct super_block *sb, break; } } + blk_finish_plug(&plug); return wrote; } From cbedaac63481dea52327127a9f1c60f092bd6b07 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Thu, 12 Mar 2015 08:19:11 -0400 Subject: [PATCH 57/65] inode: add hlist_fake to avoid the inode hash lock in evict Some filesystems don't use the VFS inode hash and fake the fact they are hashed so that all the writeback code works correctly. However, this means the evict() path still tries to remove the inode from the hash, meaning that the inode_hash_lock() needs to be taken unnecessarily. Hence under certain workloads the inode_hash_lock can be contended even if the inode is never actually hashed. To avoid this add hlist_fake to test if the inode isn't actually hashed to avoid taking the hash lock on inodes that have never been hashed. Based on Dave Chinner's inode: add IOP_NOTHASHED to avoid inode hash lock in evict basd on Al's suggestions. Thanks, Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Dave Chinner --- include/linux/fs.h | 2 +- include/linux/list.h | 5 +++++ 2 files changed, 6 insertions(+), 1 deletion(-) diff --git a/include/linux/fs.h b/include/linux/fs.h index 84b783f277f7..4a40fa843040 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -2608,7 +2608,7 @@ static inline void insert_inode_hash(struct inode *inode) extern void __remove_inode_hash(struct inode *); static inline void remove_inode_hash(struct inode *inode) { - if (!inode_unhashed(inode)) + if (!inode_unhashed(inode) && !hlist_fake(&inode->i_hash)) __remove_inode_hash(inode); } diff --git a/include/linux/list.h b/include/linux/list.h index feb773c76ee0..3e3e64a61002 100644 --- a/include/linux/list.h +++ b/include/linux/list.h @@ -672,6 +672,11 @@ static inline void hlist_add_fake(struct hlist_node *n) n->pprev = &n->next; } +static inline bool hlist_fake(struct hlist_node *h) +{ + return h->pprev == &h->next; +} + /* * Move a list from one list head to another. Fixup the pprev * reference of the first entry if it exists. From 74278da9f70d84d715601fe794567a6d2bfdf078 Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 Mar 2015 12:37:22 -0500 Subject: [PATCH 58/65] inode: convert inode_sb_list_lock to per-sb The process of reducing contention on per-superblock inode lists starts with moving the locking to match the per-superblock inode list. This takes the global lock out of the picture and reduces the contention problems to within a single filesystem. This doesn't get rid of contention as the locks still have global CPU scope, but it does isolate operations on different superblocks form each other. Signed-off-by: Dave Chinner Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Dave Chinner --- fs/block_dev.c | 12 ++++++------ fs/drop_caches.c | 10 ++++++---- fs/fs-writeback.c | 12 ++++++------ fs/inode.c | 28 +++++++++++++--------------- fs/internal.h | 1 - fs/notify/inode_mark.c | 20 ++++++++++---------- fs/quota/dquot.c | 16 ++++++++-------- fs/super.c | 3 ++- include/linux/fs.h | 5 ++++- include/linux/fsnotify_backend.h | 4 ++-- 10 files changed, 57 insertions(+), 54 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index 198243717da5..33b813e04f79 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -1769,7 +1769,7 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) { struct inode *inode, *old_inode = NULL; - spin_lock(&inode_sb_list_lock); + spin_lock(&blockdev_superblock->s_inode_list_lock); list_for_each_entry(inode, &blockdev_superblock->s_inodes, i_sb_list) { struct address_space *mapping = inode->i_mapping; @@ -1781,13 +1781,13 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock. We cannot iput the inode now as we can + * s_inode_list_lock We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); @@ -1795,8 +1795,8 @@ void iterate_bdevs(void (*func)(struct block_device *, void *), void *arg) func(I_BDEV(inode), arg); - spin_lock(&inode_sb_list_lock); + spin_lock(&blockdev_superblock->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&blockdev_superblock->s_inode_list_lock); iput(old_inode); } diff --git a/fs/drop_caches.c b/fs/drop_caches.c index 5718cb9f7273..d72d52b90433 100644 --- a/fs/drop_caches.c +++ b/fs/drop_caches.c @@ -17,7 +17,7 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) { struct inode *inode, *toput_inode = NULL; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || @@ -27,13 +27,15 @@ static void drop_pagecache_sb(struct super_block *sb, void *unused) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); + invalidate_mapping_pages(inode->i_mapping, 0, -1); iput(toput_inode); toput_inode = inode; - spin_lock(&inode_sb_list_lock); + + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(toput_inode); } diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index d98e37bbf417..f45bf876579f 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2124,7 +2124,7 @@ static void wait_sb_inodes(struct super_block *sb) */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); /* * Data integrity sync. Must wait for all pages under writeback, @@ -2144,14 +2144,14 @@ static void wait_sb_inodes(struct super_block *sb) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock. We cannot iput the inode now as we can + * s_inode_list_lock. We cannot iput the inode now as we can * be holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ iput(old_inode); @@ -2161,9 +2161,9 @@ static void wait_sb_inodes(struct super_block *sb) cond_resched(); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(old_inode); } diff --git a/fs/inode.c b/fs/inode.c index d30640f7a193..a2de294f6b77 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -28,8 +28,8 @@ * inode->i_state, inode->i_hash, __iget() * Inode LRU list locks protect: * inode->i_sb->s_inode_lru, inode->i_lru - * inode_sb_list_lock protects: - * sb->s_inodes, inode->i_sb_list + * inode->i_sb->s_inode_list_lock protects: + * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list * inode_hash_lock protects: @@ -37,7 +37,7 @@ * * Lock ordering: * - * inode_sb_list_lock + * inode->i_sb->s_inode_list_lock * inode->i_lock * Inode LRU list locks * @@ -45,7 +45,7 @@ * inode->i_lock * * inode_hash_lock - * inode_sb_list_lock + * inode->i_sb->s_inode_list_lock * inode->i_lock * * iunique_lock @@ -57,8 +57,6 @@ static unsigned int i_hash_shift __read_mostly; static struct hlist_head *inode_hashtable __read_mostly; static __cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_hash_lock); -__cacheline_aligned_in_smp DEFINE_SPINLOCK(inode_sb_list_lock); - /* * Empty aops. Can be used for the cases where the user does not * define any of the address_space operations. @@ -426,18 +424,18 @@ static void inode_lru_list_del(struct inode *inode) */ void inode_sb_list_add(struct inode *inode) { - spin_lock(&inode_sb_list_lock); + spin_lock(&inode->i_sb->s_inode_list_lock); list_add(&inode->i_sb_list, &inode->i_sb->s_inodes); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&inode->i_sb->s_inode_list_lock); } EXPORT_SYMBOL_GPL(inode_sb_list_add); static inline void inode_sb_list_del(struct inode *inode) { if (!list_empty(&inode->i_sb_list)) { - spin_lock(&inode_sb_list_lock); + spin_lock(&inode->i_sb->s_inode_list_lock); list_del_init(&inode->i_sb_list); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&inode->i_sb->s_inode_list_lock); } } @@ -594,7 +592,7 @@ void evict_inodes(struct super_block *sb) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) continue; @@ -610,7 +608,7 @@ void evict_inodes(struct super_block *sb) spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); } @@ -631,7 +629,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) struct inode *inode, *next; LIST_HEAD(dispose); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if (inode->i_state & (I_NEW | I_FREEING | I_WILL_FREE)) { @@ -654,7 +652,7 @@ int invalidate_inodes(struct super_block *sb, bool kill_dirty) spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); dispose_list(&dispose); @@ -890,7 +888,7 @@ struct inode *new_inode(struct super_block *sb) { struct inode *inode; - spin_lock_prefetch(&inode_sb_list_lock); + spin_lock_prefetch(&sb->s_inode_list_lock); inode = new_inode_pseudo(sb); if (inode) diff --git a/fs/internal.h b/fs/internal.h index 4d5af583ab03..ee1209c54eb1 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -112,7 +112,6 @@ extern int vfs_open(const struct path *, struct file *, const struct cred *); /* * inode.c */ -extern spinlock_t inode_sb_list_lock; extern long prune_icache_sb(struct super_block *sb, struct shrink_control *sc); extern void inode_add_lru(struct inode *inode); diff --git a/fs/notify/inode_mark.c b/fs/notify/inode_mark.c index 3daf513ee99e..a4e1a8f6c329 100644 --- a/fs/notify/inode_mark.c +++ b/fs/notify/inode_mark.c @@ -163,17 +163,17 @@ int fsnotify_add_inode_mark(struct fsnotify_mark *mark, /** * fsnotify_unmount_inodes - an sb is unmounting. handle any watched inodes. - * @list: list of inodes being unmounted (sb->s_inodes) + * @sb: superblock being unmounted. * * Called during unmount with no locks held, so needs to be safe against - * concurrent modifiers. We temporarily drop inode_sb_list_lock and CAN block. + * concurrent modifiers. We temporarily drop sb->s_inode_list_lock and CAN block. */ -void fsnotify_unmount_inodes(struct list_head *list) +void fsnotify_unmount_inodes(struct super_block *sb) { struct inode *inode, *next_i, *need_iput = NULL; - spin_lock(&inode_sb_list_lock); - list_for_each_entry_safe(inode, next_i, list, i_sb_list) { + spin_lock(&sb->s_inode_list_lock); + list_for_each_entry_safe(inode, next_i, &sb->s_inodes, i_sb_list) { struct inode *need_iput_tmp; /* @@ -209,7 +209,7 @@ void fsnotify_unmount_inodes(struct list_head *list) spin_unlock(&inode->i_lock); /* In case the dropping of a reference would nuke next_i. */ - while (&next_i->i_sb_list != list) { + while (&next_i->i_sb_list != &sb->s_inodes) { spin_lock(&next_i->i_lock); if (!(next_i->i_state & (I_FREEING | I_WILL_FREE)) && atomic_read(&next_i->i_count)) { @@ -224,12 +224,12 @@ void fsnotify_unmount_inodes(struct list_head *list) } /* - * We can safely drop inode_sb_list_lock here because either + * We can safely drop s_inode_list_lock here because either * we actually hold references on both inode and next_i or * end of list. Also no new inodes will be added since the * umount has begun. */ - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); if (need_iput_tmp) iput(need_iput_tmp); @@ -241,7 +241,7 @@ void fsnotify_unmount_inodes(struct list_head *list) iput(inode); - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); } diff --git a/fs/quota/dquot.c b/fs/quota/dquot.c index 20d1f74561cf..2863ec6cbadf 100644 --- a/fs/quota/dquot.c +++ b/fs/quota/dquot.c @@ -923,7 +923,7 @@ static void add_dquot_ref(struct super_block *sb, int type) int reserved = 0; #endif - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { spin_lock(&inode->i_lock); if ((inode->i_state & (I_FREEING|I_WILL_FREE|I_NEW)) || @@ -934,7 +934,7 @@ static void add_dquot_ref(struct super_block *sb, int type) } __iget(inode); spin_unlock(&inode->i_lock); - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (unlikely(inode_get_rsv_space(inode) > 0)) @@ -946,15 +946,15 @@ static void add_dquot_ref(struct super_block *sb, int type) /* * We hold a reference to 'inode' so it couldn't have been * removed from s_inodes list while we dropped the - * inode_sb_list_lock We cannot iput the inode now as we can be + * s_inode_list_lock. We cannot iput the inode now as we can be * holding the last reference and we cannot iput it under - * inode_sb_list_lock. So we keep the reference and iput it + * s_inode_list_lock. So we keep the reference and iput it * later. */ old_inode = inode; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); iput(old_inode); #ifdef CONFIG_QUOTA_DEBUG @@ -1023,7 +1023,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, struct inode *inode; int reserved = 0; - spin_lock(&inode_sb_list_lock); + spin_lock(&sb->s_inode_list_lock); list_for_each_entry(inode, &sb->s_inodes, i_sb_list) { /* * We have to scan also I_NEW inodes because they can already @@ -1039,7 +1039,7 @@ static void remove_dquot_ref(struct super_block *sb, int type, } spin_unlock(&dq_data_lock); } - spin_unlock(&inode_sb_list_lock); + spin_unlock(&sb->s_inode_list_lock); #ifdef CONFIG_QUOTA_DEBUG if (reserved) { printk(KERN_WARNING "VFS (%s): Writes happened after quota" diff --git a/fs/super.c b/fs/super.c index b61372354f2b..c808183554a2 100644 --- a/fs/super.c +++ b/fs/super.c @@ -191,6 +191,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); INIT_LIST_HEAD(&s->s_inodes); + spin_lock_init(&s->s_inode_list_lock); if (list_lru_init_memcg(&s->s_dentry_lru)) goto fail; @@ -399,7 +400,7 @@ void generic_shutdown_super(struct super_block *sb) sync_filesystem(sb); sb->s_flags &= ~MS_ACTIVE; - fsnotify_unmount_inodes(&sb->s_inodes); + fsnotify_unmount_inodes(sb); evict_inodes(sb); diff --git a/include/linux/fs.h b/include/linux/fs.h index 4a40fa843040..09bbd38485f9 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1309,7 +1309,6 @@ struct super_block { #endif const struct xattr_handler **s_xattr; - struct list_head s_inodes; /* all inodes */ struct hlist_bl_head s_anon; /* anonymous dentries for (nfs) exporting */ struct list_head s_mounts; /* list of mounts; _not_ for fs use */ struct block_device *s_bdev; @@ -1380,6 +1379,10 @@ struct super_block { * Indicates how deep in a filesystem stack this SB is */ int s_stack_depth; + + /* s_inode_list_lock protects s_inodes */ + spinlock_t s_inode_list_lock ____cacheline_aligned_in_smp; + struct list_head s_inodes; /* all inodes */ }; extern struct timespec current_fs_time(struct super_block *sb); diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 65a517dd32f7..0390ee69c439 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -357,7 +357,7 @@ extern void fsnotify_clear_marks_by_group_flags(struct fsnotify_group *group, un extern void fsnotify_clear_marks_by_group(struct fsnotify_group *group); extern void fsnotify_get_mark(struct fsnotify_mark *mark); extern void fsnotify_put_mark(struct fsnotify_mark *mark); -extern void fsnotify_unmount_inodes(struct list_head *list); +extern void fsnotify_unmount_inodes(struct super_block *sb); /* put here because inotify does some weird stuff when destroying watches */ extern void fsnotify_init_event(struct fsnotify_event *event, @@ -393,7 +393,7 @@ static inline u32 fsnotify_get_cookie(void) return 0; } -static inline void fsnotify_unmount_inodes(struct list_head *list) +static inline void fsnotify_unmount_inodes(struct super_block *sb) {} #endif /* CONFIG_FSNOTIFY */ From e97fedb9ef9868ff24d588be781906cf7c1b59ae Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 Mar 2015 13:40:00 -0500 Subject: [PATCH 59/65] sync: serialise per-superblock sync operations When competing sync(2) calls walk the same filesystem, they need to walk the list of inodes on the superblock to find all the inodes that we need to wait for IO completion on. However, when multiple wait_sb_inodes() calls do this at the same time, they contend on the the inode_sb_list_lock and the contention causes system wide slowdowns. In effect, concurrent sync(2) calls can take longer and burn more CPU than if they were serialised. Stop the worst of the contention by adding a per-sb mutex to wrap around wait_sb_inodes() so that we only execute one sync(2) IO completion walk per superblock superblock at a time and hence avoid contention being triggered by concurrent sync(2) calls. Signed-off-by: Dave Chinner Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Dave Chinner --- fs/fs-writeback.c | 11 +++++++++++ fs/super.c | 1 + include/linux/fs.h | 2 ++ 3 files changed, 14 insertions(+) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index f45bf876579f..3c974442bdf0 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -2114,6 +2114,15 @@ out_unlock_inode: } EXPORT_SYMBOL(__mark_inode_dirty); +/* + * The @s_sync_lock is used to serialise concurrent sync operations + * to avoid lock contention problems with concurrent wait_sb_inodes() calls. + * Concurrent callers will block on the s_sync_lock rather than doing contending + * walks. The queueing maintains sync(2) required behaviour as all the IO that + * has been issued up to the time this function is enter is guaranteed to be + * completed by the time we have gained the lock and waited for all IO that is + * in progress regardless of the order callers are granted the lock. + */ static void wait_sb_inodes(struct super_block *sb) { struct inode *inode, *old_inode = NULL; @@ -2124,6 +2133,7 @@ static void wait_sb_inodes(struct super_block *sb) */ WARN_ON(!rwsem_is_locked(&sb->s_umount)); + mutex_lock(&sb->s_sync_lock); spin_lock(&sb->s_inode_list_lock); /* @@ -2165,6 +2175,7 @@ static void wait_sb_inodes(struct super_block *sb) } spin_unlock(&sb->s_inode_list_lock); iput(old_inode); + mutex_unlock(&sb->s_sync_lock); } static void __writeback_inodes_sb_nr(struct super_block *sb, unsigned long nr, diff --git a/fs/super.c b/fs/super.c index c808183554a2..fd427ec0b372 100644 --- a/fs/super.c +++ b/fs/super.c @@ -190,6 +190,7 @@ static struct super_block *alloc_super(struct file_system_type *type, int flags) s->s_flags = flags; INIT_HLIST_NODE(&s->s_instances); INIT_HLIST_BL_HEAD(&s->s_anon); + mutex_init(&s->s_sync_lock); INIT_LIST_HEAD(&s->s_inodes); spin_lock_init(&s->s_inode_list_lock); diff --git a/include/linux/fs.h b/include/linux/fs.h index 09bbd38485f9..82dfc5519b4b 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1375,6 +1375,8 @@ struct super_block { struct list_lru s_inode_lru ____cacheline_aligned_in_smp; struct rcu_head rcu; + struct mutex s_sync_lock; /* sync serialisation lock */ + /* * Indicates how deep in a filesystem stack this SB is */ From c7f5408493aeb01532927b2276316797a03ed6ee Mon Sep 17 00:00:00 2001 From: Dave Chinner Date: Wed, 4 Mar 2015 14:07:22 -0500 Subject: [PATCH 60/65] inode: rename i_wb_list to i_io_list There's a small consistency problem between the inode and writeback naming. Writeback calls the "for IO" inode queues b_io and b_more_io, but the inode calls these the "writeback list" or i_wb_list. This makes it hard to an new "under writeback" list to the inode, or call it an "under IO" list on the bdi because either way we'll have writeback on IO and IO on writeback and it'll just be confusing. I'm getting confused just writing this! So, rename the inode "for IO" list variable to i_io_list so we can add a new "writeback list" in a subsequent patch. Signed-off-by: Dave Chinner Signed-off-by: Josef Bacik Reviewed-by: Jan Kara Reviewed-by: Christoph Hellwig Tested-by: Dave Chinner --- fs/fs-writeback.c | 46 +++++++++++++++++++++++----------------------- fs/inode.c | 8 ++++---- fs/internal.h | 2 +- include/linux/fs.h | 2 +- mm/backing-dev.c | 8 ++++---- 5 files changed, 33 insertions(+), 33 deletions(-) diff --git a/fs/fs-writeback.c b/fs/fs-writeback.c index 3c974442bdf0..63e00f11022e 100644 --- a/fs/fs-writeback.c +++ b/fs/fs-writeback.c @@ -88,7 +88,7 @@ unsigned int dirtytime_expire_interval = 12 * 60 * 60; static inline struct inode *wb_inode(struct list_head *head) { - return list_entry(head, struct inode, i_wb_list); + return list_entry(head, struct inode, i_io_list); } /* @@ -125,22 +125,22 @@ static void wb_io_lists_depopulated(struct bdi_writeback *wb) } /** - * inode_wb_list_move_locked - move an inode onto a bdi_writeback IO list + * inode_io_list_move_locked - move an inode onto a bdi_writeback IO list * @inode: inode to be moved * @wb: target bdi_writeback * @head: one of @wb->b_{dirty|io|more_io} * - * Move @inode->i_wb_list to @list of @wb and set %WB_has_dirty_io. + * Move @inode->i_io_list to @list of @wb and set %WB_has_dirty_io. * Returns %true if @inode is the first occupant of the !dirty_time IO * lists; otherwise, %false. */ -static bool inode_wb_list_move_locked(struct inode *inode, +static bool inode_io_list_move_locked(struct inode *inode, struct bdi_writeback *wb, struct list_head *head) { assert_spin_locked(&wb->list_lock); - list_move(&inode->i_wb_list, head); + list_move(&inode->i_io_list, head); /* dirty_time doesn't count as dirty_io until expiration */ if (head != &wb->b_dirty_time) @@ -151,19 +151,19 @@ static bool inode_wb_list_move_locked(struct inode *inode, } /** - * inode_wb_list_del_locked - remove an inode from its bdi_writeback IO list + * inode_io_list_del_locked - remove an inode from its bdi_writeback IO list * @inode: inode to be removed * @wb: bdi_writeback @inode is being removed from * * Remove @inode which may be on one of @wb->b_{dirty|io|more_io} lists and * clear %WB_has_dirty_io if all are empty afterwards. */ -static void inode_wb_list_del_locked(struct inode *inode, +static void inode_io_list_del_locked(struct inode *inode, struct bdi_writeback *wb) { assert_spin_locked(&wb->list_lock); - list_del_init(&inode->i_wb_list); + list_del_init(&inode->i_io_list); wb_io_lists_depopulated(wb); } @@ -351,7 +351,7 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) /* * Once I_FREEING is visible under i_lock, the eviction path owns - * the inode and we shouldn't modify ->i_wb_list. + * the inode and we shouldn't modify ->i_io_list. */ if (unlikely(inode->i_state & I_FREEING)) goto skip_switch; @@ -390,16 +390,16 @@ static void inode_switch_wbs_work_fn(struct work_struct *work) * is always correct including from ->b_dirty_time. The transfer * preserves @inode->dirtied_when ordering. */ - if (!list_empty(&inode->i_wb_list)) { + if (!list_empty(&inode->i_io_list)) { struct inode *pos; - inode_wb_list_del_locked(inode, old_wb); + inode_io_list_del_locked(inode, old_wb); inode->i_wb = new_wb; - list_for_each_entry(pos, &new_wb->b_dirty, i_wb_list) + list_for_each_entry(pos, &new_wb->b_dirty, i_io_list) if (time_after_eq(inode->dirtied_when, pos->dirtied_when)) break; - inode_wb_list_move_locked(inode, new_wb, pos->i_wb_list.prev); + inode_io_list_move_locked(inode, new_wb, pos->i_io_list.prev); } else { inode->i_wb = new_wb; } @@ -961,12 +961,12 @@ void wb_start_background_writeback(struct bdi_writeback *wb) /* * Remove the inode from the writeback list it is on. */ -void inode_wb_list_del(struct inode *inode) +void inode_io_list_del(struct inode *inode) { struct bdi_writeback *wb; wb = inode_to_wb_and_lock_list(inode); - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); } @@ -988,7 +988,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) if (time_before(inode->dirtied_when, tail->dirtied_when)) inode->dirtied_when = jiffies; } - inode_wb_list_move_locked(inode, wb, &wb->b_dirty); + inode_io_list_move_locked(inode, wb, &wb->b_dirty); } /* @@ -996,7 +996,7 @@ static void redirty_tail(struct inode *inode, struct bdi_writeback *wb) */ static void requeue_io(struct inode *inode, struct bdi_writeback *wb) { - inode_wb_list_move_locked(inode, wb, &wb->b_more_io); + inode_io_list_move_locked(inode, wb, &wb->b_more_io); } static void inode_sync_complete(struct inode *inode) @@ -1055,7 +1055,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, if (older_than_this && inode_dirtied_after(inode, *older_than_this)) break; - list_move(&inode->i_wb_list, &tmp); + list_move(&inode->i_io_list, &tmp); moved++; if (flags & EXPIRE_DIRTY_ATIME) set_bit(__I_DIRTY_TIME_EXPIRED, &inode->i_state); @@ -1078,7 +1078,7 @@ static int move_expired_inodes(struct list_head *delaying_queue, list_for_each_prev_safe(pos, node, &tmp) { inode = wb_inode(pos); if (inode->i_sb == sb) - list_move(&inode->i_wb_list, dispatch_queue); + list_move(&inode->i_io_list, dispatch_queue); } } out: @@ -1232,10 +1232,10 @@ static void requeue_inode(struct inode *inode, struct bdi_writeback *wb, redirty_tail(inode, wb); } else if (inode->i_state & I_DIRTY_TIME) { inode->dirtied_when = jiffies; - inode_wb_list_move_locked(inode, wb, &wb->b_dirty_time); + inode_io_list_move_locked(inode, wb, &wb->b_dirty_time); } else { /* The inode is clean. Remove from writeback lists. */ - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); } } @@ -1378,7 +1378,7 @@ writeback_single_inode(struct inode *inode, struct bdi_writeback *wb, * touch it. See comment above for explanation. */ if (!(inode->i_state & I_DIRTY_ALL)) - inode_wb_list_del_locked(inode, wb); + inode_io_list_del_locked(inode, wb); spin_unlock(&wb->list_lock); inode_sync_complete(inode); out: @@ -2091,7 +2091,7 @@ void __mark_inode_dirty(struct inode *inode, int flags) else dirty_list = &wb->b_dirty_time; - wakeup_bdi = inode_wb_list_move_locked(inode, wb, + wakeup_bdi = inode_io_list_move_locked(inode, wb, dirty_list); spin_unlock(&wb->list_lock); diff --git a/fs/inode.c b/fs/inode.c index a2de294f6b77..f09148e07198 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -31,7 +31,7 @@ * inode->i_sb->s_inode_list_lock protects: * inode->i_sb->s_inodes, inode->i_sb_list * bdi->wb.list_lock protects: - * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_wb_list + * bdi->wb.b_{dirty,io,more_io,dirty_time}, inode->i_io_list * inode_hash_lock protects: * inode_hashtable, inode->i_hash * @@ -357,7 +357,7 @@ void inode_init_once(struct inode *inode) memset(inode, 0, sizeof(*inode)); INIT_HLIST_NODE(&inode->i_hash); INIT_LIST_HEAD(&inode->i_devices); - INIT_LIST_HEAD(&inode->i_wb_list); + INIT_LIST_HEAD(&inode->i_io_list); INIT_LIST_HEAD(&inode->i_lru); address_space_init_once(&inode->i_data); i_size_ordered_init(inode); @@ -525,8 +525,8 @@ static void evict(struct inode *inode) BUG_ON(!(inode->i_state & I_FREEING)); BUG_ON(!list_empty(&inode->i_lru)); - if (!list_empty(&inode->i_wb_list)) - inode_wb_list_del(inode); + if (!list_empty(&inode->i_io_list)) + inode_io_list_del(inode); inode_sb_list_del(inode); diff --git a/fs/internal.h b/fs/internal.h index ee1209c54eb1..71859c4d0b41 100644 --- a/fs/internal.h +++ b/fs/internal.h @@ -118,7 +118,7 @@ extern void inode_add_lru(struct inode *inode); /* * fs-writeback.c */ -extern void inode_wb_list_del(struct inode *inode); +extern void inode_io_list_del(struct inode *inode); extern long get_nr_dirty_inodes(void); extern void evict_inodes(struct super_block *); diff --git a/include/linux/fs.h b/include/linux/fs.h index 82dfc5519b4b..34cfa60db678 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -636,7 +636,7 @@ struct inode { unsigned long dirtied_time_when; struct hlist_node i_hash; - struct list_head i_wb_list; /* backing dev IO list */ + struct list_head i_io_list; /* backing dev IO list */ #ifdef CONFIG_CGROUP_WRITEBACK struct bdi_writeback *i_wb; /* the associated cgroup wb */ diff --git a/mm/backing-dev.c b/mm/backing-dev.c index dac5bf59309d..ee8d7fd07be3 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -55,13 +55,13 @@ static int bdi_debug_stats_show(struct seq_file *m, void *v) nr_dirty = nr_io = nr_more_io = nr_dirty_time = 0; spin_lock(&wb->list_lock); - list_for_each_entry(inode, &wb->b_dirty, i_wb_list) + list_for_each_entry(inode, &wb->b_dirty, i_io_list) nr_dirty++; - list_for_each_entry(inode, &wb->b_io, i_wb_list) + list_for_each_entry(inode, &wb->b_io, i_io_list) nr_io++; - list_for_each_entry(inode, &wb->b_more_io, i_wb_list) + list_for_each_entry(inode, &wb->b_more_io, i_io_list) nr_more_io++; - list_for_each_entry(inode, &wb->b_dirty_time, i_wb_list) + list_for_each_entry(inode, &wb->b_dirty_time, i_io_list) if (inode->i_state & I_DIRTY_TIME) nr_dirty_time++; spin_unlock(&wb->list_lock); From ac05fbb40062411ea1b722aa2cede7feaa94f1b4 Mon Sep 17 00:00:00 2001 From: Josef Bacik Date: Wed, 4 Mar 2015 16:52:52 -0500 Subject: [PATCH 61/65] inode: don't softlockup when evicting inodes On a box with a lot of ram (148gb) I can make the box softlockup after running an fs_mark job that creates hundreds of millions of empty files. This is because we never generate enough memory pressure to keep the number of inodes on our unused list low, so when we go to unmount we have to evict ~100 million inodes. This makes one processor a very unhappy person, so add a cond_resched() in dispose_list() and if we need a resched when processing the s_inodes list do that and run dispose_list() on what we've currently culled. Thanks, Signed-off-by: Josef Bacik Reviewed-by: Jan Kara --- fs/inode.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/fs/inode.c b/fs/inode.c index f09148e07198..78a17b8859e1 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -575,6 +575,7 @@ static void dispose_list(struct list_head *head) list_del_init(&inode->i_lru); evict(inode); + cond_resched(); } } @@ -592,6 +593,7 @@ void evict_inodes(struct super_block *sb) struct inode *inode, *next; LIST_HEAD(dispose); +again: spin_lock(&sb->s_inode_list_lock); list_for_each_entry_safe(inode, next, &sb->s_inodes, i_sb_list) { if (atomic_read(&inode->i_count)) @@ -607,6 +609,18 @@ void evict_inodes(struct super_block *sb) inode_lru_list_del(inode); spin_unlock(&inode->i_lock); list_add(&inode->i_lru, &dispose); + + /* + * We can have a ton of inodes to evict at unmount time given + * enough memory, check to see if we need to go to sleep for a + * bit so we don't livelock. + */ + if (need_resched()) { + spin_unlock(&sb->s_inode_list_lock); + cond_resched(); + dispose_list(&dispose); + goto again; + } } spin_unlock(&sb->s_inode_list_lock); From 6f179af88f60b32c2855e7f3e16ea8e336a7043f Mon Sep 17 00:00:00 2001 From: Hugh Dickins Date: Mon, 17 Aug 2015 17:34:27 -0700 Subject: [PATCH 62/65] mm: fix potential data race in SyS_swapon While running KernelThreadSanitizer (ktsan) on upstream kernel with trinity, we got a few reports from SyS_swapon, here is one of them: Read of size 8 by thread T307 (K7621): [< inlined >] SyS_swapon+0x3c0/0x1850 SYSC_swapon mm/swapfile.c:2395 [] SyS_swapon+0x3c0/0x1850 mm/swapfile.c:2345 [] ia32_do_call+0x1b/0x25 Looks like the swap_lock should be taken when iterating through the swap_info array on lines 2392 - 2401: q->swap_file may be reset to NULL by another thread before it is dereferenced for f_mapping. But why is that iteration needed at all? Doesn't the claim_swapfile() which follows do all that is needed to check for a duplicate entry - FMODE_EXCL on a bdev, testing IS_SWAPFILE under i_mutex on a regfile? Well, not quite: bd_may_claim() allows the same "holder" to claim the bdev again, so we do need to use a different holder than "sys_swapon"; and we should not replace appropriate -EBUSY by inappropriate -EINVAL. Index i was reused in a cpu loop further down: renamed cpu there. Reported-by: Andrey Konovalov Signed-off-by: Hugh Dickins Signed-off-by: Al Viro --- mm/swapfile.c | 25 +++++++------------------ 1 file changed, 7 insertions(+), 18 deletions(-) diff --git a/mm/swapfile.c b/mm/swapfile.c index 41e4581af7c5..aebc2dd6e649 100644 --- a/mm/swapfile.c +++ b/mm/swapfile.c @@ -2143,11 +2143,10 @@ static int claim_swapfile(struct swap_info_struct *p, struct inode *inode) if (S_ISBLK(inode->i_mode)) { p->bdev = bdgrab(I_BDEV(inode)); error = blkdev_get(p->bdev, - FMODE_READ | FMODE_WRITE | FMODE_EXCL, - sys_swapon); + FMODE_READ | FMODE_WRITE | FMODE_EXCL, p); if (error < 0) { p->bdev = NULL; - return -EINVAL; + return error; } p->old_block_size = block_size(p->bdev); error = set_blocksize(p->bdev, PAGE_SIZE); @@ -2348,7 +2347,6 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) struct filename *name; struct file *swap_file = NULL; struct address_space *mapping; - int i; int prio; int error; union swap_header *swap_header; @@ -2388,19 +2386,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) p->swap_file = swap_file; mapping = swap_file->f_mapping; - - for (i = 0; i < nr_swapfiles; i++) { - struct swap_info_struct *q = swap_info[i]; - - if (q == p || !q->swap_file) - continue; - if (mapping == q->swap_file->f_mapping) { - error = -EBUSY; - goto bad_swap; - } - } - inode = mapping->host; + /* If S_ISREG(inode->i_mode) will do mutex_lock(&inode->i_mutex); */ error = claim_swapfile(p, inode); if (unlikely(error)) @@ -2433,6 +2420,8 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) goto bad_swap; } if (p->bdev && blk_queue_nonrot(bdev_get_queue(p->bdev))) { + int cpu; + p->flags |= SWP_SOLIDSTATE; /* * select a random position to start with to help wear leveling @@ -2451,9 +2440,9 @@ SYSCALL_DEFINE2(swapon, const char __user *, specialfile, int, swap_flags) error = -ENOMEM; goto bad_swap; } - for_each_possible_cpu(i) { + for_each_possible_cpu(cpu) { struct percpu_cluster *cluster; - cluster = per_cpu_ptr(p->percpu_cluster, i); + cluster = per_cpu_ptr(p->percpu_cluster, cpu); cluster_set_null(&cluster->index); } } From cde93be45a8a90d8c264c776fab63487b5038a65 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 15 Aug 2015 13:36:12 -0500 Subject: [PATCH 63/65] dcache: Handle escaped paths in prepend_path A rename can result in a dentry that by walking up d_parent will never reach it's mnt_root. For lack of a better term I call this an escaped path. prepend_path is called by four different functions __d_path, d_absolute_path, d_path, and getcwd. __d_path only wants to see paths are connected to the root it passes in. So __d_path needs prepend_path to return an error. d_absolute_path similarly wants to see paths that are connected to some root. Escaped paths are not connected to any mnt_root so d_absolute_path needs prepend_path to return an error greater than 1. So escaped paths will be treated like paths on lazily unmounted mounts. getcwd needs to prepend "(unreachable)" so getcwd also needs prepend_path to return an error. d_path is the interesting hold out. d_path just wants to print something, and does not care about the weird cases. Which raises the question what should be printed? Given that / should result in -ENOENT I believe it is desirable for escaped paths to be printed as empty paths. As there are not really any meaninful path components when considered from the perspective of a mount tree. So tweak prepend_path to return an empty path with an new error code of 3 when it encounters an escaped path. Signed-off-by: "Eric W. Biederman" Signed-off-by: Al Viro --- fs/dcache.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/dcache.c b/fs/dcache.c index 9b5fe503f6cb..e3b44ca75a1b 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2926,6 +2926,13 @@ restart: if (dentry == vfsmnt->mnt_root || IS_ROOT(dentry)) { struct mount *parent = ACCESS_ONCE(mnt->mnt_parent); + /* Escaped? */ + if (dentry != vfsmnt->mnt_root) { + bptr = *buffer; + blen = *buflen; + error = 3; + break; + } /* Global root? */ if (mnt != parent) { dentry = ACCESS_ONCE(mnt->mnt_mountpoint); From a03e283bf5c3d4851b4998122196ce9f849e6dfb Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 15 Aug 2015 13:36:41 -0500 Subject: [PATCH 64/65] dcache: Reduce the scope of i_lock in d_splice_alias i_lock is only needed until __d_find_any_alias calls dget on the alias dentry. After that the reference to new ensures that dentry_kill and d_delete will not remove the inode from the dentry, and remove the dentry from the inode->d_entry list. The inode i_lock came to be held over the the __d_move calls in d_splice_alias through a series of introduction of locks with increasing smaller scope. First it was the dcache_lock, then it was the dcache_inode_lock, and finally inode->i_lock. Furthermore inode->i_lock is not held over any other calls to d_move or __d_move so it can not provide any meaningful rename protection. Signed-off-by: "Eric W. Biederman" Signed-off-by: Al Viro --- fs/dcache.c | 7 +++---- 1 file changed, 3 insertions(+), 4 deletions(-) diff --git a/fs/dcache.c b/fs/dcache.c index e3b44ca75a1b..5c33aeb0f68f 100644 --- a/fs/dcache.c +++ b/fs/dcache.c @@ -2718,7 +2718,7 @@ struct dentry *d_ancestor(struct dentry *p1, struct dentry *p2) * This helper attempts to cope with remotely renamed directories * * It assumes that the caller is already holding - * dentry->d_parent->d_inode->i_mutex, inode->i_lock and rename_lock + * dentry->d_parent->d_inode->i_mutex, and rename_lock * * Note: If ever the locking in lock_rename() changes, then please * remember to update this too... @@ -2744,7 +2744,6 @@ out_unalias: __d_move(alias, dentry, false); ret = 0; out_err: - spin_unlock(&inode->i_lock); if (m2) mutex_unlock(m2); if (m1) @@ -2790,10 +2789,11 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) if (S_ISDIR(inode->i_mode)) { struct dentry *new = __d_find_any_alias(inode); if (unlikely(new)) { + /* The reference to new ensures it remains an alias */ + spin_unlock(&inode->i_lock); write_seqlock(&rename_lock); if (unlikely(d_ancestor(new, dentry))) { write_sequnlock(&rename_lock); - spin_unlock(&inode->i_lock); dput(new); new = ERR_PTR(-ELOOP); pr_warn_ratelimited( @@ -2812,7 +2812,6 @@ struct dentry *d_splice_alias(struct inode *inode, struct dentry *dentry) } else { __d_move(new, dentry, false); write_sequnlock(&rename_lock); - spin_unlock(&inode->i_lock); security_d_instantiate(new, inode); } iput(inode); From 397d425dc26da728396e66d392d5dcb8dac30c37 Mon Sep 17 00:00:00 2001 From: "Eric W. Biederman" Date: Sat, 15 Aug 2015 20:27:13 -0500 Subject: [PATCH 65/65] vfs: Test for and handle paths that are unreachable from their mnt_root In rare cases a directory can be renamed out from under a bind mount. In those cases without special handling it becomes possible to walk up the directory tree to the root dentry of the filesystem and down from the root dentry to every other file or directory on the filesystem. Like division by zero .. from an unconnected path can not be given a useful semantic as there is no predicting at which path component the code will realize it is unconnected. We certainly can not match the current behavior as the current behavior is a security hole. Therefore when encounting .. when following an unconnected path return -ENOENT. - Add a function path_connected to verify path->dentry is reachable from path->mnt.mnt_root. AKA to validate that rename did not do something nasty to the bind mount. To avoid races path_connected must be called after following a path component to it's next path component. Signed-off-by: "Eric W. Biederman" Signed-off-by: Al Viro --- fs/namei.c | 27 +++++++++++++++++++++++++-- 1 file changed, 25 insertions(+), 2 deletions(-) diff --git a/fs/namei.c b/fs/namei.c index 1c2105ed20c5..29b927938b8c 100644 --- a/fs/namei.c +++ b/fs/namei.c @@ -560,6 +560,24 @@ static int __nd_alloc_stack(struct nameidata *nd) return 0; } +/** + * path_connected - Verify that a path->dentry is below path->mnt.mnt_root + * @path: nameidate to verify + * + * Rename can sometimes move a file or directory outside of a bind + * mount, path_connected allows those cases to be detected. + */ +static bool path_connected(const struct path *path) +{ + struct vfsmount *mnt = path->mnt; + + /* Only bind mounts can have disconnected paths */ + if (mnt->mnt_root == mnt->mnt_sb->s_root) + return true; + + return is_subdir(path->dentry, mnt->mnt_root); +} + static inline int nd_alloc_stack(struct nameidata *nd) { if (likely(nd->depth != EMBEDDED_LEVELS)) @@ -1296,6 +1314,8 @@ static int follow_dotdot_rcu(struct nameidata *nd) return -ECHILD; nd->path.dentry = parent; nd->seq = seq; + if (unlikely(!path_connected(&nd->path))) + return -ENOENT; break; } else { struct mount *mnt = real_mount(nd->path.mnt); @@ -1396,7 +1416,7 @@ static void follow_mount(struct path *path) } } -static void follow_dotdot(struct nameidata *nd) +static int follow_dotdot(struct nameidata *nd) { if (!nd->root.mnt) set_root(nd); @@ -1412,6 +1432,8 @@ static void follow_dotdot(struct nameidata *nd) /* rare case of legitimate dget_parent()... */ nd->path.dentry = dget_parent(nd->path.dentry); dput(old); + if (unlikely(!path_connected(&nd->path))) + return -ENOENT; break; } if (!follow_up(&nd->path)) @@ -1419,6 +1441,7 @@ static void follow_dotdot(struct nameidata *nd) } follow_mount(&nd->path); nd->inode = nd->path.dentry->d_inode; + return 0; } /* @@ -1634,7 +1657,7 @@ static inline int handle_dots(struct nameidata *nd, int type) if (nd->flags & LOOKUP_RCU) { return follow_dotdot_rcu(nd); } else - follow_dotdot(nd); + return follow_dotdot(nd); } return 0; }