From dca3c33652e437ed02c30ed3eca3cecd0cc00838 Mon Sep 17 00:00:00 2001 From: Roman Zippel Date: Tue, 29 Apr 2008 17:02:20 +0200 Subject: [PATCH 1/3] [PATCH] fix reservation discarding in affs - remove affs_put_inode, so preallocations aren't discared unnecessarily often. - remove affs_drop_inode, it's called with a spinlock held, so it can't use a mutex. - make i_opencnt atomic - avoid direct b_count manipulations - a few allocation failure fixes, so that these are more gracefully handled now. Signed-off-by: Roman Zippel Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- fs/affs/affs.h | 4 +--- fs/affs/file.c | 25 +++++++++++++++++-------- fs/affs/inode.c | 34 ++++++++-------------------------- fs/affs/namei.c | 6 ++++-- fs/affs/super.c | 18 +++++++++++------- 5 files changed, 41 insertions(+), 46 deletions(-) diff --git a/fs/affs/affs.h b/fs/affs/affs.h index d5bd497ab9cb..223b1917093e 100644 --- a/fs/affs/affs.h +++ b/fs/affs/affs.h @@ -48,7 +48,7 @@ struct affs_ext_key { * affs fs inode data in memory */ struct affs_inode_info { - u32 i_opencnt; + atomic_t i_opencnt; struct semaphore i_link_lock; /* Protects internal inode access. */ struct semaphore i_ext_lock; /* Protects internal inode access. */ #define i_hash_lock i_ext_lock @@ -170,8 +170,6 @@ extern int affs_rename(struct inode *old_dir, struct dentry *old_dentry, extern unsigned long affs_parent_ino(struct inode *dir); extern struct inode *affs_new_inode(struct inode *dir); extern int affs_notify_change(struct dentry *dentry, struct iattr *attr); -extern void affs_put_inode(struct inode *inode); -extern void affs_drop_inode(struct inode *inode); extern void affs_delete_inode(struct inode *inode); extern void affs_clear_inode(struct inode *inode); extern struct inode *affs_iget(struct super_block *sb, diff --git a/fs/affs/file.c b/fs/affs/file.c index 1a4f092f24ef..6eac7bdeec94 100644 --- a/fs/affs/file.c +++ b/fs/affs/file.c @@ -48,8 +48,9 @@ affs_file_open(struct inode *inode, struct file *filp) { if (atomic_read(&filp->f_count) != 1) return 0; - pr_debug("AFFS: open(%d)\n", AFFS_I(inode)->i_opencnt); - AFFS_I(inode)->i_opencnt++; + pr_debug("AFFS: open(%lu,%d)\n", + inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); + atomic_inc(&AFFS_I(inode)->i_opencnt); return 0; } @@ -58,10 +59,16 @@ affs_file_release(struct inode *inode, struct file *filp) { if (atomic_read(&filp->f_count) != 0) return 0; - pr_debug("AFFS: release(%d)\n", AFFS_I(inode)->i_opencnt); - AFFS_I(inode)->i_opencnt--; - if (!AFFS_I(inode)->i_opencnt) + pr_debug("AFFS: release(%lu, %d)\n", + inode->i_ino, atomic_read(&AFFS_I(inode)->i_opencnt)); + + if (atomic_dec_and_test(&AFFS_I(inode)->i_opencnt)) { + mutex_lock(&inode->i_mutex); + if (inode->i_size != AFFS_I(inode)->mmu_private) + affs_truncate(inode); affs_free_prealloc(inode); + mutex_unlock(&inode->i_mutex); + } return 0; } @@ -180,7 +187,7 @@ affs_get_extblock(struct inode *inode, u32 ext) /* inline the simplest case: same extended block as last time */ struct buffer_head *bh = AFFS_I(inode)->i_ext_bh; if (ext == AFFS_I(inode)->i_ext_last) - atomic_inc(&bh->b_count); + get_bh(bh); else /* we have to do more (not inlined) */ bh = affs_get_extblock_slow(inode, ext); @@ -306,7 +313,7 @@ store_ext: affs_brelse(AFFS_I(inode)->i_ext_bh); AFFS_I(inode)->i_ext_last = ext; AFFS_I(inode)->i_ext_bh = bh; - atomic_inc(&bh->b_count); + get_bh(bh); return bh; @@ -324,7 +331,6 @@ affs_get_block(struct inode *inode, sector_t block, struct buffer_head *bh_resul pr_debug("AFFS: get_block(%u, %lu)\n", (u32)inode->i_ino, (unsigned long)block); - BUG_ON(block > (sector_t)0x7fffffffUL); if (block >= AFFS_I(inode)->i_blkcnt) { @@ -827,6 +833,8 @@ affs_truncate(struct inode *inode) res = mapping->a_ops->write_begin(NULL, mapping, size, 0, 0, &page, &fsdata); if (!res) res = mapping->a_ops->write_end(NULL, mapping, size, 0, 0, page, fsdata); + else + inode->i_size = AFFS_I(inode)->mmu_private; mark_inode_dirty(inode); return; } else if (inode->i_size == AFFS_I(inode)->mmu_private) @@ -862,6 +870,7 @@ affs_truncate(struct inode *inode) blk++; } else AFFS_HEAD(ext_bh)->first_data = 0; + AFFS_HEAD(ext_bh)->block_count = cpu_to_be32(i); size = AFFS_SB(sb)->s_hashsize; if (size > blkcnt - blk + i) size = blkcnt - blk + i; diff --git a/fs/affs/inode.c b/fs/affs/inode.c index 27fe6cbe43ae..a13b334a3910 100644 --- a/fs/affs/inode.c +++ b/fs/affs/inode.c @@ -58,7 +58,7 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) AFFS_I(inode)->i_extcnt = 1; AFFS_I(inode)->i_ext_last = ~1; AFFS_I(inode)->i_protect = prot; - AFFS_I(inode)->i_opencnt = 0; + atomic_set(&AFFS_I(inode)->i_opencnt, 0); AFFS_I(inode)->i_blkcnt = 0; AFFS_I(inode)->i_lc = NULL; AFFS_I(inode)->i_lc_size = 0; @@ -108,8 +108,6 @@ struct inode *affs_iget(struct super_block *sb, unsigned long ino) inode->i_mode |= S_IFDIR; } else inode->i_mode = S_IRUGO | S_IXUGO | S_IWUSR | S_IFDIR; - if (tail->link_chain) - inode->i_nlink = 2; /* Maybe it should be controlled by mount parameter? */ //inode->i_mode |= S_ISVTX; inode->i_op = &affs_dir_inode_operations; @@ -244,32 +242,13 @@ out: return error; } -void -affs_put_inode(struct inode *inode) -{ - pr_debug("AFFS: put_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); - affs_free_prealloc(inode); -} - -void -affs_drop_inode(struct inode *inode) -{ - mutex_lock(&inode->i_mutex); - if (inode->i_size != AFFS_I(inode)->mmu_private) - affs_truncate(inode); - mutex_unlock(&inode->i_mutex); - - generic_drop_inode(inode); -} - void affs_delete_inode(struct inode *inode) { pr_debug("AFFS: delete_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); truncate_inode_pages(&inode->i_data, 0); inode->i_size = 0; - if (S_ISREG(inode->i_mode)) - affs_truncate(inode); + affs_truncate(inode); clear_inode(inode); affs_free_block(inode->i_sb, inode->i_ino); } @@ -277,9 +256,12 @@ affs_delete_inode(struct inode *inode) void affs_clear_inode(struct inode *inode) { - unsigned long cache_page = (unsigned long) AFFS_I(inode)->i_lc; + unsigned long cache_page; pr_debug("AFFS: clear_inode(ino=%lu, nlink=%u)\n", inode->i_ino, inode->i_nlink); + + affs_free_prealloc(inode); + cache_page = (unsigned long)AFFS_I(inode)->i_lc; if (cache_page) { pr_debug("AFFS: freeing ext cache\n"); AFFS_I(inode)->i_lc = NULL; @@ -316,7 +298,7 @@ affs_new_inode(struct inode *dir) inode->i_ino = block; inode->i_nlink = 1; inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME_SEC; - AFFS_I(inode)->i_opencnt = 0; + atomic_set(&AFFS_I(inode)->i_opencnt, 0); AFFS_I(inode)->i_blkcnt = 0; AFFS_I(inode)->i_lc = NULL; AFFS_I(inode)->i_lc_size = 0; @@ -369,12 +351,12 @@ affs_add_entry(struct inode *dir, struct inode *inode, struct dentry *dentry, s3 switch (type) { case ST_LINKFILE: case ST_LINKDIR: - inode_bh = bh; retval = -ENOSPC; block = affs_alloc_block(dir, dir->i_ino); if (!block) goto err; retval = -EIO; + inode_bh = bh; bh = affs_getzeroblk(sb, block); if (!bh) goto err; diff --git a/fs/affs/namei.c b/fs/affs/namei.c index 2218f1ee71ce..cfcf1b6cf82b 100644 --- a/fs/affs/namei.c +++ b/fs/affs/namei.c @@ -234,7 +234,8 @@ affs_lookup(struct inode *dir, struct dentry *dentry, struct nameidata *nd) int affs_unlink(struct inode *dir, struct dentry *dentry) { - pr_debug("AFFS: unlink(dir=%d, \"%.*s\")\n", (u32)dir->i_ino, + pr_debug("AFFS: unlink(dir=%d, %lu \"%.*s\")\n", (u32)dir->i_ino, + dentry->d_inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name); return affs_remove_header(dentry); @@ -302,7 +303,8 @@ affs_mkdir(struct inode *dir, struct dentry *dentry, int mode) int affs_rmdir(struct inode *dir, struct dentry *dentry) { - pr_debug("AFFS: rmdir(dir=%u, \"%.*s\")\n", (u32)dir->i_ino, + pr_debug("AFFS: rmdir(dir=%u, %lu \"%.*s\")\n", (u32)dir->i_ino, + dentry->d_inode->i_ino, (int)dentry->d_name.len, dentry->d_name.name); return affs_remove_header(dentry); diff --git a/fs/affs/super.c b/fs/affs/super.c index 01d25d532541..d214837d5e42 100644 --- a/fs/affs/super.c +++ b/fs/affs/super.c @@ -71,12 +71,18 @@ static struct kmem_cache * affs_inode_cachep; static struct inode *affs_alloc_inode(struct super_block *sb) { - struct affs_inode_info *ei; - ei = (struct affs_inode_info *)kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL); - if (!ei) + struct affs_inode_info *i; + + i = kmem_cache_alloc(affs_inode_cachep, GFP_KERNEL); + if (!i) return NULL; - ei->vfs_inode.i_version = 1; - return &ei->vfs_inode; + + i->vfs_inode.i_version = 1; + i->i_lc = NULL; + i->i_ext_bh = NULL; + i->i_pa_cnt = 0; + + return &i->vfs_inode; } static void affs_destroy_inode(struct inode *inode) @@ -114,8 +120,6 @@ static const struct super_operations affs_sops = { .alloc_inode = affs_alloc_inode, .destroy_inode = affs_destroy_inode, .write_inode = affs_write_inode, - .put_inode = affs_put_inode, - .drop_inode = affs_drop_inode, .delete_inode = affs_delete_inode, .clear_inode = affs_clear_inode, .put_super = affs_put_super, From 33dcdac2df54e66c447ae03f58c95c7251aa5649 Mon Sep 17 00:00:00 2001 From: Christoph Hellwig Date: Tue, 29 Apr 2008 17:46:26 +0200 Subject: [PATCH 2/3] [PATCH] kill ->put_inode And with that last patch to affs killing the last put_inode instance we can finally, after many years of transition kill this racy and awkward interface. (It's kinda funny that even the description in Documentation/filesystems/vfs.txt was entirely wrong..) Also remove a very misleading comment above the defintion of struct super_operations. Signed-off-by: Christoph Hellwig Signed-off-by: Al Viro --- Documentation/filesystems/Locking | 2 -- Documentation/filesystems/vfs.txt | 4 ---- fs/inode.c | 3 --- include/linux/fs.h | 5 ----- 4 files changed, 14 deletions(-) diff --git a/Documentation/filesystems/Locking b/Documentation/filesystems/Locking index c2992bc54f2f..8b22d7d8b991 100644 --- a/Documentation/filesystems/Locking +++ b/Documentation/filesystems/Locking @@ -92,7 +92,6 @@ prototypes: void (*destroy_inode)(struct inode *); void (*dirty_inode) (struct inode *); int (*write_inode) (struct inode *, int); - void (*put_inode) (struct inode *); void (*drop_inode) (struct inode *); void (*delete_inode) (struct inode *); void (*put_super) (struct super_block *); @@ -115,7 +114,6 @@ alloc_inode: no no no destroy_inode: no dirty_inode: no (must not sleep) write_inode: no -put_inode: no drop_inode: no !!!inode_lock!!! delete_inode: no put_super: yes yes no diff --git a/Documentation/filesystems/vfs.txt b/Documentation/filesystems/vfs.txt index 81e5be6e6e35..b7522c6cbae3 100644 --- a/Documentation/filesystems/vfs.txt +++ b/Documentation/filesystems/vfs.txt @@ -205,7 +205,6 @@ struct super_operations { void (*dirty_inode) (struct inode *); int (*write_inode) (struct inode *, int); - void (*put_inode) (struct inode *); void (*drop_inode) (struct inode *); void (*delete_inode) (struct inode *); void (*put_super) (struct super_block *); @@ -246,9 +245,6 @@ or bottom half). inode to disc. The second parameter indicates whether the write should be synchronous or not, not all filesystems check this flag. - put_inode: called when the VFS inode is removed from the inode - cache. - drop_inode: called when the last access to the inode is dropped, with the inode_lock spinlock held. diff --git a/fs/inode.c b/fs/inode.c index bf6478130424..18bdce14b70c 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -1153,9 +1153,6 @@ void iput(struct inode *inode) BUG_ON(inode->i_state == I_CLEAR); - if (op && op->put_inode) - op->put_inode(inode); - if (atomic_dec_and_lock(&inode->i_count, &inode_lock)) iput_final(inode); } diff --git a/include/linux/fs.h b/include/linux/fs.h index a1ba005d08e7..7e0fa9e64479 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -1289,17 +1289,12 @@ extern ssize_t vfs_readv(struct file *, const struct iovec __user *, extern ssize_t vfs_writev(struct file *, const struct iovec __user *, unsigned long, loff_t *); -/* - * NOTE: write_inode, delete_inode, clear_inode, put_inode can be called - * without the big kernel lock held in all filesystems. - */ struct super_operations { struct inode *(*alloc_inode)(struct super_block *sb); void (*destroy_inode)(struct inode *); void (*dirty_inode) (struct inode *); int (*write_inode) (struct inode *, int); - void (*put_inode) (struct inode *); void (*drop_inode) (struct inode *); void (*delete_inode) (struct inode *); void (*put_super) (struct super_block *); From 0b2bac2f1ea0d33a3621b27ca68b9ae760fca2e9 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 6 May 2008 13:58:34 -0400 Subject: [PATCH 3/3] [PATCH] fix SMP ordering hole in fcntl_setlk() fcntl_setlk()/close() race prevention has a subtle hole - we need to make sure that if we *do* have an fcntl/close race on SMP box, the access to descriptor table and inode->i_flock won't get reordered. As it is, we get STORE inode->i_flock, LOAD descriptor table entry vs. STORE descriptor table entry, LOAD inode->i_flock with not a single lock in common on both sides. We do have BKL around the first STORE, but check in locks_remove_posix() is outside of BKL and for a good reason - we don't want BKL on common path of close(2). Solution is to hold ->file_lock around fcheck() in there; that orders us wrt removal from descriptor table that preceded locks_remove_posix() on close path and we either come first (in which case eviction will be handled by the close side) or we'll see the effect of close and do eviction ourselves. Note that even though it's read-only access, we do need ->file_lock here - rcu_read_lock() won't be enough to order the things. Signed-off-by: Al Viro --- fs/locks.c | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/fs/locks.c b/fs/locks.c index 663c069b59b3..0ac6b92cb0b6 100644 --- a/fs/locks.c +++ b/fs/locks.c @@ -1753,6 +1753,7 @@ int fcntl_setlk(unsigned int fd, struct file *filp, unsigned int cmd, struct file_lock *file_lock = locks_alloc_lock(); struct flock flock; struct inode *inode; + struct file *f; int error; if (file_lock == NULL) @@ -1825,7 +1826,15 @@ again: * Attempt to detect a close/fcntl race and recover by * releasing the lock that was just acquired. */ - if (!error && fcheck(fd) != filp && flock.l_type != F_UNLCK) { + /* + * we need that spin_lock here - it prevents reordering between + * update of inode->i_flock and check for it done in close(). + * rcu_read_lock() wouldn't do. + */ + spin_lock(¤t->files->file_lock); + f = fcheck(fd); + spin_unlock(¤t->files->file_lock); + if (!error && f != filp && flock.l_type != F_UNLCK) { flock.l_type = F_UNLCK; goto again; } @@ -1881,6 +1890,7 @@ int fcntl_setlk64(unsigned int fd, struct file *filp, unsigned int cmd, struct file_lock *file_lock = locks_alloc_lock(); struct flock64 flock; struct inode *inode; + struct file *f; int error; if (file_lock == NULL) @@ -1953,7 +1963,10 @@ again: * Attempt to detect a close/fcntl race and recover by * releasing the lock that was just acquired. */ - if (!error && fcheck(fd) != filp && flock.l_type != F_UNLCK) { + spin_lock(¤t->files->file_lock); + f = fcheck(fd); + spin_unlock(¤t->files->file_lock); + if (!error && f != filp && flock.l_type != F_UNLCK) { flock.l_type = F_UNLCK; goto again; }