OpenCloudOS-Kernel/fs/nilfs2/inode.c

1118 lines
30 KiB
C
Raw Normal View History

/*
* inode.c - NILFS inode operations.
*
* Copyright (C) 2005-2008 Nippon Telegraph and Telephone Corporation.
*
* This program is free software; you can redistribute it and/or modify
* it under the terms of the GNU General Public License as published by
* the Free Software Foundation; either version 2 of the License, or
* (at your option) any later version.
*
* This program is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
* GNU General Public License for more details.
*
* You should have received a copy of the GNU General Public License
* along with this program; if not, write to the Free Software
* Foundation, Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
*
* Written by Ryusuke Konishi <ryusuke@osrg.net>
*
*/
#include <linux/buffer_head.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/gfp.h>
#include <linux/mpage.h>
nilfs2: fix data loss with mmap() This bug leads to reproducible silent data loss, despite the use of msync(), sync() and a clean unmount of the file system. It is easily reproducible with the following script: ----------------[BEGIN SCRIPT]-------------------- mkfs.nilfs2 -f /dev/sdb mount /dev/sdb /mnt dd if=/dev/zero bs=1M count=30 of=/mnt/testfile umount /mnt mount /dev/sdb /mnt CHECKSUM_BEFORE="$(md5sum /mnt/testfile)" /root/mmaptest/mmaptest /mnt/testfile 30 10 5 sync CHECKSUM_AFTER="$(md5sum /mnt/testfile)" umount /mnt mount /dev/sdb /mnt CHECKSUM_AFTER_REMOUNT="$(md5sum /mnt/testfile)" umount /mnt echo "BEFORE MMAP:\t$CHECKSUM_BEFORE" echo "AFTER MMAP:\t$CHECKSUM_AFTER" echo "AFTER REMOUNT:\t$CHECKSUM_AFTER_REMOUNT" ----------------[END SCRIPT]-------------------- The mmaptest tool looks something like this (very simplified, with error checking removed): ----------------[BEGIN mmaptest]-------------------- data = mmap(NULL, file_size - file_offset, PROT_READ | PROT_WRITE, MAP_SHARED, fd, file_offset); for (i = 0; i < write_count; ++i) { memcpy(data + i * 4096, buf, sizeof(buf)); msync(data, file_size - file_offset, MS_SYNC)) } ----------------[END mmaptest]-------------------- The output of the script looks something like this: BEFORE MMAP: 281ed1d5ae50e8419f9b978aab16de83 /mnt/testfile AFTER MMAP: 6604a1c31f10780331a6850371b3a313 /mnt/testfile AFTER REMOUNT: 281ed1d5ae50e8419f9b978aab16de83 /mnt/testfile So it is clear, that the changes done using mmap() do not survive a remount. This can be reproduced a 100% of the time. The problem was introduced in commit 136e8770cd5d ("nilfs2: fix issue of nilfs_set_page_dirty() for page at EOF boundary"). If the page was read with mpage_readpage() or mpage_readpages() for example, then it has no buffers attached to it. In that case page_has_buffers(page) in nilfs_set_page_dirty() will be false. Therefore nilfs_set_file_dirty() is never called and the pages are never collected and never written to disk. This patch fixes the problem by also calling nilfs_set_file_dirty() if the page has no buffers attached to it. [akpm@linux-foundation.org: s/PAGE_SHIFT/PAGE_CACHE_SHIFT/] Signed-off-by: Andreas Rohner <andreas.rohner@gmx.net> Tested-by: Andreas Rohner <andreas.rohner@gmx.net> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-09-26 07:05:14 +08:00
#include <linux/pagemap.h>
#include <linux/writeback.h>
#include <linux/uio.h>
#include "nilfs.h"
#include "btnode.h"
#include "segment.h"
#include "page.h"
#include "mdt.h"
#include "cpfile.h"
#include "ifile.h"
/**
* struct nilfs_iget_args - arguments used during comparison between inodes
* @ino: inode number
* @cno: checkpoint number
* @root: pointer on NILFS root object (mounted checkpoint)
* @for_gc: inode for GC flag
*/
struct nilfs_iget_args {
u64 ino;
__u64 cno;
struct nilfs_root *root;
int for_gc;
};
static int nilfs_iget_test(struct inode *inode, void *opaque);
void nilfs_inode_add_blocks(struct inode *inode, int n)
{
struct nilfs_root *root = NILFS_I(inode)->i_root;
inode_add_bytes(inode, (1 << inode->i_blkbits) * n);
if (root)
atomic64_add(n, &root->blocks_count);
}
void nilfs_inode_sub_blocks(struct inode *inode, int n)
{
struct nilfs_root *root = NILFS_I(inode)->i_root;
inode_sub_bytes(inode, (1 << inode->i_blkbits) * n);
if (root)
atomic64_sub(n, &root->blocks_count);
}
/**
* nilfs_get_block() - get a file block on the filesystem (callback function)
* @inode - inode struct of the target file
* @blkoff - file block number
* @bh_result - buffer head to be mapped on
* @create - indicate whether allocating the block or not when it has not
* been allocated yet.
*
* This function does not issue actual read request of the specified data
* block. It is done by VFS.
*/
int nilfs_get_block(struct inode *inode, sector_t blkoff,
struct buffer_head *bh_result, int create)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
__u64 blknum = 0;
int err = 0, ret;
unsigned maxblocks = bh_result->b_size >> inode->i_blkbits;
down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
ret = nilfs_bmap_lookup_contig(ii->i_bmap, blkoff, &blknum, maxblocks);
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
if (ret >= 0) { /* found */
map_bh(bh_result, inode->i_sb, blknum);
if (ret > 0)
bh_result->b_size = (ret << inode->i_blkbits);
goto out;
}
/* data block was not found */
if (ret == -ENOENT && create) {
struct nilfs_transaction_info ti;
bh_result->b_blocknr = 0;
err = nilfs_transaction_begin(inode->i_sb, &ti, 1);
if (unlikely(err))
goto out;
err = nilfs_bmap_insert(ii->i_bmap, blkoff,
(unsigned long)bh_result);
if (unlikely(err != 0)) {
if (err == -EEXIST) {
/*
* The get_block() function could be called
* from multiple callers for an inode.
* However, the page having this block must
* be locked in this case.
*/
printk(KERN_WARNING
"nilfs_get_block: a race condition "
"while inserting a data block. "
"(inode number=%lu, file block "
"offset=%llu)\n",
inode->i_ino,
(unsigned long long)blkoff);
err = 0;
}
nilfs_transaction_abort(inode->i_sb);
goto out;
}
nilfs_mark_inode_dirty_sync(inode);
nilfs_transaction_commit(inode->i_sb); /* never fails */
/* Error handling should be detailed */
set_buffer_new(bh_result);
set_buffer_delay(bh_result);
map_bh(bh_result, inode->i_sb, 0); /* dbn must be changed
to proper value */
} else if (ret == -ENOENT) {
/* not found is not error (e.g. hole); must return without
the mapped state flag. */
;
} else {
err = ret;
}
out:
return err;
}
/**
* nilfs_readpage() - implement readpage() method of nilfs_aops {}
* address_space_operations.
* @file - file struct of the file to be read
* @page - the page to be read
*/
static int nilfs_readpage(struct file *file, struct page *page)
{
return mpage_readpage(page, nilfs_get_block);
}
/**
* nilfs_readpages() - implement readpages() method of nilfs_aops {}
* address_space_operations.
* @file - file struct of the file to be read
* @mapping - address_space struct used for reading multiple pages
* @pages - the pages to be read
* @nr_pages - number of pages to be read
*/
static int nilfs_readpages(struct file *file, struct address_space *mapping,
struct list_head *pages, unsigned nr_pages)
{
return mpage_readpages(mapping, pages, nr_pages, nilfs_get_block);
}
static int nilfs_writepages(struct address_space *mapping,
struct writeback_control *wbc)
{
struct inode *inode = mapping->host;
int err = 0;
nilfs2: fix issue with flush kernel thread after remount in RO mode because of driver's internal error or metadata corruption The NILFS2 driver remounts itself in RO mode in the case of discovering metadata corruption (for example, discovering a broken bmap). But usually, this takes place when there have been file system operations before remounting in RO mode. Thereby, NILFS2 driver can be in RO mode with presence of dirty pages in modified inodes' address spaces. It results in flush kernel thread's infinite trying to flush dirty pages in RO mode. As a result, it is possible to see such side effects as: (1) flush kernel thread occupies 50% - 99% of CPU time; (2) system can't be shutdowned without manual power switch off. SYMPTOMS: (1) System log contains error message: "Remounting filesystem read-only". (2) The flush kernel thread occupies 50% - 99% of CPU time. (3) The system can't be shutdowned without manual power switch off. REPRODUCTION PATH: (1) Create volume group with name "unencrypted" by means of vgcreate utility. (2) Run script (prepared by Anthony Doggett <Anthony2486@interfaces.org.uk>): ----------------[BEGIN SCRIPT]-------------------- #!/bin/bash VG=unencrypted #apt-get install nilfs-tools darcs lvcreate --size 2G --name ntest $VG mkfs.nilfs2 -b 1024 -B 8192 /dev/mapper/$VG-ntest mkdir /var/tmp/n mkdir /var/tmp/n/ntest mount /dev/mapper/$VG-ntest /var/tmp/n/ntest mkdir /var/tmp/n/ntest/thedir cd /var/tmp/n/ntest/thedir sleep 2 date darcs init sleep 2 dmesg|tail -n 5 date darcs whatsnew || true date sleep 2 dmesg|tail -n 5 ----------------[END SCRIPT]-------------------- (3) Try to shutdown the system. REPRODUCIBILITY: 100% FIX: This patch implements checking mount state of NILFS2 driver in nilfs_writepage(), nilfs_writepages() and nilfs_mdt_write_page() methods. If it is detected the RO mount state then all dirty pages are simply discarded with warning messages is written in system log. [akpm@linux-foundation.org: fix printk warning] Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com> Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Cc: Anthony Doggett <Anthony2486@interfaces.org.uk> Cc: ARAI Shun-ichi <hermes@ceres.dti.ne.jp> Cc: Piotr Szymaniak <szarpaj@grubelek.pl> Cc: Zahid Chowdhury <zahid.chowdhury@starsolutions.com> Cc: Elmer Zhang <freeboy6716@gmail.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-05-01 06:27:48 +08:00
if (inode->i_sb->s_flags & MS_RDONLY) {
nilfs_clear_dirty_pages(mapping, false);
return -EROFS;
}
if (wbc->sync_mode == WB_SYNC_ALL)
err = nilfs_construct_dsync_segment(inode->i_sb, inode,
wbc->range_start,
wbc->range_end);
return err;
}
static int nilfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct inode *inode = page->mapping->host;
int err;
if (inode->i_sb->s_flags & MS_RDONLY) {
nilfs2: fix issue with flush kernel thread after remount in RO mode because of driver's internal error or metadata corruption The NILFS2 driver remounts itself in RO mode in the case of discovering metadata corruption (for example, discovering a broken bmap). But usually, this takes place when there have been file system operations before remounting in RO mode. Thereby, NILFS2 driver can be in RO mode with presence of dirty pages in modified inodes' address spaces. It results in flush kernel thread's infinite trying to flush dirty pages in RO mode. As a result, it is possible to see such side effects as: (1) flush kernel thread occupies 50% - 99% of CPU time; (2) system can't be shutdowned without manual power switch off. SYMPTOMS: (1) System log contains error message: "Remounting filesystem read-only". (2) The flush kernel thread occupies 50% - 99% of CPU time. (3) The system can't be shutdowned without manual power switch off. REPRODUCTION PATH: (1) Create volume group with name "unencrypted" by means of vgcreate utility. (2) Run script (prepared by Anthony Doggett <Anthony2486@interfaces.org.uk>): ----------------[BEGIN SCRIPT]-------------------- #!/bin/bash VG=unencrypted #apt-get install nilfs-tools darcs lvcreate --size 2G --name ntest $VG mkfs.nilfs2 -b 1024 -B 8192 /dev/mapper/$VG-ntest mkdir /var/tmp/n mkdir /var/tmp/n/ntest mount /dev/mapper/$VG-ntest /var/tmp/n/ntest mkdir /var/tmp/n/ntest/thedir cd /var/tmp/n/ntest/thedir sleep 2 date darcs init sleep 2 dmesg|tail -n 5 date darcs whatsnew || true date sleep 2 dmesg|tail -n 5 ----------------[END SCRIPT]-------------------- (3) Try to shutdown the system. REPRODUCIBILITY: 100% FIX: This patch implements checking mount state of NILFS2 driver in nilfs_writepage(), nilfs_writepages() and nilfs_mdt_write_page() methods. If it is detected the RO mount state then all dirty pages are simply discarded with warning messages is written in system log. [akpm@linux-foundation.org: fix printk warning] Signed-off-by: Vyacheslav Dubeyko <slava@dubeyko.com> Acked-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Cc: Anthony Doggett <Anthony2486@interfaces.org.uk> Cc: ARAI Shun-ichi <hermes@ceres.dti.ne.jp> Cc: Piotr Szymaniak <szarpaj@grubelek.pl> Cc: Zahid Chowdhury <zahid.chowdhury@starsolutions.com> Cc: Elmer Zhang <freeboy6716@gmail.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-05-01 06:27:48 +08:00
/*
* It means that filesystem was remounted in read-only
* mode because of error or metadata corruption. But we
* have dirty pages that try to be flushed in background.
* So, here we simply discard this dirty page.
*/
nilfs_clear_dirty_page(page, false);
unlock_page(page);
return -EROFS;
}
redirty_page_for_writepage(wbc, page);
unlock_page(page);
if (wbc->sync_mode == WB_SYNC_ALL) {
err = nilfs_construct_segment(inode->i_sb);
if (unlikely(err))
return err;
} else if (wbc->for_reclaim)
nilfs_flush_segment(inode->i_sb, inode->i_ino);
return 0;
}
static int nilfs_set_page_dirty(struct page *page)
{
nilfs2: fix data loss with mmap() This bug leads to reproducible silent data loss, despite the use of msync(), sync() and a clean unmount of the file system. It is easily reproducible with the following script: ----------------[BEGIN SCRIPT]-------------------- mkfs.nilfs2 -f /dev/sdb mount /dev/sdb /mnt dd if=/dev/zero bs=1M count=30 of=/mnt/testfile umount /mnt mount /dev/sdb /mnt CHECKSUM_BEFORE="$(md5sum /mnt/testfile)" /root/mmaptest/mmaptest /mnt/testfile 30 10 5 sync CHECKSUM_AFTER="$(md5sum /mnt/testfile)" umount /mnt mount /dev/sdb /mnt CHECKSUM_AFTER_REMOUNT="$(md5sum /mnt/testfile)" umount /mnt echo "BEFORE MMAP:\t$CHECKSUM_BEFORE" echo "AFTER MMAP:\t$CHECKSUM_AFTER" echo "AFTER REMOUNT:\t$CHECKSUM_AFTER_REMOUNT" ----------------[END SCRIPT]-------------------- The mmaptest tool looks something like this (very simplified, with error checking removed): ----------------[BEGIN mmaptest]-------------------- data = mmap(NULL, file_size - file_offset, PROT_READ | PROT_WRITE, MAP_SHARED, fd, file_offset); for (i = 0; i < write_count; ++i) { memcpy(data + i * 4096, buf, sizeof(buf)); msync(data, file_size - file_offset, MS_SYNC)) } ----------------[END mmaptest]-------------------- The output of the script looks something like this: BEFORE MMAP: 281ed1d5ae50e8419f9b978aab16de83 /mnt/testfile AFTER MMAP: 6604a1c31f10780331a6850371b3a313 /mnt/testfile AFTER REMOUNT: 281ed1d5ae50e8419f9b978aab16de83 /mnt/testfile So it is clear, that the changes done using mmap() do not survive a remount. This can be reproduced a 100% of the time. The problem was introduced in commit 136e8770cd5d ("nilfs2: fix issue of nilfs_set_page_dirty() for page at EOF boundary"). If the page was read with mpage_readpage() or mpage_readpages() for example, then it has no buffers attached to it. In that case page_has_buffers(page) in nilfs_set_page_dirty() will be false. Therefore nilfs_set_file_dirty() is never called and the pages are never collected and never written to disk. This patch fixes the problem by also calling nilfs_set_file_dirty() if the page has no buffers attached to it. [akpm@linux-foundation.org: s/PAGE_SHIFT/PAGE_CACHE_SHIFT/] Signed-off-by: Andreas Rohner <andreas.rohner@gmx.net> Tested-by: Andreas Rohner <andreas.rohner@gmx.net> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-09-26 07:05:14 +08:00
struct inode *inode = page->mapping->host;
nilfs2: fix issue of nilfs_set_page_dirty() for page at EOF boundary nilfs2: fix issue of nilfs_set_page_dirty for page at EOF boundary DESCRIPTION: There are use-cases when NILFS2 file system (formatted with block size lesser than 4 KB) can be remounted in RO mode because of encountering of "broken bmap" issue. The issue was reported by Anthony Doggett <Anthony2486@interfaces.org.uk>: "The machine I've been trialling nilfs on is running Debian Testing, Linux version 3.2.0-4-686-pae (debian-kernel@lists.debian.org) (gcc version 4.6.3 (Debian 4.6.3-14) ) #1 SMP Debian 3.2.35-2), but I've also reproduced it (identically) with Debian Unstable amd64 and Debian Experimental (using the 3.8-trunk kernel). The problematic partitions were formatted with "mkfs.nilfs2 -b 1024 -B 8192"." SYMPTOMS: (1) System log contains error messages likewise: [63102.496756] nilfs_direct_assign: invalid pointer: 0 [63102.496786] NILFS error (device dm-17): nilfs_bmap_assign: broken bmap (inode number=28) [63102.496798] [63102.524403] Remounting filesystem read-only (2) The NILFS2 file system is remounted in RO mode. REPRODUSING PATH: (1) Create volume group with name "unencrypted" by means of vgcreate utility. (2) Run script (prepared by Anthony Doggett <Anthony2486@interfaces.org.uk>): ----------------[BEGIN SCRIPT]-------------------- VG=unencrypted lvcreate --size 2G --name ntest $VG mkfs.nilfs2 -b 1024 -B 8192 /dev/mapper/$VG-ntest mkdir /var/tmp/n mkdir /var/tmp/n/ntest mount /dev/mapper/$VG-ntest /var/tmp/n/ntest mkdir /var/tmp/n/ntest/thedir cd /var/tmp/n/ntest/thedir sleep 2 date darcs init sleep 2 dmesg|tail -n 5 date darcs whatsnew || true date sleep 2 dmesg|tail -n 5 ----------------[END SCRIPT]-------------------- REPRODUCIBILITY: 100% INVESTIGATION: As it was discovered, the issue takes place during segment construction after executing such sequence of user-space operations: open("_darcs/index", O_RDWR|O_CREAT|O_NOCTTY, 0666) = 7 fstat(7, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0 ftruncate(7, 60) The error message "NILFS error (device dm-17): nilfs_bmap_assign: broken bmap (inode number=28)" takes place because of trying to get block number for third block of the file with logical offset #3072 bytes. As it is possible to see from above output, the file has 60 bytes of the whole size. So, it is enough one block (1 KB in size) allocation for the whole file. Trying to operate with several blocks instead of one takes place because of discovering several dirty buffers for this file in nilfs_segctor_scan_file() method. The root cause of this issue is in nilfs_set_page_dirty function which is called just before writing to an mmapped page. When nilfs_page_mkwrite function handles a page at EOF boundary, it fills hole blocks only inside EOF through __block_page_mkwrite(). The __block_page_mkwrite() function calls set_page_dirty() after filling hole blocks, thus nilfs_set_page_dirty function (= a_ops->set_page_dirty) is called. However, the current implementation of nilfs_set_page_dirty() wrongly marks all buffers dirty even for page at EOF boundary. As a result, buffers outside EOF are inconsistently marked dirty and queued for write even though they are not mapped with nilfs_get_block function. FIX: This modifies nilfs_set_page_dirty() not to mark hole blocks dirty. Thanks to Vyacheslav Dubeyko for his effort on analysis and proposals for this issue. Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Reported-by: Anthony Doggett <Anthony2486@interfaces.org.uk> Reported-by: Vyacheslav Dubeyko <slava@dubeyko.com> Cc: Vyacheslav Dubeyko <slava@dubeyko.com> Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-05-25 06:55:29 +08:00
int ret = __set_page_dirty_nobuffers(page);
nilfs2: fix issue of nilfs_set_page_dirty() for page at EOF boundary nilfs2: fix issue of nilfs_set_page_dirty for page at EOF boundary DESCRIPTION: There are use-cases when NILFS2 file system (formatted with block size lesser than 4 KB) can be remounted in RO mode because of encountering of "broken bmap" issue. The issue was reported by Anthony Doggett <Anthony2486@interfaces.org.uk>: "The machine I've been trialling nilfs on is running Debian Testing, Linux version 3.2.0-4-686-pae (debian-kernel@lists.debian.org) (gcc version 4.6.3 (Debian 4.6.3-14) ) #1 SMP Debian 3.2.35-2), but I've also reproduced it (identically) with Debian Unstable amd64 and Debian Experimental (using the 3.8-trunk kernel). The problematic partitions were formatted with "mkfs.nilfs2 -b 1024 -B 8192"." SYMPTOMS: (1) System log contains error messages likewise: [63102.496756] nilfs_direct_assign: invalid pointer: 0 [63102.496786] NILFS error (device dm-17): nilfs_bmap_assign: broken bmap (inode number=28) [63102.496798] [63102.524403] Remounting filesystem read-only (2) The NILFS2 file system is remounted in RO mode. REPRODUSING PATH: (1) Create volume group with name "unencrypted" by means of vgcreate utility. (2) Run script (prepared by Anthony Doggett <Anthony2486@interfaces.org.uk>): ----------------[BEGIN SCRIPT]-------------------- VG=unencrypted lvcreate --size 2G --name ntest $VG mkfs.nilfs2 -b 1024 -B 8192 /dev/mapper/$VG-ntest mkdir /var/tmp/n mkdir /var/tmp/n/ntest mount /dev/mapper/$VG-ntest /var/tmp/n/ntest mkdir /var/tmp/n/ntest/thedir cd /var/tmp/n/ntest/thedir sleep 2 date darcs init sleep 2 dmesg|tail -n 5 date darcs whatsnew || true date sleep 2 dmesg|tail -n 5 ----------------[END SCRIPT]-------------------- REPRODUCIBILITY: 100% INVESTIGATION: As it was discovered, the issue takes place during segment construction after executing such sequence of user-space operations: open("_darcs/index", O_RDWR|O_CREAT|O_NOCTTY, 0666) = 7 fstat(7, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0 ftruncate(7, 60) The error message "NILFS error (device dm-17): nilfs_bmap_assign: broken bmap (inode number=28)" takes place because of trying to get block number for third block of the file with logical offset #3072 bytes. As it is possible to see from above output, the file has 60 bytes of the whole size. So, it is enough one block (1 KB in size) allocation for the whole file. Trying to operate with several blocks instead of one takes place because of discovering several dirty buffers for this file in nilfs_segctor_scan_file() method. The root cause of this issue is in nilfs_set_page_dirty function which is called just before writing to an mmapped page. When nilfs_page_mkwrite function handles a page at EOF boundary, it fills hole blocks only inside EOF through __block_page_mkwrite(). The __block_page_mkwrite() function calls set_page_dirty() after filling hole blocks, thus nilfs_set_page_dirty function (= a_ops->set_page_dirty) is called. However, the current implementation of nilfs_set_page_dirty() wrongly marks all buffers dirty even for page at EOF boundary. As a result, buffers outside EOF are inconsistently marked dirty and queued for write even though they are not mapped with nilfs_get_block function. FIX: This modifies nilfs_set_page_dirty() not to mark hole blocks dirty. Thanks to Vyacheslav Dubeyko for his effort on analysis and proposals for this issue. Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Reported-by: Anthony Doggett <Anthony2486@interfaces.org.uk> Reported-by: Vyacheslav Dubeyko <slava@dubeyko.com> Cc: Vyacheslav Dubeyko <slava@dubeyko.com> Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-05-25 06:55:29 +08:00
if (page_has_buffers(page)) {
unsigned nr_dirty = 0;
struct buffer_head *bh, *head;
nilfs2: fix issue of nilfs_set_page_dirty() for page at EOF boundary nilfs2: fix issue of nilfs_set_page_dirty for page at EOF boundary DESCRIPTION: There are use-cases when NILFS2 file system (formatted with block size lesser than 4 KB) can be remounted in RO mode because of encountering of "broken bmap" issue. The issue was reported by Anthony Doggett <Anthony2486@interfaces.org.uk>: "The machine I've been trialling nilfs on is running Debian Testing, Linux version 3.2.0-4-686-pae (debian-kernel@lists.debian.org) (gcc version 4.6.3 (Debian 4.6.3-14) ) #1 SMP Debian 3.2.35-2), but I've also reproduced it (identically) with Debian Unstable amd64 and Debian Experimental (using the 3.8-trunk kernel). The problematic partitions were formatted with "mkfs.nilfs2 -b 1024 -B 8192"." SYMPTOMS: (1) System log contains error messages likewise: [63102.496756] nilfs_direct_assign: invalid pointer: 0 [63102.496786] NILFS error (device dm-17): nilfs_bmap_assign: broken bmap (inode number=28) [63102.496798] [63102.524403] Remounting filesystem read-only (2) The NILFS2 file system is remounted in RO mode. REPRODUSING PATH: (1) Create volume group with name "unencrypted" by means of vgcreate utility. (2) Run script (prepared by Anthony Doggett <Anthony2486@interfaces.org.uk>): ----------------[BEGIN SCRIPT]-------------------- VG=unencrypted lvcreate --size 2G --name ntest $VG mkfs.nilfs2 -b 1024 -B 8192 /dev/mapper/$VG-ntest mkdir /var/tmp/n mkdir /var/tmp/n/ntest mount /dev/mapper/$VG-ntest /var/tmp/n/ntest mkdir /var/tmp/n/ntest/thedir cd /var/tmp/n/ntest/thedir sleep 2 date darcs init sleep 2 dmesg|tail -n 5 date darcs whatsnew || true date sleep 2 dmesg|tail -n 5 ----------------[END SCRIPT]-------------------- REPRODUCIBILITY: 100% INVESTIGATION: As it was discovered, the issue takes place during segment construction after executing such sequence of user-space operations: open("_darcs/index", O_RDWR|O_CREAT|O_NOCTTY, 0666) = 7 fstat(7, {st_mode=S_IFREG|0644, st_size=0, ...}) = 0 ftruncate(7, 60) The error message "NILFS error (device dm-17): nilfs_bmap_assign: broken bmap (inode number=28)" takes place because of trying to get block number for third block of the file with logical offset #3072 bytes. As it is possible to see from above output, the file has 60 bytes of the whole size. So, it is enough one block (1 KB in size) allocation for the whole file. Trying to operate with several blocks instead of one takes place because of discovering several dirty buffers for this file in nilfs_segctor_scan_file() method. The root cause of this issue is in nilfs_set_page_dirty function which is called just before writing to an mmapped page. When nilfs_page_mkwrite function handles a page at EOF boundary, it fills hole blocks only inside EOF through __block_page_mkwrite(). The __block_page_mkwrite() function calls set_page_dirty() after filling hole blocks, thus nilfs_set_page_dirty function (= a_ops->set_page_dirty) is called. However, the current implementation of nilfs_set_page_dirty() wrongly marks all buffers dirty even for page at EOF boundary. As a result, buffers outside EOF are inconsistently marked dirty and queued for write even though they are not mapped with nilfs_get_block function. FIX: This modifies nilfs_set_page_dirty() not to mark hole blocks dirty. Thanks to Vyacheslav Dubeyko for his effort on analysis and proposals for this issue. Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Reported-by: Anthony Doggett <Anthony2486@interfaces.org.uk> Reported-by: Vyacheslav Dubeyko <slava@dubeyko.com> Cc: Vyacheslav Dubeyko <slava@dubeyko.com> Tested-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-05-25 06:55:29 +08:00
/*
* This page is locked by callers, and no other thread
* concurrently marks its buffers dirty since they are
* only dirtied through routines in fs/buffer.c in
* which call sites of mark_buffer_dirty are protected
* by page lock.
*/
bh = head = page_buffers(page);
do {
/* Do not mark hole blocks dirty */
if (buffer_dirty(bh) || !buffer_mapped(bh))
continue;
set_buffer_dirty(bh);
nr_dirty++;
} while (bh = bh->b_this_page, bh != head);
if (nr_dirty)
nilfs_set_file_dirty(inode, nr_dirty);
nilfs2: fix data loss with mmap() This bug leads to reproducible silent data loss, despite the use of msync(), sync() and a clean unmount of the file system. It is easily reproducible with the following script: ----------------[BEGIN SCRIPT]-------------------- mkfs.nilfs2 -f /dev/sdb mount /dev/sdb /mnt dd if=/dev/zero bs=1M count=30 of=/mnt/testfile umount /mnt mount /dev/sdb /mnt CHECKSUM_BEFORE="$(md5sum /mnt/testfile)" /root/mmaptest/mmaptest /mnt/testfile 30 10 5 sync CHECKSUM_AFTER="$(md5sum /mnt/testfile)" umount /mnt mount /dev/sdb /mnt CHECKSUM_AFTER_REMOUNT="$(md5sum /mnt/testfile)" umount /mnt echo "BEFORE MMAP:\t$CHECKSUM_BEFORE" echo "AFTER MMAP:\t$CHECKSUM_AFTER" echo "AFTER REMOUNT:\t$CHECKSUM_AFTER_REMOUNT" ----------------[END SCRIPT]-------------------- The mmaptest tool looks something like this (very simplified, with error checking removed): ----------------[BEGIN mmaptest]-------------------- data = mmap(NULL, file_size - file_offset, PROT_READ | PROT_WRITE, MAP_SHARED, fd, file_offset); for (i = 0; i < write_count; ++i) { memcpy(data + i * 4096, buf, sizeof(buf)); msync(data, file_size - file_offset, MS_SYNC)) } ----------------[END mmaptest]-------------------- The output of the script looks something like this: BEFORE MMAP: 281ed1d5ae50e8419f9b978aab16de83 /mnt/testfile AFTER MMAP: 6604a1c31f10780331a6850371b3a313 /mnt/testfile AFTER REMOUNT: 281ed1d5ae50e8419f9b978aab16de83 /mnt/testfile So it is clear, that the changes done using mmap() do not survive a remount. This can be reproduced a 100% of the time. The problem was introduced in commit 136e8770cd5d ("nilfs2: fix issue of nilfs_set_page_dirty() for page at EOF boundary"). If the page was read with mpage_readpage() or mpage_readpages() for example, then it has no buffers attached to it. In that case page_has_buffers(page) in nilfs_set_page_dirty() will be false. Therefore nilfs_set_file_dirty() is never called and the pages are never collected and never written to disk. This patch fixes the problem by also calling nilfs_set_file_dirty() if the page has no buffers attached to it. [akpm@linux-foundation.org: s/PAGE_SHIFT/PAGE_CACHE_SHIFT/] Signed-off-by: Andreas Rohner <andreas.rohner@gmx.net> Tested-by: Andreas Rohner <andreas.rohner@gmx.net> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-09-26 07:05:14 +08:00
} else if (ret) {
unsigned nr_dirty = 1 << (PAGE_CACHE_SHIFT - inode->i_blkbits);
nilfs_set_file_dirty(inode, nr_dirty);
}
return ret;
}
void nilfs_write_failed(struct address_space *mapping, loff_t to)
{
struct inode *inode = mapping->host;
if (to > inode->i_size) {
truncate_pagecache(inode, inode->i_size);
nilfs_truncate(inode);
}
}
static int nilfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
struct inode *inode = mapping->host;
int err = nilfs_transaction_begin(inode->i_sb, NULL, 1);
if (unlikely(err))
return err;
err = block_write_begin(mapping, pos, len, flags, pagep,
nilfs_get_block);
if (unlikely(err)) {
nilfs_write_failed(mapping, pos + len);
nilfs_transaction_abort(inode->i_sb);
}
return err;
}
static int nilfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
unsigned start = pos & (PAGE_CACHE_SIZE - 1);
unsigned nr_dirty;
int err;
nr_dirty = nilfs_page_count_clean_buffers(page, start,
start + copied);
copied = generic_write_end(file, mapping, pos, len, copied, page,
fsdata);
nilfs_set_file_dirty(inode, nr_dirty);
err = nilfs_transaction_commit(inode->i_sb);
return err ? : copied;
}
static ssize_t
nilfs_direct_IO(struct kiocb *iocb, struct iov_iter *iter, loff_t offset)
{
struct inode *inode = file_inode(iocb->ki_filp);
if (iov_iter_rw(iter) == WRITE)
return 0;
/* Needs synchronization with the cleaner */
return blockdev_direct_IO(iocb, inode, iter, offset, nilfs_get_block);
}
const struct address_space_operations nilfs_aops = {
.writepage = nilfs_writepage,
.readpage = nilfs_readpage,
.writepages = nilfs_writepages,
.set_page_dirty = nilfs_set_page_dirty,
.readpages = nilfs_readpages,
.write_begin = nilfs_write_begin,
.write_end = nilfs_write_end,
/* .releasepage = nilfs_releasepage, */
.invalidatepage = block_invalidatepage,
.direct_IO = nilfs_direct_IO,
NILFS2: Pagecache usage optimization on NILFS2 Hi, I introduced "is_partially_uptodate" aops for NILFS2. A page can have multiple buffers and even if a page is not uptodate, some buffers can be uptodate on pagesize != blocksize environment. This aops checks that all buffers which correspond to a part of a file that we want to read are uptodate. If so, we do not have to issue actual read IO to HDD even if a page is not uptodate because the portion we want to read are uptodate. "block_is_partially_uptodate" function is already used by ext2/3/4. With the following patch random read/write mixed workloads or random read after random write workloads can be optimized and we can get performance improvement. I did a performance test using the sysbench. 1 --file-block-size=8K --file-total-size=2G --file-test-mode=rndrw --file-fsync-freq=0 --fil e-rw-ratio=1 run -2.6.30-rc5 Test execution summary: total time: 151.2907s total number of events: 200000 total time taken by event execution: 2409.8387 per-request statistics: min: 0.0000s avg: 0.0120s max: 0.9306s approx. 95 percentile: 0.0439s Threads fairness: events (avg/stddev): 12500.0000/238.52 execution time (avg/stddev): 150.6149/0.01 -2.6.30-rc5-patched Test execution summary: total time: 140.8828s total number of events: 200000 total time taken by event execution: 2240.8577 per-request statistics: min: 0.0000s avg: 0.0112s max: 0.8750s approx. 95 percentile: 0.0418s Threads fairness: events (avg/stddev): 12500.0000/218.43 execution time (avg/stddev): 140.0536/0.01 arch: ia64 pagesize: 16k Thanks. Signed-off-by: Hisashi Hifumi <hifumi.hisashi@oss.ntt.co.jp> Signed-off-by: Ryusuke Konishi <konishi.ryusuke@lab.ntt.co.jp>
2009-05-13 10:19:40 +08:00
.is_partially_uptodate = block_is_partially_uptodate,
};
static int nilfs_insert_inode_locked(struct inode *inode,
struct nilfs_root *root,
unsigned long ino)
{
struct nilfs_iget_args args = {
.ino = ino, .root = root, .cno = 0, .for_gc = 0
};
return insert_inode_locked4(inode, ino, nilfs_iget_test, &args);
}
struct inode *nilfs_new_inode(struct inode *dir, umode_t mode)
{
struct super_block *sb = dir->i_sb;
struct the_nilfs *nilfs = sb->s_fs_info;
struct inode *inode;
struct nilfs_inode_info *ii;
struct nilfs_root *root;
int err = -ENOMEM;
ino_t ino;
inode = new_inode(sb);
if (unlikely(!inode))
goto failed;
mapping_set_gfp_mask(inode->i_mapping,
mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
root = NILFS_I(dir)->i_root;
ii = NILFS_I(inode);
ii->i_state = 1 << NILFS_I_NEW;
ii->i_root = root;
err = nilfs_ifile_create_inode(root->ifile, &ino, &ii->i_bh);
if (unlikely(err))
goto failed_ifile_create_inode;
/* reference count of i_bh inherits from nilfs_mdt_read_block() */
atomic64_inc(&root->inodes_count);
inode_init_owner(inode, dir, mode);
inode->i_ino = ino;
inode->i_mtime = inode->i_atime = inode->i_ctime = CURRENT_TIME;
if (S_ISREG(mode) || S_ISDIR(mode) || S_ISLNK(mode)) {
err = nilfs_bmap_read(ii->i_bmap, NULL);
if (err < 0)
goto failed_after_creation;
set_bit(NILFS_I_BMAP, &ii->i_state);
/* No lock is needed; iget() ensures it. */
}
ii->i_flags = nilfs_mask_flags(
mode, NILFS_I(dir)->i_flags & NILFS_FL_INHERITED);
/* ii->i_file_acl = 0; */
/* ii->i_dir_acl = 0; */
ii->i_dir_start_lookup = 0;
nilfs_set_inode_flags(inode);
spin_lock(&nilfs->ns_next_gen_lock);
inode->i_generation = nilfs->ns_next_generation++;
spin_unlock(&nilfs->ns_next_gen_lock);
if (nilfs_insert_inode_locked(inode, root, ino) < 0) {
err = -EIO;
goto failed_after_creation;
}
err = nilfs_init_acl(inode, dir);
if (unlikely(err))
goto failed_after_creation; /* never occur. When supporting
nilfs_init_acl(), proper cancellation of
above jobs should be considered */
return inode;
failed_after_creation:
clear_nlink(inode);
unlock_new_inode(inode);
iput(inode); /* raw_inode will be deleted through
nilfs_evict_inode() */
goto failed;
failed_ifile_create_inode:
make_bad_inode(inode);
iput(inode); /* if i_nlink == 1, generic_forget_inode() will be
called */
failed:
return ERR_PTR(err);
}
void nilfs_set_inode_flags(struct inode *inode)
{
unsigned int flags = NILFS_I(inode)->i_flags;
unsigned int new_fl = 0;
if (flags & FS_SYNC_FL)
new_fl |= S_SYNC;
if (flags & FS_APPEND_FL)
new_fl |= S_APPEND;
if (flags & FS_IMMUTABLE_FL)
new_fl |= S_IMMUTABLE;
if (flags & FS_NOATIME_FL)
new_fl |= S_NOATIME;
if (flags & FS_DIRSYNC_FL)
new_fl |= S_DIRSYNC;
inode_set_flags(inode, new_fl, S_SYNC | S_APPEND | S_IMMUTABLE |
S_NOATIME | S_DIRSYNC);
}
int nilfs_read_inode_common(struct inode *inode,
struct nilfs_inode *raw_inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
int err;
inode->i_mode = le16_to_cpu(raw_inode->i_mode);
i_uid_write(inode, le32_to_cpu(raw_inode->i_uid));
i_gid_write(inode, le32_to_cpu(raw_inode->i_gid));
set_nlink(inode, le16_to_cpu(raw_inode->i_links_count));
inode->i_size = le64_to_cpu(raw_inode->i_size);
inode->i_atime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
inode->i_ctime.tv_sec = le64_to_cpu(raw_inode->i_ctime);
inode->i_mtime.tv_sec = le64_to_cpu(raw_inode->i_mtime);
inode->i_atime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
inode->i_ctime.tv_nsec = le32_to_cpu(raw_inode->i_ctime_nsec);
inode->i_mtime.tv_nsec = le32_to_cpu(raw_inode->i_mtime_nsec);
if (inode->i_nlink == 0)
return -ESTALE; /* this inode is deleted */
inode->i_blocks = le64_to_cpu(raw_inode->i_blocks);
ii->i_flags = le32_to_cpu(raw_inode->i_flags);
#if 0
ii->i_file_acl = le32_to_cpu(raw_inode->i_file_acl);
ii->i_dir_acl = S_ISREG(inode->i_mode) ?
0 : le32_to_cpu(raw_inode->i_dir_acl);
#endif
ii->i_dir_start_lookup = 0;
inode->i_generation = le32_to_cpu(raw_inode->i_generation);
if (S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) ||
S_ISLNK(inode->i_mode)) {
err = nilfs_bmap_read(ii->i_bmap, raw_inode);
if (err < 0)
return err;
set_bit(NILFS_I_BMAP, &ii->i_state);
/* No lock is needed; iget() ensures it. */
}
return 0;
}
static int __nilfs_read_inode(struct super_block *sb,
struct nilfs_root *root, unsigned long ino,
struct inode *inode)
{
struct the_nilfs *nilfs = sb->s_fs_info;
struct buffer_head *bh;
struct nilfs_inode *raw_inode;
int err;
down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
err = nilfs_ifile_get_inode_block(root->ifile, ino, &bh);
if (unlikely(err))
goto bad_inode;
raw_inode = nilfs_ifile_map_inode(root->ifile, ino, bh);
err = nilfs_read_inode_common(inode, raw_inode);
if (err)
goto failed_unmap;
if (S_ISREG(inode->i_mode)) {
inode->i_op = &nilfs_file_inode_operations;
inode->i_fop = &nilfs_file_operations;
inode->i_mapping->a_ops = &nilfs_aops;
} else if (S_ISDIR(inode->i_mode)) {
inode->i_op = &nilfs_dir_inode_operations;
inode->i_fop = &nilfs_dir_operations;
inode->i_mapping->a_ops = &nilfs_aops;
} else if (S_ISLNK(inode->i_mode)) {
inode->i_op = &nilfs_symlink_inode_operations;
inode->i_mapping->a_ops = &nilfs_aops;
} else {
inode->i_op = &nilfs_special_inode_operations;
init_special_inode(
inode, inode->i_mode,
huge_decode_dev(le64_to_cpu(raw_inode->i_device_code)));
}
nilfs_ifile_unmap_inode(root->ifile, ino, bh);
brelse(bh);
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
nilfs_set_inode_flags(inode);
mapping_set_gfp_mask(inode->i_mapping,
mapping_gfp_mask(inode->i_mapping) & ~__GFP_FS);
return 0;
failed_unmap:
nilfs_ifile_unmap_inode(root->ifile, ino, bh);
brelse(bh);
bad_inode:
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
return err;
}
static int nilfs_iget_test(struct inode *inode, void *opaque)
{
struct nilfs_iget_args *args = opaque;
struct nilfs_inode_info *ii;
if (args->ino != inode->i_ino || args->root != NILFS_I(inode)->i_root)
return 0;
ii = NILFS_I(inode);
if (!test_bit(NILFS_I_GCINODE, &ii->i_state))
return !args->for_gc;
return args->for_gc && args->cno == ii->i_cno;
}
static int nilfs_iget_set(struct inode *inode, void *opaque)
{
struct nilfs_iget_args *args = opaque;
inode->i_ino = args->ino;
if (args->for_gc) {
NILFS_I(inode)->i_state = 1 << NILFS_I_GCINODE;
NILFS_I(inode)->i_cno = args->cno;
NILFS_I(inode)->i_root = NULL;
} else {
if (args->root && args->ino == NILFS_ROOT_INO)
nilfs_get_root(args->root);
NILFS_I(inode)->i_root = args->root;
}
return 0;
}
struct inode *nilfs_ilookup(struct super_block *sb, struct nilfs_root *root,
unsigned long ino)
{
struct nilfs_iget_args args = {
.ino = ino, .root = root, .cno = 0, .for_gc = 0
};
return ilookup5(sb, ino, nilfs_iget_test, &args);
}
struct inode *nilfs_iget_locked(struct super_block *sb, struct nilfs_root *root,
unsigned long ino)
{
struct nilfs_iget_args args = {
.ino = ino, .root = root, .cno = 0, .for_gc = 0
};
return iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
}
struct inode *nilfs_iget(struct super_block *sb, struct nilfs_root *root,
unsigned long ino)
{
struct inode *inode;
int err;
inode = nilfs_iget_locked(sb, root, ino);
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
err = __nilfs_read_inode(sb, root, ino, inode);
if (unlikely(err)) {
iget_failed(inode);
return ERR_PTR(err);
}
unlock_new_inode(inode);
return inode;
}
struct inode *nilfs_iget_for_gc(struct super_block *sb, unsigned long ino,
__u64 cno)
{
struct nilfs_iget_args args = {
.ino = ino, .root = NULL, .cno = cno, .for_gc = 1
};
struct inode *inode;
int err;
inode = iget5_locked(sb, ino, nilfs_iget_test, nilfs_iget_set, &args);
if (unlikely(!inode))
return ERR_PTR(-ENOMEM);
if (!(inode->i_state & I_NEW))
return inode;
err = nilfs_init_gcinode(inode);
if (unlikely(err)) {
iget_failed(inode);
return ERR_PTR(err);
}
unlock_new_inode(inode);
return inode;
}
void nilfs_write_inode_common(struct inode *inode,
struct nilfs_inode *raw_inode, int has_bmap)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
raw_inode->i_mode = cpu_to_le16(inode->i_mode);
raw_inode->i_uid = cpu_to_le32(i_uid_read(inode));
raw_inode->i_gid = cpu_to_le32(i_gid_read(inode));
raw_inode->i_links_count = cpu_to_le16(inode->i_nlink);
raw_inode->i_size = cpu_to_le64(inode->i_size);
raw_inode->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
raw_inode->i_mtime = cpu_to_le64(inode->i_mtime.tv_sec);
raw_inode->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
raw_inode->i_mtime_nsec = cpu_to_le32(inode->i_mtime.tv_nsec);
raw_inode->i_blocks = cpu_to_le64(inode->i_blocks);
raw_inode->i_flags = cpu_to_le32(ii->i_flags);
raw_inode->i_generation = cpu_to_le32(inode->i_generation);
if (NILFS_ROOT_METADATA_FILE(inode->i_ino)) {
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
/* zero-fill unused portion in the case of super root block */
raw_inode->i_xattr = 0;
raw_inode->i_pad = 0;
memset((void *)raw_inode + sizeof(*raw_inode), 0,
nilfs->ns_inode_size - sizeof(*raw_inode));
}
if (has_bmap)
nilfs_bmap_write(ii->i_bmap, raw_inode);
else if (S_ISCHR(inode->i_mode) || S_ISBLK(inode->i_mode))
raw_inode->i_device_code =
cpu_to_le64(huge_encode_dev(inode->i_rdev));
/* When extending inode, nilfs->ns_inode_size should be checked
for substitutions of appended fields */
}
void nilfs_update_inode(struct inode *inode, struct buffer_head *ibh, int flags)
{
ino_t ino = inode->i_ino;
struct nilfs_inode_info *ii = NILFS_I(inode);
struct inode *ifile = ii->i_root->ifile;
struct nilfs_inode *raw_inode;
raw_inode = nilfs_ifile_map_inode(ifile, ino, ibh);
if (test_and_clear_bit(NILFS_I_NEW, &ii->i_state))
memset(raw_inode, 0, NILFS_MDT(ifile)->mi_entry_size);
if (flags & I_DIRTY_DATASYNC)
set_bit(NILFS_I_INODE_SYNC, &ii->i_state);
nilfs_write_inode_common(inode, raw_inode, 0);
/* XXX: call with has_bmap = 0 is a workaround to avoid
deadlock of bmap. This delays update of i_bmap to just
before writing */
nilfs_ifile_unmap_inode(ifile, ino, ibh);
}
#define NILFS_MAX_TRUNCATE_BLOCKS 16384 /* 64MB for 4KB block */
static void nilfs_truncate_bmap(struct nilfs_inode_info *ii,
unsigned long from)
{
__u64 b;
int ret;
if (!test_bit(NILFS_I_BMAP, &ii->i_state))
return;
repeat:
ret = nilfs_bmap_last_key(ii->i_bmap, &b);
if (ret == -ENOENT)
return;
else if (ret < 0)
goto failed;
if (b < from)
return;
b -= min_t(__u64, NILFS_MAX_TRUNCATE_BLOCKS, b - from);
ret = nilfs_bmap_truncate(ii->i_bmap, b);
nilfs_relax_pressure_in_lock(ii->vfs_inode.i_sb);
if (!ret || (ret == -ENOMEM &&
nilfs_bmap_truncate(ii->i_bmap, b) == 0))
goto repeat;
failed:
nilfs_warning(ii->vfs_inode.i_sb, __func__,
"failed to truncate bmap (ino=%lu, err=%d)",
ii->vfs_inode.i_ino, ret);
}
void nilfs_truncate(struct inode *inode)
{
unsigned long blkoff;
unsigned int blocksize;
struct nilfs_transaction_info ti;
struct super_block *sb = inode->i_sb;
struct nilfs_inode_info *ii = NILFS_I(inode);
if (!test_bit(NILFS_I_BMAP, &ii->i_state))
return;
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return;
blocksize = sb->s_blocksize;
blkoff = (inode->i_size + blocksize - 1) >> sb->s_blocksize_bits;
nilfs_transaction_begin(sb, &ti, 0); /* never fails */
block_truncate_page(inode->i_mapping, inode->i_size, nilfs_get_block);
nilfs_truncate_bmap(ii, blkoff);
inode->i_mtime = inode->i_ctime = CURRENT_TIME;
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
nilfs_mark_inode_dirty(inode);
nilfs_set_file_dirty(inode, 0);
nilfs_transaction_commit(sb);
/* May construct a logical segment and may fail in sync mode.
But truncate has no return value. */
}
static void nilfs_clear_inode(struct inode *inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
/*
* Free resources allocated in nilfs_read_inode(), here.
*/
BUG_ON(!list_empty(&ii->i_dirty));
brelse(ii->i_bh);
ii->i_bh = NULL;
if (mdi && mdi->mi_palloc_cache)
nilfs_palloc_destroy_cache(inode);
if (test_bit(NILFS_I_BMAP, &ii->i_state))
nilfs_bmap_clear(ii->i_bmap);
nilfs_btnode_cache_clear(&ii->i_btnode_cache);
if (ii->i_root && inode->i_ino == NILFS_ROOT_INO)
nilfs_put_root(ii->i_root);
}
void nilfs_evict_inode(struct inode *inode)
{
struct nilfs_transaction_info ti;
struct super_block *sb = inode->i_sb;
struct nilfs_inode_info *ii = NILFS_I(inode);
int ret;
if (inode->i_nlink || !ii->i_root || unlikely(is_bad_inode(inode))) {
mm + fs: store shadow entries in page cache Reclaim will be leaving shadow entries in the page cache radix tree upon evicting the real page. As those pages are found from the LRU, an iput() can lead to the inode being freed concurrently. At this point, reclaim must no longer install shadow pages because the inode freeing code needs to ensure the page tree is really empty. Add an address_space flag, AS_EXITING, that the inode freeing code sets under the tree lock before doing the final truncate. Reclaim will check for this flag before installing shadow pages. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Bob Liu <bob.liu@oracle.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Luigi Semenzato <semenzato@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Metin Doslu <metin@citusdata.com> Cc: Michel Lespinasse <walken@google.com> Cc: Ozgun Erdogan <ozgun@citusdata.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Roman Gushchin <klamm@yandex-team.ru> Cc: Ryan Mallon <rmallon@gmail.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 05:47:49 +08:00
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
nilfs_clear_inode(inode);
return;
}
nilfs_transaction_begin(sb, &ti, 0); /* never fails */
mm + fs: store shadow entries in page cache Reclaim will be leaving shadow entries in the page cache radix tree upon evicting the real page. As those pages are found from the LRU, an iput() can lead to the inode being freed concurrently. At this point, reclaim must no longer install shadow pages because the inode freeing code needs to ensure the page tree is really empty. Add an address_space flag, AS_EXITING, that the inode freeing code sets under the tree lock before doing the final truncate. Reclaim will check for this flag before installing shadow pages. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Bob Liu <bob.liu@oracle.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Luigi Semenzato <semenzato@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Metin Doslu <metin@citusdata.com> Cc: Michel Lespinasse <walken@google.com> Cc: Ozgun Erdogan <ozgun@citusdata.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Roman Gushchin <klamm@yandex-team.ru> Cc: Ryan Mallon <rmallon@gmail.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 05:47:49 +08:00
truncate_inode_pages_final(&inode->i_data);
/* TODO: some of the following operations may fail. */
nilfs_truncate_bmap(ii, 0);
nilfs_mark_inode_dirty(inode);
clear_inode(inode);
ret = nilfs_ifile_delete_inode(ii->i_root->ifile, inode->i_ino);
if (!ret)
atomic64_dec(&ii->i_root->inodes_count);
nilfs_clear_inode(inode);
if (IS_SYNC(inode))
nilfs_set_transaction_flag(NILFS_TI_SYNC);
nilfs_transaction_commit(sb);
/* May construct a logical segment and may fail in sync mode.
But delete_inode has no return value. */
}
int nilfs_setattr(struct dentry *dentry, struct iattr *iattr)
{
struct nilfs_transaction_info ti;
struct inode *inode = d_inode(dentry);
struct super_block *sb = inode->i_sb;
int err;
err = inode_change_ok(inode, iattr);
if (err)
return err;
err = nilfs_transaction_begin(sb, &ti, 0);
if (unlikely(err))
return err;
if ((iattr->ia_valid & ATTR_SIZE) &&
iattr->ia_size != i_size_read(inode)) {
inode_dio_wait(inode);
truncate_setsize(inode, iattr->ia_size);
nilfs_truncate(inode);
}
setattr_copy(inode, iattr);
mark_inode_dirty(inode);
if (iattr->ia_valid & ATTR_MODE) {
err = nilfs_acl_chmod(inode);
if (unlikely(err))
goto out_err;
}
return nilfs_transaction_commit(sb);
out_err:
nilfs_transaction_abort(sb);
return err;
}
int nilfs_permission(struct inode *inode, int mask)
{
struct nilfs_root *root = NILFS_I(inode)->i_root;
if ((mask & MAY_WRITE) && root &&
root->cno != NILFS_CPTREE_CURRENT_CNO)
return -EROFS; /* snapshot is not writable */
return generic_permission(inode, mask);
}
int nilfs_load_inode_block(struct inode *inode, struct buffer_head **pbh)
{
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
struct nilfs_inode_info *ii = NILFS_I(inode);
int err;
spin_lock(&nilfs->ns_inode_lock);
if (ii->i_bh == NULL) {
spin_unlock(&nilfs->ns_inode_lock);
err = nilfs_ifile_get_inode_block(ii->i_root->ifile,
inode->i_ino, pbh);
if (unlikely(err))
return err;
spin_lock(&nilfs->ns_inode_lock);
if (ii->i_bh == NULL)
ii->i_bh = *pbh;
else {
brelse(*pbh);
*pbh = ii->i_bh;
}
} else
*pbh = ii->i_bh;
get_bh(*pbh);
spin_unlock(&nilfs->ns_inode_lock);
return 0;
}
int nilfs_inode_dirty(struct inode *inode)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
int ret = 0;
if (!list_empty(&ii->i_dirty)) {
spin_lock(&nilfs->ns_inode_lock);
ret = test_bit(NILFS_I_DIRTY, &ii->i_state) ||
test_bit(NILFS_I_BUSY, &ii->i_state);
spin_unlock(&nilfs->ns_inode_lock);
}
return ret;
}
int nilfs_set_file_dirty(struct inode *inode, unsigned nr_dirty)
{
struct nilfs_inode_info *ii = NILFS_I(inode);
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
atomic_add(nr_dirty, &nilfs->ns_ndirtyblks);
if (test_and_set_bit(NILFS_I_DIRTY, &ii->i_state))
return 0;
spin_lock(&nilfs->ns_inode_lock);
if (!test_bit(NILFS_I_QUEUED, &ii->i_state) &&
!test_bit(NILFS_I_BUSY, &ii->i_state)) {
/* Because this routine may race with nilfs_dispose_list(),
we have to check NILFS_I_QUEUED here, too. */
if (list_empty(&ii->i_dirty) && igrab(inode) == NULL) {
/* This will happen when somebody is freeing
this inode. */
nilfs_warning(inode->i_sb, __func__,
"cannot get inode (ino=%lu)\n",
inode->i_ino);
spin_unlock(&nilfs->ns_inode_lock);
return -EINVAL; /* NILFS_I_DIRTY may remain for
freeing inode */
}
list_move_tail(&ii->i_dirty, &nilfs->ns_dirty_files);
set_bit(NILFS_I_QUEUED, &ii->i_state);
}
spin_unlock(&nilfs->ns_inode_lock);
return 0;
}
int __nilfs_mark_inode_dirty(struct inode *inode, int flags)
{
struct buffer_head *ibh;
int err;
err = nilfs_load_inode_block(inode, &ibh);
if (unlikely(err)) {
nilfs_warning(inode->i_sb, __func__,
"failed to reget inode block.\n");
return err;
}
nilfs_update_inode(inode, ibh, flags);
mark_buffer_dirty(ibh);
nilfs_mdt_mark_dirty(NILFS_I(inode)->i_root->ifile);
brelse(ibh);
return 0;
}
/**
* nilfs_dirty_inode - reflect changes on given inode to an inode block.
* @inode: inode of the file to be registered.
*
* nilfs_dirty_inode() loads a inode block containing the specified
* @inode and copies data from a nilfs_inode to a corresponding inode
* entry in the inode block. This operation is excluded from the segment
* construction. This function can be called both as a single operation
* and as a part of indivisible file operations.
*/
void nilfs_dirty_inode(struct inode *inode, int flags)
{
struct nilfs_transaction_info ti;
struct nilfs_mdt_info *mdi = NILFS_MDT(inode);
if (is_bad_inode(inode)) {
nilfs_warning(inode->i_sb, __func__,
"tried to mark bad_inode dirty. ignored.\n");
dump_stack();
return;
}
if (mdi) {
nilfs_mdt_mark_dirty(inode);
return;
}
nilfs_transaction_begin(inode->i_sb, &ti, 0);
__nilfs_mark_inode_dirty(inode, flags);
nilfs_transaction_commit(inode->i_sb); /* never fails */
}
int nilfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
__u64 start, __u64 len)
{
struct the_nilfs *nilfs = inode->i_sb->s_fs_info;
__u64 logical = 0, phys = 0, size = 0;
__u32 flags = 0;
loff_t isize;
sector_t blkoff, end_blkoff;
sector_t delalloc_blkoff;
unsigned long delalloc_blklen;
unsigned int blkbits = inode->i_blkbits;
int ret, n;
ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC);
if (ret)
return ret;
mutex_lock(&inode->i_mutex);
isize = i_size_read(inode);
blkoff = start >> blkbits;
end_blkoff = (start + len - 1) >> blkbits;
delalloc_blklen = nilfs_find_uncommitted_extent(inode, blkoff,
&delalloc_blkoff);
do {
__u64 blkphy;
unsigned int maxblocks;
if (delalloc_blklen && blkoff == delalloc_blkoff) {
if (size) {
/* End of the current extent */
ret = fiemap_fill_next_extent(
fieinfo, logical, phys, size, flags);
if (ret)
break;
}
if (blkoff > end_blkoff)
break;
flags = FIEMAP_EXTENT_MERGED | FIEMAP_EXTENT_DELALLOC;
logical = blkoff << blkbits;
phys = 0;
size = delalloc_blklen << blkbits;
blkoff = delalloc_blkoff + delalloc_blklen;
delalloc_blklen = nilfs_find_uncommitted_extent(
inode, blkoff, &delalloc_blkoff);
continue;
}
/*
* Limit the number of blocks that we look up so as
* not to get into the next delayed allocation extent.
*/
maxblocks = INT_MAX;
if (delalloc_blklen)
maxblocks = min_t(sector_t, delalloc_blkoff - blkoff,
maxblocks);
blkphy = 0;
down_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
n = nilfs_bmap_lookup_contig(
NILFS_I(inode)->i_bmap, blkoff, &blkphy, maxblocks);
up_read(&NILFS_MDT(nilfs->ns_dat)->mi_sem);
if (n < 0) {
int past_eof;
if (unlikely(n != -ENOENT))
break; /* error */
/* HOLE */
blkoff++;
past_eof = ((blkoff << blkbits) >= isize);
if (size) {
/* End of the current extent */
if (past_eof)
flags |= FIEMAP_EXTENT_LAST;
ret = fiemap_fill_next_extent(
fieinfo, logical, phys, size, flags);
if (ret)
break;
size = 0;
}
if (blkoff > end_blkoff || past_eof)
break;
} else {
if (size) {
if (phys && blkphy << blkbits == phys + size) {
/* The current extent goes on */
size += n << blkbits;
} else {
/* Terminate the current extent */
ret = fiemap_fill_next_extent(
fieinfo, logical, phys, size,
flags);
if (ret || blkoff > end_blkoff)
break;
/* Start another extent */
flags = FIEMAP_EXTENT_MERGED;
logical = blkoff << blkbits;
phys = blkphy << blkbits;
size = n << blkbits;
}
} else {
/* Start a new extent */
flags = FIEMAP_EXTENT_MERGED;
logical = blkoff << blkbits;
phys = blkphy << blkbits;
size = n << blkbits;
}
blkoff += n;
}
cond_resched();
} while (true);
/* If ret is 1 then we just hit the end of the extent array */
if (ret == 1)
ret = 0;
mutex_unlock(&inode->i_mutex);
return ret;
}