2018-09-12 09:16:07 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2012-11-29 12:28:09 +08:00
|
|
|
/*
|
2012-11-02 16:10:12 +08:00
|
|
|
* fs/f2fs/data.c
|
|
|
|
*
|
|
|
|
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
|
|
|
|
* http://www.samsung.com/
|
|
|
|
*/
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/f2fs_fs.h>
|
|
|
|
#include <linux/buffer_head.h>
|
|
|
|
#include <linux/mpage.h>
|
|
|
|
#include <linux/writeback.h>
|
|
|
|
#include <linux/backing-dev.h>
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
#include <linux/pagevec.h>
|
2012-11-02 16:10:12 +08:00
|
|
|
#include <linux/blkdev.h>
|
|
|
|
#include <linux/bio.h>
|
2019-07-02 10:15:29 +08:00
|
|
|
#include <linux/swap.h>
|
2012-12-20 05:19:30 +08:00
|
|
|
#include <linux/prefetch.h>
|
2015-02-23 00:58:50 +08:00
|
|
|
#include <linux/uio.h>
|
2015-04-10 02:20:42 +08:00
|
|
|
#include <linux/cleancache.h>
|
2017-02-03 02:15:33 +08:00
|
|
|
#include <linux/sched/signal.h>
|
2012-11-02 16:10:12 +08:00
|
|
|
|
|
|
|
#include "f2fs.h"
|
|
|
|
#include "node.h"
|
|
|
|
#include "segment.h"
|
2014-12-18 12:04:08 +08:00
|
|
|
#include "trace.h"
|
2013-04-23 15:38:02 +08:00
|
|
|
#include <trace/events/f2fs.h>
|
2012-11-02 16:10:12 +08:00
|
|
|
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
#define NUM_PREALLOC_POST_READ_CTXS 128
|
|
|
|
|
|
|
|
static struct kmem_cache *bio_post_read_ctx_cache;
|
|
|
|
static mempool_t *bio_post_read_ctx_pool;
|
|
|
|
|
f2fs: don't wait writeback for datas during checkpoint
Normally, while committing checkpoint, we will wait on all pages to be
writebacked no matter the page is data or metadata, so in scenario where
there are lots of data IO being submitted with metadata, we may suffer
long latency for waiting writeback during checkpoint.
Indeed, we only care about persistence for pages with metadata, but not
pages with data, as file system consistent are only related to metadate,
so in order to avoid encountering long latency in above scenario, let's
recognize and reference metadata in submitted IOs, wait writeback only
for metadatas.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-11-16 10:41:20 +08:00
|
|
|
static bool __is_cp_guaranteed(struct page *page)
|
|
|
|
{
|
|
|
|
struct address_space *mapping = page->mapping;
|
|
|
|
struct inode *inode;
|
|
|
|
struct f2fs_sb_info *sbi;
|
|
|
|
|
|
|
|
if (!mapping)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
inode = mapping->host;
|
|
|
|
sbi = F2FS_I_SB(inode);
|
|
|
|
|
|
|
|
if (inode->i_ino == F2FS_META_INO(sbi) ||
|
|
|
|
inode->i_ino == F2FS_NODE_INO(sbi) ||
|
|
|
|
S_ISDIR(inode->i_mode) ||
|
2018-05-08 14:06:03 +08:00
|
|
|
(S_ISREG(inode->i_mode) &&
|
f2fs: guarantee journalled quota data by checkpoint
For journalled quota mode, let checkpoint to flush dquot dirty data
and quota file data to guarntee persistence of all quota sysfile in
last checkpoint, by this way, we can avoid corrupting quota sysfile
when encountering SPO.
The implementation is as below:
1. add a global state SBI_QUOTA_NEED_FLUSH to indicate that there is
cached dquot metadata changes in quota subsystem, and later checkpoint
should:
a) flush dquot metadata into quota file.
b) flush quota file to storage to keep file usage be consistent.
2. add a global state SBI_QUOTA_NEED_REPAIR to indicate that quota
operation failed due to -EIO or -ENOSPC, so later,
a) checkpoint will skip syncing dquot metadata.
b) CP_QUOTA_NEED_FSCK_FLAG will be set in last cp pack to give a
hint for fsck repairing.
3. add a global state SBI_QUOTA_SKIP_FLUSH, in checkpoint, if quota
data updating is very heavy, it may cause hungtask in block_operation().
To avoid this, if our retry time exceed threshold, let's just skip
flushing and retry in next checkpoint().
Signed-off-by: Weichao Guo <guoweichao@huawei.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: avoid warnings and set fsck flag]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-09-20 20:05:00 +08:00
|
|
|
(f2fs_is_atomic_file(inode) || IS_NOQUOTA(inode))) ||
|
f2fs: don't wait writeback for datas during checkpoint
Normally, while committing checkpoint, we will wait on all pages to be
writebacked no matter the page is data or metadata, so in scenario where
there are lots of data IO being submitted with metadata, we may suffer
long latency for waiting writeback during checkpoint.
Indeed, we only care about persistence for pages with metadata, but not
pages with data, as file system consistent are only related to metadate,
so in order to avoid encountering long latency in above scenario, let's
recognize and reference metadata in submitted IOs, wait writeback only
for metadatas.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-11-16 10:41:20 +08:00
|
|
|
is_cold_data(page))
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-10-17 01:20:53 +08:00
|
|
|
static enum count_type __read_io_type(struct page *page)
|
|
|
|
{
|
2019-07-02 10:15:29 +08:00
|
|
|
struct address_space *mapping = page_file_mapping(page);
|
2018-10-17 01:20:53 +08:00
|
|
|
|
|
|
|
if (mapping) {
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
|
|
|
|
|
|
|
if (inode->i_ino == F2FS_META_INO(sbi))
|
|
|
|
return F2FS_RD_META;
|
|
|
|
|
|
|
|
if (inode->i_ino == F2FS_NODE_INO(sbi))
|
|
|
|
return F2FS_RD_NODE;
|
|
|
|
}
|
|
|
|
return F2FS_RD_DATA;
|
|
|
|
}
|
|
|
|
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
/* postprocessing steps for read bios */
|
|
|
|
enum bio_post_read_step {
|
|
|
|
STEP_INITIAL = 0,
|
|
|
|
STEP_DECRYPT,
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
STEP_VERITY,
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct bio_post_read_ctx {
|
|
|
|
struct bio *bio;
|
|
|
|
struct work_struct work;
|
|
|
|
unsigned int cur_step;
|
|
|
|
unsigned int enabled_steps;
|
|
|
|
};
|
|
|
|
|
|
|
|
static void __read_end_io(struct bio *bio)
|
2013-11-30 11:51:14 +08:00
|
|
|
{
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
struct page *page;
|
|
|
|
struct bio_vec *bv;
|
2019-02-15 19:13:19 +08:00
|
|
|
struct bvec_iter_all iter_all;
|
2013-11-30 11:51:14 +08:00
|
|
|
|
2019-04-25 15:03:00 +08:00
|
|
|
bio_for_each_segment_all(bv, bio, iter_all) {
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
page = bv->bv_page;
|
|
|
|
|
|
|
|
/* PG_error was set if any post_read step failed */
|
|
|
|
if (bio->bi_status || PageError(page)) {
|
|
|
|
ClearPageUptodate(page);
|
2018-09-26 04:54:33 +08:00
|
|
|
/* will re-read again later */
|
|
|
|
ClearPageError(page);
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
} else {
|
|
|
|
SetPageUptodate(page);
|
|
|
|
}
|
2018-10-17 01:20:53 +08:00
|
|
|
dec_page_count(F2FS_P_SB(page), __read_io_type(page));
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
unlock_page(page);
|
|
|
|
}
|
|
|
|
if (bio->bi_private)
|
|
|
|
mempool_free(bio->bi_private, bio_post_read_ctx_pool);
|
|
|
|
bio_put(bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void bio_post_read_processing(struct bio_post_read_ctx *ctx);
|
|
|
|
|
|
|
|
static void decrypt_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct bio_post_read_ctx *ctx =
|
|
|
|
container_of(work, struct bio_post_read_ctx, work);
|
|
|
|
|
|
|
|
fscrypt_decrypt_bio(ctx->bio);
|
|
|
|
|
|
|
|
bio_post_read_processing(ctx);
|
|
|
|
}
|
|
|
|
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
static void verity_work(struct work_struct *work)
|
|
|
|
{
|
|
|
|
struct bio_post_read_ctx *ctx =
|
|
|
|
container_of(work, struct bio_post_read_ctx, work);
|
|
|
|
|
|
|
|
fsverity_verify_bio(ctx->bio);
|
|
|
|
|
|
|
|
bio_post_read_processing(ctx);
|
|
|
|
}
|
|
|
|
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
static void bio_post_read_processing(struct bio_post_read_ctx *ctx)
|
|
|
|
{
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
/*
|
|
|
|
* We use different work queues for decryption and for verity because
|
|
|
|
* verity may require reading metadata pages that need decryption, and
|
|
|
|
* we shouldn't recurse to the same workqueue.
|
|
|
|
*/
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
switch (++ctx->cur_step) {
|
|
|
|
case STEP_DECRYPT:
|
|
|
|
if (ctx->enabled_steps & (1 << STEP_DECRYPT)) {
|
|
|
|
INIT_WORK(&ctx->work, decrypt_work);
|
|
|
|
fscrypt_enqueue_decrypt_work(&ctx->work);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
ctx->cur_step++;
|
|
|
|
/* fall-through */
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
case STEP_VERITY:
|
|
|
|
if (ctx->enabled_steps & (1 << STEP_VERITY)) {
|
|
|
|
INIT_WORK(&ctx->work, verity_work);
|
|
|
|
fsverity_enqueue_verify_work(&ctx->work);
|
|
|
|
return;
|
|
|
|
}
|
|
|
|
ctx->cur_step++;
|
|
|
|
/* fall-through */
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
default:
|
|
|
|
__read_end_io(ctx->bio);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool f2fs_bio_post_read_required(struct bio *bio)
|
|
|
|
{
|
|
|
|
return bio->bi_private && !bio->bi_status;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void f2fs_read_end_io(struct bio *bio)
|
|
|
|
{
|
2024-06-12 13:13:20 +08:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_P_SB(bio_first_page_all(bio));
|
|
|
|
|
|
|
|
if (time_to_inject(sbi, FAULT_READ_IO)) {
|
|
|
|
f2fs_show_injection_info(sbi, FAULT_READ_IO);
|
2017-06-03 15:38:06 +08:00
|
|
|
bio->bi_status = BLK_STS_IOERR;
|
2017-02-25 11:08:28 +08:00
|
|
|
}
|
2016-09-18 23:30:07 +08:00
|
|
|
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
if (f2fs_bio_post_read_required(bio)) {
|
|
|
|
struct bio_post_read_ctx *ctx = bio->bi_private;
|
2015-04-10 02:20:42 +08:00
|
|
|
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
ctx->cur_step = STEP_INITIAL;
|
|
|
|
bio_post_read_processing(ctx);
|
|
|
|
return;
|
2015-04-10 02:20:42 +08:00
|
|
|
}
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
|
|
|
|
__read_end_io(bio);
|
2015-04-10 02:20:42 +08:00
|
|
|
}
|
|
|
|
|
2015-07-20 21:29:37 +08:00
|
|
|
static void f2fs_write_end_io(struct bio *bio)
|
2013-11-30 11:51:14 +08:00
|
|
|
{
|
2014-02-03 09:50:22 +08:00
|
|
|
struct f2fs_sb_info *sbi = bio->bi_private;
|
2014-01-31 03:19:05 +08:00
|
|
|
struct bio_vec *bvec;
|
2019-02-15 19:13:19 +08:00
|
|
|
struct bvec_iter_all iter_all;
|
2013-11-30 11:51:14 +08:00
|
|
|
|
2018-09-12 09:22:29 +08:00
|
|
|
if (time_to_inject(sbi, FAULT_WRITE_IO)) {
|
2024-06-12 13:13:20 +08:00
|
|
|
f2fs_show_injection_info(sbi, FAULT_WRITE_IO);
|
2018-09-12 09:22:29 +08:00
|
|
|
bio->bi_status = BLK_STS_IOERR;
|
|
|
|
}
|
|
|
|
|
2019-04-25 15:03:00 +08:00
|
|
|
bio_for_each_segment_all(bvec, bio, iter_all) {
|
2013-11-30 11:51:14 +08:00
|
|
|
struct page *page = bvec->bv_page;
|
f2fs: don't wait writeback for datas during checkpoint
Normally, while committing checkpoint, we will wait on all pages to be
writebacked no matter the page is data or metadata, so in scenario where
there are lots of data IO being submitted with metadata, we may suffer
long latency for waiting writeback during checkpoint.
Indeed, we only care about persistence for pages with metadata, but not
pages with data, as file system consistent are only related to metadate,
so in order to avoid encountering long latency in above scenario, let's
recognize and reference metadata in submitted IOs, wait writeback only
for metadatas.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-11-16 10:41:20 +08:00
|
|
|
enum count_type type = WB_DATA_TYPE(page);
|
2013-11-30 11:51:14 +08:00
|
|
|
|
2016-12-15 02:12:56 +08:00
|
|
|
if (IS_DUMMY_WRITTEN_PAGE(page)) {
|
|
|
|
set_page_private(page, (unsigned long)NULL);
|
|
|
|
ClearPagePrivate(page);
|
|
|
|
unlock_page(page);
|
|
|
|
mempool_free(page, sbi->write_io_dummy);
|
|
|
|
|
2017-06-03 15:38:06 +08:00
|
|
|
if (unlikely(bio->bi_status))
|
2016-12-15 02:12:56 +08:00
|
|
|
f2fs_stop_checkpoint(sbi, true);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2019-05-21 00:29:39 +08:00
|
|
|
fscrypt_finalize_bounce_page(&page);
|
2015-04-24 03:04:33 +08:00
|
|
|
|
2017-06-03 15:38:06 +08:00
|
|
|
if (unlikely(bio->bi_status)) {
|
2016-10-12 04:56:01 +08:00
|
|
|
mapping_set_error(page->mapping, -EIO);
|
2018-01-01 08:26:38 +08:00
|
|
|
if (type == F2FS_WB_CP_DATA)
|
|
|
|
f2fs_stop_checkpoint(sbi, true);
|
2013-11-30 11:51:14 +08:00
|
|
|
}
|
2018-01-11 14:19:32 +08:00
|
|
|
|
|
|
|
f2fs_bug_on(sbi, page->mapping == NODE_MAPPING(sbi) &&
|
|
|
|
page->index != nid_of_node(page));
|
|
|
|
|
f2fs: don't wait writeback for datas during checkpoint
Normally, while committing checkpoint, we will wait on all pages to be
writebacked no matter the page is data or metadata, so in scenario where
there are lots of data IO being submitted with metadata, we may suffer
long latency for waiting writeback during checkpoint.
Indeed, we only care about persistence for pages with metadata, but not
pages with data, as file system consistent are only related to metadate,
so in order to avoid encountering long latency in above scenario, let's
recognize and reference metadata in submitted IOs, wait writeback only
for metadatas.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-11-16 10:41:20 +08:00
|
|
|
dec_page_count(sbi, type);
|
f2fs: fix to avoid broken of dnode block list
f2fs recovery flow is relying on dnode block link list, it means fsynced
file recovery depends on previous dnode's persistence in the list, so
during fsync() we should wait on all regular inode's dnode writebacked
before issuing flush.
By this way, we can avoid dnode block list being broken by out-of-order
IO submission due to IO scheduler or driver.
Sheng Yong helps to do the test with this patch:
Target:/data (f2fs, -)
64MB / 32768KB / 4KB / 8
1 / PERSIST / Index
Base:
SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS)
1 867.82 204.15 41440.03 41370.54 680.8 1025.94 1031.08
2 871.87 205.87 41370.3 40275.2 791.14 1065.84 1101.7
3 866.52 205.69 41795.67 40596.16 694.69 1037.16 1031.48
Avg 868.7366667 205.2366667 41535.33333 40747.3 722.21 1042.98 1054.753333
After:
SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS)
1 798.81 202.5 41143 40613.87 602.71 838.08 913.83
2 805.79 206.47 40297.2 41291.46 604.44 840.75 924.27
3 814.83 206.17 41209.57 40453.62 602.85 834.66 927.91
Avg 806.4766667 205.0466667 40883.25667 40786.31667 603.3333333 837.83 922.0033333
Patched/Original:
0.928332713 0.999074239 0.984300676 1.000957528 0.835398753 0.803303994 0.874141189
It looks like atomic write will suffer performance regression.
I suspect that the criminal is that we forcing to wait all dnode being in
storage cache before we issue PREFLUSH+FUA.
BTW, will commit ("f2fs: don't need to wait for node writes for atomic write")
cause the problem: we will lose data of last transaction after SPO, even if
atomic write return no error:
- atomic_open();
- write() P1, P2, P3;
- atomic_commit();
- writeback data: P1, P2, P3;
- writeback node: N1, N2, N3; <--- If N1, N2 is not writebacked, N3 with fsync_mark is
writebacked, In SPOR, we won't find N3 since node chain is broken, turns out that losing
last transaction.
- preflush + fua;
- power-cut
If we don't wait dnode writeback for atomic_write:
SEQ-RD(MB/s) SEQ-WR(MB/s) RND-RD(IOPS) RND-WR(IOPS) Insert(TPS) Update(TPS) Delete(TPS)
1 779.91 206.03 41621.5 40333.16 716.9 1038.21 1034.85
2 848.51 204.35 40082.44 39486.17 791.83 1119.96 1083.77
3 772.12 206.27 41335.25 41599.65 723.29 1055.07 971.92
Avg 800.18 205.55 41013.06333 40472.99333 744.0066667 1071.08 1030.18
Patched/Original:
0.92108464 1.001526693 0.987425886 0.993268102 1.030180511 1.026942031 0.976702294
SQLite's performance recovers.
Jaegeuk:
"Practically, I don't see db corruption becase of this. We can excuse to lose
the last transaction."
Finally, we decide to keep original implementation of atomic write interface
sematics that we don't wait all dnode writeback before preflush+fua submission.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-02 23:03:19 +08:00
|
|
|
if (f2fs_in_warm_node_list(sbi, page))
|
|
|
|
f2fs_del_fsync_node_entry(sbi, page);
|
f2fs: don't wait writeback for datas during checkpoint
Normally, while committing checkpoint, we will wait on all pages to be
writebacked no matter the page is data or metadata, so in scenario where
there are lots of data IO being submitted with metadata, we may suffer
long latency for waiting writeback during checkpoint.
Indeed, we only care about persistence for pages with metadata, but not
pages with data, as file system consistent are only related to metadate,
so in order to avoid encountering long latency in above scenario, let's
recognize and reference metadata in submitted IOs, wait writeback only
for metadatas.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-11-16 10:41:20 +08:00
|
|
|
clear_cold_data(page);
|
2013-11-30 11:51:14 +08:00
|
|
|
end_page_writeback(page);
|
2014-01-31 03:19:05 +08:00
|
|
|
}
|
f2fs: don't wait writeback for datas during checkpoint
Normally, while committing checkpoint, we will wait on all pages to be
writebacked no matter the page is data or metadata, so in scenario where
there are lots of data IO being submitted with metadata, we may suffer
long latency for waiting writeback during checkpoint.
Indeed, we only care about persistence for pages with metadata, but not
pages with data, as file system consistent are only related to metadate,
so in order to avoid encountering long latency in above scenario, let's
recognize and reference metadata in submitted IOs, wait writeback only
for metadatas.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-11-16 10:41:20 +08:00
|
|
|
if (!get_pages(sbi, F2FS_WB_CP_DATA) &&
|
2016-05-18 07:23:36 +08:00
|
|
|
wq_has_sleeper(&sbi->cp_wait))
|
2013-11-30 11:51:14 +08:00
|
|
|
wake_up(&sbi->cp_wait);
|
|
|
|
|
|
|
|
bio_put(bio);
|
|
|
|
}
|
|
|
|
|
2016-10-07 10:02:05 +08:00
|
|
|
/*
|
|
|
|
* Return true, if pre_bio's bdev is same as its target device.
|
|
|
|
*/
|
|
|
|
struct block_device *f2fs_target_device(struct f2fs_sb_info *sbi,
|
|
|
|
block_t blk_addr, struct bio *bio)
|
|
|
|
{
|
|
|
|
struct block_device *bdev = sbi->sb->s_bdev;
|
|
|
|
int i;
|
|
|
|
|
2019-03-16 08:13:06 +08:00
|
|
|
if (f2fs_is_multi_device(sbi)) {
|
|
|
|
for (i = 0; i < sbi->s_ndevs; i++) {
|
|
|
|
if (FDEV(i).start_blk <= blk_addr &&
|
|
|
|
FDEV(i).end_blk >= blk_addr) {
|
|
|
|
blk_addr -= FDEV(i).start_blk;
|
|
|
|
bdev = FDEV(i).bdev;
|
|
|
|
break;
|
|
|
|
}
|
2016-10-07 10:02:05 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
if (bio) {
|
2017-08-24 01:10:32 +08:00
|
|
|
bio_set_dev(bio, bdev);
|
2016-10-07 10:02:05 +08:00
|
|
|
bio->bi_iter.bi_sector = SECTOR_FROM_BLOCK(blk_addr);
|
|
|
|
}
|
|
|
|
return bdev;
|
|
|
|
}
|
|
|
|
|
|
|
|
int f2fs_target_device_index(struct f2fs_sb_info *sbi, block_t blkaddr)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
2019-03-16 08:13:06 +08:00
|
|
|
if (!f2fs_is_multi_device(sbi))
|
|
|
|
return 0;
|
|
|
|
|
2016-10-07 10:02:05 +08:00
|
|
|
for (i = 0; i < sbi->s_ndevs; i++)
|
|
|
|
if (FDEV(i).start_blk <= blkaddr && FDEV(i).end_blk >= blkaddr)
|
|
|
|
return i;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool __same_bdev(struct f2fs_sb_info *sbi,
|
|
|
|
block_t blk_addr, struct bio *bio)
|
|
|
|
{
|
2017-08-24 01:10:32 +08:00
|
|
|
struct block_device *b = f2fs_target_device(sbi, blk_addr, NULL);
|
|
|
|
return bio->bi_disk == b->bd_disk && bio->bi_partno == b->bd_partno;
|
2016-10-07 10:02:05 +08:00
|
|
|
}
|
|
|
|
|
2013-12-20 17:39:59 +08:00
|
|
|
/*
|
|
|
|
* Low-level block read/write IO operations.
|
|
|
|
*/
|
2019-08-23 17:58:35 +08:00
|
|
|
static struct bio *__bio_alloc(struct f2fs_io_info *fio, int npages)
|
2013-12-20 17:39:59 +08:00
|
|
|
{
|
2019-08-23 17:58:35 +08:00
|
|
|
struct f2fs_sb_info *sbi = fio->sbi;
|
2013-12-20 17:39:59 +08:00
|
|
|
struct bio *bio;
|
|
|
|
|
2017-10-28 16:52:31 +08:00
|
|
|
bio = f2fs_bio_alloc(sbi, npages, true);
|
2013-12-20 17:39:59 +08:00
|
|
|
|
2019-08-23 17:58:35 +08:00
|
|
|
f2fs_target_device(sbi, fio->new_blkaddr, bio);
|
|
|
|
if (is_read_io(fio->op)) {
|
2018-01-31 10:36:57 +08:00
|
|
|
bio->bi_end_io = f2fs_read_end_io;
|
|
|
|
bio->bi_private = NULL;
|
|
|
|
} else {
|
|
|
|
bio->bi_end_io = f2fs_write_end_io;
|
|
|
|
bio->bi_private = sbi;
|
2019-08-23 17:58:35 +08:00
|
|
|
bio->bi_write_hint = f2fs_io_type_to_rw_hint(sbi,
|
|
|
|
fio->type, fio->temp);
|
2018-01-31 10:36:57 +08:00
|
|
|
}
|
2019-08-23 17:58:35 +08:00
|
|
|
if (fio->io_wbc)
|
|
|
|
wbc_init_bio(fio->io_wbc, bio);
|
2013-12-20 17:39:59 +08:00
|
|
|
|
|
|
|
return bio;
|
|
|
|
}
|
|
|
|
|
2016-07-28 01:36:31 +08:00
|
|
|
static inline void __submit_bio(struct f2fs_sb_info *sbi,
|
|
|
|
struct bio *bio, enum page_type type)
|
2016-05-18 07:23:36 +08:00
|
|
|
{
|
2016-07-28 01:36:31 +08:00
|
|
|
if (!is_read_io(bio_op(bio))) {
|
2016-12-15 02:12:56 +08:00
|
|
|
unsigned int start;
|
|
|
|
|
|
|
|
if (type != DATA && type != NODE)
|
|
|
|
goto submit_io;
|
|
|
|
|
2018-07-12 23:09:28 +08:00
|
|
|
if (test_opt(sbi, LFS) && current->plug)
|
2018-02-06 08:21:45 +08:00
|
|
|
blk_finish_plug(current->plug);
|
|
|
|
|
2024-06-11 20:26:44 +08:00
|
|
|
if (!F2FS_IO_ALIGNED(sbi))
|
2019-08-28 17:33:38 +08:00
|
|
|
goto submit_io;
|
|
|
|
|
2016-12-15 02:12:56 +08:00
|
|
|
start = bio->bi_iter.bi_size >> F2FS_BLKSIZE_BITS;
|
|
|
|
start %= F2FS_IO_SIZE(sbi);
|
|
|
|
|
|
|
|
if (start == 0)
|
|
|
|
goto submit_io;
|
|
|
|
|
|
|
|
/* fill dummy pages */
|
|
|
|
for (; start < F2FS_IO_SIZE(sbi); start++) {
|
|
|
|
struct page *page =
|
|
|
|
mempool_alloc(sbi->write_io_dummy,
|
2019-02-19 10:31:52 +08:00
|
|
|
GFP_NOIO | __GFP_NOFAIL);
|
2016-12-15 02:12:56 +08:00
|
|
|
f2fs_bug_on(sbi, !page);
|
|
|
|
|
2019-02-19 10:31:52 +08:00
|
|
|
zero_user_segment(page, 0, PAGE_SIZE);
|
2016-12-15 02:12:56 +08:00
|
|
|
SetPagePrivate(page);
|
|
|
|
set_page_private(page, (unsigned long)DUMMY_WRITTEN_PAGE);
|
|
|
|
lock_page(page);
|
|
|
|
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE)
|
|
|
|
f2fs_bug_on(sbi, 1);
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* In the NODE case, we lose next block address chain. So, we
|
|
|
|
* need to do checkpoint in f2fs_sync_file.
|
|
|
|
*/
|
|
|
|
if (type == NODE)
|
|
|
|
set_sbi_flag(sbi, SBI_NEED_CP);
|
2016-06-05 05:25:24 +08:00
|
|
|
}
|
2016-12-15 02:12:56 +08:00
|
|
|
submit_io:
|
2016-12-22 04:13:03 +08:00
|
|
|
if (is_read_io(bio_op(bio)))
|
|
|
|
trace_f2fs_submit_read_bio(sbi->sb, type, bio);
|
|
|
|
else
|
|
|
|
trace_f2fs_submit_write_bio(sbi->sb, type, bio);
|
2016-06-06 03:31:41 +08:00
|
|
|
submit_bio(bio);
|
2016-05-18 07:23:36 +08:00
|
|
|
}
|
|
|
|
|
2013-12-11 12:54:01 +08:00
|
|
|
static void __submit_merged_bio(struct f2fs_bio_info *io)
|
2013-11-30 11:51:14 +08:00
|
|
|
{
|
2013-12-11 12:54:01 +08:00
|
|
|
struct f2fs_io_info *fio = &io->fio;
|
2013-11-30 11:51:14 +08:00
|
|
|
|
|
|
|
if (!io->bio)
|
|
|
|
return;
|
|
|
|
|
2016-12-22 04:13:03 +08:00
|
|
|
bio_set_op_attrs(io->bio, fio->op, fio->op_flags);
|
|
|
|
|
2016-06-06 03:31:55 +08:00
|
|
|
if (is_read_io(fio->op))
|
2016-12-22 04:13:03 +08:00
|
|
|
trace_f2fs_prepare_read_bio(io->sbi->sb, fio->type, io->bio);
|
2014-10-30 05:37:22 +08:00
|
|
|
else
|
2016-12-22 04:13:03 +08:00
|
|
|
trace_f2fs_prepare_write_bio(io->sbi->sb, fio->type, io->bio);
|
2016-06-06 03:31:55 +08:00
|
|
|
|
2016-07-28 01:36:31 +08:00
|
|
|
__submit_bio(io->sbi, io->bio, fio->type);
|
2013-11-30 11:51:14 +08:00
|
|
|
io->bio = NULL;
|
|
|
|
}
|
|
|
|
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
static bool __has_merged_page(struct bio *bio, struct inode *inode,
|
2018-09-27 23:41:16 +08:00
|
|
|
struct page *page, nid_t ino)
|
2016-01-18 18:24:59 +08:00
|
|
|
{
|
|
|
|
struct bio_vec *bvec;
|
|
|
|
struct page *target;
|
2019-02-15 19:13:19 +08:00
|
|
|
struct bvec_iter_all iter_all;
|
2016-01-18 18:24:59 +08:00
|
|
|
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
if (!bio)
|
2016-01-18 18:24:59 +08:00
|
|
|
return false;
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
|
2018-09-27 23:41:16 +08:00
|
|
|
if (!inode && !page && !ino)
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
return true;
|
2016-01-18 18:24:59 +08:00
|
|
|
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
bio_for_each_segment_all(bvec, bio, iter_all) {
|
2016-01-18 18:24:59 +08:00
|
|
|
|
2019-05-21 00:29:39 +08:00
|
|
|
target = bvec->bv_page;
|
|
|
|
if (fscrypt_is_bounce_page(target))
|
|
|
|
target = fscrypt_pagecache_page(target);
|
2016-01-18 18:24:59 +08:00
|
|
|
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
if (inode && inode == target->mapping->host)
|
|
|
|
return true;
|
2018-09-27 23:41:16 +08:00
|
|
|
if (page && page == target)
|
|
|
|
return true;
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
if (ino && ino == ino_of_node(target))
|
2016-01-18 18:24:59 +08:00
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-05-11 02:28:38 +08:00
|
|
|
static void __f2fs_submit_merged_write(struct f2fs_sb_info *sbi,
|
2017-05-11 02:18:25 +08:00
|
|
|
enum page_type type, enum temp_type temp)
|
2013-11-30 11:51:14 +08:00
|
|
|
{
|
|
|
|
enum page_type btype = PAGE_TYPE_OF_BIO(type);
|
2017-05-11 02:18:25 +08:00
|
|
|
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
|
2013-11-30 11:51:14 +08:00
|
|
|
|
2014-03-22 14:57:23 +08:00
|
|
|
down_write(&io->io_rwsem);
|
2013-12-11 12:54:01 +08:00
|
|
|
|
|
|
|
/* change META to META_FLUSH in the checkpoint procedure */
|
|
|
|
if (type >= META_FLUSH) {
|
|
|
|
io->fio.type = META_FLUSH;
|
2016-06-06 03:31:55 +08:00
|
|
|
io->fio.op = REQ_OP_WRITE;
|
2017-05-02 23:03:47 +08:00
|
|
|
io->fio.op_flags = REQ_META | REQ_PRIO | REQ_SYNC;
|
2016-11-01 21:40:10 +08:00
|
|
|
if (!test_opt(sbi, NOBARRIER))
|
2017-02-07 05:57:58 +08:00
|
|
|
io->fio.op_flags |= REQ_PREFLUSH | REQ_FUA;
|
2013-12-11 12:54:01 +08:00
|
|
|
}
|
|
|
|
__submit_merged_bio(io);
|
2014-03-22 14:57:23 +08:00
|
|
|
up_write(&io->io_rwsem);
|
2013-11-30 11:51:14 +08:00
|
|
|
}
|
|
|
|
|
2017-05-11 02:18:25 +08:00
|
|
|
static void __submit_merged_write_cond(struct f2fs_sb_info *sbi,
|
2018-09-27 23:41:16 +08:00
|
|
|
struct inode *inode, struct page *page,
|
|
|
|
nid_t ino, enum page_type type, bool force)
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
{
|
2017-05-11 02:18:25 +08:00
|
|
|
enum temp_type temp;
|
2018-11-13 11:57:32 +08:00
|
|
|
bool ret = true;
|
2017-05-11 02:18:25 +08:00
|
|
|
|
|
|
|
for (temp = HOT; temp < NR_TEMP_TYPE; temp++) {
|
2018-11-13 11:57:32 +08:00
|
|
|
if (!force) {
|
|
|
|
enum page_type btype = PAGE_TYPE_OF_BIO(type);
|
|
|
|
struct f2fs_bio_info *io = sbi->write_io[btype] + temp;
|
2017-05-11 02:18:25 +08:00
|
|
|
|
2018-11-13 11:57:32 +08:00
|
|
|
down_read(&io->io_rwsem);
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
ret = __has_merged_page(io->bio, inode, page, ino);
|
2018-11-13 11:57:32 +08:00
|
|
|
up_read(&io->io_rwsem);
|
|
|
|
}
|
|
|
|
if (ret)
|
|
|
|
__f2fs_submit_merged_write(sbi, type, temp);
|
2017-05-11 02:18:25 +08:00
|
|
|
|
|
|
|
/* TODO: use HOT temp only for meta pages now. */
|
|
|
|
if (type >= META)
|
|
|
|
break;
|
|
|
|
}
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
}
|
|
|
|
|
2017-05-11 02:28:38 +08:00
|
|
|
void f2fs_submit_merged_write(struct f2fs_sb_info *sbi, enum page_type type)
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
{
|
2019-04-06 18:59:36 +08:00
|
|
|
__submit_merged_write_cond(sbi, NULL, NULL, 0, type, true);
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
}
|
|
|
|
|
2017-05-11 02:28:38 +08:00
|
|
|
void f2fs_submit_merged_write_cond(struct f2fs_sb_info *sbi,
|
2018-09-27 23:41:16 +08:00
|
|
|
struct inode *inode, struct page *page,
|
|
|
|
nid_t ino, enum page_type type)
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
{
|
2018-09-27 23:41:16 +08:00
|
|
|
__submit_merged_write_cond(sbi, inode, page, ino, type, false);
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
}
|
|
|
|
|
2017-05-11 02:28:38 +08:00
|
|
|
void f2fs_flush_merged_writes(struct f2fs_sb_info *sbi)
|
2016-02-24 17:17:55 +08:00
|
|
|
{
|
2017-05-11 02:28:38 +08:00
|
|
|
f2fs_submit_merged_write(sbi, DATA);
|
|
|
|
f2fs_submit_merged_write(sbi, NODE);
|
|
|
|
f2fs_submit_merged_write(sbi, META);
|
2016-02-24 17:17:55 +08:00
|
|
|
}
|
|
|
|
|
2013-11-30 11:51:14 +08:00
|
|
|
/*
|
|
|
|
* Fill the locked page with data located in the block address.
|
2017-04-06 03:49:44 +08:00
|
|
|
* A caller needs to unlock the page on failure.
|
2013-11-30 11:51:14 +08:00
|
|
|
*/
|
2015-04-24 05:38:15 +08:00
|
|
|
int f2fs_submit_page_bio(struct f2fs_io_info *fio)
|
2013-11-30 11:51:14 +08:00
|
|
|
{
|
|
|
|
struct bio *bio;
|
2015-05-16 07:26:10 +08:00
|
|
|
struct page *page = fio->encrypted_page ?
|
|
|
|
fio->encrypted_page : fio->page;
|
2013-11-30 11:51:14 +08:00
|
|
|
|
f2fs: fix to do sanity check with block address in main area
This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info
- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image
- Reproduce (4.18 upstream kernel)
- POC (poc.c)
static void activity(char *mpoint) {
char *foo_bar_baz;
int err;
static int buf[8192];
memset(buf, 0, sizeof(buf));
err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);
int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
if (fd >= 0) {
write(fd, (char *)buf, sizeof(buf));
fdatasync(fd);
close(fd);
}
}
int main(int argc, char *argv[]) {
activity(argv[1]);
return 0;
}
- Kernel message
[ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.729171] Call Trace:
[ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.729269] ? __radix_tree_replace+0xa3/0x120
[ 699.729276] __write_data_page+0x5c7/0xe30
[ 699.729291] ? kasan_check_read+0x11/0x20
[ 699.729310] ? page_mapped+0x8a/0x110
[ 699.729321] ? page_mkclean+0xe9/0x160
[ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.729331] ? invalid_page_referenced_vma+0x130/0x130
[ 699.729345] ? clear_page_dirty_for_io+0x332/0x450
[ 699.729351] f2fs_write_cache_pages+0x4ca/0x860
[ 699.729358] ? __write_data_page+0xe30/0xe30
[ 699.729374] ? percpu_counter_add_batch+0x22/0xa0
[ 699.729380] ? kasan_check_write+0x14/0x20
[ 699.729391] ? _raw_spin_lock+0x17/0x40
[ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.729413] ? iov_iter_advance+0x113/0x640
[ 699.729418] ? f2fs_write_end+0x133/0x2e0
[ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.729428] f2fs_write_data_pages+0x329/0x520
[ 699.729433] ? generic_perform_write+0x250/0x320
[ 699.729438] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729454] ? current_time+0x110/0x110
[ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.729464] do_writepages+0x37/0xb0
[ 699.729468] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729472] ? do_writepages+0x37/0xb0
[ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.729496] ? __vfs_write+0x2b2/0x410
[ 699.729501] file_write_and_wait_range+0x66/0xb0
[ 699.729506] f2fs_do_sync_file+0x1f9/0xd90
[ 699.729511] ? truncate_partial_data_page+0x290/0x290
[ 699.729521] ? __sb_end_write+0x30/0x50
[ 699.729526] ? vfs_write+0x20f/0x260
[ 699.729530] f2fs_sync_file+0x9a/0xb0
[ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.729548] vfs_fsync_range+0x68/0x100
[ 699.729554] ? __fget_light+0xc9/0xe0
[ 699.729558] do_fsync+0x3d/0x70
[ 699.729562] __x64_sys_fdatasync+0x24/0x30
[ 699.729585] do_syscall_64+0x78/0x170
[ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.729613] RIP: 0033:0x7f9bf930d800
[ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[ 699.729782] ------------[ cut here ]------------
[ 699.729785] kernel BUG at fs/f2fs/segment.h:654!
[ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4
[ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.752874] Call Trace:
[ 699.753386] ? f2fs_inplace_write_data+0x93/0x240
[ 699.754341] f2fs_inplace_write_data+0xd2/0x240
[ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.758209] ? __radix_tree_replace+0xa3/0x120
[ 699.759164] __write_data_page+0x5c7/0xe30
[ 699.760002] ? kasan_check_read+0x11/0x20
[ 699.760823] ? page_mapped+0x8a/0x110
[ 699.761573] ? page_mkclean+0xe9/0x160
[ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.763332] ? invalid_page_referenced_vma+0x130/0x130
[ 699.764374] ? clear_page_dirty_for_io+0x332/0x450
[ 699.765347] f2fs_write_cache_pages+0x4ca/0x860
[ 699.766276] ? __write_data_page+0xe30/0xe30
[ 699.767161] ? percpu_counter_add_batch+0x22/0xa0
[ 699.768112] ? kasan_check_write+0x14/0x20
[ 699.768951] ? _raw_spin_lock+0x17/0x40
[ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.770885] ? iov_iter_advance+0x113/0x640
[ 699.771743] ? f2fs_write_end+0x133/0x2e0
[ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.773680] f2fs_write_data_pages+0x329/0x520
[ 699.774603] ? generic_perform_write+0x250/0x320
[ 699.775544] ? f2fs_write_cache_pages+0x860/0x860
[ 699.776510] ? current_time+0x110/0x110
[ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.778279] do_writepages+0x37/0xb0
[ 699.779026] ? f2fs_write_cache_pages+0x860/0x860
[ 699.779978] ? do_writepages+0x37/0xb0
[ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.782820] ? __vfs_write+0x2b2/0x410
[ 699.783597] file_write_and_wait_range+0x66/0xb0
[ 699.784540] f2fs_do_sync_file+0x1f9/0xd90
[ 699.785381] ? truncate_partial_data_page+0x290/0x290
[ 699.786415] ? __sb_end_write+0x30/0x50
[ 699.787204] ? vfs_write+0x20f/0x260
[ 699.787941] f2fs_sync_file+0x9a/0xb0
[ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.789572] vfs_fsync_range+0x68/0x100
[ 699.790360] ? __fget_light+0xc9/0xe0
[ 699.791128] do_fsync+0x3d/0x70
[ 699.791779] __x64_sys_fdatasync+0x24/0x30
[ 699.792614] do_syscall_64+0x78/0x170
[ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.794406] RIP: 0033:0x7f9bf930d800
[ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.835556] ==================================================================
[ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309
[ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4
[ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.843475] Call Trace:
[ 699.843982] dump_stack+0x7b/0xb5
[ 699.844661] print_address_description+0x70/0x290
[ 699.845607] kasan_report+0x291/0x390
[ 699.846351] ? update_stack_state+0x38c/0x3e0
[ 699.853831] __asan_load8+0x54/0x90
[ 699.854569] update_stack_state+0x38c/0x3e0
[ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20
[ 699.856601] ? __save_stack_trace+0x5e/0x100
[ 699.857476] unwind_next_frame.part.5+0x18e/0x490
[ 699.858448] ? unwind_dump+0x290/0x290
[ 699.859217] ? clear_page_dirty_for_io+0x332/0x450
[ 699.860185] __unwind_start+0x106/0x190
[ 699.860974] __save_stack_trace+0x5e/0x100
[ 699.861808] ? __save_stack_trace+0x5e/0x100
[ 699.862691] ? unlink_anon_vmas+0xba/0x2c0
[ 699.863525] save_stack_trace+0x1f/0x30
[ 699.864312] save_stack+0x46/0xd0
[ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420
[ 699.865990] ? flush_tlb_mm_range+0x15e/0x220
[ 699.866889] ? kasan_check_write+0x14/0x20
[ 699.867724] ? __dec_node_state+0x92/0xb0
[ 699.868543] ? lock_page_memcg+0x85/0xf0
[ 699.869350] ? unlock_page_memcg+0x16/0x80
[ 699.870185] ? page_remove_rmap+0x198/0x520
[ 699.871048] ? mark_page_accessed+0x133/0x200
[ 699.871930] ? _cond_resched+0x1a/0x50
[ 699.872700] ? unmap_page_range+0xcd4/0xe50
[ 699.873551] ? rb_next+0x58/0x80
[ 699.874217] ? rb_next+0x58/0x80
[ 699.874895] __kasan_slab_free+0x13c/0x1a0
[ 699.875734] ? unlink_anon_vmas+0xba/0x2c0
[ 699.876563] kasan_slab_free+0xe/0x10
[ 699.877315] kmem_cache_free+0x89/0x1e0
[ 699.878095] unlink_anon_vmas+0xba/0x2c0
[ 699.878913] free_pgtables+0x101/0x1b0
[ 699.879677] exit_mmap+0x146/0x2a0
[ 699.880378] ? __ia32_sys_munmap+0x50/0x50
[ 699.881214] ? kasan_check_read+0x11/0x20
[ 699.882052] ? mm_update_next_owner+0x322/0x380
[ 699.882985] mmput+0x8b/0x1d0
[ 699.883602] do_exit+0x43a/0x1390
[ 699.884288] ? mm_update_next_owner+0x380/0x380
[ 699.885212] ? f2fs_sync_file+0x9a/0xb0
[ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.886877] ? vfs_fsync_range+0x68/0x100
[ 699.887694] ? __fget_light+0xc9/0xe0
[ 699.888442] ? do_fsync+0x3d/0x70
[ 699.889118] ? __x64_sys_fdatasync+0x24/0x30
[ 699.889996] rewind_stack_do_exit+0x17/0x20
[ 699.890860] RIP: 0033:0x7f9bf930d800
[ 699.891585] Code: Bad RIP value.
[ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.901241] The buggy address belongs to the page:
[ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[ 699.903811] flags: 0x2ffff0000000000()
[ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[ 699.907673] page dumped because: kasan: bad access detected
[ 699.909108] Memory state around the buggy address:
[ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[ 699.914392] ^
[ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[ 699.918634] ==================================================================
- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644
Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-01 19:13:44 +08:00
|
|
|
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
fio->is_por ? META_POR : (__is_meta_io(fio) ?
|
|
|
|
META_GENERIC : DATA_GENERIC_ENHANCE)))
|
2019-06-20 11:36:14 +08:00
|
|
|
return -EFSCORRUPTED;
|
f2fs: fix to do sanity check with block address in main area
This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info
- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image
- Reproduce (4.18 upstream kernel)
- POC (poc.c)
static void activity(char *mpoint) {
char *foo_bar_baz;
int err;
static int buf[8192];
memset(buf, 0, sizeof(buf));
err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);
int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
if (fd >= 0) {
write(fd, (char *)buf, sizeof(buf));
fdatasync(fd);
close(fd);
}
}
int main(int argc, char *argv[]) {
activity(argv[1]);
return 0;
}
- Kernel message
[ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.729171] Call Trace:
[ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.729269] ? __radix_tree_replace+0xa3/0x120
[ 699.729276] __write_data_page+0x5c7/0xe30
[ 699.729291] ? kasan_check_read+0x11/0x20
[ 699.729310] ? page_mapped+0x8a/0x110
[ 699.729321] ? page_mkclean+0xe9/0x160
[ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.729331] ? invalid_page_referenced_vma+0x130/0x130
[ 699.729345] ? clear_page_dirty_for_io+0x332/0x450
[ 699.729351] f2fs_write_cache_pages+0x4ca/0x860
[ 699.729358] ? __write_data_page+0xe30/0xe30
[ 699.729374] ? percpu_counter_add_batch+0x22/0xa0
[ 699.729380] ? kasan_check_write+0x14/0x20
[ 699.729391] ? _raw_spin_lock+0x17/0x40
[ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.729413] ? iov_iter_advance+0x113/0x640
[ 699.729418] ? f2fs_write_end+0x133/0x2e0
[ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.729428] f2fs_write_data_pages+0x329/0x520
[ 699.729433] ? generic_perform_write+0x250/0x320
[ 699.729438] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729454] ? current_time+0x110/0x110
[ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.729464] do_writepages+0x37/0xb0
[ 699.729468] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729472] ? do_writepages+0x37/0xb0
[ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.729496] ? __vfs_write+0x2b2/0x410
[ 699.729501] file_write_and_wait_range+0x66/0xb0
[ 699.729506] f2fs_do_sync_file+0x1f9/0xd90
[ 699.729511] ? truncate_partial_data_page+0x290/0x290
[ 699.729521] ? __sb_end_write+0x30/0x50
[ 699.729526] ? vfs_write+0x20f/0x260
[ 699.729530] f2fs_sync_file+0x9a/0xb0
[ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.729548] vfs_fsync_range+0x68/0x100
[ 699.729554] ? __fget_light+0xc9/0xe0
[ 699.729558] do_fsync+0x3d/0x70
[ 699.729562] __x64_sys_fdatasync+0x24/0x30
[ 699.729585] do_syscall_64+0x78/0x170
[ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.729613] RIP: 0033:0x7f9bf930d800
[ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[ 699.729782] ------------[ cut here ]------------
[ 699.729785] kernel BUG at fs/f2fs/segment.h:654!
[ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4
[ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.752874] Call Trace:
[ 699.753386] ? f2fs_inplace_write_data+0x93/0x240
[ 699.754341] f2fs_inplace_write_data+0xd2/0x240
[ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.758209] ? __radix_tree_replace+0xa3/0x120
[ 699.759164] __write_data_page+0x5c7/0xe30
[ 699.760002] ? kasan_check_read+0x11/0x20
[ 699.760823] ? page_mapped+0x8a/0x110
[ 699.761573] ? page_mkclean+0xe9/0x160
[ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.763332] ? invalid_page_referenced_vma+0x130/0x130
[ 699.764374] ? clear_page_dirty_for_io+0x332/0x450
[ 699.765347] f2fs_write_cache_pages+0x4ca/0x860
[ 699.766276] ? __write_data_page+0xe30/0xe30
[ 699.767161] ? percpu_counter_add_batch+0x22/0xa0
[ 699.768112] ? kasan_check_write+0x14/0x20
[ 699.768951] ? _raw_spin_lock+0x17/0x40
[ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.770885] ? iov_iter_advance+0x113/0x640
[ 699.771743] ? f2fs_write_end+0x133/0x2e0
[ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.773680] f2fs_write_data_pages+0x329/0x520
[ 699.774603] ? generic_perform_write+0x250/0x320
[ 699.775544] ? f2fs_write_cache_pages+0x860/0x860
[ 699.776510] ? current_time+0x110/0x110
[ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.778279] do_writepages+0x37/0xb0
[ 699.779026] ? f2fs_write_cache_pages+0x860/0x860
[ 699.779978] ? do_writepages+0x37/0xb0
[ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.782820] ? __vfs_write+0x2b2/0x410
[ 699.783597] file_write_and_wait_range+0x66/0xb0
[ 699.784540] f2fs_do_sync_file+0x1f9/0xd90
[ 699.785381] ? truncate_partial_data_page+0x290/0x290
[ 699.786415] ? __sb_end_write+0x30/0x50
[ 699.787204] ? vfs_write+0x20f/0x260
[ 699.787941] f2fs_sync_file+0x9a/0xb0
[ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.789572] vfs_fsync_range+0x68/0x100
[ 699.790360] ? __fget_light+0xc9/0xe0
[ 699.791128] do_fsync+0x3d/0x70
[ 699.791779] __x64_sys_fdatasync+0x24/0x30
[ 699.792614] do_syscall_64+0x78/0x170
[ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.794406] RIP: 0033:0x7f9bf930d800
[ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.835556] ==================================================================
[ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309
[ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4
[ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.843475] Call Trace:
[ 699.843982] dump_stack+0x7b/0xb5
[ 699.844661] print_address_description+0x70/0x290
[ 699.845607] kasan_report+0x291/0x390
[ 699.846351] ? update_stack_state+0x38c/0x3e0
[ 699.853831] __asan_load8+0x54/0x90
[ 699.854569] update_stack_state+0x38c/0x3e0
[ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20
[ 699.856601] ? __save_stack_trace+0x5e/0x100
[ 699.857476] unwind_next_frame.part.5+0x18e/0x490
[ 699.858448] ? unwind_dump+0x290/0x290
[ 699.859217] ? clear_page_dirty_for_io+0x332/0x450
[ 699.860185] __unwind_start+0x106/0x190
[ 699.860974] __save_stack_trace+0x5e/0x100
[ 699.861808] ? __save_stack_trace+0x5e/0x100
[ 699.862691] ? unlink_anon_vmas+0xba/0x2c0
[ 699.863525] save_stack_trace+0x1f/0x30
[ 699.864312] save_stack+0x46/0xd0
[ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420
[ 699.865990] ? flush_tlb_mm_range+0x15e/0x220
[ 699.866889] ? kasan_check_write+0x14/0x20
[ 699.867724] ? __dec_node_state+0x92/0xb0
[ 699.868543] ? lock_page_memcg+0x85/0xf0
[ 699.869350] ? unlock_page_memcg+0x16/0x80
[ 699.870185] ? page_remove_rmap+0x198/0x520
[ 699.871048] ? mark_page_accessed+0x133/0x200
[ 699.871930] ? _cond_resched+0x1a/0x50
[ 699.872700] ? unmap_page_range+0xcd4/0xe50
[ 699.873551] ? rb_next+0x58/0x80
[ 699.874217] ? rb_next+0x58/0x80
[ 699.874895] __kasan_slab_free+0x13c/0x1a0
[ 699.875734] ? unlink_anon_vmas+0xba/0x2c0
[ 699.876563] kasan_slab_free+0xe/0x10
[ 699.877315] kmem_cache_free+0x89/0x1e0
[ 699.878095] unlink_anon_vmas+0xba/0x2c0
[ 699.878913] free_pgtables+0x101/0x1b0
[ 699.879677] exit_mmap+0x146/0x2a0
[ 699.880378] ? __ia32_sys_munmap+0x50/0x50
[ 699.881214] ? kasan_check_read+0x11/0x20
[ 699.882052] ? mm_update_next_owner+0x322/0x380
[ 699.882985] mmput+0x8b/0x1d0
[ 699.883602] do_exit+0x43a/0x1390
[ 699.884288] ? mm_update_next_owner+0x380/0x380
[ 699.885212] ? f2fs_sync_file+0x9a/0xb0
[ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.886877] ? vfs_fsync_range+0x68/0x100
[ 699.887694] ? __fget_light+0xc9/0xe0
[ 699.888442] ? do_fsync+0x3d/0x70
[ 699.889118] ? __x64_sys_fdatasync+0x24/0x30
[ 699.889996] rewind_stack_do_exit+0x17/0x20
[ 699.890860] RIP: 0033:0x7f9bf930d800
[ 699.891585] Code: Bad RIP value.
[ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.901241] The buggy address belongs to the page:
[ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[ 699.903811] flags: 0x2ffff0000000000()
[ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[ 699.907673] page dumped because: kasan: bad access detected
[ 699.909108] Memory state around the buggy address:
[ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[ 699.914392] ^
[ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[ 699.918634] ==================================================================
- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644
Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-01 19:13:44 +08:00
|
|
|
|
2014-12-24 16:08:14 +08:00
|
|
|
trace_f2fs_submit_page_bio(page, fio);
|
2015-04-24 05:38:15 +08:00
|
|
|
f2fs_trace_ios(fio, 0);
|
2013-11-30 11:51:14 +08:00
|
|
|
|
|
|
|
/* Allocate a new bio */
|
2019-08-23 17:58:35 +08:00
|
|
|
bio = __bio_alloc(fio, 1);
|
2013-11-30 11:51:14 +08:00
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
|
2013-11-30 11:51:14 +08:00
|
|
|
bio_put(bio);
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
2018-10-22 23:24:28 +08:00
|
|
|
|
|
|
|
if (fio->io_wbc && !is_read_io(fio->op))
|
2024-06-12 13:13:20 +08:00
|
|
|
wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
|
2018-10-22 23:24:28 +08:00
|
|
|
|
2016-06-06 03:31:55 +08:00
|
|
|
bio_set_op_attrs(bio, fio->op, fio->op_flags);
|
2013-11-30 11:51:14 +08:00
|
|
|
|
2018-10-17 01:20:53 +08:00
|
|
|
inc_page_count(fio->sbi, is_read_io(fio->op) ?
|
|
|
|
__read_io_type(page): WB_DATA_TYPE(fio->page));
|
2018-10-22 09:12:51 +08:00
|
|
|
|
|
|
|
__submit_bio(fio->sbi, bio, fio->type);
|
2013-11-30 11:51:14 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2019-07-12 16:55:41 +08:00
|
|
|
static bool page_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
|
|
|
|
block_t last_blkaddr, block_t cur_blkaddr)
|
|
|
|
{
|
|
|
|
if (last_blkaddr + 1 != cur_blkaddr)
|
|
|
|
return false;
|
|
|
|
return __same_bdev(sbi, cur_blkaddr, bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool io_type_is_mergeable(struct f2fs_bio_info *io,
|
|
|
|
struct f2fs_io_info *fio)
|
|
|
|
{
|
|
|
|
if (io->fio.op != fio->op)
|
|
|
|
return false;
|
|
|
|
return io->fio.op_flags == fio->op_flags;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool io_is_mergeable(struct f2fs_sb_info *sbi, struct bio *bio,
|
|
|
|
struct f2fs_bio_info *io,
|
|
|
|
struct f2fs_io_info *fio,
|
|
|
|
block_t last_blkaddr,
|
|
|
|
block_t cur_blkaddr)
|
|
|
|
{
|
2019-07-12 16:55:42 +08:00
|
|
|
if (F2FS_IO_ALIGNED(sbi) && (fio->type == DATA || fio->type == NODE)) {
|
|
|
|
unsigned int filled_blocks =
|
|
|
|
F2FS_BYTES_TO_BLK(bio->bi_iter.bi_size);
|
|
|
|
unsigned int io_size = F2FS_IO_SIZE(sbi);
|
|
|
|
unsigned int left_vecs = bio->bi_max_vecs - bio->bi_vcnt;
|
|
|
|
|
|
|
|
/* IOs in bio is aligned and left space of vectors is not enough */
|
|
|
|
if (!(filled_blocks % io_size) && left_vecs < io_size)
|
|
|
|
return false;
|
|
|
|
}
|
2019-07-12 16:55:41 +08:00
|
|
|
if (!page_is_mergeable(sbi, bio, last_blkaddr, cur_blkaddr))
|
|
|
|
return false;
|
|
|
|
return io_type_is_mergeable(io, fio);
|
|
|
|
}
|
|
|
|
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
int f2fs_merge_page_bio(struct f2fs_io_info *fio)
|
|
|
|
{
|
|
|
|
struct bio *bio = *fio->bio;
|
|
|
|
struct page *page = fio->encrypted_page ?
|
|
|
|
fio->encrypted_page : fio->page;
|
|
|
|
|
|
|
|
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->new_blkaddr,
|
|
|
|
__is_meta_io(fio) ? META_GENERIC : DATA_GENERIC))
|
2019-06-20 11:36:14 +08:00
|
|
|
return -EFSCORRUPTED;
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
|
|
|
|
trace_f2fs_submit_page_bio(page, fio);
|
|
|
|
f2fs_trace_ios(fio, 0);
|
|
|
|
|
2019-07-12 16:55:41 +08:00
|
|
|
if (bio && !page_is_mergeable(fio->sbi, bio, *fio->last_block,
|
|
|
|
fio->new_blkaddr)) {
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
__submit_bio(fio->sbi, bio, fio->type);
|
|
|
|
bio = NULL;
|
|
|
|
}
|
|
|
|
alloc_new:
|
|
|
|
if (!bio) {
|
2019-08-23 17:58:35 +08:00
|
|
|
bio = __bio_alloc(fio, BIO_MAX_PAGES);
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
bio_set_op_attrs(bio, fio->op, fio->op_flags);
|
|
|
|
}
|
|
|
|
|
|
|
|
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
|
|
|
|
__submit_bio(fio->sbi, bio, fio->type);
|
|
|
|
bio = NULL;
|
|
|
|
goto alloc_new;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (fio->io_wbc)
|
2024-06-12 13:13:20 +08:00
|
|
|
wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
|
|
|
|
inc_page_count(fio->sbi, WB_DATA_TYPE(page));
|
|
|
|
|
|
|
|
*fio->last_block = fio->new_blkaddr;
|
|
|
|
*fio->bio = bio;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void f2fs_submit_ipu_bio(struct f2fs_sb_info *sbi, struct bio **bio,
|
|
|
|
struct page *page)
|
|
|
|
{
|
|
|
|
if (!bio)
|
|
|
|
return;
|
|
|
|
|
|
|
|
if (!__has_merged_page(*bio, NULL, page, 0))
|
|
|
|
return;
|
|
|
|
|
|
|
|
__submit_bio(sbi, *bio, DATA);
|
|
|
|
*bio = NULL;
|
|
|
|
}
|
|
|
|
|
2018-05-28 23:47:18 +08:00
|
|
|
void f2fs_submit_page_write(struct f2fs_io_info *fio)
|
2013-11-30 11:51:14 +08:00
|
|
|
{
|
2015-04-24 05:38:15 +08:00
|
|
|
struct f2fs_sb_info *sbi = fio->sbi;
|
2013-12-11 12:54:01 +08:00
|
|
|
enum page_type btype = PAGE_TYPE_OF_BIO(fio->type);
|
2017-05-11 02:18:25 +08:00
|
|
|
struct f2fs_bio_info *io = sbi->write_io[btype] + fio->temp;
|
2015-04-24 03:04:33 +08:00
|
|
|
struct page *bio_page;
|
2013-11-30 11:51:14 +08:00
|
|
|
|
2017-05-11 02:28:38 +08:00
|
|
|
f2fs_bug_on(sbi, is_read_io(fio->op));
|
2013-11-30 11:51:14 +08:00
|
|
|
|
2017-05-19 23:37:01 +08:00
|
|
|
down_write(&io->io_rwsem);
|
|
|
|
next:
|
|
|
|
if (fio->in_list) {
|
|
|
|
spin_lock(&io->io_lock);
|
|
|
|
if (list_empty(&io->io_list)) {
|
|
|
|
spin_unlock(&io->io_lock);
|
2018-05-28 23:47:18 +08:00
|
|
|
goto out;
|
2017-05-19 23:37:01 +08:00
|
|
|
}
|
|
|
|
fio = list_first_entry(&io->io_list,
|
|
|
|
struct f2fs_io_info, list);
|
|
|
|
list_del(&fio->list);
|
|
|
|
spin_unlock(&io->io_lock);
|
|
|
|
}
|
2013-11-30 11:51:14 +08:00
|
|
|
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
verify_fio_blkaddr(fio);
|
2013-11-30 11:51:14 +08:00
|
|
|
|
f2fs: don't wait writeback for datas during checkpoint
Normally, while committing checkpoint, we will wait on all pages to be
writebacked no matter the page is data or metadata, so in scenario where
there are lots of data IO being submitted with metadata, we may suffer
long latency for waiting writeback during checkpoint.
Indeed, we only care about persistence for pages with metadata, but not
pages with data, as file system consistent are only related to metadate,
so in order to avoid encountering long latency in above scenario, let's
recognize and reference metadata in submitted IOs, wait writeback only
for metadatas.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-11-16 10:41:20 +08:00
|
|
|
bio_page = fio->encrypted_page ? fio->encrypted_page : fio->page;
|
|
|
|
|
2017-10-07 22:02:21 +08:00
|
|
|
/* set submitted = true as a return value */
|
|
|
|
fio->submitted = true;
|
2017-02-04 09:44:04 +08:00
|
|
|
|
2017-05-11 02:28:38 +08:00
|
|
|
inc_page_count(sbi, WB_DATA_TYPE(bio_page));
|
2013-11-30 11:51:14 +08:00
|
|
|
|
2019-07-12 16:55:41 +08:00
|
|
|
if (io->bio && !io_is_mergeable(sbi, io->bio, io, fio,
|
|
|
|
io->last_block_in_bio, fio->new_blkaddr))
|
2013-12-11 12:54:01 +08:00
|
|
|
__submit_merged_bio(io);
|
2013-11-30 11:51:14 +08:00
|
|
|
alloc_new:
|
|
|
|
if (io->bio == NULL) {
|
2019-08-28 17:33:38 +08:00
|
|
|
if (F2FS_IO_ALIGNED(sbi) &&
|
|
|
|
(fio->type == DATA || fio->type == NODE) &&
|
2016-12-15 02:12:56 +08:00
|
|
|
fio->new_blkaddr & F2FS_IO_SIZE_MASK(sbi)) {
|
2017-05-11 02:28:38 +08:00
|
|
|
dec_page_count(sbi, WB_DATA_TYPE(bio_page));
|
2018-05-28 23:47:18 +08:00
|
|
|
fio->retry = true;
|
|
|
|
goto skip;
|
2016-12-15 02:12:56 +08:00
|
|
|
}
|
2019-08-23 17:58:35 +08:00
|
|
|
io->bio = __bio_alloc(fio, BIO_MAX_PAGES);
|
2013-12-11 12:54:01 +08:00
|
|
|
io->fio = *fio;
|
2013-11-30 11:51:14 +08:00
|
|
|
}
|
|
|
|
|
2017-05-11 02:18:25 +08:00
|
|
|
if (bio_add_page(io->bio, bio_page, PAGE_SIZE, 0) < PAGE_SIZE) {
|
2013-12-11 12:54:01 +08:00
|
|
|
__submit_merged_bio(io);
|
2013-11-30 11:51:14 +08:00
|
|
|
goto alloc_new;
|
|
|
|
}
|
|
|
|
|
2018-01-09 19:33:39 +08:00
|
|
|
if (fio->io_wbc)
|
2024-06-12 13:13:20 +08:00
|
|
|
wbc_account_cgroup_owner(fio->io_wbc, fio->page, PAGE_SIZE);
|
2018-01-09 19:33:39 +08:00
|
|
|
|
f2fs: trace old block address for CoWed page
This patch enables to trace old block address of CoWed page for better
debugging.
f2fs_submit_page_mbio: dev = (1,0), ino = 1, page_index = 0x1d4f0, oldaddr = 0xfe8ab, newaddr = 0xfee90 rw = WRITE_SYNC, type = NODE
f2fs_submit_page_mbio: dev = (1,0), ino = 1, page_index = 0x1d4f8, oldaddr = 0xfe8b0, newaddr = 0xfee91 rw = WRITE_SYNC, type = NODE
f2fs_submit_page_mbio: dev = (1,0), ino = 1, page_index = 0x1d4fa, oldaddr = 0xfe8ae, newaddr = 0xfee92 rw = WRITE_SYNC, type = NODE
f2fs_submit_page_mbio: dev = (1,0), ino = 134824, page_index = 0x96, oldaddr = 0xf049b, newaddr = 0x2bbe rw = WRITE, type = DATA
f2fs_submit_page_mbio: dev = (1,0), ino = 134824, page_index = 0x97, oldaddr = 0xf049c, newaddr = 0x2bbf rw = WRITE, type = DATA
f2fs_submit_page_mbio: dev = (1,0), ino = 134824, page_index = 0x98, oldaddr = 0xf049d, newaddr = 0x2bc0 rw = WRITE, type = DATA
f2fs_submit_page_mbio: dev = (1,0), ino = 135260, page_index = 0x47, oldaddr = 0xffffffff, newaddr = 0xf2631 rw = WRITE, type = DATA
f2fs_submit_page_mbio: dev = (1,0), ino = 135260, page_index = 0x48, oldaddr = 0xffffffff, newaddr = 0xf2632 rw = WRITE, type = DATA
f2fs_submit_page_mbio: dev = (1,0), ino = 135260, page_index = 0x49, oldaddr = 0xffffffff, newaddr = 0xf2633 rw = WRITE, type = DATA
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-02-22 18:36:38 +08:00
|
|
|
io->last_block_in_bio = fio->new_blkaddr;
|
2015-04-24 05:38:15 +08:00
|
|
|
f2fs_trace_ios(fio, 0);
|
2017-05-19 23:37:01 +08:00
|
|
|
|
|
|
|
trace_f2fs_submit_page_write(fio->page, fio);
|
2018-05-28 23:47:18 +08:00
|
|
|
skip:
|
2017-05-19 23:37:01 +08:00
|
|
|
if (fio->in_list)
|
|
|
|
goto next;
|
2018-05-28 23:47:18 +08:00
|
|
|
out:
|
2018-08-21 10:21:43 +08:00
|
|
|
if (is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
|
2019-08-23 17:58:36 +08:00
|
|
|
!f2fs_is_checkpoint_ready(sbi))
|
2018-09-07 02:40:12 +08:00
|
|
|
__submit_merged_bio(io);
|
2014-03-22 14:57:23 +08:00
|
|
|
up_write(&io->io_rwsem);
|
2013-11-30 11:51:14 +08:00
|
|
|
}
|
|
|
|
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
static inline bool f2fs_need_verity(const struct inode *inode, pgoff_t idx)
|
|
|
|
{
|
|
|
|
return fsverity_active(inode) &&
|
|
|
|
idx < DIV_ROUND_UP(inode->i_size, PAGE_SIZE);
|
|
|
|
}
|
|
|
|
|
2017-09-07 12:04:44 +08:00
|
|
|
static struct bio *f2fs_grab_read_bio(struct inode *inode, block_t blkaddr,
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
unsigned nr_pages, unsigned op_flag,
|
|
|
|
pgoff_t first_idx)
|
2017-09-07 12:04:44 +08:00
|
|
|
{
|
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
|
|
|
struct bio *bio;
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
struct bio_post_read_ctx *ctx;
|
|
|
|
unsigned int post_read_steps = 0;
|
2017-09-07 12:04:44 +08:00
|
|
|
|
2017-10-28 16:52:31 +08:00
|
|
|
bio = f2fs_bio_alloc(sbi, min_t(int, nr_pages, BIO_MAX_PAGES), false);
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
if (!bio)
|
2017-09-07 12:04:44 +08:00
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
f2fs_target_device(sbi, blkaddr, bio);
|
|
|
|
bio->bi_end_io = f2fs_read_end_io;
|
2018-06-22 02:29:43 +08:00
|
|
|
bio_set_op_attrs(bio, REQ_OP_READ, op_flag);
|
2017-09-07 12:04:44 +08:00
|
|
|
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
if (f2fs_encrypted_file(inode))
|
|
|
|
post_read_steps |= 1 << STEP_DECRYPT;
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
|
|
|
|
if (f2fs_need_verity(inode, first_idx))
|
|
|
|
post_read_steps |= 1 << STEP_VERITY;
|
|
|
|
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
if (post_read_steps) {
|
|
|
|
ctx = mempool_alloc(bio_post_read_ctx_pool, GFP_NOFS);
|
|
|
|
if (!ctx) {
|
|
|
|
bio_put(bio);
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
}
|
|
|
|
ctx->bio = bio;
|
|
|
|
ctx->enabled_steps = post_read_steps;
|
|
|
|
bio->bi_private = ctx;
|
|
|
|
}
|
|
|
|
|
2017-09-07 12:04:44 +08:00
|
|
|
return bio;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* This can handle encryption stuffs */
|
|
|
|
static int f2fs_submit_page_read(struct inode *inode, struct page *page,
|
|
|
|
block_t blkaddr)
|
|
|
|
{
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
|
|
|
struct bio *bio;
|
2017-09-07 12:04:44 +08:00
|
|
|
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
bio = f2fs_grab_read_bio(inode, blkaddr, 1, 0, page->index);
|
2017-09-07 12:04:44 +08:00
|
|
|
if (IS_ERR(bio))
|
|
|
|
return PTR_ERR(bio);
|
|
|
|
|
2018-08-23 12:18:00 +08:00
|
|
|
/* wait for GCed page writeback via META_MAPPING */
|
|
|
|
f2fs_wait_on_block_writeback(inode, blkaddr);
|
|
|
|
|
2017-09-07 12:04:44 +08:00
|
|
|
if (bio_add_page(bio, page, PAGE_SIZE, 0) < PAGE_SIZE) {
|
|
|
|
bio_put(bio);
|
|
|
|
return -EFAULT;
|
|
|
|
}
|
2018-09-26 04:54:33 +08:00
|
|
|
ClearPageError(page);
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
inc_page_count(sbi, F2FS_RD_DATA);
|
|
|
|
__submit_bio(sbi, bio, DATA);
|
2017-09-07 12:04:44 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-09 19:56:30 +08:00
|
|
|
static void __set_data_blkaddr(struct dnode_of_data *dn)
|
|
|
|
{
|
|
|
|
struct f2fs_node *rn = F2FS_NODE(dn->node_page);
|
|
|
|
__le32 *addr_array;
|
2017-07-19 00:19:06 +08:00
|
|
|
int base = 0;
|
|
|
|
|
|
|
|
if (IS_INODE(dn->node_page) && f2fs_has_extra_attr(dn->inode))
|
|
|
|
base = get_extra_isize(dn->inode);
|
2016-05-09 19:56:30 +08:00
|
|
|
|
|
|
|
/* Get physical address of data block */
|
|
|
|
addr_array = blkaddr_in_node(rn);
|
2017-07-19 00:19:06 +08:00
|
|
|
addr_array[base + dn->ofs_in_node] = cpu_to_le32(dn->data_blkaddr);
|
2016-05-09 19:56:30 +08:00
|
|
|
}
|
|
|
|
|
2012-11-29 12:28:09 +08:00
|
|
|
/*
|
2012-11-02 16:10:12 +08:00
|
|
|
* Lock ordering for the change of data block address:
|
|
|
|
* ->data_page
|
|
|
|
* ->node_page
|
|
|
|
* update block addresses in the node page
|
|
|
|
*/
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
void f2fs_set_data_blkaddr(struct dnode_of_data *dn)
|
2012-11-02 16:10:12 +08:00
|
|
|
{
|
2018-12-25 17:43:42 +08:00
|
|
|
f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
|
2016-05-09 19:56:30 +08:00
|
|
|
__set_data_blkaddr(dn);
|
|
|
|
if (set_page_dirty(dn->node_page))
|
2016-01-08 05:23:12 +08:00
|
|
|
dn->node_changed = true;
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
|
|
|
|
2016-02-24 17:16:47 +08:00
|
|
|
void f2fs_update_data_blkaddr(struct dnode_of_data *dn, block_t blkaddr)
|
|
|
|
{
|
|
|
|
dn->data_blkaddr = blkaddr;
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_set_data_blkaddr(dn);
|
2016-02-24 17:16:47 +08:00
|
|
|
f2fs_update_extent_cache(dn);
|
|
|
|
}
|
|
|
|
|
2016-05-09 19:56:30 +08:00
|
|
|
/* dn->ofs_in_node will be returned with up-to-date last block pointer */
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
int f2fs_reserve_new_blocks(struct dnode_of_data *dn, blkcnt_t count)
|
2012-11-02 16:10:12 +08:00
|
|
|
{
|
2014-09-03 06:31:18 +08:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
|
2017-07-09 00:13:07 +08:00
|
|
|
int err;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
2016-05-09 19:56:30 +08:00
|
|
|
if (!count)
|
|
|
|
return 0;
|
|
|
|
|
2016-05-21 01:13:22 +08:00
|
|
|
if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
|
2012-11-02 16:10:12 +08:00
|
|
|
return -EPERM;
|
2017-07-09 00:13:07 +08:00
|
|
|
if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
|
|
|
|
return err;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
2016-05-09 19:56:30 +08:00
|
|
|
trace_f2fs_reserve_new_blocks(dn->inode, dn->nid,
|
|
|
|
dn->ofs_in_node, count);
|
|
|
|
|
2018-12-25 17:43:42 +08:00
|
|
|
f2fs_wait_on_page_writeback(dn->node_page, NODE, true, true);
|
2016-05-09 19:56:30 +08:00
|
|
|
|
|
|
|
for (; count > 0; dn->ofs_in_node++) {
|
2017-07-19 00:19:06 +08:00
|
|
|
block_t blkaddr = datablock_addr(dn->inode,
|
|
|
|
dn->node_page, dn->ofs_in_node);
|
2016-05-09 19:56:30 +08:00
|
|
|
if (blkaddr == NULL_ADDR) {
|
|
|
|
dn->data_blkaddr = NEW_ADDR;
|
|
|
|
__set_data_blkaddr(dn);
|
|
|
|
count--;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
if (set_page_dirty(dn->node_page))
|
|
|
|
dn->node_changed = true;
|
2012-11-02 16:10:12 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-05-09 19:56:30 +08:00
|
|
|
/* Should keep dn->ofs_in_node unchanged */
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
int f2fs_reserve_new_block(struct dnode_of_data *dn)
|
2016-05-09 19:56:30 +08:00
|
|
|
{
|
|
|
|
unsigned int ofs_in_node = dn->ofs_in_node;
|
|
|
|
int ret;
|
|
|
|
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
ret = f2fs_reserve_new_blocks(dn, 1);
|
2016-05-09 19:56:30 +08:00
|
|
|
dn->ofs_in_node = ofs_in_node;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2013-11-10 23:13:18 +08:00
|
|
|
int f2fs_reserve_block(struct dnode_of_data *dn, pgoff_t index)
|
|
|
|
{
|
|
|
|
bool need_put = dn->inode_page ? false : true;
|
|
|
|
int err;
|
|
|
|
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
err = f2fs_get_dnode_of_data(dn, index, ALLOC_NODE);
|
2013-11-10 23:13:18 +08:00
|
|
|
if (err)
|
|
|
|
return err;
|
2013-12-27 16:04:17 +08:00
|
|
|
|
2013-11-10 23:13:18 +08:00
|
|
|
if (dn->data_blkaddr == NULL_ADDR)
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
err = f2fs_reserve_new_block(dn);
|
2013-12-27 16:04:17 +08:00
|
|
|
if (err || need_put)
|
2013-11-10 23:13:18 +08:00
|
|
|
f2fs_put_dnode(dn);
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2015-08-05 15:52:16 +08:00
|
|
|
int f2fs_get_block(struct dnode_of_data *dn, pgoff_t index)
|
2012-11-02 16:10:12 +08:00
|
|
|
{
|
2017-02-23 17:18:05 +08:00
|
|
|
struct extent_info ei = {0,0,0};
|
2015-08-05 15:52:16 +08:00
|
|
|
struct inode *inode = dn->inode;
|
2015-03-19 19:26:02 +08:00
|
|
|
|
2015-08-05 15:52:16 +08:00
|
|
|
if (f2fs_lookup_extent_cache(inode, index, &ei)) {
|
|
|
|
dn->data_blkaddr = ei.blk + index - ei.fofs;
|
|
|
|
return 0;
|
2015-02-05 17:54:31 +08:00
|
|
|
}
|
2015-03-19 19:26:02 +08:00
|
|
|
|
2015-08-05 15:52:16 +08:00
|
|
|
return f2fs_reserve_block(dn, index);
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
|
|
|
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
struct page *f2fs_get_read_data_page(struct inode *inode, pgoff_t index,
|
2016-06-06 03:31:55 +08:00
|
|
|
int op_flags, bool for_write)
|
2012-11-02 16:10:12 +08:00
|
|
|
{
|
|
|
|
struct address_space *mapping = inode->i_mapping;
|
|
|
|
struct dnode_of_data dn;
|
|
|
|
struct page *page;
|
2017-02-23 17:18:05 +08:00
|
|
|
struct extent_info ei = {0,0,0};
|
2012-11-02 16:10:12 +08:00
|
|
|
int err;
|
2015-04-24 03:04:33 +08:00
|
|
|
|
2015-10-10 06:11:38 +08:00
|
|
|
page = f2fs_grab_cache_page(mapping, index, for_write);
|
2013-05-13 07:38:35 +08:00
|
|
|
if (!page)
|
|
|
|
return ERR_PTR(-ENOMEM);
|
|
|
|
|
2015-02-05 18:03:40 +08:00
|
|
|
if (f2fs_lookup_extent_cache(inode, index, &ei)) {
|
|
|
|
dn.data_blkaddr = ei.blk + index - ei.fofs;
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), dn.data_blkaddr,
|
|
|
|
DATA_GENERIC_ENHANCE_READ)) {
|
2019-06-20 11:36:14 +08:00
|
|
|
err = -EFSCORRUPTED;
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
goto put_err;
|
|
|
|
}
|
2015-02-05 18:03:40 +08:00
|
|
|
goto got_it;
|
|
|
|
}
|
|
|
|
|
2012-11-02 16:10:12 +08:00
|
|
|
set_new_dnode(&dn, inode, NULL, NULL, 0);
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
|
2015-07-16 04:08:21 +08:00
|
|
|
if (err)
|
|
|
|
goto put_err;
|
2012-11-02 16:10:12 +08:00
|
|
|
f2fs_put_dnode(&dn);
|
|
|
|
|
2013-12-06 14:00:58 +08:00
|
|
|
if (unlikely(dn.data_blkaddr == NULL_ADDR)) {
|
2015-07-16 04:08:21 +08:00
|
|
|
err = -ENOENT;
|
|
|
|
goto put_err;
|
2013-05-13 07:38:35 +08:00
|
|
|
}
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
if (dn.data_blkaddr != NEW_ADDR &&
|
|
|
|
!f2fs_is_valid_blkaddr(F2FS_I_SB(inode),
|
|
|
|
dn.data_blkaddr,
|
|
|
|
DATA_GENERIC_ENHANCE)) {
|
2019-06-20 11:36:14 +08:00
|
|
|
err = -EFSCORRUPTED;
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
goto put_err;
|
|
|
|
}
|
2015-02-05 18:03:40 +08:00
|
|
|
got_it:
|
2015-05-01 08:00:33 +08:00
|
|
|
if (PageUptodate(page)) {
|
|
|
|
unlock_page(page);
|
2012-11-02 16:10:12 +08:00
|
|
|
return page;
|
2015-05-01 08:00:33 +08:00
|
|
|
}
|
2012-11-02 16:10:12 +08:00
|
|
|
|
2013-08-20 18:13:07 +08:00
|
|
|
/*
|
|
|
|
* A new dentry page is allocated but not able to be written, since its
|
|
|
|
* new inode page couldn't be allocated due to -ENOSPC.
|
|
|
|
* In such the case, its blkaddr can be remained as NEW_ADDR.
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
* see, f2fs_add_link -> f2fs_get_new_data_page ->
|
|
|
|
* f2fs_init_inode_metadata.
|
2013-08-20 18:13:07 +08:00
|
|
|
*/
|
|
|
|
if (dn.data_blkaddr == NEW_ADDR) {
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
zero_user_segment(page, 0, PAGE_SIZE);
|
2016-07-01 09:49:15 +08:00
|
|
|
if (!PageUptodate(page))
|
|
|
|
SetPageUptodate(page);
|
2015-05-01 08:00:33 +08:00
|
|
|
unlock_page(page);
|
2013-08-20 18:13:07 +08:00
|
|
|
return page;
|
|
|
|
}
|
2012-11-02 16:10:12 +08:00
|
|
|
|
2017-09-07 12:04:44 +08:00
|
|
|
err = f2fs_submit_page_read(inode, page, dn.data_blkaddr);
|
2013-03-08 20:29:23 +08:00
|
|
|
if (err)
|
2015-07-16 04:08:21 +08:00
|
|
|
goto put_err;
|
2015-05-01 08:00:33 +08:00
|
|
|
return page;
|
2015-07-16 04:08:21 +08:00
|
|
|
|
|
|
|
put_err:
|
|
|
|
f2fs_put_page(page, 1);
|
|
|
|
return ERR_PTR(err);
|
2015-05-01 08:00:33 +08:00
|
|
|
}
|
|
|
|
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
struct page *f2fs_find_data_page(struct inode *inode, pgoff_t index)
|
2015-05-01 08:00:33 +08:00
|
|
|
{
|
|
|
|
struct address_space *mapping = inode->i_mapping;
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
page = find_get_page(mapping, index);
|
|
|
|
if (page && PageUptodate(page))
|
|
|
|
return page;
|
|
|
|
f2fs_put_page(page, 0);
|
|
|
|
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
page = f2fs_get_read_data_page(inode, index, 0, false);
|
2015-05-01 08:00:33 +08:00
|
|
|
if (IS_ERR(page))
|
|
|
|
return page;
|
|
|
|
|
|
|
|
if (PageUptodate(page))
|
|
|
|
return page;
|
|
|
|
|
|
|
|
wait_on_page_locked(page);
|
|
|
|
if (unlikely(!PageUptodate(page))) {
|
|
|
|
f2fs_put_page(page, 0);
|
|
|
|
return ERR_PTR(-EIO);
|
|
|
|
}
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If it tries to access a hole, return an error.
|
|
|
|
* Because, the callers, functions in dir.c and GC, should be able to know
|
|
|
|
* whether this page exists or not.
|
|
|
|
*/
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
struct page *f2fs_get_lock_data_page(struct inode *inode, pgoff_t index,
|
2015-10-10 06:11:38 +08:00
|
|
|
bool for_write)
|
2015-05-01 08:00:33 +08:00
|
|
|
{
|
|
|
|
struct address_space *mapping = inode->i_mapping;
|
|
|
|
struct page *page;
|
|
|
|
repeat:
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
page = f2fs_get_read_data_page(inode, index, 0, for_write);
|
2015-05-01 08:00:33 +08:00
|
|
|
if (IS_ERR(page))
|
|
|
|
return page;
|
2013-03-08 20:29:23 +08:00
|
|
|
|
2015-05-01 08:00:33 +08:00
|
|
|
/* wait for read completion */
|
2013-03-08 20:29:23 +08:00
|
|
|
lock_page(page);
|
2013-12-06 14:00:58 +08:00
|
|
|
if (unlikely(page->mapping != mapping)) {
|
2013-04-26 10:55:17 +08:00
|
|
|
f2fs_put_page(page, 1);
|
|
|
|
goto repeat;
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
2016-07-03 22:05:12 +08:00
|
|
|
if (unlikely(!PageUptodate(page))) {
|
|
|
|
f2fs_put_page(page, 1);
|
|
|
|
return ERR_PTR(-EIO);
|
|
|
|
}
|
2012-11-02 16:10:12 +08:00
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
2012-11-29 12:28:09 +08:00
|
|
|
/*
|
2012-11-02 16:10:12 +08:00
|
|
|
* Caller ensures that this data page is never allocated.
|
|
|
|
* A new zero-filled data page is allocated in the page cache.
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 15:21:29 +08:00
|
|
|
*
|
2013-12-21 18:02:14 +08:00
|
|
|
* Also, caller should grab and release a rwsem by calling f2fs_lock_op() and
|
|
|
|
* f2fs_unlock_op().
|
2015-07-14 18:14:06 +08:00
|
|
|
* Note that, ipage is set only by make_empty_dir, and if any error occur,
|
|
|
|
* ipage should be released by this function.
|
2012-11-02 16:10:12 +08:00
|
|
|
*/
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
struct page *f2fs_get_new_data_page(struct inode *inode,
|
2013-12-27 16:04:17 +08:00
|
|
|
struct page *ipage, pgoff_t index, bool new_i_size)
|
2012-11-02 16:10:12 +08:00
|
|
|
{
|
|
|
|
struct address_space *mapping = inode->i_mapping;
|
|
|
|
struct page *page;
|
|
|
|
struct dnode_of_data dn;
|
|
|
|
int err;
|
2016-01-02 14:03:47 +08:00
|
|
|
|
2015-10-10 06:11:38 +08:00
|
|
|
page = f2fs_grab_cache_page(mapping, index, true);
|
2015-07-14 18:14:06 +08:00
|
|
|
if (!page) {
|
|
|
|
/*
|
|
|
|
* before exiting, we should make sure ipage will be released
|
|
|
|
* if any error occur.
|
|
|
|
*/
|
|
|
|
f2fs_put_page(ipage, 1);
|
2015-04-30 02:18:42 +08:00
|
|
|
return ERR_PTR(-ENOMEM);
|
2015-07-14 18:14:06 +08:00
|
|
|
}
|
2012-11-02 16:10:12 +08:00
|
|
|
|
2013-12-27 16:04:17 +08:00
|
|
|
set_new_dnode(&dn, inode, ipage, NULL, 0);
|
2013-11-10 23:13:18 +08:00
|
|
|
err = f2fs_reserve_block(&dn, index);
|
2015-04-30 02:18:42 +08:00
|
|
|
if (err) {
|
|
|
|
f2fs_put_page(page, 1);
|
2012-11-02 16:10:12 +08:00
|
|
|
return ERR_PTR(err);
|
2013-12-27 16:04:17 +08:00
|
|
|
}
|
2015-04-30 02:18:42 +08:00
|
|
|
if (!ipage)
|
|
|
|
f2fs_put_dnode(&dn);
|
2012-11-02 16:10:12 +08:00
|
|
|
|
|
|
|
if (PageUptodate(page))
|
2015-04-30 02:18:42 +08:00
|
|
|
goto got_it;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
|
|
|
if (dn.data_blkaddr == NEW_ADDR) {
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
zero_user_segment(page, 0, PAGE_SIZE);
|
2016-07-01 09:49:15 +08:00
|
|
|
if (!PageUptodate(page))
|
|
|
|
SetPageUptodate(page);
|
2012-11-02 16:10:12 +08:00
|
|
|
} else {
|
2015-04-24 03:04:33 +08:00
|
|
|
f2fs_put_page(page, 1);
|
2013-12-27 16:04:17 +08:00
|
|
|
|
2016-01-02 14:03:47 +08:00
|
|
|
/* if ipage exists, blkaddr should be NEW_ADDR */
|
|
|
|
f2fs_bug_on(F2FS_I_SB(inode), ipage);
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
page = f2fs_get_lock_data_page(inode, index, true);
|
2015-04-24 03:04:33 +08:00
|
|
|
if (IS_ERR(page))
|
2016-01-02 14:03:47 +08:00
|
|
|
return page;
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
2015-04-30 02:18:42 +08:00
|
|
|
got_it:
|
2015-09-11 14:43:52 +08:00
|
|
|
if (new_i_size && i_size_read(inode) <
|
2016-05-21 07:32:49 +08:00
|
|
|
((loff_t)(index + 1) << PAGE_SHIFT))
|
2016-05-21 00:22:03 +08:00
|
|
|
f2fs_i_size_write(inode, ((loff_t)(index + 1) << PAGE_SHIFT));
|
2012-11-02 16:10:12 +08:00
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
2017-11-28 08:23:00 +08:00
|
|
|
static int __allocate_data_block(struct dnode_of_data *dn, int seg_type)
|
2013-12-16 18:04:05 +08:00
|
|
|
{
|
2014-09-03 06:31:18 +08:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(dn->inode);
|
2013-12-16 18:04:05 +08:00
|
|
|
struct f2fs_summary sum;
|
|
|
|
struct node_info ni;
|
f2fs: readahead encrypted block during GC
During GC, for each encrypted block, we will read block synchronously
into meta page, and then submit it into current cold data log area.
So this block read model with 4k granularity can make poor performance,
like migrating non-encrypted block, let's readahead encrypted block
as well to improve migration performance.
To implement this, we choose meta page that its index is old block
address of the encrypted block, and readahead ciphertext into this
page, later, if readaheaded page is still updated, we will load its
data into target meta page, and submit the write IO.
Note that for OPU, truncation, deletion, we need to invalid meta
page after we invalid old block address, to make sure we won't load
invalid data from target meta page during encrypted block migration.
for ((i = 0; i < 1000; i++))
do {
xfs_io -f /mnt/f2fs/dir/$i -c "pwrite 0 128k" -c "fsync";
} done
for ((i = 0; i < 1000; i+=2))
do {
rm /mnt/f2fs/dir/$i;
} done
ret = ioctl(fd, F2FS_IOC_GARBAGE_COLLECT, 0);
Before:
gc-6549 [001] d..1 214682.212797: block_rq_insert: 8,32 RA 32768 () 786400 + 64 [gc]
gc-6549 [001] d..1 214682.212802: block_unplug: [gc] 1
gc-6549 [001] .... 214682.213892: block_bio_queue: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213899: block_getrq: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213902: block_plug: [gc]
gc-6549 [001] d..1 214682.213905: block_rq_insert: 8,32 R 4096 () 67494144 + 8 [gc]
gc-6549 [001] d..1 214682.213908: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226405: block_bio_queue: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226412: block_getrq: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226414: block_plug: [gc]
gc-6549 [001] d..1 214682.226417: block_rq_insert: 8,32 R 4096 () 67494152 + 8 [gc]
gc-6549 [001] d..1 214682.226420: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226904: block_bio_queue: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226910: block_getrq: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226911: block_plug: [gc]
gc-6549 [001] d..1 214682.226914: block_rq_insert: 8,32 R 4096 () 67494160 + 8 [gc]
gc-6549 [001] d..1 214682.226916: block_unplug: [gc] 1
After:
gc-5678 [003] .... 214327.025906: block_bio_queue: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025908: block_bio_backmerge: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025915: block_bio_queue: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025917: block_bio_backmerge: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025923: block_bio_queue: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025925: block_bio_backmerge: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025932: block_bio_queue: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025934: block_bio_backmerge: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025941: block_bio_queue: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025943: block_bio_backmerge: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025953: block_bio_queue: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025955: block_bio_backmerge: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025962: block_bio_queue: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025964: block_bio_backmerge: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025970: block_bio_queue: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.025972: block_bio_backmerge: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.026000: block_bio_queue: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] .... 214327.026019: block_getrq: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] d..1 214327.026021: block_rq_insert: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] d..1 214327.026023: block_unplug: [gc] 1
gc-5678 [003] d..1 214327.026026: block_rq_issue: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] .... 214327.026046: block_plug: [gc]
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-14 22:37:25 +08:00
|
|
|
block_t old_blkaddr;
|
2016-05-09 19:56:30 +08:00
|
|
|
blkcnt_t count = 1;
|
2017-07-09 00:13:07 +08:00
|
|
|
int err;
|
2013-12-16 18:04:05 +08:00
|
|
|
|
2016-05-21 01:13:22 +08:00
|
|
|
if (unlikely(is_inode_flag_set(dn->inode, FI_NO_ALLOC)))
|
2013-12-16 18:04:05 +08:00
|
|
|
return -EPERM;
|
f2fs: preallocate fallocated blocks for direct IO
Normally, due to DIO_SKIP_HOLES flag is set by default, blockdev_direct_IO in
f2fs_direct_IO tries to skip DIO in holes when writing inside i_size, this
makes us falling back to buffered IO which shows lower performance.
So in commit 59b802e5a453 ("f2fs: allocate data blocks in advance for
f2fs_direct_IO"), we improve perfromance by allocating data blocks in advance
if we meet holes no matter in i_size or not, since with it we can avoid falling
back to buffered IO.
But we forget to consider for unwritten fallocated block in this commit.
This patch tries to fix it for fallocate case, this helps to improve
performance.
Test result:
Storage info: sandisk ultra 64G micro sd card.
touch /mnt/f2fs/file
truncate -s 67108864 /mnt/f2fs/file
fallocate -o 0 -l 67108864 /mnt/f2fs/file
time dd if=/dev/zero of=/mnt/f2fs/file bs=1M count=64 conv=notrunc oflag=direct
Time before applying the patch:
67108864 bytes (67 MB) copied, 36.16 s, 1.9 MB/s
real 0m36.162s
user 0m0.000s
sys 0m0.180s
Time after applying the patch:
67108864 bytes (67 MB) copied, 27.7776 s, 2.4 MB/s
real 0m27.780s
user 0m0.000s
sys 0m0.036s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-03-23 10:33:37 +08:00
|
|
|
|
2018-07-17 00:02:17 +08:00
|
|
|
err = f2fs_get_node_info(sbi, dn->nid, &ni);
|
|
|
|
if (err)
|
|
|
|
return err;
|
|
|
|
|
2017-07-19 00:19:06 +08:00
|
|
|
dn->data_blkaddr = datablock_addr(dn->inode,
|
|
|
|
dn->node_page, dn->ofs_in_node);
|
2018-09-27 18:34:52 +08:00
|
|
|
if (dn->data_blkaddr != NULL_ADDR)
|
f2fs: preallocate fallocated blocks for direct IO
Normally, due to DIO_SKIP_HOLES flag is set by default, blockdev_direct_IO in
f2fs_direct_IO tries to skip DIO in holes when writing inside i_size, this
makes us falling back to buffered IO which shows lower performance.
So in commit 59b802e5a453 ("f2fs: allocate data blocks in advance for
f2fs_direct_IO"), we improve perfromance by allocating data blocks in advance
if we meet holes no matter in i_size or not, since with it we can avoid falling
back to buffered IO.
But we forget to consider for unwritten fallocated block in this commit.
This patch tries to fix it for fallocate case, this helps to improve
performance.
Test result:
Storage info: sandisk ultra 64G micro sd card.
touch /mnt/f2fs/file
truncate -s 67108864 /mnt/f2fs/file
fallocate -o 0 -l 67108864 /mnt/f2fs/file
time dd if=/dev/zero of=/mnt/f2fs/file bs=1M count=64 conv=notrunc oflag=direct
Time before applying the patch:
67108864 bytes (67 MB) copied, 36.16 s, 1.9 MB/s
real 0m36.162s
user 0m0.000s
sys 0m0.180s
Time after applying the patch:
67108864 bytes (67 MB) copied, 27.7776 s, 2.4 MB/s
real 0m27.780s
user 0m0.000s
sys 0m0.036s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-03-23 10:33:37 +08:00
|
|
|
goto alloc;
|
|
|
|
|
2017-07-09 00:13:07 +08:00
|
|
|
if (unlikely((err = inc_valid_block_count(sbi, dn->inode, &count))))
|
|
|
|
return err;
|
2013-12-16 18:04:05 +08:00
|
|
|
|
f2fs: preallocate fallocated blocks for direct IO
Normally, due to DIO_SKIP_HOLES flag is set by default, blockdev_direct_IO in
f2fs_direct_IO tries to skip DIO in holes when writing inside i_size, this
makes us falling back to buffered IO which shows lower performance.
So in commit 59b802e5a453 ("f2fs: allocate data blocks in advance for
f2fs_direct_IO"), we improve perfromance by allocating data blocks in advance
if we meet holes no matter in i_size or not, since with it we can avoid falling
back to buffered IO.
But we forget to consider for unwritten fallocated block in this commit.
This patch tries to fix it for fallocate case, this helps to improve
performance.
Test result:
Storage info: sandisk ultra 64G micro sd card.
touch /mnt/f2fs/file
truncate -s 67108864 /mnt/f2fs/file
fallocate -o 0 -l 67108864 /mnt/f2fs/file
time dd if=/dev/zero of=/mnt/f2fs/file bs=1M count=64 conv=notrunc oflag=direct
Time before applying the patch:
67108864 bytes (67 MB) copied, 36.16 s, 1.9 MB/s
real 0m36.162s
user 0m0.000s
sys 0m0.180s
Time after applying the patch:
67108864 bytes (67 MB) copied, 27.7776 s, 2.4 MB/s
real 0m27.780s
user 0m0.000s
sys 0m0.036s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-03-23 10:33:37 +08:00
|
|
|
alloc:
|
2013-12-16 18:04:05 +08:00
|
|
|
set_summary(&sum, dn->nid, dn->ofs_in_node, ni.version);
|
f2fs: readahead encrypted block during GC
During GC, for each encrypted block, we will read block synchronously
into meta page, and then submit it into current cold data log area.
So this block read model with 4k granularity can make poor performance,
like migrating non-encrypted block, let's readahead encrypted block
as well to improve migration performance.
To implement this, we choose meta page that its index is old block
address of the encrypted block, and readahead ciphertext into this
page, later, if readaheaded page is still updated, we will load its
data into target meta page, and submit the write IO.
Note that for OPU, truncation, deletion, we need to invalid meta
page after we invalid old block address, to make sure we won't load
invalid data from target meta page during encrypted block migration.
for ((i = 0; i < 1000; i++))
do {
xfs_io -f /mnt/f2fs/dir/$i -c "pwrite 0 128k" -c "fsync";
} done
for ((i = 0; i < 1000; i+=2))
do {
rm /mnt/f2fs/dir/$i;
} done
ret = ioctl(fd, F2FS_IOC_GARBAGE_COLLECT, 0);
Before:
gc-6549 [001] d..1 214682.212797: block_rq_insert: 8,32 RA 32768 () 786400 + 64 [gc]
gc-6549 [001] d..1 214682.212802: block_unplug: [gc] 1
gc-6549 [001] .... 214682.213892: block_bio_queue: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213899: block_getrq: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213902: block_plug: [gc]
gc-6549 [001] d..1 214682.213905: block_rq_insert: 8,32 R 4096 () 67494144 + 8 [gc]
gc-6549 [001] d..1 214682.213908: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226405: block_bio_queue: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226412: block_getrq: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226414: block_plug: [gc]
gc-6549 [001] d..1 214682.226417: block_rq_insert: 8,32 R 4096 () 67494152 + 8 [gc]
gc-6549 [001] d..1 214682.226420: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226904: block_bio_queue: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226910: block_getrq: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226911: block_plug: [gc]
gc-6549 [001] d..1 214682.226914: block_rq_insert: 8,32 R 4096 () 67494160 + 8 [gc]
gc-6549 [001] d..1 214682.226916: block_unplug: [gc] 1
After:
gc-5678 [003] .... 214327.025906: block_bio_queue: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025908: block_bio_backmerge: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025915: block_bio_queue: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025917: block_bio_backmerge: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025923: block_bio_queue: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025925: block_bio_backmerge: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025932: block_bio_queue: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025934: block_bio_backmerge: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025941: block_bio_queue: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025943: block_bio_backmerge: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025953: block_bio_queue: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025955: block_bio_backmerge: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025962: block_bio_queue: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025964: block_bio_backmerge: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025970: block_bio_queue: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.025972: block_bio_backmerge: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.026000: block_bio_queue: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] .... 214327.026019: block_getrq: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] d..1 214327.026021: block_rq_insert: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] d..1 214327.026023: block_unplug: [gc] 1
gc-5678 [003] d..1 214327.026026: block_rq_issue: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] .... 214327.026046: block_plug: [gc]
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-14 22:37:25 +08:00
|
|
|
old_blkaddr = dn->data_blkaddr;
|
|
|
|
f2fs_allocate_data_block(sbi, NULL, old_blkaddr, &dn->data_blkaddr,
|
2017-11-28 08:23:00 +08:00
|
|
|
&sum, seg_type, NULL, false);
|
f2fs: readahead encrypted block during GC
During GC, for each encrypted block, we will read block synchronously
into meta page, and then submit it into current cold data log area.
So this block read model with 4k granularity can make poor performance,
like migrating non-encrypted block, let's readahead encrypted block
as well to improve migration performance.
To implement this, we choose meta page that its index is old block
address of the encrypted block, and readahead ciphertext into this
page, later, if readaheaded page is still updated, we will load its
data into target meta page, and submit the write IO.
Note that for OPU, truncation, deletion, we need to invalid meta
page after we invalid old block address, to make sure we won't load
invalid data from target meta page during encrypted block migration.
for ((i = 0; i < 1000; i++))
do {
xfs_io -f /mnt/f2fs/dir/$i -c "pwrite 0 128k" -c "fsync";
} done
for ((i = 0; i < 1000; i+=2))
do {
rm /mnt/f2fs/dir/$i;
} done
ret = ioctl(fd, F2FS_IOC_GARBAGE_COLLECT, 0);
Before:
gc-6549 [001] d..1 214682.212797: block_rq_insert: 8,32 RA 32768 () 786400 + 64 [gc]
gc-6549 [001] d..1 214682.212802: block_unplug: [gc] 1
gc-6549 [001] .... 214682.213892: block_bio_queue: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213899: block_getrq: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213902: block_plug: [gc]
gc-6549 [001] d..1 214682.213905: block_rq_insert: 8,32 R 4096 () 67494144 + 8 [gc]
gc-6549 [001] d..1 214682.213908: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226405: block_bio_queue: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226412: block_getrq: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226414: block_plug: [gc]
gc-6549 [001] d..1 214682.226417: block_rq_insert: 8,32 R 4096 () 67494152 + 8 [gc]
gc-6549 [001] d..1 214682.226420: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226904: block_bio_queue: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226910: block_getrq: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226911: block_plug: [gc]
gc-6549 [001] d..1 214682.226914: block_rq_insert: 8,32 R 4096 () 67494160 + 8 [gc]
gc-6549 [001] d..1 214682.226916: block_unplug: [gc] 1
After:
gc-5678 [003] .... 214327.025906: block_bio_queue: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025908: block_bio_backmerge: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025915: block_bio_queue: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025917: block_bio_backmerge: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025923: block_bio_queue: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025925: block_bio_backmerge: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025932: block_bio_queue: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025934: block_bio_backmerge: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025941: block_bio_queue: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025943: block_bio_backmerge: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025953: block_bio_queue: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025955: block_bio_backmerge: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025962: block_bio_queue: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025964: block_bio_backmerge: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025970: block_bio_queue: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.025972: block_bio_backmerge: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.026000: block_bio_queue: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] .... 214327.026019: block_getrq: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] d..1 214327.026021: block_rq_insert: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] d..1 214327.026023: block_unplug: [gc] 1
gc-5678 [003] d..1 214327.026026: block_rq_issue: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] .... 214327.026046: block_plug: [gc]
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-14 22:37:25 +08:00
|
|
|
if (GET_SEGNO(sbi, old_blkaddr) != NULL_SEGNO)
|
|
|
|
invalidate_mapping_pages(META_MAPPING(sbi),
|
|
|
|
old_blkaddr, old_blkaddr);
|
2019-08-28 17:33:35 +08:00
|
|
|
f2fs_update_data_blkaddr(dn, dn->data_blkaddr);
|
2013-12-16 18:04:05 +08:00
|
|
|
|
2018-09-20 06:28:40 +08:00
|
|
|
/*
|
|
|
|
* i_size will be updated by direct_IO. Otherwise, we'll get stale
|
|
|
|
* data from unwritten block via dio_read.
|
|
|
|
*/
|
2013-12-16 18:04:05 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2016-11-12 08:31:56 +08:00
|
|
|
int f2fs_preallocate_blocks(struct kiocb *iocb, struct iov_iter *from)
|
2015-02-10 04:09:53 +08:00
|
|
|
{
|
2016-02-04 05:09:09 +08:00
|
|
|
struct inode *inode = file_inode(iocb->ki_filp);
|
2016-01-26 15:38:29 +08:00
|
|
|
struct f2fs_map_blocks map;
|
2018-01-03 17:30:19 +08:00
|
|
|
int flag;
|
2016-11-12 08:31:56 +08:00
|
|
|
int err = 0;
|
2018-01-03 17:30:19 +08:00
|
|
|
bool direct_io = iocb->ki_flags & IOCB_DIRECT;
|
2015-02-10 04:09:53 +08:00
|
|
|
|
2016-05-07 23:52:57 +08:00
|
|
|
map.m_lblk = F2FS_BLK_ALIGN(iocb->ki_pos);
|
2016-08-20 15:12:01 +08:00
|
|
|
map.m_len = F2FS_BYTES_TO_BLK(iocb->ki_pos + iov_iter_count(from));
|
|
|
|
if (map.m_len > map.m_lblk)
|
|
|
|
map.m_len -= map.m_lblk;
|
|
|
|
else
|
|
|
|
map.m_len = 0;
|
|
|
|
|
2016-01-26 15:42:58 +08:00
|
|
|
map.m_next_pgofs = NULL;
|
2018-01-11 14:42:30 +08:00
|
|
|
map.m_next_extent = NULL;
|
2017-11-28 08:23:00 +08:00
|
|
|
map.m_seg_type = NO_CHECK_TYPE;
|
2018-11-13 14:33:45 +08:00
|
|
|
map.m_may_create = true;
|
2015-12-23 05:23:35 +08:00
|
|
|
|
2018-01-03 17:30:19 +08:00
|
|
|
if (direct_io) {
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
map.m_seg_type = f2fs_rw_hint_to_seg_type(iocb->ki_hint);
|
2018-09-27 18:34:52 +08:00
|
|
|
flag = f2fs_force_buffered_io(inode, iocb, from) ?
|
2018-01-03 17:30:19 +08:00
|
|
|
F2FS_GET_BLOCK_PRE_AIO :
|
|
|
|
F2FS_GET_BLOCK_PRE_DIO;
|
|
|
|
goto map_blocks;
|
2017-11-28 08:23:00 +08:00
|
|
|
}
|
2017-07-19 00:19:05 +08:00
|
|
|
if (iocb->ki_pos + iov_iter_count(from) > MAX_INLINE_DATA(inode)) {
|
2016-11-12 08:31:56 +08:00
|
|
|
err = f2fs_convert_inline_inode(inode);
|
|
|
|
if (err)
|
|
|
|
return err;
|
2016-02-04 05:09:09 +08:00
|
|
|
}
|
2018-01-03 17:30:19 +08:00
|
|
|
if (f2fs_has_inline_data(inode))
|
2017-11-22 18:23:39 +08:00
|
|
|
return err;
|
2018-01-03 17:30:19 +08:00
|
|
|
|
|
|
|
flag = F2FS_GET_BLOCK_PRE_AIO;
|
|
|
|
|
|
|
|
map_blocks:
|
|
|
|
err = f2fs_map_blocks(inode, &map, 1, flag);
|
|
|
|
if (map.m_len > 0 && err == -ENOSPC) {
|
|
|
|
if (!direct_io)
|
|
|
|
set_inode_flag(inode, FI_NO_PREALLOC);
|
|
|
|
err = 0;
|
2017-11-22 18:23:39 +08:00
|
|
|
}
|
2016-11-12 08:31:56 +08:00
|
|
|
return err;
|
2015-02-10 04:09:53 +08:00
|
|
|
}
|
|
|
|
|
2018-09-27 18:33:18 +08:00
|
|
|
void __do_map_lock(struct f2fs_sb_info *sbi, int flag, bool lock)
|
2017-03-13 20:22:18 +08:00
|
|
|
{
|
|
|
|
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
|
|
|
|
if (lock)
|
|
|
|
down_read(&sbi->node_change);
|
|
|
|
else
|
|
|
|
up_read(&sbi->node_change);
|
|
|
|
} else {
|
|
|
|
if (lock)
|
|
|
|
f2fs_lock_op(sbi);
|
|
|
|
else
|
|
|
|
f2fs_unlock_op(sbi);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2012-11-29 12:28:09 +08:00
|
|
|
/*
|
2015-04-07 10:55:34 +08:00
|
|
|
* f2fs_map_blocks() now supported readahead/bmap/rw direct_IO with
|
|
|
|
* f2fs_map_blocks structure.
|
2013-12-21 18:02:14 +08:00
|
|
|
* If original data blocks are allocated, then give them to blockdev.
|
|
|
|
* Otherwise,
|
|
|
|
* a. preallocate requested block addresses
|
|
|
|
* b. do not use extent cache for better performance
|
|
|
|
* c. give the block addresses to blockdev
|
2012-11-02 16:10:12 +08:00
|
|
|
*/
|
2015-10-27 09:53:45 +08:00
|
|
|
int f2fs_map_blocks(struct inode *inode, struct f2fs_map_blocks *map,
|
2015-08-19 19:11:19 +08:00
|
|
|
int create, int flag)
|
2012-11-02 16:10:12 +08:00
|
|
|
{
|
2015-04-07 10:55:34 +08:00
|
|
|
unsigned int maxblocks = map->m_len;
|
2012-11-02 16:10:12 +08:00
|
|
|
struct dnode_of_data dn;
|
2015-09-21 20:17:52 +08:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
2018-11-13 14:33:45 +08:00
|
|
|
int mode = map->m_may_create ? ALLOC_NODE : LOOKUP_NODE;
|
2016-05-09 19:56:30 +08:00
|
|
|
pgoff_t pgofs, end_offset, end;
|
2013-12-16 18:04:05 +08:00
|
|
|
int err = 0, ofs = 1;
|
2016-05-09 19:56:30 +08:00
|
|
|
unsigned int ofs_in_node, last_ofs_in_node;
|
|
|
|
blkcnt_t prealloc;
|
2017-02-23 17:18:05 +08:00
|
|
|
struct extent_info ei = {0,0,0};
|
2015-12-17 13:20:59 +08:00
|
|
|
block_t blkaddr;
|
2018-01-11 14:42:30 +08:00
|
|
|
unsigned int start_pgofs;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
2016-08-20 15:12:01 +08:00
|
|
|
if (!maxblocks)
|
|
|
|
return 0;
|
|
|
|
|
2015-04-07 10:55:34 +08:00
|
|
|
map->m_len = 0;
|
|
|
|
map->m_flags = 0;
|
|
|
|
|
|
|
|
/* it only supports block size == page size */
|
|
|
|
pgofs = (pgoff_t)map->m_lblk;
|
2016-05-09 19:56:30 +08:00
|
|
|
end = pgofs + maxblocks;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
2016-02-04 05:49:44 +08:00
|
|
|
if (!create && f2fs_lookup_extent_cache(inode, pgofs, &ei)) {
|
2018-11-20 04:29:35 +08:00
|
|
|
if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
|
|
|
|
map->m_may_create)
|
|
|
|
goto next_dnode;
|
|
|
|
|
2015-04-07 10:55:34 +08:00
|
|
|
map->m_pblk = ei.blk + pgofs - ei.fofs;
|
|
|
|
map->m_len = min((pgoff_t)maxblocks, ei.fofs + ei.len - pgofs);
|
|
|
|
map->m_flags = F2FS_MAP_MAPPED;
|
2018-01-11 14:42:30 +08:00
|
|
|
if (map->m_next_extent)
|
|
|
|
*map->m_next_extent = pgofs + map->m_len;
|
2018-10-10 13:26:22 +08:00
|
|
|
|
|
|
|
/* for hardware encryption, but to avoid potential issue in future */
|
|
|
|
if (flag == F2FS_GET_BLOCK_DIO)
|
|
|
|
f2fs_wait_on_block_writeback_range(inode,
|
|
|
|
map->m_pblk, map->m_len);
|
2013-12-16 18:04:05 +08:00
|
|
|
goto out;
|
2015-02-05 17:50:30 +08:00
|
|
|
}
|
2013-12-16 18:04:05 +08:00
|
|
|
|
2016-01-26 15:37:38 +08:00
|
|
|
next_dnode:
|
2018-11-13 14:33:45 +08:00
|
|
|
if (map->m_may_create)
|
2017-03-13 20:22:18 +08:00
|
|
|
__do_map_lock(sbi, flag, true);
|
2012-11-02 16:10:12 +08:00
|
|
|
|
|
|
|
/* When reading holes, we need its node page */
|
|
|
|
set_new_dnode(&dn, inode, NULL, NULL, 0);
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
err = f2fs_get_dnode_of_data(&dn, pgofs, mode);
|
2013-12-26 15:55:22 +08:00
|
|
|
if (err) {
|
2016-05-05 19:13:02 +08:00
|
|
|
if (flag == F2FS_GET_BLOCK_BMAP)
|
|
|
|
map->m_pblk = 0;
|
2024-06-12 13:13:20 +08:00
|
|
|
|
2016-01-26 15:42:58 +08:00
|
|
|
if (err == -ENOENT) {
|
2024-06-12 13:13:20 +08:00
|
|
|
/*
|
|
|
|
* There is one exceptional case that read_node_page()
|
|
|
|
* may return -ENOENT due to filesystem has been
|
|
|
|
* shutdown or cp_error, so force to convert error
|
|
|
|
* number to EIO for such case.
|
|
|
|
*/
|
|
|
|
if (map->m_may_create &&
|
|
|
|
(is_sbi_flag_set(sbi, SBI_IS_SHUTDOWN) ||
|
|
|
|
f2fs_cp_error(sbi))) {
|
|
|
|
err = -EIO;
|
|
|
|
goto unlock_out;
|
|
|
|
}
|
|
|
|
|
2013-12-16 18:04:05 +08:00
|
|
|
err = 0;
|
2016-01-26 15:42:58 +08:00
|
|
|
if (map->m_next_pgofs)
|
|
|
|
*map->m_next_pgofs =
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_get_next_page_offset(&dn, pgofs);
|
2018-01-11 14:42:30 +08:00
|
|
|
if (map->m_next_extent)
|
|
|
|
*map->m_next_extent =
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_get_next_page_offset(&dn, pgofs);
|
2016-01-26 15:42:58 +08:00
|
|
|
}
|
2013-12-16 18:04:05 +08:00
|
|
|
goto unlock_out;
|
2013-04-23 15:38:02 +08:00
|
|
|
}
|
2015-09-18 16:51:51 +08:00
|
|
|
|
2018-01-11 14:42:30 +08:00
|
|
|
start_pgofs = pgofs;
|
2016-05-09 19:56:30 +08:00
|
|
|
prealloc = 0;
|
2016-11-02 21:52:15 +08:00
|
|
|
last_ofs_in_node = ofs_in_node = dn.ofs_in_node;
|
2016-01-26 15:39:35 +08:00
|
|
|
end_offset = ADDRS_PER_PAGE(dn.node_page, inode);
|
2016-01-26 15:37:38 +08:00
|
|
|
|
|
|
|
next_block:
|
2017-07-19 00:19:06 +08:00
|
|
|
blkaddr = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
|
2016-01-26 15:37:38 +08:00
|
|
|
|
f2fs: fix to do sanity check with block address in main area
This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info
- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image
- Reproduce (4.18 upstream kernel)
- POC (poc.c)
static void activity(char *mpoint) {
char *foo_bar_baz;
int err;
static int buf[8192];
memset(buf, 0, sizeof(buf));
err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);
int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
if (fd >= 0) {
write(fd, (char *)buf, sizeof(buf));
fdatasync(fd);
close(fd);
}
}
int main(int argc, char *argv[]) {
activity(argv[1]);
return 0;
}
- Kernel message
[ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.729171] Call Trace:
[ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.729269] ? __radix_tree_replace+0xa3/0x120
[ 699.729276] __write_data_page+0x5c7/0xe30
[ 699.729291] ? kasan_check_read+0x11/0x20
[ 699.729310] ? page_mapped+0x8a/0x110
[ 699.729321] ? page_mkclean+0xe9/0x160
[ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.729331] ? invalid_page_referenced_vma+0x130/0x130
[ 699.729345] ? clear_page_dirty_for_io+0x332/0x450
[ 699.729351] f2fs_write_cache_pages+0x4ca/0x860
[ 699.729358] ? __write_data_page+0xe30/0xe30
[ 699.729374] ? percpu_counter_add_batch+0x22/0xa0
[ 699.729380] ? kasan_check_write+0x14/0x20
[ 699.729391] ? _raw_spin_lock+0x17/0x40
[ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.729413] ? iov_iter_advance+0x113/0x640
[ 699.729418] ? f2fs_write_end+0x133/0x2e0
[ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.729428] f2fs_write_data_pages+0x329/0x520
[ 699.729433] ? generic_perform_write+0x250/0x320
[ 699.729438] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729454] ? current_time+0x110/0x110
[ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.729464] do_writepages+0x37/0xb0
[ 699.729468] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729472] ? do_writepages+0x37/0xb0
[ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.729496] ? __vfs_write+0x2b2/0x410
[ 699.729501] file_write_and_wait_range+0x66/0xb0
[ 699.729506] f2fs_do_sync_file+0x1f9/0xd90
[ 699.729511] ? truncate_partial_data_page+0x290/0x290
[ 699.729521] ? __sb_end_write+0x30/0x50
[ 699.729526] ? vfs_write+0x20f/0x260
[ 699.729530] f2fs_sync_file+0x9a/0xb0
[ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.729548] vfs_fsync_range+0x68/0x100
[ 699.729554] ? __fget_light+0xc9/0xe0
[ 699.729558] do_fsync+0x3d/0x70
[ 699.729562] __x64_sys_fdatasync+0x24/0x30
[ 699.729585] do_syscall_64+0x78/0x170
[ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.729613] RIP: 0033:0x7f9bf930d800
[ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[ 699.729782] ------------[ cut here ]------------
[ 699.729785] kernel BUG at fs/f2fs/segment.h:654!
[ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4
[ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.752874] Call Trace:
[ 699.753386] ? f2fs_inplace_write_data+0x93/0x240
[ 699.754341] f2fs_inplace_write_data+0xd2/0x240
[ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.758209] ? __radix_tree_replace+0xa3/0x120
[ 699.759164] __write_data_page+0x5c7/0xe30
[ 699.760002] ? kasan_check_read+0x11/0x20
[ 699.760823] ? page_mapped+0x8a/0x110
[ 699.761573] ? page_mkclean+0xe9/0x160
[ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.763332] ? invalid_page_referenced_vma+0x130/0x130
[ 699.764374] ? clear_page_dirty_for_io+0x332/0x450
[ 699.765347] f2fs_write_cache_pages+0x4ca/0x860
[ 699.766276] ? __write_data_page+0xe30/0xe30
[ 699.767161] ? percpu_counter_add_batch+0x22/0xa0
[ 699.768112] ? kasan_check_write+0x14/0x20
[ 699.768951] ? _raw_spin_lock+0x17/0x40
[ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.770885] ? iov_iter_advance+0x113/0x640
[ 699.771743] ? f2fs_write_end+0x133/0x2e0
[ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.773680] f2fs_write_data_pages+0x329/0x520
[ 699.774603] ? generic_perform_write+0x250/0x320
[ 699.775544] ? f2fs_write_cache_pages+0x860/0x860
[ 699.776510] ? current_time+0x110/0x110
[ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.778279] do_writepages+0x37/0xb0
[ 699.779026] ? f2fs_write_cache_pages+0x860/0x860
[ 699.779978] ? do_writepages+0x37/0xb0
[ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.782820] ? __vfs_write+0x2b2/0x410
[ 699.783597] file_write_and_wait_range+0x66/0xb0
[ 699.784540] f2fs_do_sync_file+0x1f9/0xd90
[ 699.785381] ? truncate_partial_data_page+0x290/0x290
[ 699.786415] ? __sb_end_write+0x30/0x50
[ 699.787204] ? vfs_write+0x20f/0x260
[ 699.787941] f2fs_sync_file+0x9a/0xb0
[ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.789572] vfs_fsync_range+0x68/0x100
[ 699.790360] ? __fget_light+0xc9/0xe0
[ 699.791128] do_fsync+0x3d/0x70
[ 699.791779] __x64_sys_fdatasync+0x24/0x30
[ 699.792614] do_syscall_64+0x78/0x170
[ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.794406] RIP: 0033:0x7f9bf930d800
[ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.835556] ==================================================================
[ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309
[ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4
[ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.843475] Call Trace:
[ 699.843982] dump_stack+0x7b/0xb5
[ 699.844661] print_address_description+0x70/0x290
[ 699.845607] kasan_report+0x291/0x390
[ 699.846351] ? update_stack_state+0x38c/0x3e0
[ 699.853831] __asan_load8+0x54/0x90
[ 699.854569] update_stack_state+0x38c/0x3e0
[ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20
[ 699.856601] ? __save_stack_trace+0x5e/0x100
[ 699.857476] unwind_next_frame.part.5+0x18e/0x490
[ 699.858448] ? unwind_dump+0x290/0x290
[ 699.859217] ? clear_page_dirty_for_io+0x332/0x450
[ 699.860185] __unwind_start+0x106/0x190
[ 699.860974] __save_stack_trace+0x5e/0x100
[ 699.861808] ? __save_stack_trace+0x5e/0x100
[ 699.862691] ? unlink_anon_vmas+0xba/0x2c0
[ 699.863525] save_stack_trace+0x1f/0x30
[ 699.864312] save_stack+0x46/0xd0
[ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420
[ 699.865990] ? flush_tlb_mm_range+0x15e/0x220
[ 699.866889] ? kasan_check_write+0x14/0x20
[ 699.867724] ? __dec_node_state+0x92/0xb0
[ 699.868543] ? lock_page_memcg+0x85/0xf0
[ 699.869350] ? unlock_page_memcg+0x16/0x80
[ 699.870185] ? page_remove_rmap+0x198/0x520
[ 699.871048] ? mark_page_accessed+0x133/0x200
[ 699.871930] ? _cond_resched+0x1a/0x50
[ 699.872700] ? unmap_page_range+0xcd4/0xe50
[ 699.873551] ? rb_next+0x58/0x80
[ 699.874217] ? rb_next+0x58/0x80
[ 699.874895] __kasan_slab_free+0x13c/0x1a0
[ 699.875734] ? unlink_anon_vmas+0xba/0x2c0
[ 699.876563] kasan_slab_free+0xe/0x10
[ 699.877315] kmem_cache_free+0x89/0x1e0
[ 699.878095] unlink_anon_vmas+0xba/0x2c0
[ 699.878913] free_pgtables+0x101/0x1b0
[ 699.879677] exit_mmap+0x146/0x2a0
[ 699.880378] ? __ia32_sys_munmap+0x50/0x50
[ 699.881214] ? kasan_check_read+0x11/0x20
[ 699.882052] ? mm_update_next_owner+0x322/0x380
[ 699.882985] mmput+0x8b/0x1d0
[ 699.883602] do_exit+0x43a/0x1390
[ 699.884288] ? mm_update_next_owner+0x380/0x380
[ 699.885212] ? f2fs_sync_file+0x9a/0xb0
[ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.886877] ? vfs_fsync_range+0x68/0x100
[ 699.887694] ? __fget_light+0xc9/0xe0
[ 699.888442] ? do_fsync+0x3d/0x70
[ 699.889118] ? __x64_sys_fdatasync+0x24/0x30
[ 699.889996] rewind_stack_do_exit+0x17/0x20
[ 699.890860] RIP: 0033:0x7f9bf930d800
[ 699.891585] Code: Bad RIP value.
[ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.901241] The buggy address belongs to the page:
[ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[ 699.903811] flags: 0x2ffff0000000000()
[ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[ 699.907673] page dumped because: kasan: bad access detected
[ 699.909108] Memory state around the buggy address:
[ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[ 699.914392] ^
[ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[ 699.918634] ==================================================================
- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644
Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-01 19:13:44 +08:00
|
|
|
if (__is_valid_data_blkaddr(blkaddr) &&
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
!f2fs_is_valid_blkaddr(sbi, blkaddr, DATA_GENERIC_ENHANCE)) {
|
2019-06-20 11:36:14 +08:00
|
|
|
err = -EFSCORRUPTED;
|
f2fs: fix to do sanity check with block address in main area
This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info
- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image
- Reproduce (4.18 upstream kernel)
- POC (poc.c)
static void activity(char *mpoint) {
char *foo_bar_baz;
int err;
static int buf[8192];
memset(buf, 0, sizeof(buf));
err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);
int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
if (fd >= 0) {
write(fd, (char *)buf, sizeof(buf));
fdatasync(fd);
close(fd);
}
}
int main(int argc, char *argv[]) {
activity(argv[1]);
return 0;
}
- Kernel message
[ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.729171] Call Trace:
[ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.729269] ? __radix_tree_replace+0xa3/0x120
[ 699.729276] __write_data_page+0x5c7/0xe30
[ 699.729291] ? kasan_check_read+0x11/0x20
[ 699.729310] ? page_mapped+0x8a/0x110
[ 699.729321] ? page_mkclean+0xe9/0x160
[ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.729331] ? invalid_page_referenced_vma+0x130/0x130
[ 699.729345] ? clear_page_dirty_for_io+0x332/0x450
[ 699.729351] f2fs_write_cache_pages+0x4ca/0x860
[ 699.729358] ? __write_data_page+0xe30/0xe30
[ 699.729374] ? percpu_counter_add_batch+0x22/0xa0
[ 699.729380] ? kasan_check_write+0x14/0x20
[ 699.729391] ? _raw_spin_lock+0x17/0x40
[ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.729413] ? iov_iter_advance+0x113/0x640
[ 699.729418] ? f2fs_write_end+0x133/0x2e0
[ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.729428] f2fs_write_data_pages+0x329/0x520
[ 699.729433] ? generic_perform_write+0x250/0x320
[ 699.729438] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729454] ? current_time+0x110/0x110
[ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.729464] do_writepages+0x37/0xb0
[ 699.729468] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729472] ? do_writepages+0x37/0xb0
[ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.729496] ? __vfs_write+0x2b2/0x410
[ 699.729501] file_write_and_wait_range+0x66/0xb0
[ 699.729506] f2fs_do_sync_file+0x1f9/0xd90
[ 699.729511] ? truncate_partial_data_page+0x290/0x290
[ 699.729521] ? __sb_end_write+0x30/0x50
[ 699.729526] ? vfs_write+0x20f/0x260
[ 699.729530] f2fs_sync_file+0x9a/0xb0
[ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.729548] vfs_fsync_range+0x68/0x100
[ 699.729554] ? __fget_light+0xc9/0xe0
[ 699.729558] do_fsync+0x3d/0x70
[ 699.729562] __x64_sys_fdatasync+0x24/0x30
[ 699.729585] do_syscall_64+0x78/0x170
[ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.729613] RIP: 0033:0x7f9bf930d800
[ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[ 699.729782] ------------[ cut here ]------------
[ 699.729785] kernel BUG at fs/f2fs/segment.h:654!
[ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4
[ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.752874] Call Trace:
[ 699.753386] ? f2fs_inplace_write_data+0x93/0x240
[ 699.754341] f2fs_inplace_write_data+0xd2/0x240
[ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.758209] ? __radix_tree_replace+0xa3/0x120
[ 699.759164] __write_data_page+0x5c7/0xe30
[ 699.760002] ? kasan_check_read+0x11/0x20
[ 699.760823] ? page_mapped+0x8a/0x110
[ 699.761573] ? page_mkclean+0xe9/0x160
[ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.763332] ? invalid_page_referenced_vma+0x130/0x130
[ 699.764374] ? clear_page_dirty_for_io+0x332/0x450
[ 699.765347] f2fs_write_cache_pages+0x4ca/0x860
[ 699.766276] ? __write_data_page+0xe30/0xe30
[ 699.767161] ? percpu_counter_add_batch+0x22/0xa0
[ 699.768112] ? kasan_check_write+0x14/0x20
[ 699.768951] ? _raw_spin_lock+0x17/0x40
[ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.770885] ? iov_iter_advance+0x113/0x640
[ 699.771743] ? f2fs_write_end+0x133/0x2e0
[ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.773680] f2fs_write_data_pages+0x329/0x520
[ 699.774603] ? generic_perform_write+0x250/0x320
[ 699.775544] ? f2fs_write_cache_pages+0x860/0x860
[ 699.776510] ? current_time+0x110/0x110
[ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.778279] do_writepages+0x37/0xb0
[ 699.779026] ? f2fs_write_cache_pages+0x860/0x860
[ 699.779978] ? do_writepages+0x37/0xb0
[ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.782820] ? __vfs_write+0x2b2/0x410
[ 699.783597] file_write_and_wait_range+0x66/0xb0
[ 699.784540] f2fs_do_sync_file+0x1f9/0xd90
[ 699.785381] ? truncate_partial_data_page+0x290/0x290
[ 699.786415] ? __sb_end_write+0x30/0x50
[ 699.787204] ? vfs_write+0x20f/0x260
[ 699.787941] f2fs_sync_file+0x9a/0xb0
[ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.789572] vfs_fsync_range+0x68/0x100
[ 699.790360] ? __fget_light+0xc9/0xe0
[ 699.791128] do_fsync+0x3d/0x70
[ 699.791779] __x64_sys_fdatasync+0x24/0x30
[ 699.792614] do_syscall_64+0x78/0x170
[ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.794406] RIP: 0033:0x7f9bf930d800
[ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.835556] ==================================================================
[ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309
[ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4
[ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.843475] Call Trace:
[ 699.843982] dump_stack+0x7b/0xb5
[ 699.844661] print_address_description+0x70/0x290
[ 699.845607] kasan_report+0x291/0x390
[ 699.846351] ? update_stack_state+0x38c/0x3e0
[ 699.853831] __asan_load8+0x54/0x90
[ 699.854569] update_stack_state+0x38c/0x3e0
[ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20
[ 699.856601] ? __save_stack_trace+0x5e/0x100
[ 699.857476] unwind_next_frame.part.5+0x18e/0x490
[ 699.858448] ? unwind_dump+0x290/0x290
[ 699.859217] ? clear_page_dirty_for_io+0x332/0x450
[ 699.860185] __unwind_start+0x106/0x190
[ 699.860974] __save_stack_trace+0x5e/0x100
[ 699.861808] ? __save_stack_trace+0x5e/0x100
[ 699.862691] ? unlink_anon_vmas+0xba/0x2c0
[ 699.863525] save_stack_trace+0x1f/0x30
[ 699.864312] save_stack+0x46/0xd0
[ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420
[ 699.865990] ? flush_tlb_mm_range+0x15e/0x220
[ 699.866889] ? kasan_check_write+0x14/0x20
[ 699.867724] ? __dec_node_state+0x92/0xb0
[ 699.868543] ? lock_page_memcg+0x85/0xf0
[ 699.869350] ? unlock_page_memcg+0x16/0x80
[ 699.870185] ? page_remove_rmap+0x198/0x520
[ 699.871048] ? mark_page_accessed+0x133/0x200
[ 699.871930] ? _cond_resched+0x1a/0x50
[ 699.872700] ? unmap_page_range+0xcd4/0xe50
[ 699.873551] ? rb_next+0x58/0x80
[ 699.874217] ? rb_next+0x58/0x80
[ 699.874895] __kasan_slab_free+0x13c/0x1a0
[ 699.875734] ? unlink_anon_vmas+0xba/0x2c0
[ 699.876563] kasan_slab_free+0xe/0x10
[ 699.877315] kmem_cache_free+0x89/0x1e0
[ 699.878095] unlink_anon_vmas+0xba/0x2c0
[ 699.878913] free_pgtables+0x101/0x1b0
[ 699.879677] exit_mmap+0x146/0x2a0
[ 699.880378] ? __ia32_sys_munmap+0x50/0x50
[ 699.881214] ? kasan_check_read+0x11/0x20
[ 699.882052] ? mm_update_next_owner+0x322/0x380
[ 699.882985] mmput+0x8b/0x1d0
[ 699.883602] do_exit+0x43a/0x1390
[ 699.884288] ? mm_update_next_owner+0x380/0x380
[ 699.885212] ? f2fs_sync_file+0x9a/0xb0
[ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.886877] ? vfs_fsync_range+0x68/0x100
[ 699.887694] ? __fget_light+0xc9/0xe0
[ 699.888442] ? do_fsync+0x3d/0x70
[ 699.889118] ? __x64_sys_fdatasync+0x24/0x30
[ 699.889996] rewind_stack_do_exit+0x17/0x20
[ 699.890860] RIP: 0033:0x7f9bf930d800
[ 699.891585] Code: Bad RIP value.
[ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.901241] The buggy address belongs to the page:
[ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[ 699.903811] flags: 0x2ffff0000000000()
[ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[ 699.907673] page dumped because: kasan: bad access detected
[ 699.909108] Memory state around the buggy address:
[ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[ 699.914392] ^
[ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[ 699.918634] ==================================================================
- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644
Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-01 19:13:44 +08:00
|
|
|
goto sync_out;
|
|
|
|
}
|
|
|
|
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
if (__is_valid_data_blkaddr(blkaddr)) {
|
2018-09-27 18:34:52 +08:00
|
|
|
/* use out-place-update for driect IO under LFS mode */
|
2018-11-13 14:33:45 +08:00
|
|
|
if (test_opt(sbi, LFS) && flag == F2FS_GET_BLOCK_DIO &&
|
|
|
|
map->m_may_create) {
|
2018-09-27 18:34:52 +08:00
|
|
|
err = __allocate_data_block(&dn, map->m_seg_type);
|
2019-08-28 17:33:36 +08:00
|
|
|
if (err)
|
|
|
|
goto sync_out;
|
|
|
|
blkaddr = dn.data_blkaddr;
|
|
|
|
set_inode_flag(inode, FI_APPEND_WRITE);
|
2018-09-27 18:34:52 +08:00
|
|
|
}
|
|
|
|
} else {
|
2015-09-18 16:51:51 +08:00
|
|
|
if (create) {
|
2015-09-21 20:17:52 +08:00
|
|
|
if (unlikely(f2fs_cp_error(sbi))) {
|
|
|
|
err = -EIO;
|
2016-01-26 15:37:38 +08:00
|
|
|
goto sync_out;
|
2015-09-21 20:17:52 +08:00
|
|
|
}
|
2016-02-04 05:49:44 +08:00
|
|
|
if (flag == F2FS_GET_BLOCK_PRE_AIO) {
|
2016-05-09 19:56:30 +08:00
|
|
|
if (blkaddr == NULL_ADDR) {
|
|
|
|
prealloc++;
|
|
|
|
last_ofs_in_node = dn.ofs_in_node;
|
|
|
|
}
|
2016-02-04 05:49:44 +08:00
|
|
|
} else {
|
2018-09-20 06:28:40 +08:00
|
|
|
WARN_ON(flag != F2FS_GET_BLOCK_PRE_DIO &&
|
|
|
|
flag != F2FS_GET_BLOCK_DIO);
|
2017-11-28 08:23:00 +08:00
|
|
|
err = __allocate_data_block(&dn,
|
|
|
|
map->m_seg_type);
|
2016-10-11 22:57:03 +08:00
|
|
|
if (!err)
|
2016-05-21 01:13:22 +08:00
|
|
|
set_inode_flag(inode, FI_APPEND_WRITE);
|
2016-02-04 05:49:44 +08:00
|
|
|
}
|
2015-09-18 16:51:51 +08:00
|
|
|
if (err)
|
2016-01-26 15:37:38 +08:00
|
|
|
goto sync_out;
|
2017-02-23 19:55:05 +08:00
|
|
|
map->m_flags |= F2FS_MAP_NEW;
|
2016-01-26 15:37:38 +08:00
|
|
|
blkaddr = dn.data_blkaddr;
|
2015-09-18 16:51:51 +08:00
|
|
|
} else {
|
2016-05-05 19:13:02 +08:00
|
|
|
if (flag == F2FS_GET_BLOCK_BMAP) {
|
|
|
|
map->m_pblk = 0;
|
|
|
|
goto sync_out;
|
|
|
|
}
|
2018-01-11 14:42:30 +08:00
|
|
|
if (flag == F2FS_GET_BLOCK_PRECACHE)
|
|
|
|
goto sync_out;
|
2016-01-26 15:42:58 +08:00
|
|
|
if (flag == F2FS_GET_BLOCK_FIEMAP &&
|
|
|
|
blkaddr == NULL_ADDR) {
|
|
|
|
if (map->m_next_pgofs)
|
|
|
|
*map->m_next_pgofs = pgofs + 1;
|
2017-11-30 19:28:22 +08:00
|
|
|
goto sync_out;
|
2016-01-26 15:42:58 +08:00
|
|
|
}
|
2018-01-10 18:18:52 +08:00
|
|
|
if (flag != F2FS_GET_BLOCK_FIEMAP) {
|
|
|
|
/* for defragment case */
|
|
|
|
if (map->m_next_pgofs)
|
|
|
|
*map->m_next_pgofs = pgofs + 1;
|
2016-01-26 15:37:38 +08:00
|
|
|
goto sync_out;
|
2018-01-10 18:18:52 +08:00
|
|
|
}
|
2015-08-19 19:11:19 +08:00
|
|
|
}
|
|
|
|
}
|
2012-11-02 16:10:12 +08:00
|
|
|
|
2016-05-09 19:56:30 +08:00
|
|
|
if (flag == F2FS_GET_BLOCK_PRE_AIO)
|
|
|
|
goto skip;
|
|
|
|
|
2016-01-26 15:37:38 +08:00
|
|
|
if (map->m_len == 0) {
|
|
|
|
/* preallocated unwritten block should be mapped for fiemap. */
|
|
|
|
if (blkaddr == NEW_ADDR)
|
|
|
|
map->m_flags |= F2FS_MAP_UNWRITTEN;
|
|
|
|
map->m_flags |= F2FS_MAP_MAPPED;
|
|
|
|
|
|
|
|
map->m_pblk = blkaddr;
|
|
|
|
map->m_len = 1;
|
|
|
|
} else if ((map->m_pblk != NEW_ADDR &&
|
|
|
|
blkaddr == (map->m_pblk + ofs)) ||
|
2016-02-04 05:09:09 +08:00
|
|
|
(map->m_pblk == NEW_ADDR && blkaddr == NEW_ADDR) ||
|
2016-05-09 19:56:30 +08:00
|
|
|
flag == F2FS_GET_BLOCK_PRE_DIO) {
|
2016-01-26 15:37:38 +08:00
|
|
|
ofs++;
|
|
|
|
map->m_len++;
|
|
|
|
} else {
|
|
|
|
goto sync_out;
|
|
|
|
}
|
2013-12-16 18:04:05 +08:00
|
|
|
|
2016-05-09 19:56:30 +08:00
|
|
|
skip:
|
2013-12-16 18:04:05 +08:00
|
|
|
dn.ofs_in_node++;
|
|
|
|
pgofs++;
|
|
|
|
|
2016-05-09 19:56:30 +08:00
|
|
|
/* preallocate blocks in batch for one dnode page */
|
|
|
|
if (flag == F2FS_GET_BLOCK_PRE_AIO &&
|
|
|
|
(pgofs == end || dn.ofs_in_node == end_offset)) {
|
2015-12-17 13:20:59 +08:00
|
|
|
|
2016-05-09 19:56:30 +08:00
|
|
|
dn.ofs_in_node = ofs_in_node;
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
err = f2fs_reserve_new_blocks(&dn, prealloc);
|
2016-05-09 19:56:30 +08:00
|
|
|
if (err)
|
|
|
|
goto sync_out;
|
2013-12-16 18:04:05 +08:00
|
|
|
|
2016-05-09 19:56:30 +08:00
|
|
|
map->m_len += dn.ofs_in_node - ofs_in_node;
|
|
|
|
if (prealloc && dn.ofs_in_node != last_ofs_in_node + 1) {
|
|
|
|
err = -ENOSPC;
|
|
|
|
goto sync_out;
|
2015-12-23 17:11:43 +08:00
|
|
|
}
|
2016-05-09 19:56:30 +08:00
|
|
|
dn.ofs_in_node = end_offset;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (pgofs >= end)
|
|
|
|
goto sync_out;
|
|
|
|
else if (dn.ofs_in_node < end_offset)
|
|
|
|
goto next_block;
|
|
|
|
|
2018-01-11 14:42:30 +08:00
|
|
|
if (flag == F2FS_GET_BLOCK_PRECACHE) {
|
|
|
|
if (map->m_flags & F2FS_MAP_MAPPED) {
|
|
|
|
unsigned int ofs = start_pgofs - map->m_lblk;
|
|
|
|
|
|
|
|
f2fs_update_extent_cache_range(&dn,
|
|
|
|
start_pgofs, map->m_pblk + ofs,
|
|
|
|
map->m_len - ofs);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2016-05-09 19:56:30 +08:00
|
|
|
f2fs_put_dnode(&dn);
|
|
|
|
|
2018-11-13 14:33:45 +08:00
|
|
|
if (map->m_may_create) {
|
2017-03-13 20:22:18 +08:00
|
|
|
__do_map_lock(sbi, flag, false);
|
2016-10-11 22:57:03 +08:00
|
|
|
f2fs_balance_fs(sbi, dn.node_changed);
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
2016-05-09 19:56:30 +08:00
|
|
|
goto next_dnode;
|
2015-12-17 13:20:59 +08:00
|
|
|
|
2013-12-16 18:04:05 +08:00
|
|
|
sync_out:
|
2018-10-10 13:26:22 +08:00
|
|
|
|
|
|
|
/* for hardware encryption, but to avoid potential issue in future */
|
|
|
|
if (flag == F2FS_GET_BLOCK_DIO && map->m_flags & F2FS_MAP_MAPPED)
|
|
|
|
f2fs_wait_on_block_writeback_range(inode,
|
|
|
|
map->m_pblk, map->m_len);
|
|
|
|
|
2018-01-11 14:42:30 +08:00
|
|
|
if (flag == F2FS_GET_BLOCK_PRECACHE) {
|
|
|
|
if (map->m_flags & F2FS_MAP_MAPPED) {
|
|
|
|
unsigned int ofs = start_pgofs - map->m_lblk;
|
|
|
|
|
|
|
|
f2fs_update_extent_cache_range(&dn,
|
|
|
|
start_pgofs, map->m_pblk + ofs,
|
|
|
|
map->m_len - ofs);
|
|
|
|
}
|
|
|
|
if (map->m_next_extent)
|
|
|
|
*map->m_next_extent = pgofs + 1;
|
|
|
|
}
|
2012-11-02 16:10:12 +08:00
|
|
|
f2fs_put_dnode(&dn);
|
2013-12-16 18:04:05 +08:00
|
|
|
unlock_out:
|
2018-11-13 14:33:45 +08:00
|
|
|
if (map->m_may_create) {
|
2017-03-13 20:22:18 +08:00
|
|
|
__do_map_lock(sbi, flag, false);
|
2016-10-11 22:57:03 +08:00
|
|
|
f2fs_balance_fs(sbi, dn.node_changed);
|
2015-12-23 05:23:35 +08:00
|
|
|
}
|
2013-12-16 18:04:05 +08:00
|
|
|
out:
|
2015-04-07 10:55:34 +08:00
|
|
|
trace_f2fs_map_blocks(inode, map, err);
|
2013-12-16 18:04:05 +08:00
|
|
|
return err;
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
|
|
|
|
2018-03-08 18:34:38 +08:00
|
|
|
bool f2fs_overwrite_io(struct inode *inode, loff_t pos, size_t len)
|
|
|
|
{
|
|
|
|
struct f2fs_map_blocks map;
|
|
|
|
block_t last_lblk;
|
|
|
|
int err;
|
|
|
|
|
|
|
|
if (pos + len > i_size_read(inode))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
map.m_lblk = F2FS_BYTES_TO_BLK(pos);
|
|
|
|
map.m_next_pgofs = NULL;
|
|
|
|
map.m_next_extent = NULL;
|
|
|
|
map.m_seg_type = NO_CHECK_TYPE;
|
2018-11-20 04:29:35 +08:00
|
|
|
map.m_may_create = false;
|
2018-03-08 18:34:38 +08:00
|
|
|
last_lblk = F2FS_BLK_ALIGN(pos + len);
|
|
|
|
|
|
|
|
while (map.m_lblk < last_lblk) {
|
|
|
|
map.m_len = last_lblk - map.m_lblk;
|
|
|
|
err = f2fs_map_blocks(inode, &map, 0, F2FS_GET_BLOCK_DEFAULT);
|
|
|
|
if (err || map.m_len == 0)
|
|
|
|
return false;
|
|
|
|
map.m_lblk += map.m_len;
|
|
|
|
}
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2015-04-07 10:55:34 +08:00
|
|
|
static int __get_data_block(struct inode *inode, sector_t iblock,
|
2016-01-26 15:42:58 +08:00
|
|
|
struct buffer_head *bh, int create, int flag,
|
2018-11-13 14:33:45 +08:00
|
|
|
pgoff_t *next_pgofs, int seg_type, bool may_write)
|
2015-04-07 10:55:34 +08:00
|
|
|
{
|
|
|
|
struct f2fs_map_blocks map;
|
2016-11-12 08:31:56 +08:00
|
|
|
int err;
|
2015-04-07 10:55:34 +08:00
|
|
|
|
|
|
|
map.m_lblk = iblock;
|
|
|
|
map.m_len = bh->b_size >> inode->i_blkbits;
|
2016-01-26 15:42:58 +08:00
|
|
|
map.m_next_pgofs = next_pgofs;
|
2018-01-11 14:42:30 +08:00
|
|
|
map.m_next_extent = NULL;
|
2017-11-28 08:23:00 +08:00
|
|
|
map.m_seg_type = seg_type;
|
2018-11-13 14:33:45 +08:00
|
|
|
map.m_may_create = may_write;
|
2015-04-07 10:55:34 +08:00
|
|
|
|
2016-11-12 08:31:56 +08:00
|
|
|
err = f2fs_map_blocks(inode, &map, create, flag);
|
|
|
|
if (!err) {
|
2015-04-07 10:55:34 +08:00
|
|
|
map_bh(bh, inode->i_sb, map.m_pblk);
|
|
|
|
bh->b_state = (bh->b_state & ~F2FS_MAP_FLAGS) | map.m_flags;
|
2017-01-22 12:21:02 +08:00
|
|
|
bh->b_size = (u64)map.m_len << inode->i_blkbits;
|
2015-04-07 10:55:34 +08:00
|
|
|
}
|
2016-11-12 08:31:56 +08:00
|
|
|
return err;
|
2015-04-07 10:55:34 +08:00
|
|
|
}
|
|
|
|
|
2014-06-13 12:02:11 +08:00
|
|
|
static int get_data_block(struct inode *inode, sector_t iblock,
|
2016-01-26 15:42:58 +08:00
|
|
|
struct buffer_head *bh_result, int create, int flag,
|
|
|
|
pgoff_t *next_pgofs)
|
2015-08-19 19:11:19 +08:00
|
|
|
{
|
2016-01-26 15:42:58 +08:00
|
|
|
return __get_data_block(inode, iblock, bh_result, create,
|
2017-11-28 08:23:00 +08:00
|
|
|
flag, next_pgofs,
|
2018-11-13 14:33:45 +08:00
|
|
|
NO_CHECK_TYPE, create);
|
|
|
|
}
|
|
|
|
|
|
|
|
static int get_data_block_dio_write(struct inode *inode, sector_t iblock,
|
|
|
|
struct buffer_head *bh_result, int create)
|
|
|
|
{
|
|
|
|
return __get_data_block(inode, iblock, bh_result, create,
|
|
|
|
F2FS_GET_BLOCK_DIO, NULL,
|
|
|
|
f2fs_rw_hint_to_seg_type(inode->i_write_hint),
|
2019-08-01 04:27:05 +08:00
|
|
|
IS_SWAPFILE(inode) ? false : true);
|
2015-08-19 19:11:19 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int get_data_block_dio(struct inode *inode, sector_t iblock,
|
2014-06-13 12:02:11 +08:00
|
|
|
struct buffer_head *bh_result, int create)
|
|
|
|
{
|
2015-08-19 19:11:19 +08:00
|
|
|
return __get_data_block(inode, iblock, bh_result, create,
|
2018-11-13 14:33:45 +08:00
|
|
|
F2FS_GET_BLOCK_DIO, NULL,
|
|
|
|
f2fs_rw_hint_to_seg_type(inode->i_write_hint),
|
|
|
|
false);
|
2014-06-13 12:02:11 +08:00
|
|
|
}
|
|
|
|
|
2015-08-19 19:11:19 +08:00
|
|
|
static int get_data_block_bmap(struct inode *inode, sector_t iblock,
|
2014-06-13 12:02:11 +08:00
|
|
|
struct buffer_head *bh_result, int create)
|
|
|
|
{
|
2015-12-28 21:48:32 +08:00
|
|
|
/* Block number less than F2FS MAX BLOCKS */
|
2015-12-31 14:35:37 +08:00
|
|
|
if (unlikely(iblock >= F2FS_I_SB(inode)->max_file_blocks))
|
2015-12-28 21:48:32 +08:00
|
|
|
return -EFBIG;
|
|
|
|
|
2015-08-19 19:11:19 +08:00
|
|
|
return __get_data_block(inode, iblock, bh_result, create,
|
2017-11-28 08:23:00 +08:00
|
|
|
F2FS_GET_BLOCK_BMAP, NULL,
|
2018-11-13 14:33:45 +08:00
|
|
|
NO_CHECK_TYPE, create);
|
2014-06-13 12:02:11 +08:00
|
|
|
}
|
|
|
|
|
2015-05-09 10:30:32 +08:00
|
|
|
static inline sector_t logical_to_blk(struct inode *inode, loff_t offset)
|
|
|
|
{
|
|
|
|
return (offset >> inode->i_blkbits);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline loff_t blk_to_logical(struct inode *inode, sector_t blk)
|
|
|
|
{
|
|
|
|
return (blk << inode->i_blkbits);
|
|
|
|
}
|
|
|
|
|
2018-01-11 14:39:57 +08:00
|
|
|
static int f2fs_xattr_fiemap(struct inode *inode,
|
|
|
|
struct fiemap_extent_info *fieinfo)
|
|
|
|
{
|
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
|
|
|
struct page *page;
|
|
|
|
struct node_info ni;
|
|
|
|
__u64 phys = 0, len;
|
|
|
|
__u32 flags;
|
|
|
|
nid_t xnid = F2FS_I(inode)->i_xattr_nid;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
if (f2fs_has_inline_xattr(inode)) {
|
|
|
|
int offset;
|
|
|
|
|
|
|
|
page = f2fs_grab_cache_page(NODE_MAPPING(sbi),
|
|
|
|
inode->i_ino, false);
|
|
|
|
if (!page)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-07-17 00:02:17 +08:00
|
|
|
err = f2fs_get_node_info(sbi, inode->i_ino, &ni);
|
|
|
|
if (err) {
|
|
|
|
f2fs_put_page(page, 1);
|
|
|
|
return err;
|
|
|
|
}
|
2018-01-11 14:39:57 +08:00
|
|
|
|
|
|
|
phys = (__u64)blk_to_logical(inode, ni.blk_addr);
|
|
|
|
offset = offsetof(struct f2fs_inode, i_addr) +
|
|
|
|
sizeof(__le32) * (DEF_ADDRS_PER_INODE -
|
2018-01-17 16:31:36 +08:00
|
|
|
get_inline_xattr_addrs(inode));
|
2018-01-11 14:39:57 +08:00
|
|
|
|
|
|
|
phys += offset;
|
|
|
|
len = inline_xattr_size(inode);
|
|
|
|
|
|
|
|
f2fs_put_page(page, 1);
|
|
|
|
|
|
|
|
flags = FIEMAP_EXTENT_DATA_INLINE | FIEMAP_EXTENT_NOT_ALIGNED;
|
|
|
|
|
|
|
|
if (!xnid)
|
|
|
|
flags |= FIEMAP_EXTENT_LAST;
|
|
|
|
|
|
|
|
err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags);
|
|
|
|
if (err || err == 1)
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (xnid) {
|
|
|
|
page = f2fs_grab_cache_page(NODE_MAPPING(sbi), xnid, false);
|
|
|
|
if (!page)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2018-07-17 00:02:17 +08:00
|
|
|
err = f2fs_get_node_info(sbi, xnid, &ni);
|
|
|
|
if (err) {
|
|
|
|
f2fs_put_page(page, 1);
|
|
|
|
return err;
|
|
|
|
}
|
2018-01-11 14:39:57 +08:00
|
|
|
|
|
|
|
phys = (__u64)blk_to_logical(inode, ni.blk_addr);
|
|
|
|
len = inode->i_sb->s_blocksize;
|
|
|
|
|
|
|
|
f2fs_put_page(page, 1);
|
|
|
|
|
|
|
|
flags = FIEMAP_EXTENT_LAST;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (phys)
|
|
|
|
err = fiemap_fill_next_extent(fieinfo, 0, phys, len, flags);
|
|
|
|
|
|
|
|
return (err < 0 ? err : 0);
|
|
|
|
}
|
|
|
|
|
2014-06-08 03:30:14 +08:00
|
|
|
int f2fs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
|
|
|
u64 start, u64 len)
|
|
|
|
{
|
2015-05-09 10:30:32 +08:00
|
|
|
struct buffer_head map_bh;
|
|
|
|
sector_t start_blk, last_blk;
|
2016-01-26 15:42:58 +08:00
|
|
|
pgoff_t next_pgofs;
|
2015-05-09 10:30:32 +08:00
|
|
|
u64 logical = 0, phys = 0, size = 0;
|
|
|
|
u32 flags = 0;
|
|
|
|
int ret = 0;
|
|
|
|
|
2018-01-11 14:42:30 +08:00
|
|
|
if (fieinfo->fi_flags & FIEMAP_FLAG_CACHE) {
|
|
|
|
ret = f2fs_precache_extents(inode);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-01-11 14:39:57 +08:00
|
|
|
ret = fiemap_check_flags(fieinfo, FIEMAP_FLAG_SYNC | FIEMAP_FLAG_XATTR);
|
2015-05-09 10:30:32 +08:00
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2018-01-11 14:37:35 +08:00
|
|
|
inode_lock(inode);
|
|
|
|
|
2018-01-11 14:39:57 +08:00
|
|
|
if (fieinfo->fi_flags & FIEMAP_FLAG_XATTR) {
|
|
|
|
ret = f2fs_xattr_fiemap(inode, fieinfo);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2019-07-22 18:03:50 +08:00
|
|
|
if (f2fs_has_inline_data(inode) || f2fs_has_inline_dentry(inode)) {
|
2015-10-16 02:34:49 +08:00
|
|
|
ret = f2fs_inline_data_fiemap(inode, fieinfo, start, len);
|
|
|
|
if (ret != -EAGAIN)
|
2018-01-11 14:37:35 +08:00
|
|
|
goto out;
|
2015-10-16 02:34:49 +08:00
|
|
|
}
|
|
|
|
|
2015-05-09 10:30:32 +08:00
|
|
|
if (logical_to_blk(inode, len) == 0)
|
|
|
|
len = blk_to_logical(inode, 1);
|
|
|
|
|
|
|
|
start_blk = logical_to_blk(inode, start);
|
|
|
|
last_blk = logical_to_blk(inode, start + len - 1);
|
2015-12-26 18:07:41 +08:00
|
|
|
|
2015-05-09 10:30:32 +08:00
|
|
|
next:
|
|
|
|
memset(&map_bh, 0, sizeof(struct buffer_head));
|
|
|
|
map_bh.b_size = len;
|
|
|
|
|
2015-08-19 19:11:19 +08:00
|
|
|
ret = get_data_block(inode, start_blk, &map_bh, 0,
|
2016-01-26 15:42:58 +08:00
|
|
|
F2FS_GET_BLOCK_FIEMAP, &next_pgofs);
|
2015-05-09 10:30:32 +08:00
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
/* HOLE */
|
|
|
|
if (!buffer_mapped(&map_bh)) {
|
2016-01-26 15:42:58 +08:00
|
|
|
start_blk = next_pgofs;
|
2016-10-11 22:57:04 +08:00
|
|
|
|
|
|
|
if (blk_to_logical(inode, start_blk) < blk_to_logical(inode,
|
|
|
|
F2FS_I_SB(inode)->max_file_blocks))
|
2015-12-26 18:07:41 +08:00
|
|
|
goto prep_next;
|
2016-10-11 22:57:04 +08:00
|
|
|
|
2015-12-26 18:07:41 +08:00
|
|
|
flags |= FIEMAP_EXTENT_LAST;
|
|
|
|
}
|
2015-05-09 10:30:32 +08:00
|
|
|
|
2016-01-08 20:19:27 +08:00
|
|
|
if (size) {
|
2018-12-12 17:50:11 +08:00
|
|
|
if (IS_ENCRYPTED(inode))
|
2016-01-08 20:19:27 +08:00
|
|
|
flags |= FIEMAP_EXTENT_DATA_ENCRYPTED;
|
|
|
|
|
2015-12-26 18:07:41 +08:00
|
|
|
ret = fiemap_fill_next_extent(fieinfo, logical,
|
|
|
|
phys, size, flags);
|
2016-01-08 20:19:27 +08:00
|
|
|
}
|
2015-05-09 10:30:32 +08:00
|
|
|
|
2015-12-26 18:07:41 +08:00
|
|
|
if (start_blk > last_blk || ret)
|
|
|
|
goto out;
|
2015-05-09 10:30:32 +08:00
|
|
|
|
2015-12-26 18:07:41 +08:00
|
|
|
logical = blk_to_logical(inode, start_blk);
|
|
|
|
phys = blk_to_logical(inode, map_bh.b_blocknr);
|
|
|
|
size = map_bh.b_size;
|
|
|
|
flags = 0;
|
|
|
|
if (buffer_unwritten(&map_bh))
|
|
|
|
flags = FIEMAP_EXTENT_UNWRITTEN;
|
2015-05-09 10:30:32 +08:00
|
|
|
|
2015-12-26 18:07:41 +08:00
|
|
|
start_blk += logical_to_blk(inode, size);
|
2015-05-09 10:30:32 +08:00
|
|
|
|
2015-12-26 18:07:41 +08:00
|
|
|
prep_next:
|
2015-05-09 10:30:32 +08:00
|
|
|
cond_resched();
|
|
|
|
if (fatal_signal_pending(current))
|
|
|
|
ret = -EINTR;
|
|
|
|
else
|
|
|
|
goto next;
|
|
|
|
out:
|
|
|
|
if (ret == 1)
|
|
|
|
ret = 0;
|
|
|
|
|
2016-01-23 04:40:57 +08:00
|
|
|
inode_unlock(inode);
|
2015-05-09 10:30:32 +08:00
|
|
|
return ret;
|
2014-06-08 03:30:14 +08:00
|
|
|
}
|
|
|
|
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
static inline loff_t f2fs_readpage_limit(struct inode *inode)
|
|
|
|
{
|
|
|
|
if (IS_ENABLED(CONFIG_FS_VERITY) &&
|
|
|
|
(IS_VERITY(inode) || f2fs_verity_in_progress(inode)))
|
|
|
|
return inode->i_sb->s_maxbytes;
|
|
|
|
|
|
|
|
return i_size_read(inode);
|
|
|
|
}
|
|
|
|
|
2019-03-25 21:07:30 +08:00
|
|
|
static int f2fs_read_single_page(struct inode *inode, struct page *page,
|
|
|
|
unsigned nr_pages,
|
|
|
|
struct f2fs_map_blocks *map,
|
|
|
|
struct bio **bio_ret,
|
|
|
|
sector_t *last_block_in_bio,
|
|
|
|
bool is_readahead)
|
|
|
|
{
|
|
|
|
struct bio *bio = *bio_ret;
|
|
|
|
const unsigned blkbits = inode->i_blkbits;
|
|
|
|
const unsigned blocksize = 1 << blkbits;
|
|
|
|
sector_t block_in_file;
|
|
|
|
sector_t last_block;
|
|
|
|
sector_t last_block_in_file;
|
|
|
|
sector_t block_nr;
|
|
|
|
int ret = 0;
|
|
|
|
|
2019-07-02 10:15:29 +08:00
|
|
|
block_in_file = (sector_t)page_index(page);
|
2019-03-25 21:07:30 +08:00
|
|
|
last_block = block_in_file + nr_pages;
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
last_block_in_file = (f2fs_readpage_limit(inode) + blocksize - 1) >>
|
2019-03-25 21:07:30 +08:00
|
|
|
blkbits;
|
|
|
|
if (last_block > last_block_in_file)
|
|
|
|
last_block = last_block_in_file;
|
|
|
|
|
|
|
|
/* just zeroing out page which is beyond EOF */
|
|
|
|
if (block_in_file >= last_block)
|
|
|
|
goto zero_out;
|
|
|
|
/*
|
|
|
|
* Map blocks using the previous result first.
|
|
|
|
*/
|
|
|
|
if ((map->m_flags & F2FS_MAP_MAPPED) &&
|
|
|
|
block_in_file > map->m_lblk &&
|
|
|
|
block_in_file < (map->m_lblk + map->m_len))
|
|
|
|
goto got_it;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Then do more f2fs_map_blocks() calls until we are
|
|
|
|
* done with this page.
|
|
|
|
*/
|
|
|
|
map->m_lblk = block_in_file;
|
|
|
|
map->m_len = last_block - block_in_file;
|
|
|
|
|
|
|
|
ret = f2fs_map_blocks(inode, map, 0, F2FS_GET_BLOCK_DEFAULT);
|
|
|
|
if (ret)
|
|
|
|
goto out;
|
|
|
|
got_it:
|
|
|
|
if ((map->m_flags & F2FS_MAP_MAPPED)) {
|
|
|
|
block_nr = map->m_pblk + block_in_file - map->m_lblk;
|
|
|
|
SetPageMappedToDisk(page);
|
|
|
|
|
2019-07-02 10:15:29 +08:00
|
|
|
if (!PageUptodate(page) && (!PageSwapCache(page) &&
|
|
|
|
!cleancache_get_page(page))) {
|
2019-03-25 21:07:30 +08:00
|
|
|
SetPageUptodate(page);
|
|
|
|
goto confused;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!f2fs_is_valid_blkaddr(F2FS_I_SB(inode), block_nr,
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
DATA_GENERIC_ENHANCE_READ)) {
|
2019-06-20 11:36:14 +08:00
|
|
|
ret = -EFSCORRUPTED;
|
2019-03-25 21:07:30 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
} else {
|
|
|
|
zero_out:
|
|
|
|
zero_user_segment(page, 0, PAGE_SIZE);
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
if (f2fs_need_verity(inode, page->index) &&
|
|
|
|
!fsverity_verify_page(page)) {
|
|
|
|
ret = -EIO;
|
|
|
|
goto out;
|
|
|
|
}
|
2019-03-25 21:07:30 +08:00
|
|
|
if (!PageUptodate(page))
|
|
|
|
SetPageUptodate(page);
|
|
|
|
unlock_page(page);
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This page will go to BIO. Do we need to send this
|
|
|
|
* BIO off first?
|
|
|
|
*/
|
2019-07-12 16:55:41 +08:00
|
|
|
if (bio && !page_is_mergeable(F2FS_I_SB(inode), bio,
|
|
|
|
*last_block_in_bio, block_nr)) {
|
2019-03-25 21:07:30 +08:00
|
|
|
submit_and_realloc:
|
|
|
|
__submit_bio(F2FS_I_SB(inode), bio, DATA);
|
|
|
|
bio = NULL;
|
|
|
|
}
|
|
|
|
if (bio == NULL) {
|
|
|
|
bio = f2fs_grab_read_bio(inode, block_nr, nr_pages,
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
is_readahead ? REQ_RAHEAD : 0, page->index);
|
2019-03-25 21:07:30 +08:00
|
|
|
if (IS_ERR(bio)) {
|
|
|
|
ret = PTR_ERR(bio);
|
|
|
|
bio = NULL;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the page is under writeback, we need to wait for
|
|
|
|
* its completion to see the correct decrypted data.
|
|
|
|
*/
|
|
|
|
f2fs_wait_on_block_writeback(inode, block_nr);
|
|
|
|
|
|
|
|
if (bio_add_page(bio, page, blocksize, 0) < blocksize)
|
|
|
|
goto submit_and_realloc;
|
|
|
|
|
|
|
|
inc_page_count(F2FS_I_SB(inode), F2FS_RD_DATA);
|
|
|
|
ClearPageError(page);
|
|
|
|
*last_block_in_bio = block_nr;
|
|
|
|
goto out;
|
|
|
|
confused:
|
|
|
|
if (bio) {
|
|
|
|
__submit_bio(F2FS_I_SB(inode), bio, DATA);
|
|
|
|
bio = NULL;
|
|
|
|
}
|
|
|
|
unlock_page(page);
|
|
|
|
out:
|
|
|
|
*bio_ret = bio;
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2015-04-10 02:20:42 +08:00
|
|
|
/*
|
|
|
|
* This function was originally taken from fs/mpage.c, and customized for f2fs.
|
|
|
|
* Major change was from block_size == page_size in f2fs by default.
|
2018-06-22 02:29:43 +08:00
|
|
|
*
|
|
|
|
* Note that the aops->readpages() function is ONLY used for read-ahead. If
|
|
|
|
* this function ever deviates from doing just read-ahead, it should either
|
|
|
|
* use ->readpage() or do the necessary surgery to decouple ->readpages()
|
|
|
|
* from read-ahead.
|
2015-04-10 02:20:42 +08:00
|
|
|
*/
|
|
|
|
static int f2fs_mpage_readpages(struct address_space *mapping,
|
|
|
|
struct list_head *pages, struct page *page,
|
2018-06-22 02:29:43 +08:00
|
|
|
unsigned nr_pages, bool is_readahead)
|
2015-04-10 02:20:42 +08:00
|
|
|
{
|
|
|
|
struct bio *bio = NULL;
|
|
|
|
sector_t last_block_in_bio = 0;
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
struct f2fs_map_blocks map;
|
2019-03-25 21:07:30 +08:00
|
|
|
int ret = 0;
|
2015-04-10 02:20:42 +08:00
|
|
|
|
|
|
|
map.m_pblk = 0;
|
|
|
|
map.m_lblk = 0;
|
|
|
|
map.m_len = 0;
|
|
|
|
map.m_flags = 0;
|
2016-01-26 15:42:58 +08:00
|
|
|
map.m_next_pgofs = NULL;
|
2018-01-11 14:42:30 +08:00
|
|
|
map.m_next_extent = NULL;
|
2017-11-28 08:23:00 +08:00
|
|
|
map.m_seg_type = NO_CHECK_TYPE;
|
2018-11-13 14:33:45 +08:00
|
|
|
map.m_may_create = false;
|
2015-04-10 02:20:42 +08:00
|
|
|
|
2017-11-25 11:46:18 +08:00
|
|
|
for (; nr_pages; nr_pages--) {
|
2015-04-10 02:20:42 +08:00
|
|
|
if (pages) {
|
2017-01-07 18:49:42 +08:00
|
|
|
page = list_last_entry(pages, struct page, lru);
|
2017-03-13 16:35:13 +08:00
|
|
|
|
|
|
|
prefetchw(&page->flags);
|
2015-04-10 02:20:42 +08:00
|
|
|
list_del(&page->lru);
|
|
|
|
if (add_to_page_cache_lru(page, mapping,
|
2019-07-02 10:15:29 +08:00
|
|
|
page_index(page),
|
2016-07-27 06:24:53 +08:00
|
|
|
readahead_gfp_mask(mapping)))
|
2015-04-10 02:20:42 +08:00
|
|
|
goto next_page;
|
|
|
|
}
|
|
|
|
|
2019-03-25 21:07:30 +08:00
|
|
|
ret = f2fs_read_single_page(inode, page, nr_pages, &map, &bio,
|
|
|
|
&last_block_in_bio, is_readahead);
|
|
|
|
if (ret) {
|
|
|
|
SetPageError(page);
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
zero_user_segment(page, 0, PAGE_SIZE);
|
2015-04-10 02:20:42 +08:00
|
|
|
unlock_page(page);
|
|
|
|
}
|
|
|
|
next_page:
|
|
|
|
if (pages)
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
put_page(page);
|
2015-04-10 02:20:42 +08:00
|
|
|
}
|
|
|
|
BUG_ON(pages && !list_empty(pages));
|
|
|
|
if (bio)
|
2016-07-28 01:36:31 +08:00
|
|
|
__submit_bio(F2FS_I_SB(inode), bio, DATA);
|
2019-03-25 21:07:30 +08:00
|
|
|
return pages ? 0 : ret;
|
2015-04-10 02:20:42 +08:00
|
|
|
}
|
|
|
|
|
2012-11-02 16:10:12 +08:00
|
|
|
static int f2fs_read_data_page(struct file *file, struct page *page)
|
|
|
|
{
|
2019-07-02 10:15:29 +08:00
|
|
|
struct inode *inode = page_file_mapping(page)->host;
|
2014-10-24 10:48:09 +08:00
|
|
|
int ret = -EAGAIN;
|
f2fs: handle inline data operations
Hook inline data read/write, truncate, fallocate, setattr, etc.
Files need meet following 2 requirement to inline:
1) file size is not greater than MAX_INLINE_DATA;
2) file doesn't pre-allocate data blocks by fallocate().
FI_INLINE_DATA will not be set while creating a new regular inode because
most of the files are bigger than ~3.4K. Set FI_INLINE_DATA only when
data is submitted to block layer, ranther than set it while creating a new
inode, this also avoids converting data from inline to normal data block
and vice versa.
While writting inline data to inode block, the first data block should be
released if the file has a block indexed by i_addr[0].
On the other hand, when a file operation is appied to a file with inline
data, we need to test if this file can remain inline by doing this
operation, otherwise it should be convert into normal file by reserving
a new data block, copying inline data to this new block and clear
FI_INLINE_DATA flag. Because reserve a new data block here will make use
of i_addr[0], if we save inline data in i_addr[0..872], then the first
4 bytes would be overwriten. This problem can be avoided simply by
not using i_addr[0] for inline data.
Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-11-10 23:13:20 +08:00
|
|
|
|
2014-05-06 16:53:08 +08:00
|
|
|
trace_f2fs_readpage(page, DATA);
|
|
|
|
|
2014-08-06 22:22:50 +08:00
|
|
|
/* If the file has inline data, try to read it directly */
|
f2fs: handle inline data operations
Hook inline data read/write, truncate, fallocate, setattr, etc.
Files need meet following 2 requirement to inline:
1) file size is not greater than MAX_INLINE_DATA;
2) file doesn't pre-allocate data blocks by fallocate().
FI_INLINE_DATA will not be set while creating a new regular inode because
most of the files are bigger than ~3.4K. Set FI_INLINE_DATA only when
data is submitted to block layer, ranther than set it while creating a new
inode, this also avoids converting data from inline to normal data block
and vice versa.
While writting inline data to inode block, the first data block should be
released if the file has a block indexed by i_addr[0].
On the other hand, when a file operation is appied to a file with inline
data, we need to test if this file can remain inline by doing this
operation, otherwise it should be convert into normal file by reserving
a new data block, copying inline data to this new block and clear
FI_INLINE_DATA flag. Because reserve a new data block here will make use
of i_addr[0], if we save inline data in i_addr[0..872], then the first
4 bytes would be overwriten. This problem can be avoided simply by
not using i_addr[0] for inline data.
Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-11-10 23:13:20 +08:00
|
|
|
if (f2fs_has_inline_data(inode))
|
|
|
|
ret = f2fs_read_inline_data(inode, page);
|
2014-10-24 10:48:09 +08:00
|
|
|
if (ret == -EAGAIN)
|
2019-07-02 10:15:29 +08:00
|
|
|
ret = f2fs_mpage_readpages(page_file_mapping(page),
|
|
|
|
NULL, page, 1, false);
|
f2fs: handle inline data operations
Hook inline data read/write, truncate, fallocate, setattr, etc.
Files need meet following 2 requirement to inline:
1) file size is not greater than MAX_INLINE_DATA;
2) file doesn't pre-allocate data blocks by fallocate().
FI_INLINE_DATA will not be set while creating a new regular inode because
most of the files are bigger than ~3.4K. Set FI_INLINE_DATA only when
data is submitted to block layer, ranther than set it while creating a new
inode, this also avoids converting data from inline to normal data block
and vice versa.
While writting inline data to inode block, the first data block should be
released if the file has a block indexed by i_addr[0].
On the other hand, when a file operation is appied to a file with inline
data, we need to test if this file can remain inline by doing this
operation, otherwise it should be convert into normal file by reserving
a new data block, copying inline data to this new block and clear
FI_INLINE_DATA flag. Because reserve a new data block here will make use
of i_addr[0], if we save inline data in i_addr[0..872], then the first
4 bytes would be overwriten. This problem can be avoided simply by
not using i_addr[0] for inline data.
Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-11-10 23:13:20 +08:00
|
|
|
return ret;
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int f2fs_read_data_pages(struct file *file,
|
|
|
|
struct address_space *mapping,
|
|
|
|
struct list_head *pages, unsigned nr_pages)
|
|
|
|
{
|
2017-09-24 02:45:42 +08:00
|
|
|
struct inode *inode = mapping->host;
|
2017-01-07 18:49:42 +08:00
|
|
|
struct page *page = list_last_entry(pages, struct page, lru);
|
2015-10-12 17:02:26 +08:00
|
|
|
|
|
|
|
trace_f2fs_readpages(inode, page, nr_pages);
|
f2fs: handle inline data operations
Hook inline data read/write, truncate, fallocate, setattr, etc.
Files need meet following 2 requirement to inline:
1) file size is not greater than MAX_INLINE_DATA;
2) file doesn't pre-allocate data blocks by fallocate().
FI_INLINE_DATA will not be set while creating a new regular inode because
most of the files are bigger than ~3.4K. Set FI_INLINE_DATA only when
data is submitted to block layer, ranther than set it while creating a new
inode, this also avoids converting data from inline to normal data block
and vice versa.
While writting inline data to inode block, the first data block should be
released if the file has a block indexed by i_addr[0].
On the other hand, when a file operation is appied to a file with inline
data, we need to test if this file can remain inline by doing this
operation, otherwise it should be convert into normal file by reserving
a new data block, copying inline data to this new block and clear
FI_INLINE_DATA flag. Because reserve a new data block here will make use
of i_addr[0], if we save inline data in i_addr[0..872], then the first
4 bytes would be overwriten. This problem can be avoided simply by
not using i_addr[0] for inline data.
Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-11-10 23:13:20 +08:00
|
|
|
|
|
|
|
/* If the file has inline data, skip readpages */
|
|
|
|
if (f2fs_has_inline_data(inode))
|
|
|
|
return 0;
|
|
|
|
|
2018-06-22 02:29:43 +08:00
|
|
|
return f2fs_mpage_readpages(mapping, pages, NULL, nr_pages, true);
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
|
|
|
|
2017-04-25 20:45:12 +08:00
|
|
|
static int encrypt_one_page(struct f2fs_io_info *fio)
|
|
|
|
{
|
|
|
|
struct inode *inode = fio->page->mapping->host;
|
f2fs: readahead encrypted block during GC
During GC, for each encrypted block, we will read block synchronously
into meta page, and then submit it into current cold data log area.
So this block read model with 4k granularity can make poor performance,
like migrating non-encrypted block, let's readahead encrypted block
as well to improve migration performance.
To implement this, we choose meta page that its index is old block
address of the encrypted block, and readahead ciphertext into this
page, later, if readaheaded page is still updated, we will load its
data into target meta page, and submit the write IO.
Note that for OPU, truncation, deletion, we need to invalid meta
page after we invalid old block address, to make sure we won't load
invalid data from target meta page during encrypted block migration.
for ((i = 0; i < 1000; i++))
do {
xfs_io -f /mnt/f2fs/dir/$i -c "pwrite 0 128k" -c "fsync";
} done
for ((i = 0; i < 1000; i+=2))
do {
rm /mnt/f2fs/dir/$i;
} done
ret = ioctl(fd, F2FS_IOC_GARBAGE_COLLECT, 0);
Before:
gc-6549 [001] d..1 214682.212797: block_rq_insert: 8,32 RA 32768 () 786400 + 64 [gc]
gc-6549 [001] d..1 214682.212802: block_unplug: [gc] 1
gc-6549 [001] .... 214682.213892: block_bio_queue: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213899: block_getrq: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213902: block_plug: [gc]
gc-6549 [001] d..1 214682.213905: block_rq_insert: 8,32 R 4096 () 67494144 + 8 [gc]
gc-6549 [001] d..1 214682.213908: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226405: block_bio_queue: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226412: block_getrq: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226414: block_plug: [gc]
gc-6549 [001] d..1 214682.226417: block_rq_insert: 8,32 R 4096 () 67494152 + 8 [gc]
gc-6549 [001] d..1 214682.226420: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226904: block_bio_queue: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226910: block_getrq: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226911: block_plug: [gc]
gc-6549 [001] d..1 214682.226914: block_rq_insert: 8,32 R 4096 () 67494160 + 8 [gc]
gc-6549 [001] d..1 214682.226916: block_unplug: [gc] 1
After:
gc-5678 [003] .... 214327.025906: block_bio_queue: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025908: block_bio_backmerge: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025915: block_bio_queue: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025917: block_bio_backmerge: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025923: block_bio_queue: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025925: block_bio_backmerge: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025932: block_bio_queue: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025934: block_bio_backmerge: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025941: block_bio_queue: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025943: block_bio_backmerge: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025953: block_bio_queue: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025955: block_bio_backmerge: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025962: block_bio_queue: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025964: block_bio_backmerge: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025970: block_bio_queue: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.025972: block_bio_backmerge: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.026000: block_bio_queue: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] .... 214327.026019: block_getrq: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] d..1 214327.026021: block_rq_insert: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] d..1 214327.026023: block_unplug: [gc] 1
gc-5678 [003] d..1 214327.026026: block_rq_issue: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] .... 214327.026046: block_plug: [gc]
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-14 22:37:25 +08:00
|
|
|
struct page *mpage;
|
2017-04-25 20:45:12 +08:00
|
|
|
gfp_t gfp_flags = GFP_NOFS;
|
|
|
|
|
2017-09-06 07:54:24 +08:00
|
|
|
if (!f2fs_encrypted_file(inode))
|
2017-04-25 20:45:12 +08:00
|
|
|
return 0;
|
|
|
|
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
/* wait for GCed page writeback via META_MAPPING */
|
2018-08-23 12:18:00 +08:00
|
|
|
f2fs_wait_on_block_writeback(inode, fio->old_blkaddr);
|
2017-04-25 20:45:12 +08:00
|
|
|
|
|
|
|
retry_encrypt:
|
2019-05-21 00:29:44 +08:00
|
|
|
fio->encrypted_page = fscrypt_encrypt_pagecache_blocks(fio->page,
|
|
|
|
PAGE_SIZE, 0,
|
|
|
|
gfp_flags);
|
f2fs: readahead encrypted block during GC
During GC, for each encrypted block, we will read block synchronously
into meta page, and then submit it into current cold data log area.
So this block read model with 4k granularity can make poor performance,
like migrating non-encrypted block, let's readahead encrypted block
as well to improve migration performance.
To implement this, we choose meta page that its index is old block
address of the encrypted block, and readahead ciphertext into this
page, later, if readaheaded page is still updated, we will load its
data into target meta page, and submit the write IO.
Note that for OPU, truncation, deletion, we need to invalid meta
page after we invalid old block address, to make sure we won't load
invalid data from target meta page during encrypted block migration.
for ((i = 0; i < 1000; i++))
do {
xfs_io -f /mnt/f2fs/dir/$i -c "pwrite 0 128k" -c "fsync";
} done
for ((i = 0; i < 1000; i+=2))
do {
rm /mnt/f2fs/dir/$i;
} done
ret = ioctl(fd, F2FS_IOC_GARBAGE_COLLECT, 0);
Before:
gc-6549 [001] d..1 214682.212797: block_rq_insert: 8,32 RA 32768 () 786400 + 64 [gc]
gc-6549 [001] d..1 214682.212802: block_unplug: [gc] 1
gc-6549 [001] .... 214682.213892: block_bio_queue: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213899: block_getrq: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213902: block_plug: [gc]
gc-6549 [001] d..1 214682.213905: block_rq_insert: 8,32 R 4096 () 67494144 + 8 [gc]
gc-6549 [001] d..1 214682.213908: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226405: block_bio_queue: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226412: block_getrq: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226414: block_plug: [gc]
gc-6549 [001] d..1 214682.226417: block_rq_insert: 8,32 R 4096 () 67494152 + 8 [gc]
gc-6549 [001] d..1 214682.226420: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226904: block_bio_queue: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226910: block_getrq: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226911: block_plug: [gc]
gc-6549 [001] d..1 214682.226914: block_rq_insert: 8,32 R 4096 () 67494160 + 8 [gc]
gc-6549 [001] d..1 214682.226916: block_unplug: [gc] 1
After:
gc-5678 [003] .... 214327.025906: block_bio_queue: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025908: block_bio_backmerge: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025915: block_bio_queue: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025917: block_bio_backmerge: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025923: block_bio_queue: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025925: block_bio_backmerge: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025932: block_bio_queue: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025934: block_bio_backmerge: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025941: block_bio_queue: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025943: block_bio_backmerge: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025953: block_bio_queue: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025955: block_bio_backmerge: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025962: block_bio_queue: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025964: block_bio_backmerge: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025970: block_bio_queue: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.025972: block_bio_backmerge: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.026000: block_bio_queue: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] .... 214327.026019: block_getrq: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] d..1 214327.026021: block_rq_insert: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] d..1 214327.026023: block_unplug: [gc] 1
gc-5678 [003] d..1 214327.026026: block_rq_issue: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] .... 214327.026046: block_plug: [gc]
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-14 22:37:25 +08:00
|
|
|
if (IS_ERR(fio->encrypted_page)) {
|
|
|
|
/* flush pending IOs and wait for a while in the ENOMEM case */
|
|
|
|
if (PTR_ERR(fio->encrypted_page) == -ENOMEM) {
|
|
|
|
f2fs_flush_merged_writes(fio->sbi);
|
|
|
|
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
|
|
|
gfp_flags |= __GFP_NOFAIL;
|
|
|
|
goto retry_encrypt;
|
|
|
|
}
|
|
|
|
return PTR_ERR(fio->encrypted_page);
|
|
|
|
}
|
2017-04-25 20:45:12 +08:00
|
|
|
|
f2fs: readahead encrypted block during GC
During GC, for each encrypted block, we will read block synchronously
into meta page, and then submit it into current cold data log area.
So this block read model with 4k granularity can make poor performance,
like migrating non-encrypted block, let's readahead encrypted block
as well to improve migration performance.
To implement this, we choose meta page that its index is old block
address of the encrypted block, and readahead ciphertext into this
page, later, if readaheaded page is still updated, we will load its
data into target meta page, and submit the write IO.
Note that for OPU, truncation, deletion, we need to invalid meta
page after we invalid old block address, to make sure we won't load
invalid data from target meta page during encrypted block migration.
for ((i = 0; i < 1000; i++))
do {
xfs_io -f /mnt/f2fs/dir/$i -c "pwrite 0 128k" -c "fsync";
} done
for ((i = 0; i < 1000; i+=2))
do {
rm /mnt/f2fs/dir/$i;
} done
ret = ioctl(fd, F2FS_IOC_GARBAGE_COLLECT, 0);
Before:
gc-6549 [001] d..1 214682.212797: block_rq_insert: 8,32 RA 32768 () 786400 + 64 [gc]
gc-6549 [001] d..1 214682.212802: block_unplug: [gc] 1
gc-6549 [001] .... 214682.213892: block_bio_queue: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213899: block_getrq: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213902: block_plug: [gc]
gc-6549 [001] d..1 214682.213905: block_rq_insert: 8,32 R 4096 () 67494144 + 8 [gc]
gc-6549 [001] d..1 214682.213908: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226405: block_bio_queue: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226412: block_getrq: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226414: block_plug: [gc]
gc-6549 [001] d..1 214682.226417: block_rq_insert: 8,32 R 4096 () 67494152 + 8 [gc]
gc-6549 [001] d..1 214682.226420: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226904: block_bio_queue: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226910: block_getrq: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226911: block_plug: [gc]
gc-6549 [001] d..1 214682.226914: block_rq_insert: 8,32 R 4096 () 67494160 + 8 [gc]
gc-6549 [001] d..1 214682.226916: block_unplug: [gc] 1
After:
gc-5678 [003] .... 214327.025906: block_bio_queue: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025908: block_bio_backmerge: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025915: block_bio_queue: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025917: block_bio_backmerge: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025923: block_bio_queue: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025925: block_bio_backmerge: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025932: block_bio_queue: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025934: block_bio_backmerge: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025941: block_bio_queue: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025943: block_bio_backmerge: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025953: block_bio_queue: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025955: block_bio_backmerge: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025962: block_bio_queue: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025964: block_bio_backmerge: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025970: block_bio_queue: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.025972: block_bio_backmerge: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.026000: block_bio_queue: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] .... 214327.026019: block_getrq: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] d..1 214327.026021: block_rq_insert: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] d..1 214327.026023: block_unplug: [gc] 1
gc-5678 [003] d..1 214327.026026: block_rq_issue: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] .... 214327.026046: block_plug: [gc]
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-14 22:37:25 +08:00
|
|
|
mpage = find_lock_page(META_MAPPING(fio->sbi), fio->old_blkaddr);
|
|
|
|
if (mpage) {
|
|
|
|
if (PageUptodate(mpage))
|
|
|
|
memcpy(page_address(mpage),
|
|
|
|
page_address(fio->encrypted_page), PAGE_SIZE);
|
|
|
|
f2fs_put_page(mpage, 1);
|
2017-04-25 20:45:12 +08:00
|
|
|
}
|
f2fs: readahead encrypted block during GC
During GC, for each encrypted block, we will read block synchronously
into meta page, and then submit it into current cold data log area.
So this block read model with 4k granularity can make poor performance,
like migrating non-encrypted block, let's readahead encrypted block
as well to improve migration performance.
To implement this, we choose meta page that its index is old block
address of the encrypted block, and readahead ciphertext into this
page, later, if readaheaded page is still updated, we will load its
data into target meta page, and submit the write IO.
Note that for OPU, truncation, deletion, we need to invalid meta
page after we invalid old block address, to make sure we won't load
invalid data from target meta page during encrypted block migration.
for ((i = 0; i < 1000; i++))
do {
xfs_io -f /mnt/f2fs/dir/$i -c "pwrite 0 128k" -c "fsync";
} done
for ((i = 0; i < 1000; i+=2))
do {
rm /mnt/f2fs/dir/$i;
} done
ret = ioctl(fd, F2FS_IOC_GARBAGE_COLLECT, 0);
Before:
gc-6549 [001] d..1 214682.212797: block_rq_insert: 8,32 RA 32768 () 786400 + 64 [gc]
gc-6549 [001] d..1 214682.212802: block_unplug: [gc] 1
gc-6549 [001] .... 214682.213892: block_bio_queue: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213899: block_getrq: 8,32 R 67494144 + 8 [gc]
gc-6549 [001] .... 214682.213902: block_plug: [gc]
gc-6549 [001] d..1 214682.213905: block_rq_insert: 8,32 R 4096 () 67494144 + 8 [gc]
gc-6549 [001] d..1 214682.213908: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226405: block_bio_queue: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226412: block_getrq: 8,32 R 67494152 + 8 [gc]
gc-6549 [001] .... 214682.226414: block_plug: [gc]
gc-6549 [001] d..1 214682.226417: block_rq_insert: 8,32 R 4096 () 67494152 + 8 [gc]
gc-6549 [001] d..1 214682.226420: block_unplug: [gc] 1
gc-6549 [001] .... 214682.226904: block_bio_queue: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226910: block_getrq: 8,32 R 67494160 + 8 [gc]
gc-6549 [001] .... 214682.226911: block_plug: [gc]
gc-6549 [001] d..1 214682.226914: block_rq_insert: 8,32 R 4096 () 67494160 + 8 [gc]
gc-6549 [001] d..1 214682.226916: block_unplug: [gc] 1
After:
gc-5678 [003] .... 214327.025906: block_bio_queue: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025908: block_bio_backmerge: 8,32 R 67493824 + 8 [gc]
gc-5678 [003] .... 214327.025915: block_bio_queue: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025917: block_bio_backmerge: 8,32 R 67493832 + 8 [gc]
gc-5678 [003] .... 214327.025923: block_bio_queue: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025925: block_bio_backmerge: 8,32 R 67493840 + 8 [gc]
gc-5678 [003] .... 214327.025932: block_bio_queue: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025934: block_bio_backmerge: 8,32 R 67493848 + 8 [gc]
gc-5678 [003] .... 214327.025941: block_bio_queue: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025943: block_bio_backmerge: 8,32 R 67493856 + 8 [gc]
gc-5678 [003] .... 214327.025953: block_bio_queue: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025955: block_bio_backmerge: 8,32 R 67493864 + 8 [gc]
gc-5678 [003] .... 214327.025962: block_bio_queue: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025964: block_bio_backmerge: 8,32 R 67493872 + 8 [gc]
gc-5678 [003] .... 214327.025970: block_bio_queue: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.025972: block_bio_backmerge: 8,32 R 67493880 + 8 [gc]
gc-5678 [003] .... 214327.026000: block_bio_queue: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] .... 214327.026019: block_getrq: 8,32 WS 34123776 + 2048 [gc]
gc-5678 [003] d..1 214327.026021: block_rq_insert: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] d..1 214327.026023: block_unplug: [gc] 1
gc-5678 [003] d..1 214327.026026: block_rq_issue: 8,32 R 131072 () 67493632 + 256 [gc]
gc-5678 [003] .... 214327.026046: block_plug: [gc]
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-14 22:37:25 +08:00
|
|
|
return 0;
|
2017-04-25 20:45:12 +08:00
|
|
|
}
|
|
|
|
|
2018-01-17 16:31:38 +08:00
|
|
|
static inline bool check_inplace_update_policy(struct inode *inode,
|
|
|
|
struct f2fs_io_info *fio)
|
2017-04-25 20:45:12 +08:00
|
|
|
{
|
2018-01-17 16:31:38 +08:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
|
|
|
unsigned int policy = SM_I(sbi)->ipu_policy;
|
2017-04-25 20:45:12 +08:00
|
|
|
|
2018-01-17 16:31:38 +08:00
|
|
|
if (policy & (0x1 << F2FS_IPU_FORCE))
|
|
|
|
return true;
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
if (policy & (0x1 << F2FS_IPU_SSR) && f2fs_need_SSR(sbi))
|
2018-01-17 16:31:38 +08:00
|
|
|
return true;
|
|
|
|
if (policy & (0x1 << F2FS_IPU_UTIL) &&
|
|
|
|
utilization(sbi) > SM_I(sbi)->min_ipu_util)
|
|
|
|
return true;
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
if (policy & (0x1 << F2FS_IPU_SSR_UTIL) && f2fs_need_SSR(sbi) &&
|
2018-01-17 16:31:38 +08:00
|
|
|
utilization(sbi) > SM_I(sbi)->min_ipu_util)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* IPU for rewrite async pages
|
|
|
|
*/
|
|
|
|
if (policy & (0x1 << F2FS_IPU_ASYNC) &&
|
|
|
|
fio && fio->op == REQ_OP_WRITE &&
|
|
|
|
!(fio->op_flags & REQ_SYNC) &&
|
2018-12-12 17:50:11 +08:00
|
|
|
!IS_ENCRYPTED(inode))
|
2018-01-17 16:31:38 +08:00
|
|
|
return true;
|
|
|
|
|
|
|
|
/* this is only set during fdatasync */
|
|
|
|
if (policy & (0x1 << F2FS_IPU_FSYNC) &&
|
|
|
|
is_inode_flag_set(inode, FI_NEED_IPU))
|
|
|
|
return true;
|
|
|
|
|
2018-08-21 10:21:43 +08:00
|
|
|
if (unlikely(fio && is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
|
|
|
|
!f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
|
|
|
|
return true;
|
|
|
|
|
2018-01-17 16:31:38 +08:00
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
bool f2fs_should_update_inplace(struct inode *inode, struct f2fs_io_info *fio)
|
2018-01-17 16:31:38 +08:00
|
|
|
{
|
2017-12-08 08:25:39 +08:00
|
|
|
if (f2fs_is_pinned_file(inode))
|
|
|
|
return true;
|
2018-01-17 16:31:38 +08:00
|
|
|
|
|
|
|
/* if this is cold file, we should overwrite to avoid fragmentation */
|
|
|
|
if (file_is_cold(inode))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return check_inplace_update_policy(inode, fio);
|
|
|
|
}
|
|
|
|
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
bool f2fs_should_update_outplace(struct inode *inode, struct f2fs_io_info *fio)
|
2018-01-17 16:31:38 +08:00
|
|
|
{
|
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
|
|
|
|
|
|
|
if (test_opt(sbi, LFS))
|
|
|
|
return true;
|
|
|
|
if (S_ISDIR(inode->i_mode))
|
|
|
|
return true;
|
f2fs: guarantee journalled quota data by checkpoint
For journalled quota mode, let checkpoint to flush dquot dirty data
and quota file data to guarntee persistence of all quota sysfile in
last checkpoint, by this way, we can avoid corrupting quota sysfile
when encountering SPO.
The implementation is as below:
1. add a global state SBI_QUOTA_NEED_FLUSH to indicate that there is
cached dquot metadata changes in quota subsystem, and later checkpoint
should:
a) flush dquot metadata into quota file.
b) flush quota file to storage to keep file usage be consistent.
2. add a global state SBI_QUOTA_NEED_REPAIR to indicate that quota
operation failed due to -EIO or -ENOSPC, so later,
a) checkpoint will skip syncing dquot metadata.
b) CP_QUOTA_NEED_FSCK_FLAG will be set in last cp pack to give a
hint for fsck repairing.
3. add a global state SBI_QUOTA_SKIP_FLUSH, in checkpoint, if quota
data updating is very heavy, it may cause hungtask in block_operation().
To avoid this, if our retry time exceed threshold, let's just skip
flushing and retry in next checkpoint().
Signed-off-by: Weichao Guo <guoweichao@huawei.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: avoid warnings and set fsck flag]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-09-20 20:05:00 +08:00
|
|
|
if (IS_NOQUOTA(inode))
|
|
|
|
return true;
|
2018-01-17 16:31:38 +08:00
|
|
|
if (f2fs_is_atomic_file(inode))
|
|
|
|
return true;
|
|
|
|
if (fio) {
|
|
|
|
if (is_cold_data(fio->page))
|
|
|
|
return true;
|
|
|
|
if (IS_ATOMIC_WRITTEN_PAGE(fio->page))
|
|
|
|
return true;
|
2018-08-21 10:21:43 +08:00
|
|
|
if (unlikely(is_sbi_flag_set(sbi, SBI_CP_DISABLED) &&
|
|
|
|
f2fs_is_checkpointed_data(sbi, fio->old_blkaddr)))
|
|
|
|
return true;
|
2018-01-17 16:31:38 +08:00
|
|
|
}
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2017-04-25 20:45:12 +08:00
|
|
|
static inline bool need_inplace_update(struct f2fs_io_info *fio)
|
|
|
|
{
|
|
|
|
struct inode *inode = fio->page->mapping->host;
|
|
|
|
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
if (f2fs_should_update_outplace(inode, fio))
|
2017-04-25 20:45:12 +08:00
|
|
|
return false;
|
|
|
|
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
return f2fs_should_update_inplace(inode, fio);
|
2017-04-25 20:45:12 +08:00
|
|
|
}
|
|
|
|
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
int f2fs_do_write_data_page(struct f2fs_io_info *fio)
|
2012-11-02 16:10:12 +08:00
|
|
|
{
|
2015-04-24 05:38:15 +08:00
|
|
|
struct page *page = fio->page;
|
2012-11-02 16:10:12 +08:00
|
|
|
struct inode *inode = page->mapping->host;
|
|
|
|
struct dnode_of_data dn;
|
2017-04-25 20:45:13 +08:00
|
|
|
struct extent_info ei = {0,0,0};
|
2018-07-17 00:02:17 +08:00
|
|
|
struct node_info ni;
|
2017-04-25 20:45:13 +08:00
|
|
|
bool ipu_force = false;
|
2012-11-02 16:10:12 +08:00
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
set_new_dnode(&dn, inode, NULL, NULL, 0);
|
2017-04-25 20:45:13 +08:00
|
|
|
if (need_inplace_update(fio) &&
|
|
|
|
f2fs_lookup_extent_cache(inode, page->index, &ei)) {
|
|
|
|
fio->old_blkaddr = ei.blk + page->index - ei.fofs;
|
2017-04-25 06:20:16 +08:00
|
|
|
|
f2fs: fix to do sanity check with block address in main area
This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info
- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image
- Reproduce (4.18 upstream kernel)
- POC (poc.c)
static void activity(char *mpoint) {
char *foo_bar_baz;
int err;
static int buf[8192];
memset(buf, 0, sizeof(buf));
err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);
int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
if (fd >= 0) {
write(fd, (char *)buf, sizeof(buf));
fdatasync(fd);
close(fd);
}
}
int main(int argc, char *argv[]) {
activity(argv[1]);
return 0;
}
- Kernel message
[ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.729171] Call Trace:
[ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.729269] ? __radix_tree_replace+0xa3/0x120
[ 699.729276] __write_data_page+0x5c7/0xe30
[ 699.729291] ? kasan_check_read+0x11/0x20
[ 699.729310] ? page_mapped+0x8a/0x110
[ 699.729321] ? page_mkclean+0xe9/0x160
[ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.729331] ? invalid_page_referenced_vma+0x130/0x130
[ 699.729345] ? clear_page_dirty_for_io+0x332/0x450
[ 699.729351] f2fs_write_cache_pages+0x4ca/0x860
[ 699.729358] ? __write_data_page+0xe30/0xe30
[ 699.729374] ? percpu_counter_add_batch+0x22/0xa0
[ 699.729380] ? kasan_check_write+0x14/0x20
[ 699.729391] ? _raw_spin_lock+0x17/0x40
[ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.729413] ? iov_iter_advance+0x113/0x640
[ 699.729418] ? f2fs_write_end+0x133/0x2e0
[ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.729428] f2fs_write_data_pages+0x329/0x520
[ 699.729433] ? generic_perform_write+0x250/0x320
[ 699.729438] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729454] ? current_time+0x110/0x110
[ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.729464] do_writepages+0x37/0xb0
[ 699.729468] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729472] ? do_writepages+0x37/0xb0
[ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.729496] ? __vfs_write+0x2b2/0x410
[ 699.729501] file_write_and_wait_range+0x66/0xb0
[ 699.729506] f2fs_do_sync_file+0x1f9/0xd90
[ 699.729511] ? truncate_partial_data_page+0x290/0x290
[ 699.729521] ? __sb_end_write+0x30/0x50
[ 699.729526] ? vfs_write+0x20f/0x260
[ 699.729530] f2fs_sync_file+0x9a/0xb0
[ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.729548] vfs_fsync_range+0x68/0x100
[ 699.729554] ? __fget_light+0xc9/0xe0
[ 699.729558] do_fsync+0x3d/0x70
[ 699.729562] __x64_sys_fdatasync+0x24/0x30
[ 699.729585] do_syscall_64+0x78/0x170
[ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.729613] RIP: 0033:0x7f9bf930d800
[ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[ 699.729782] ------------[ cut here ]------------
[ 699.729785] kernel BUG at fs/f2fs/segment.h:654!
[ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4
[ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.752874] Call Trace:
[ 699.753386] ? f2fs_inplace_write_data+0x93/0x240
[ 699.754341] f2fs_inplace_write_data+0xd2/0x240
[ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.758209] ? __radix_tree_replace+0xa3/0x120
[ 699.759164] __write_data_page+0x5c7/0xe30
[ 699.760002] ? kasan_check_read+0x11/0x20
[ 699.760823] ? page_mapped+0x8a/0x110
[ 699.761573] ? page_mkclean+0xe9/0x160
[ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.763332] ? invalid_page_referenced_vma+0x130/0x130
[ 699.764374] ? clear_page_dirty_for_io+0x332/0x450
[ 699.765347] f2fs_write_cache_pages+0x4ca/0x860
[ 699.766276] ? __write_data_page+0xe30/0xe30
[ 699.767161] ? percpu_counter_add_batch+0x22/0xa0
[ 699.768112] ? kasan_check_write+0x14/0x20
[ 699.768951] ? _raw_spin_lock+0x17/0x40
[ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.770885] ? iov_iter_advance+0x113/0x640
[ 699.771743] ? f2fs_write_end+0x133/0x2e0
[ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.773680] f2fs_write_data_pages+0x329/0x520
[ 699.774603] ? generic_perform_write+0x250/0x320
[ 699.775544] ? f2fs_write_cache_pages+0x860/0x860
[ 699.776510] ? current_time+0x110/0x110
[ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.778279] do_writepages+0x37/0xb0
[ 699.779026] ? f2fs_write_cache_pages+0x860/0x860
[ 699.779978] ? do_writepages+0x37/0xb0
[ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.782820] ? __vfs_write+0x2b2/0x410
[ 699.783597] file_write_and_wait_range+0x66/0xb0
[ 699.784540] f2fs_do_sync_file+0x1f9/0xd90
[ 699.785381] ? truncate_partial_data_page+0x290/0x290
[ 699.786415] ? __sb_end_write+0x30/0x50
[ 699.787204] ? vfs_write+0x20f/0x260
[ 699.787941] f2fs_sync_file+0x9a/0xb0
[ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.789572] vfs_fsync_range+0x68/0x100
[ 699.790360] ? __fget_light+0xc9/0xe0
[ 699.791128] do_fsync+0x3d/0x70
[ 699.791779] __x64_sys_fdatasync+0x24/0x30
[ 699.792614] do_syscall_64+0x78/0x170
[ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.794406] RIP: 0033:0x7f9bf930d800
[ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.835556] ==================================================================
[ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309
[ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4
[ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.843475] Call Trace:
[ 699.843982] dump_stack+0x7b/0xb5
[ 699.844661] print_address_description+0x70/0x290
[ 699.845607] kasan_report+0x291/0x390
[ 699.846351] ? update_stack_state+0x38c/0x3e0
[ 699.853831] __asan_load8+0x54/0x90
[ 699.854569] update_stack_state+0x38c/0x3e0
[ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20
[ 699.856601] ? __save_stack_trace+0x5e/0x100
[ 699.857476] unwind_next_frame.part.5+0x18e/0x490
[ 699.858448] ? unwind_dump+0x290/0x290
[ 699.859217] ? clear_page_dirty_for_io+0x332/0x450
[ 699.860185] __unwind_start+0x106/0x190
[ 699.860974] __save_stack_trace+0x5e/0x100
[ 699.861808] ? __save_stack_trace+0x5e/0x100
[ 699.862691] ? unlink_anon_vmas+0xba/0x2c0
[ 699.863525] save_stack_trace+0x1f/0x30
[ 699.864312] save_stack+0x46/0xd0
[ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420
[ 699.865990] ? flush_tlb_mm_range+0x15e/0x220
[ 699.866889] ? kasan_check_write+0x14/0x20
[ 699.867724] ? __dec_node_state+0x92/0xb0
[ 699.868543] ? lock_page_memcg+0x85/0xf0
[ 699.869350] ? unlock_page_memcg+0x16/0x80
[ 699.870185] ? page_remove_rmap+0x198/0x520
[ 699.871048] ? mark_page_accessed+0x133/0x200
[ 699.871930] ? _cond_resched+0x1a/0x50
[ 699.872700] ? unmap_page_range+0xcd4/0xe50
[ 699.873551] ? rb_next+0x58/0x80
[ 699.874217] ? rb_next+0x58/0x80
[ 699.874895] __kasan_slab_free+0x13c/0x1a0
[ 699.875734] ? unlink_anon_vmas+0xba/0x2c0
[ 699.876563] kasan_slab_free+0xe/0x10
[ 699.877315] kmem_cache_free+0x89/0x1e0
[ 699.878095] unlink_anon_vmas+0xba/0x2c0
[ 699.878913] free_pgtables+0x101/0x1b0
[ 699.879677] exit_mmap+0x146/0x2a0
[ 699.880378] ? __ia32_sys_munmap+0x50/0x50
[ 699.881214] ? kasan_check_read+0x11/0x20
[ 699.882052] ? mm_update_next_owner+0x322/0x380
[ 699.882985] mmput+0x8b/0x1d0
[ 699.883602] do_exit+0x43a/0x1390
[ 699.884288] ? mm_update_next_owner+0x380/0x380
[ 699.885212] ? f2fs_sync_file+0x9a/0xb0
[ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.886877] ? vfs_fsync_range+0x68/0x100
[ 699.887694] ? __fget_light+0xc9/0xe0
[ 699.888442] ? do_fsync+0x3d/0x70
[ 699.889118] ? __x64_sys_fdatasync+0x24/0x30
[ 699.889996] rewind_stack_do_exit+0x17/0x20
[ 699.890860] RIP: 0033:0x7f9bf930d800
[ 699.891585] Code: Bad RIP value.
[ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.901241] The buggy address belongs to the page:
[ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[ 699.903811] flags: 0x2ffff0000000000()
[ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[ 699.907673] page dumped because: kasan: bad access detected
[ 699.909108] Memory state around the buggy address:
[ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[ 699.914392] ^
[ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[ 699.918634] ==================================================================
- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644
Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-01 19:13:44 +08:00
|
|
|
if (!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
DATA_GENERIC_ENHANCE))
|
2019-06-20 11:36:14 +08:00
|
|
|
return -EFSCORRUPTED;
|
f2fs: fix to do sanity check with block address in main area
This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info
- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image
- Reproduce (4.18 upstream kernel)
- POC (poc.c)
static void activity(char *mpoint) {
char *foo_bar_baz;
int err;
static int buf[8192];
memset(buf, 0, sizeof(buf));
err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);
int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
if (fd >= 0) {
write(fd, (char *)buf, sizeof(buf));
fdatasync(fd);
close(fd);
}
}
int main(int argc, char *argv[]) {
activity(argv[1]);
return 0;
}
- Kernel message
[ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.729171] Call Trace:
[ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.729269] ? __radix_tree_replace+0xa3/0x120
[ 699.729276] __write_data_page+0x5c7/0xe30
[ 699.729291] ? kasan_check_read+0x11/0x20
[ 699.729310] ? page_mapped+0x8a/0x110
[ 699.729321] ? page_mkclean+0xe9/0x160
[ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.729331] ? invalid_page_referenced_vma+0x130/0x130
[ 699.729345] ? clear_page_dirty_for_io+0x332/0x450
[ 699.729351] f2fs_write_cache_pages+0x4ca/0x860
[ 699.729358] ? __write_data_page+0xe30/0xe30
[ 699.729374] ? percpu_counter_add_batch+0x22/0xa0
[ 699.729380] ? kasan_check_write+0x14/0x20
[ 699.729391] ? _raw_spin_lock+0x17/0x40
[ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.729413] ? iov_iter_advance+0x113/0x640
[ 699.729418] ? f2fs_write_end+0x133/0x2e0
[ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.729428] f2fs_write_data_pages+0x329/0x520
[ 699.729433] ? generic_perform_write+0x250/0x320
[ 699.729438] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729454] ? current_time+0x110/0x110
[ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.729464] do_writepages+0x37/0xb0
[ 699.729468] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729472] ? do_writepages+0x37/0xb0
[ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.729496] ? __vfs_write+0x2b2/0x410
[ 699.729501] file_write_and_wait_range+0x66/0xb0
[ 699.729506] f2fs_do_sync_file+0x1f9/0xd90
[ 699.729511] ? truncate_partial_data_page+0x290/0x290
[ 699.729521] ? __sb_end_write+0x30/0x50
[ 699.729526] ? vfs_write+0x20f/0x260
[ 699.729530] f2fs_sync_file+0x9a/0xb0
[ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.729548] vfs_fsync_range+0x68/0x100
[ 699.729554] ? __fget_light+0xc9/0xe0
[ 699.729558] do_fsync+0x3d/0x70
[ 699.729562] __x64_sys_fdatasync+0x24/0x30
[ 699.729585] do_syscall_64+0x78/0x170
[ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.729613] RIP: 0033:0x7f9bf930d800
[ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[ 699.729782] ------------[ cut here ]------------
[ 699.729785] kernel BUG at fs/f2fs/segment.h:654!
[ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4
[ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.752874] Call Trace:
[ 699.753386] ? f2fs_inplace_write_data+0x93/0x240
[ 699.754341] f2fs_inplace_write_data+0xd2/0x240
[ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.758209] ? __radix_tree_replace+0xa3/0x120
[ 699.759164] __write_data_page+0x5c7/0xe30
[ 699.760002] ? kasan_check_read+0x11/0x20
[ 699.760823] ? page_mapped+0x8a/0x110
[ 699.761573] ? page_mkclean+0xe9/0x160
[ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.763332] ? invalid_page_referenced_vma+0x130/0x130
[ 699.764374] ? clear_page_dirty_for_io+0x332/0x450
[ 699.765347] f2fs_write_cache_pages+0x4ca/0x860
[ 699.766276] ? __write_data_page+0xe30/0xe30
[ 699.767161] ? percpu_counter_add_batch+0x22/0xa0
[ 699.768112] ? kasan_check_write+0x14/0x20
[ 699.768951] ? _raw_spin_lock+0x17/0x40
[ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.770885] ? iov_iter_advance+0x113/0x640
[ 699.771743] ? f2fs_write_end+0x133/0x2e0
[ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.773680] f2fs_write_data_pages+0x329/0x520
[ 699.774603] ? generic_perform_write+0x250/0x320
[ 699.775544] ? f2fs_write_cache_pages+0x860/0x860
[ 699.776510] ? current_time+0x110/0x110
[ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.778279] do_writepages+0x37/0xb0
[ 699.779026] ? f2fs_write_cache_pages+0x860/0x860
[ 699.779978] ? do_writepages+0x37/0xb0
[ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.782820] ? __vfs_write+0x2b2/0x410
[ 699.783597] file_write_and_wait_range+0x66/0xb0
[ 699.784540] f2fs_do_sync_file+0x1f9/0xd90
[ 699.785381] ? truncate_partial_data_page+0x290/0x290
[ 699.786415] ? __sb_end_write+0x30/0x50
[ 699.787204] ? vfs_write+0x20f/0x260
[ 699.787941] f2fs_sync_file+0x9a/0xb0
[ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.789572] vfs_fsync_range+0x68/0x100
[ 699.790360] ? __fget_light+0xc9/0xe0
[ 699.791128] do_fsync+0x3d/0x70
[ 699.791779] __x64_sys_fdatasync+0x24/0x30
[ 699.792614] do_syscall_64+0x78/0x170
[ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.794406] RIP: 0033:0x7f9bf930d800
[ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.835556] ==================================================================
[ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309
[ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4
[ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.843475] Call Trace:
[ 699.843982] dump_stack+0x7b/0xb5
[ 699.844661] print_address_description+0x70/0x290
[ 699.845607] kasan_report+0x291/0x390
[ 699.846351] ? update_stack_state+0x38c/0x3e0
[ 699.853831] __asan_load8+0x54/0x90
[ 699.854569] update_stack_state+0x38c/0x3e0
[ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20
[ 699.856601] ? __save_stack_trace+0x5e/0x100
[ 699.857476] unwind_next_frame.part.5+0x18e/0x490
[ 699.858448] ? unwind_dump+0x290/0x290
[ 699.859217] ? clear_page_dirty_for_io+0x332/0x450
[ 699.860185] __unwind_start+0x106/0x190
[ 699.860974] __save_stack_trace+0x5e/0x100
[ 699.861808] ? __save_stack_trace+0x5e/0x100
[ 699.862691] ? unlink_anon_vmas+0xba/0x2c0
[ 699.863525] save_stack_trace+0x1f/0x30
[ 699.864312] save_stack+0x46/0xd0
[ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420
[ 699.865990] ? flush_tlb_mm_range+0x15e/0x220
[ 699.866889] ? kasan_check_write+0x14/0x20
[ 699.867724] ? __dec_node_state+0x92/0xb0
[ 699.868543] ? lock_page_memcg+0x85/0xf0
[ 699.869350] ? unlock_page_memcg+0x16/0x80
[ 699.870185] ? page_remove_rmap+0x198/0x520
[ 699.871048] ? mark_page_accessed+0x133/0x200
[ 699.871930] ? _cond_resched+0x1a/0x50
[ 699.872700] ? unmap_page_range+0xcd4/0xe50
[ 699.873551] ? rb_next+0x58/0x80
[ 699.874217] ? rb_next+0x58/0x80
[ 699.874895] __kasan_slab_free+0x13c/0x1a0
[ 699.875734] ? unlink_anon_vmas+0xba/0x2c0
[ 699.876563] kasan_slab_free+0xe/0x10
[ 699.877315] kmem_cache_free+0x89/0x1e0
[ 699.878095] unlink_anon_vmas+0xba/0x2c0
[ 699.878913] free_pgtables+0x101/0x1b0
[ 699.879677] exit_mmap+0x146/0x2a0
[ 699.880378] ? __ia32_sys_munmap+0x50/0x50
[ 699.881214] ? kasan_check_read+0x11/0x20
[ 699.882052] ? mm_update_next_owner+0x322/0x380
[ 699.882985] mmput+0x8b/0x1d0
[ 699.883602] do_exit+0x43a/0x1390
[ 699.884288] ? mm_update_next_owner+0x380/0x380
[ 699.885212] ? f2fs_sync_file+0x9a/0xb0
[ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.886877] ? vfs_fsync_range+0x68/0x100
[ 699.887694] ? __fget_light+0xc9/0xe0
[ 699.888442] ? do_fsync+0x3d/0x70
[ 699.889118] ? __x64_sys_fdatasync+0x24/0x30
[ 699.889996] rewind_stack_do_exit+0x17/0x20
[ 699.890860] RIP: 0033:0x7f9bf930d800
[ 699.891585] Code: Bad RIP value.
[ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.901241] The buggy address belongs to the page:
[ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[ 699.903811] flags: 0x2ffff0000000000()
[ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[ 699.907673] page dumped because: kasan: bad access detected
[ 699.909108] Memory state around the buggy address:
[ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[ 699.914392] ^
[ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[ 699.918634] ==================================================================
- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644
Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-01 19:13:44 +08:00
|
|
|
|
|
|
|
ipu_force = true;
|
|
|
|
fio->need_lock = LOCK_DONE;
|
|
|
|
goto got_it;
|
2017-04-25 20:45:13 +08:00
|
|
|
}
|
2017-04-27 00:17:21 +08:00
|
|
|
|
2017-06-22 08:52:39 +08:00
|
|
|
/* Deadlock due to between page->lock and f2fs_lock_op */
|
|
|
|
if (fio->need_lock == LOCK_REQ && !f2fs_trylock_op(fio->sbi))
|
|
|
|
return -EAGAIN;
|
2017-04-27 00:17:21 +08:00
|
|
|
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
err = f2fs_get_dnode_of_data(&dn, page->index, LOOKUP_NODE);
|
2012-11-02 16:10:12 +08:00
|
|
|
if (err)
|
2017-04-27 00:17:21 +08:00
|
|
|
goto out;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
f2fs: support revoking atomic written pages
f2fs support atomic write with following semantics:
1. open db file
2. ioctl start atomic write
3. (write db file) * n
4. ioctl commit atomic write
5. close db file
With this flow we can avoid file becoming corrupted when abnormal power
cut, because we hold data of transaction in referenced pages linked in
inmem_pages list of inode, but without setting them dirty, so these data
won't be persisted unless we commit them in step 4.
But we should still hold journal db file in memory by using volatile
write, because our semantics of 'atomic write support' is incomplete, in
step 4, we could fail to submit all dirty data of transaction, once
partial dirty data was committed in storage, then after a checkpoint &
abnormal power-cut, db file will be corrupted forever.
So this patch tries to improve atomic write flow by adding a revoking flow,
once inner error occurs in committing, this gives another chance to try to
revoke these partial submitted data of current transaction, it makes
committing operation more like aotmical one.
If we're not lucky, once revoking operation was failed, EAGAIN will be
reported to user for suggesting doing the recovery with held journal file,
or retrying current transaction again.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-02-06 14:40:34 +08:00
|
|
|
fio->old_blkaddr = dn.data_blkaddr;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
|
|
|
/* This page is already truncated */
|
f2fs: trace old block address for CoWed page
This patch enables to trace old block address of CoWed page for better
debugging.
f2fs_submit_page_mbio: dev = (1,0), ino = 1, page_index = 0x1d4f0, oldaddr = 0xfe8ab, newaddr = 0xfee90 rw = WRITE_SYNC, type = NODE
f2fs_submit_page_mbio: dev = (1,0), ino = 1, page_index = 0x1d4f8, oldaddr = 0xfe8b0, newaddr = 0xfee91 rw = WRITE_SYNC, type = NODE
f2fs_submit_page_mbio: dev = (1,0), ino = 1, page_index = 0x1d4fa, oldaddr = 0xfe8ae, newaddr = 0xfee92 rw = WRITE_SYNC, type = NODE
f2fs_submit_page_mbio: dev = (1,0), ino = 134824, page_index = 0x96, oldaddr = 0xf049b, newaddr = 0x2bbe rw = WRITE, type = DATA
f2fs_submit_page_mbio: dev = (1,0), ino = 134824, page_index = 0x97, oldaddr = 0xf049c, newaddr = 0x2bbf rw = WRITE, type = DATA
f2fs_submit_page_mbio: dev = (1,0), ino = 134824, page_index = 0x98, oldaddr = 0xf049d, newaddr = 0x2bc0 rw = WRITE, type = DATA
f2fs_submit_page_mbio: dev = (1,0), ino = 135260, page_index = 0x47, oldaddr = 0xffffffff, newaddr = 0xf2631 rw = WRITE, type = DATA
f2fs_submit_page_mbio: dev = (1,0), ino = 135260, page_index = 0x48, oldaddr = 0xffffffff, newaddr = 0xf2632 rw = WRITE, type = DATA
f2fs_submit_page_mbio: dev = (1,0), ino = 135260, page_index = 0x49, oldaddr = 0xffffffff, newaddr = 0xf2633 rw = WRITE, type = DATA
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-02-22 18:36:38 +08:00
|
|
|
if (fio->old_blkaddr == NULL_ADDR) {
|
2015-02-26 11:25:01 +08:00
|
|
|
ClearPageUptodate(page);
|
2018-07-27 18:15:16 +08:00
|
|
|
clear_cold_data(page);
|
2012-11-02 16:10:12 +08:00
|
|
|
goto out_writepage;
|
2015-02-26 11:25:01 +08:00
|
|
|
}
|
2017-04-25 20:45:13 +08:00
|
|
|
got_it:
|
f2fs: fix to do sanity check with block address in main area
This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info
- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image
- Reproduce (4.18 upstream kernel)
- POC (poc.c)
static void activity(char *mpoint) {
char *foo_bar_baz;
int err;
static int buf[8192];
memset(buf, 0, sizeof(buf));
err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);
int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
if (fd >= 0) {
write(fd, (char *)buf, sizeof(buf));
fdatasync(fd);
close(fd);
}
}
int main(int argc, char *argv[]) {
activity(argv[1]);
return 0;
}
- Kernel message
[ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.729171] Call Trace:
[ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.729269] ? __radix_tree_replace+0xa3/0x120
[ 699.729276] __write_data_page+0x5c7/0xe30
[ 699.729291] ? kasan_check_read+0x11/0x20
[ 699.729310] ? page_mapped+0x8a/0x110
[ 699.729321] ? page_mkclean+0xe9/0x160
[ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.729331] ? invalid_page_referenced_vma+0x130/0x130
[ 699.729345] ? clear_page_dirty_for_io+0x332/0x450
[ 699.729351] f2fs_write_cache_pages+0x4ca/0x860
[ 699.729358] ? __write_data_page+0xe30/0xe30
[ 699.729374] ? percpu_counter_add_batch+0x22/0xa0
[ 699.729380] ? kasan_check_write+0x14/0x20
[ 699.729391] ? _raw_spin_lock+0x17/0x40
[ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.729413] ? iov_iter_advance+0x113/0x640
[ 699.729418] ? f2fs_write_end+0x133/0x2e0
[ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.729428] f2fs_write_data_pages+0x329/0x520
[ 699.729433] ? generic_perform_write+0x250/0x320
[ 699.729438] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729454] ? current_time+0x110/0x110
[ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.729464] do_writepages+0x37/0xb0
[ 699.729468] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729472] ? do_writepages+0x37/0xb0
[ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.729496] ? __vfs_write+0x2b2/0x410
[ 699.729501] file_write_and_wait_range+0x66/0xb0
[ 699.729506] f2fs_do_sync_file+0x1f9/0xd90
[ 699.729511] ? truncate_partial_data_page+0x290/0x290
[ 699.729521] ? __sb_end_write+0x30/0x50
[ 699.729526] ? vfs_write+0x20f/0x260
[ 699.729530] f2fs_sync_file+0x9a/0xb0
[ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.729548] vfs_fsync_range+0x68/0x100
[ 699.729554] ? __fget_light+0xc9/0xe0
[ 699.729558] do_fsync+0x3d/0x70
[ 699.729562] __x64_sys_fdatasync+0x24/0x30
[ 699.729585] do_syscall_64+0x78/0x170
[ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.729613] RIP: 0033:0x7f9bf930d800
[ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[ 699.729782] ------------[ cut here ]------------
[ 699.729785] kernel BUG at fs/f2fs/segment.h:654!
[ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4
[ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.752874] Call Trace:
[ 699.753386] ? f2fs_inplace_write_data+0x93/0x240
[ 699.754341] f2fs_inplace_write_data+0xd2/0x240
[ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.758209] ? __radix_tree_replace+0xa3/0x120
[ 699.759164] __write_data_page+0x5c7/0xe30
[ 699.760002] ? kasan_check_read+0x11/0x20
[ 699.760823] ? page_mapped+0x8a/0x110
[ 699.761573] ? page_mkclean+0xe9/0x160
[ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.763332] ? invalid_page_referenced_vma+0x130/0x130
[ 699.764374] ? clear_page_dirty_for_io+0x332/0x450
[ 699.765347] f2fs_write_cache_pages+0x4ca/0x860
[ 699.766276] ? __write_data_page+0xe30/0xe30
[ 699.767161] ? percpu_counter_add_batch+0x22/0xa0
[ 699.768112] ? kasan_check_write+0x14/0x20
[ 699.768951] ? _raw_spin_lock+0x17/0x40
[ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.770885] ? iov_iter_advance+0x113/0x640
[ 699.771743] ? f2fs_write_end+0x133/0x2e0
[ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.773680] f2fs_write_data_pages+0x329/0x520
[ 699.774603] ? generic_perform_write+0x250/0x320
[ 699.775544] ? f2fs_write_cache_pages+0x860/0x860
[ 699.776510] ? current_time+0x110/0x110
[ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.778279] do_writepages+0x37/0xb0
[ 699.779026] ? f2fs_write_cache_pages+0x860/0x860
[ 699.779978] ? do_writepages+0x37/0xb0
[ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.782820] ? __vfs_write+0x2b2/0x410
[ 699.783597] file_write_and_wait_range+0x66/0xb0
[ 699.784540] f2fs_do_sync_file+0x1f9/0xd90
[ 699.785381] ? truncate_partial_data_page+0x290/0x290
[ 699.786415] ? __sb_end_write+0x30/0x50
[ 699.787204] ? vfs_write+0x20f/0x260
[ 699.787941] f2fs_sync_file+0x9a/0xb0
[ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.789572] vfs_fsync_range+0x68/0x100
[ 699.790360] ? __fget_light+0xc9/0xe0
[ 699.791128] do_fsync+0x3d/0x70
[ 699.791779] __x64_sys_fdatasync+0x24/0x30
[ 699.792614] do_syscall_64+0x78/0x170
[ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.794406] RIP: 0033:0x7f9bf930d800
[ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.835556] ==================================================================
[ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309
[ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4
[ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.843475] Call Trace:
[ 699.843982] dump_stack+0x7b/0xb5
[ 699.844661] print_address_description+0x70/0x290
[ 699.845607] kasan_report+0x291/0x390
[ 699.846351] ? update_stack_state+0x38c/0x3e0
[ 699.853831] __asan_load8+0x54/0x90
[ 699.854569] update_stack_state+0x38c/0x3e0
[ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20
[ 699.856601] ? __save_stack_trace+0x5e/0x100
[ 699.857476] unwind_next_frame.part.5+0x18e/0x490
[ 699.858448] ? unwind_dump+0x290/0x290
[ 699.859217] ? clear_page_dirty_for_io+0x332/0x450
[ 699.860185] __unwind_start+0x106/0x190
[ 699.860974] __save_stack_trace+0x5e/0x100
[ 699.861808] ? __save_stack_trace+0x5e/0x100
[ 699.862691] ? unlink_anon_vmas+0xba/0x2c0
[ 699.863525] save_stack_trace+0x1f/0x30
[ 699.864312] save_stack+0x46/0xd0
[ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420
[ 699.865990] ? flush_tlb_mm_range+0x15e/0x220
[ 699.866889] ? kasan_check_write+0x14/0x20
[ 699.867724] ? __dec_node_state+0x92/0xb0
[ 699.868543] ? lock_page_memcg+0x85/0xf0
[ 699.869350] ? unlock_page_memcg+0x16/0x80
[ 699.870185] ? page_remove_rmap+0x198/0x520
[ 699.871048] ? mark_page_accessed+0x133/0x200
[ 699.871930] ? _cond_resched+0x1a/0x50
[ 699.872700] ? unmap_page_range+0xcd4/0xe50
[ 699.873551] ? rb_next+0x58/0x80
[ 699.874217] ? rb_next+0x58/0x80
[ 699.874895] __kasan_slab_free+0x13c/0x1a0
[ 699.875734] ? unlink_anon_vmas+0xba/0x2c0
[ 699.876563] kasan_slab_free+0xe/0x10
[ 699.877315] kmem_cache_free+0x89/0x1e0
[ 699.878095] unlink_anon_vmas+0xba/0x2c0
[ 699.878913] free_pgtables+0x101/0x1b0
[ 699.879677] exit_mmap+0x146/0x2a0
[ 699.880378] ? __ia32_sys_munmap+0x50/0x50
[ 699.881214] ? kasan_check_read+0x11/0x20
[ 699.882052] ? mm_update_next_owner+0x322/0x380
[ 699.882985] mmput+0x8b/0x1d0
[ 699.883602] do_exit+0x43a/0x1390
[ 699.884288] ? mm_update_next_owner+0x380/0x380
[ 699.885212] ? f2fs_sync_file+0x9a/0xb0
[ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.886877] ? vfs_fsync_range+0x68/0x100
[ 699.887694] ? __fget_light+0xc9/0xe0
[ 699.888442] ? do_fsync+0x3d/0x70
[ 699.889118] ? __x64_sys_fdatasync+0x24/0x30
[ 699.889996] rewind_stack_do_exit+0x17/0x20
[ 699.890860] RIP: 0033:0x7f9bf930d800
[ 699.891585] Code: Bad RIP value.
[ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.901241] The buggy address belongs to the page:
[ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[ 699.903811] flags: 0x2ffff0000000000()
[ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[ 699.907673] page dumped because: kasan: bad access detected
[ 699.909108] Memory state around the buggy address:
[ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[ 699.914392] ^
[ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[ 699.918634] ==================================================================
- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644
Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-01 19:13:44 +08:00
|
|
|
if (__is_valid_data_blkaddr(fio->old_blkaddr) &&
|
|
|
|
!f2fs_is_valid_blkaddr(fio->sbi, fio->old_blkaddr,
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
DATA_GENERIC_ENHANCE)) {
|
2019-06-20 11:36:14 +08:00
|
|
|
err = -EFSCORRUPTED;
|
f2fs: fix to do sanity check with block address in main area
This patch add to do sanity check with below field:
- cp_pack_total_block_count
- blkaddr of data/node
- extent info
- Overview
BUG() in verify_block_addr() when writing to a corrupted f2fs image
- Reproduce (4.18 upstream kernel)
- POC (poc.c)
static void activity(char *mpoint) {
char *foo_bar_baz;
int err;
static int buf[8192];
memset(buf, 0, sizeof(buf));
err = asprintf(&foo_bar_baz, "%s/foo/bar/baz", mpoint);
int fd = open(foo_bar_baz, O_RDWR | O_TRUNC, 0777);
if (fd >= 0) {
write(fd, (char *)buf, sizeof(buf));
fdatasync(fd);
close(fd);
}
}
int main(int argc, char *argv[]) {
activity(argv[1]);
return 0;
}
- Kernel message
[ 689.349473] F2FS-fs (loop0): Mounted with checkpoint version = 3
[ 699.728662] WARNING: CPU: 0 PID: 1309 at fs/f2fs/segment.c:2860 f2fs_inplace_write_data+0x232/0x240
[ 699.728670] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.729056] CPU: 0 PID: 1309 Comm: a.out Not tainted 4.18.0-rc1+ #4
[ 699.729064] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.729074] RIP: 0010:f2fs_inplace_write_data+0x232/0x240
[ 699.729076] Code: ff e9 cf fe ff ff 49 8d 7d 10 e8 39 45 ad ff 4d 8b 7d 10 be 04 00 00 00 49 8d 7f 48 e8 07 49 ad ff 45 8b 7f 48 e9 fb fe ff ff <0f> 0b f0 41 80 4d 48 04 e9 65 fe ff ff 90 66 66 66 66 90 55 48 8d
[ 699.729130] RSP: 0018:ffff8801f43af568 EFLAGS: 00010202
[ 699.729139] RAX: 000000000000003f RBX: ffff8801f43af7b8 RCX: ffffffffb88c9113
[ 699.729142] RDX: 0000000000000003 RSI: dffffc0000000000 RDI: ffff8802024e5540
[ 699.729144] RBP: ffff8801f43af590 R08: 0000000000000009 R09: ffffffffffffffe8
[ 699.729147] R10: 0000000000000001 R11: ffffed0039b0596a R12: ffff8802024e5540
[ 699.729149] R13: ffff8801f0335500 R14: ffff8801e3e7a700 R15: ffff8801e1ee4450
[ 699.729154] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.729156] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.729159] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.729171] Call Trace:
[ 699.729192] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.729203] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.729238] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.729269] ? __radix_tree_replace+0xa3/0x120
[ 699.729276] __write_data_page+0x5c7/0xe30
[ 699.729291] ? kasan_check_read+0x11/0x20
[ 699.729310] ? page_mapped+0x8a/0x110
[ 699.729321] ? page_mkclean+0xe9/0x160
[ 699.729327] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.729331] ? invalid_page_referenced_vma+0x130/0x130
[ 699.729345] ? clear_page_dirty_for_io+0x332/0x450
[ 699.729351] f2fs_write_cache_pages+0x4ca/0x860
[ 699.729358] ? __write_data_page+0xe30/0xe30
[ 699.729374] ? percpu_counter_add_batch+0x22/0xa0
[ 699.729380] ? kasan_check_write+0x14/0x20
[ 699.729391] ? _raw_spin_lock+0x17/0x40
[ 699.729403] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.729413] ? iov_iter_advance+0x113/0x640
[ 699.729418] ? f2fs_write_end+0x133/0x2e0
[ 699.729423] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.729428] f2fs_write_data_pages+0x329/0x520
[ 699.729433] ? generic_perform_write+0x250/0x320
[ 699.729438] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729454] ? current_time+0x110/0x110
[ 699.729459] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.729464] do_writepages+0x37/0xb0
[ 699.729468] ? f2fs_write_cache_pages+0x860/0x860
[ 699.729472] ? do_writepages+0x37/0xb0
[ 699.729478] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.729483] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.729496] ? __vfs_write+0x2b2/0x410
[ 699.729501] file_write_and_wait_range+0x66/0xb0
[ 699.729506] f2fs_do_sync_file+0x1f9/0xd90
[ 699.729511] ? truncate_partial_data_page+0x290/0x290
[ 699.729521] ? __sb_end_write+0x30/0x50
[ 699.729526] ? vfs_write+0x20f/0x260
[ 699.729530] f2fs_sync_file+0x9a/0xb0
[ 699.729534] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.729548] vfs_fsync_range+0x68/0x100
[ 699.729554] ? __fget_light+0xc9/0xe0
[ 699.729558] do_fsync+0x3d/0x70
[ 699.729562] __x64_sys_fdatasync+0x24/0x30
[ 699.729585] do_syscall_64+0x78/0x170
[ 699.729595] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.729613] RIP: 0033:0x7f9bf930d800
[ 699.729615] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.729668] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.729673] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.729675] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.729678] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.729680] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.729683] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.729687] ---[ end trace 4ce02f25ff7d3df5 ]---
[ 699.729782] ------------[ cut here ]------------
[ 699.729785] kernel BUG at fs/f2fs/segment.h:654!
[ 699.731055] invalid opcode: 0000 [#1] SMP KASAN PTI
[ 699.732104] CPU: 0 PID: 1309 Comm: a.out Tainted: G W 4.18.0-rc1+ #4
[ 699.733684] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.735611] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.736649] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.740524] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.741573] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.743006] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.744426] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.745833] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.747256] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.748683] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.750293] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.751462] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.752874] Call Trace:
[ 699.753386] ? f2fs_inplace_write_data+0x93/0x240
[ 699.754341] f2fs_inplace_write_data+0xd2/0x240
[ 699.755271] f2fs_do_write_data_page+0x2e2/0xe00
[ 699.756214] ? f2fs_should_update_outplace+0xd0/0xd0
[ 699.757215] ? memcg_drain_all_list_lrus+0x280/0x280
[ 699.758209] ? __radix_tree_replace+0xa3/0x120
[ 699.759164] __write_data_page+0x5c7/0xe30
[ 699.760002] ? kasan_check_read+0x11/0x20
[ 699.760823] ? page_mapped+0x8a/0x110
[ 699.761573] ? page_mkclean+0xe9/0x160
[ 699.762345] ? f2fs_do_write_data_page+0xe00/0xe00
[ 699.763332] ? invalid_page_referenced_vma+0x130/0x130
[ 699.764374] ? clear_page_dirty_for_io+0x332/0x450
[ 699.765347] f2fs_write_cache_pages+0x4ca/0x860
[ 699.766276] ? __write_data_page+0xe30/0xe30
[ 699.767161] ? percpu_counter_add_batch+0x22/0xa0
[ 699.768112] ? kasan_check_write+0x14/0x20
[ 699.768951] ? _raw_spin_lock+0x17/0x40
[ 699.769739] ? f2fs_mark_inode_dirty_sync.part.18+0x16/0x30
[ 699.770885] ? iov_iter_advance+0x113/0x640
[ 699.771743] ? f2fs_write_end+0x133/0x2e0
[ 699.772569] ? balance_dirty_pages_ratelimited+0x239/0x640
[ 699.773680] f2fs_write_data_pages+0x329/0x520
[ 699.774603] ? generic_perform_write+0x250/0x320
[ 699.775544] ? f2fs_write_cache_pages+0x860/0x860
[ 699.776510] ? current_time+0x110/0x110
[ 699.777299] ? f2fs_preallocate_blocks+0x1ef/0x370
[ 699.778279] do_writepages+0x37/0xb0
[ 699.779026] ? f2fs_write_cache_pages+0x860/0x860
[ 699.779978] ? do_writepages+0x37/0xb0
[ 699.780755] __filemap_fdatawrite_range+0x19a/0x1f0
[ 699.781746] ? delete_from_page_cache_batch+0x4e0/0x4e0
[ 699.782820] ? __vfs_write+0x2b2/0x410
[ 699.783597] file_write_and_wait_range+0x66/0xb0
[ 699.784540] f2fs_do_sync_file+0x1f9/0xd90
[ 699.785381] ? truncate_partial_data_page+0x290/0x290
[ 699.786415] ? __sb_end_write+0x30/0x50
[ 699.787204] ? vfs_write+0x20f/0x260
[ 699.787941] f2fs_sync_file+0x9a/0xb0
[ 699.788694] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.789572] vfs_fsync_range+0x68/0x100
[ 699.790360] ? __fget_light+0xc9/0xe0
[ 699.791128] do_fsync+0x3d/0x70
[ 699.791779] __x64_sys_fdatasync+0x24/0x30
[ 699.792614] do_syscall_64+0x78/0x170
[ 699.793371] entry_SYSCALL_64_after_hwframe+0x44/0xa9
[ 699.794406] RIP: 0033:0x7f9bf930d800
[ 699.795134] Code: 00 f7 d8 64 89 01 48 83 c8 ff c3 66 2e 0f 1f 84 00 00 00 00 00 0f 1f 44 00 00 83 3d 49 bf 2c 00 00 75 10 b8 4b 00 00 00 0f 05 <48> 3d 01 f0 ff ff 73 31 c3 48 83 ec 08 e8 be 78 01 00 48 89 04 24
[ 699.798960] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.800483] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.801923] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.803373] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.804798] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.806233] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.807667] Modules linked in: snd_hda_codec_generic snd_hda_intel snd_hda_codec snd_hwdep snd_hda_core snd_pcm snd_timer snd mac_hid i2c_piix4 soundcore ib_iser rdma_cm iw_cm ib_cm ib_core iscsi_tcp libiscsi_tcp libiscsi scsi_transport_iscsi raid10 raid456 async_raid6_recov async_memcpy async_pq async_xor async_tx raid1 raid0 multipath linear 8139too crct10dif_pclmul crc32_pclmul qxl drm_kms_helper syscopyarea aesni_intel sysfillrect sysimgblt fb_sys_fops ttm drm aes_x86_64 crypto_simd cryptd 8139cp glue_helper mii pata_acpi floppy
[ 699.817079] ---[ end trace 4ce02f25ff7d3df6 ]---
[ 699.818068] RIP: 0010:f2fs_submit_page_bio+0x29b/0x730
[ 699.819114] Code: 54 49 8d bd 18 04 00 00 e8 b2 59 af ff 41 8b 8d 18 04 00 00 8b 45 b8 41 d3 e6 44 01 f0 4c 8d 73 14 41 39 c7 0f 82 37 fe ff ff <0f> 0b 65 8b 05 2c 04 77 47 89 c0 48 0f a3 05 52 c1 d5 01 0f 92 c0
[ 699.822919] RSP: 0018:ffff8801f43af508 EFLAGS: 00010283
[ 699.823977] RAX: 0000000000000000 RBX: ffff8801f43af7b8 RCX: ffffffffb88a7cef
[ 699.825436] RDX: 0000000000000007 RSI: dffffc0000000000 RDI: ffff8801e3e7a64c
[ 699.826881] RBP: ffff8801f43af558 R08: ffffed003e066b55 R09: ffffed003e066b55
[ 699.828292] R10: 0000000000000001 R11: ffffed003e066b54 R12: ffffea0007876940
[ 699.829750] R13: ffff8801f0335500 R14: ffff8801e3e7a600 R15: 0000000000000001
[ 699.831192] FS: 00007f9bf97f5700(0000) GS:ffff8801f6e00000(0000) knlGS:0000000000000000
[ 699.832793] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033
[ 699.833981] CR2: 00007f9bf925d170 CR3: 00000001f0c34000 CR4: 00000000000006f0
[ 699.835556] ==================================================================
[ 699.837029] BUG: KASAN: stack-out-of-bounds in update_stack_state+0x38c/0x3e0
[ 699.838462] Read of size 8 at addr ffff8801f43af970 by task a.out/1309
[ 699.840086] CPU: 0 PID: 1309 Comm: a.out Tainted: G D W 4.18.0-rc1+ #4
[ 699.841603] Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS Ubuntu-1.8.2-1ubuntu1 04/01/2014
[ 699.843475] Call Trace:
[ 699.843982] dump_stack+0x7b/0xb5
[ 699.844661] print_address_description+0x70/0x290
[ 699.845607] kasan_report+0x291/0x390
[ 699.846351] ? update_stack_state+0x38c/0x3e0
[ 699.853831] __asan_load8+0x54/0x90
[ 699.854569] update_stack_state+0x38c/0x3e0
[ 699.855428] ? __read_once_size_nocheck.constprop.7+0x20/0x20
[ 699.856601] ? __save_stack_trace+0x5e/0x100
[ 699.857476] unwind_next_frame.part.5+0x18e/0x490
[ 699.858448] ? unwind_dump+0x290/0x290
[ 699.859217] ? clear_page_dirty_for_io+0x332/0x450
[ 699.860185] __unwind_start+0x106/0x190
[ 699.860974] __save_stack_trace+0x5e/0x100
[ 699.861808] ? __save_stack_trace+0x5e/0x100
[ 699.862691] ? unlink_anon_vmas+0xba/0x2c0
[ 699.863525] save_stack_trace+0x1f/0x30
[ 699.864312] save_stack+0x46/0xd0
[ 699.864993] ? __alloc_pages_slowpath+0x1420/0x1420
[ 699.865990] ? flush_tlb_mm_range+0x15e/0x220
[ 699.866889] ? kasan_check_write+0x14/0x20
[ 699.867724] ? __dec_node_state+0x92/0xb0
[ 699.868543] ? lock_page_memcg+0x85/0xf0
[ 699.869350] ? unlock_page_memcg+0x16/0x80
[ 699.870185] ? page_remove_rmap+0x198/0x520
[ 699.871048] ? mark_page_accessed+0x133/0x200
[ 699.871930] ? _cond_resched+0x1a/0x50
[ 699.872700] ? unmap_page_range+0xcd4/0xe50
[ 699.873551] ? rb_next+0x58/0x80
[ 699.874217] ? rb_next+0x58/0x80
[ 699.874895] __kasan_slab_free+0x13c/0x1a0
[ 699.875734] ? unlink_anon_vmas+0xba/0x2c0
[ 699.876563] kasan_slab_free+0xe/0x10
[ 699.877315] kmem_cache_free+0x89/0x1e0
[ 699.878095] unlink_anon_vmas+0xba/0x2c0
[ 699.878913] free_pgtables+0x101/0x1b0
[ 699.879677] exit_mmap+0x146/0x2a0
[ 699.880378] ? __ia32_sys_munmap+0x50/0x50
[ 699.881214] ? kasan_check_read+0x11/0x20
[ 699.882052] ? mm_update_next_owner+0x322/0x380
[ 699.882985] mmput+0x8b/0x1d0
[ 699.883602] do_exit+0x43a/0x1390
[ 699.884288] ? mm_update_next_owner+0x380/0x380
[ 699.885212] ? f2fs_sync_file+0x9a/0xb0
[ 699.885995] ? f2fs_do_sync_file+0xd90/0xd90
[ 699.886877] ? vfs_fsync_range+0x68/0x100
[ 699.887694] ? __fget_light+0xc9/0xe0
[ 699.888442] ? do_fsync+0x3d/0x70
[ 699.889118] ? __x64_sys_fdatasync+0x24/0x30
[ 699.889996] rewind_stack_do_exit+0x17/0x20
[ 699.890860] RIP: 0033:0x7f9bf930d800
[ 699.891585] Code: Bad RIP value.
[ 699.892268] RSP: 002b:00007ffee3606c68 EFLAGS: 00000246 ORIG_RAX: 000000000000004b
[ 699.893781] RAX: ffffffffffffffda RBX: 0000000000000000 RCX: 00007f9bf930d800
[ 699.895220] RDX: 0000000000008000 RSI: 00000000006010a0 RDI: 0000000000000003
[ 699.896643] RBP: 00007ffee3606ca0 R08: 0000000001503010 R09: 0000000000000000
[ 699.898069] R10: 00000000000002e8 R11: 0000000000000246 R12: 0000000000400610
[ 699.899505] R13: 00007ffee3606da0 R14: 0000000000000000 R15: 0000000000000000
[ 699.901241] The buggy address belongs to the page:
[ 699.902215] page:ffffea0007d0ebc0 count:0 mapcount:0 mapping:0000000000000000 index:0x0
[ 699.903811] flags: 0x2ffff0000000000()
[ 699.904585] raw: 02ffff0000000000 0000000000000000 ffffffff07d00101 0000000000000000
[ 699.906125] raw: 0000000000000000 0000000000240000 00000000ffffffff 0000000000000000
[ 699.907673] page dumped because: kasan: bad access detected
[ 699.909108] Memory state around the buggy address:
[ 699.910077] ffff8801f43af800: 00 f1 f1 f1 f1 00 f4 f4 f4 f3 f3 f3 f3 00 00 00
[ 699.911528] ffff8801f43af880: 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
[ 699.912953] >ffff8801f43af900: 00 00 00 00 00 00 00 00 f1 01 f4 f4 f4 f2 f2 f2
[ 699.914392] ^
[ 699.915758] ffff8801f43af980: f2 00 f4 f4 00 00 00 00 f2 00 00 00 00 00 00 00
[ 699.917193] ffff8801f43afa00: 00 00 00 00 00 00 00 00 00 f3 f3 f3 00 00 00 00
[ 699.918634] ==================================================================
- Location
https://elixir.bootlin.com/linux/v4.18-rc1/source/fs/f2fs/segment.h#L644
Reported-by Wen Xu <wen.xu@gatech.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-08-01 19:13:44 +08:00
|
|
|
goto out_writepage;
|
|
|
|
}
|
2012-11-02 16:10:12 +08:00
|
|
|
/*
|
|
|
|
* If current allocation needs SSR,
|
|
|
|
* it had better in-place writes for updated data.
|
|
|
|
*/
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
if (ipu_force ||
|
|
|
|
(__is_valid_data_blkaddr(fio->old_blkaddr) &&
|
2018-05-23 22:25:08 +08:00
|
|
|
need_inplace_update(fio))) {
|
2017-05-13 04:51:34 +08:00
|
|
|
err = encrypt_one_page(fio);
|
|
|
|
if (err)
|
|
|
|
goto out_writepage;
|
|
|
|
|
|
|
|
set_page_writeback(page);
|
2018-04-12 14:09:04 +08:00
|
|
|
ClearPageError(page);
|
2017-04-27 00:17:21 +08:00
|
|
|
f2fs_put_dnode(&dn);
|
2017-05-13 04:51:34 +08:00
|
|
|
if (fio->need_lock == LOCK_REQ)
|
2017-04-27 00:17:21 +08:00
|
|
|
f2fs_unlock_op(fio->sbi);
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
err = f2fs_inplace_write_data(fio);
|
2019-02-21 20:37:14 +08:00
|
|
|
if (err) {
|
|
|
|
if (f2fs_encrypted_file(inode))
|
2019-05-21 00:29:39 +08:00
|
|
|
fscrypt_finalize_bounce_page(&fio->encrypted_page);
|
2019-02-21 20:37:14 +08:00
|
|
|
if (PageWriteback(page))
|
|
|
|
end_page_writeback(page);
|
2019-04-15 15:30:53 +08:00
|
|
|
} else {
|
|
|
|
set_inode_flag(inode, FI_UPDATE_WRITE);
|
2019-02-21 20:37:14 +08:00
|
|
|
}
|
2017-04-25 20:45:12 +08:00
|
|
|
trace_f2fs_do_write_data_page(fio->page, IPU);
|
2017-04-27 00:17:21 +08:00
|
|
|
return err;
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
2017-04-27 00:17:21 +08:00
|
|
|
|
2017-05-13 04:51:34 +08:00
|
|
|
if (fio->need_lock == LOCK_RETRY) {
|
|
|
|
if (!f2fs_trylock_op(fio->sbi)) {
|
|
|
|
err = -EAGAIN;
|
|
|
|
goto out_writepage;
|
|
|
|
}
|
|
|
|
fio->need_lock = LOCK_REQ;
|
|
|
|
}
|
|
|
|
|
2018-07-17 00:02:17 +08:00
|
|
|
err = f2fs_get_node_info(fio->sbi, dn.nid, &ni);
|
|
|
|
if (err)
|
|
|
|
goto out_writepage;
|
|
|
|
|
|
|
|
fio->version = ni.version;
|
|
|
|
|
2017-05-13 04:51:34 +08:00
|
|
|
err = encrypt_one_page(fio);
|
|
|
|
if (err)
|
|
|
|
goto out_writepage;
|
|
|
|
|
|
|
|
set_page_writeback(page);
|
2018-04-12 14:09:04 +08:00
|
|
|
ClearPageError(page);
|
2017-05-13 04:51:34 +08:00
|
|
|
|
2017-04-27 00:17:21 +08:00
|
|
|
/* LFS mode write path */
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_outplace_write_data(&dn, fio);
|
2017-04-27 00:17:21 +08:00
|
|
|
trace_f2fs_do_write_data_page(page, OPU);
|
|
|
|
set_inode_flag(inode, FI_APPEND_WRITE);
|
|
|
|
if (page->index == 0)
|
|
|
|
set_inode_flag(inode, FI_FIRST_BLOCK_WRITTEN);
|
2012-11-02 16:10:12 +08:00
|
|
|
out_writepage:
|
|
|
|
f2fs_put_dnode(&dn);
|
2017-04-27 00:17:21 +08:00
|
|
|
out:
|
2017-05-13 04:51:34 +08:00
|
|
|
if (fio->need_lock == LOCK_REQ)
|
2017-04-27 00:17:21 +08:00
|
|
|
f2fs_unlock_op(fio->sbi);
|
2012-11-02 16:10:12 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2017-02-04 09:44:04 +08:00
|
|
|
static int __write_data_page(struct page *page, bool *submitted,
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
struct bio **bio,
|
|
|
|
sector_t *last_block,
|
2017-08-02 23:21:48 +08:00
|
|
|
struct writeback_control *wbc,
|
|
|
|
enum iostat_type io_type)
|
2012-11-02 16:10:12 +08:00
|
|
|
{
|
|
|
|
struct inode *inode = page->mapping->host;
|
2014-09-03 06:31:18 +08:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
2012-11-02 16:10:12 +08:00
|
|
|
loff_t i_size = i_size_read(inode);
|
|
|
|
const pgoff_t end_index = ((unsigned long long) i_size)
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
>> PAGE_SHIFT;
|
2024-06-11 20:08:33 +08:00
|
|
|
loff_t psize = (loff_t)(page->index + 1) << PAGE_SHIFT;
|
f2fs: handle inline data operations
Hook inline data read/write, truncate, fallocate, setattr, etc.
Files need meet following 2 requirement to inline:
1) file size is not greater than MAX_INLINE_DATA;
2) file doesn't pre-allocate data blocks by fallocate().
FI_INLINE_DATA will not be set while creating a new regular inode because
most of the files are bigger than ~3.4K. Set FI_INLINE_DATA only when
data is submitted to block layer, ranther than set it while creating a new
inode, this also avoids converting data from inline to normal data block
and vice versa.
While writting inline data to inode block, the first data block should be
released if the file has a block indexed by i_addr[0].
On the other hand, when a file operation is appied to a file with inline
data, we need to test if this file can remain inline by doing this
operation, otherwise it should be convert into normal file by reserving
a new data block, copying inline data to this new block and clear
FI_INLINE_DATA flag. Because reserve a new data block here will make use
of i_addr[0], if we save inline data in i_addr[0..872], then the first
4 bytes would be overwriten. This problem can be avoided simply by
not using i_addr[0] for inline data.
Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-11-10 23:13:20 +08:00
|
|
|
unsigned offset = 0;
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 15:21:29 +08:00
|
|
|
bool need_balance_fs = false;
|
2012-11-02 16:10:12 +08:00
|
|
|
int err = 0;
|
2013-12-11 12:54:01 +08:00
|
|
|
struct f2fs_io_info fio = {
|
2015-04-24 05:38:15 +08:00
|
|
|
.sbi = sbi,
|
2017-09-29 13:59:38 +08:00
|
|
|
.ino = inode->i_ino,
|
2013-12-11 12:54:01 +08:00
|
|
|
.type = DATA,
|
2016-06-06 03:31:55 +08:00
|
|
|
.op = REQ_OP_WRITE,
|
2016-11-02 00:00:38 +08:00
|
|
|
.op_flags = wbc_to_write_flags(wbc),
|
2017-04-25 20:45:13 +08:00
|
|
|
.old_blkaddr = NULL_ADDR,
|
2015-04-24 05:38:15 +08:00
|
|
|
.page = page,
|
2015-04-24 03:04:33 +08:00
|
|
|
.encrypted_page = NULL,
|
2017-02-04 09:44:04 +08:00
|
|
|
.submitted = false,
|
2017-05-13 04:51:34 +08:00
|
|
|
.need_lock = LOCK_RETRY,
|
2017-08-02 23:21:48 +08:00
|
|
|
.io_type = io_type,
|
2018-01-09 19:33:39 +08:00
|
|
|
.io_wbc = wbc,
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
.bio = bio,
|
|
|
|
.last_block = last_block,
|
2013-12-11 12:54:01 +08:00
|
|
|
};
|
2012-11-02 16:10:12 +08:00
|
|
|
|
2014-05-06 16:48:26 +08:00
|
|
|
trace_f2fs_writepage(page, DATA);
|
|
|
|
|
2018-01-18 17:29:10 +08:00
|
|
|
/* we should bypass data pages to proceed the kworkder jobs */
|
|
|
|
if (unlikely(f2fs_cp_error(sbi))) {
|
|
|
|
mapping_set_error(page->mapping, -EIO);
|
2018-05-28 16:59:26 +08:00
|
|
|
/*
|
|
|
|
* don't drop any dirty dentry pages for keeping lastest
|
|
|
|
* directory structure.
|
|
|
|
*/
|
|
|
|
if (S_ISDIR(inode->i_mode))
|
|
|
|
goto redirty_out;
|
2018-01-18 17:29:10 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2017-06-29 23:20:45 +08:00
|
|
|
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
|
|
|
|
goto redirty_out;
|
|
|
|
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
if (page->index < end_index || f2fs_verity_in_progress(inode))
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 15:21:29 +08:00
|
|
|
goto write;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the offset is out-of-range of file size,
|
|
|
|
* this page does not have to be written to disk.
|
|
|
|
*/
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
offset = i_size & (PAGE_SIZE - 1);
|
2014-04-15 15:04:15 +08:00
|
|
|
if ((page->index >= end_index + 1) || !offset)
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 15:21:29 +08:00
|
|
|
goto out;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
zero_user_segment(page, offset, PAGE_SIZE);
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 15:21:29 +08:00
|
|
|
write:
|
2014-12-09 22:08:59 +08:00
|
|
|
if (f2fs_is_drop_cache(inode))
|
|
|
|
goto out;
|
2016-04-15 07:48:52 +08:00
|
|
|
/* we should not write 0'th page having journal header */
|
|
|
|
if (f2fs_is_volatile_file(inode) && (!page->index ||
|
|
|
|
(!wbc->for_reclaim &&
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_available_free_memory(sbi, BASE_CHECK))))
|
2014-12-09 22:08:59 +08:00
|
|
|
goto redirty_out;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 15:21:29 +08:00
|
|
|
/* Dentry blocks are controlled by checkpoint */
|
2012-11-02 16:10:12 +08:00
|
|
|
if (S_ISDIR(inode->i_mode)) {
|
2017-05-13 04:51:34 +08:00
|
|
|
fio.need_lock = LOCK_DONE;
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
err = f2fs_do_write_data_page(&fio);
|
2014-02-17 18:29:27 +08:00
|
|
|
goto done;
|
|
|
|
}
|
f2fs: handle inline data operations
Hook inline data read/write, truncate, fallocate, setattr, etc.
Files need meet following 2 requirement to inline:
1) file size is not greater than MAX_INLINE_DATA;
2) file doesn't pre-allocate data blocks by fallocate().
FI_INLINE_DATA will not be set while creating a new regular inode because
most of the files are bigger than ~3.4K. Set FI_INLINE_DATA only when
data is submitted to block layer, ranther than set it while creating a new
inode, this also avoids converting data from inline to normal data block
and vice versa.
While writting inline data to inode block, the first data block should be
released if the file has a block indexed by i_addr[0].
On the other hand, when a file operation is appied to a file with inline
data, we need to test if this file can remain inline by doing this
operation, otherwise it should be convert into normal file by reserving
a new data block, copying inline data to this new block and clear
FI_INLINE_DATA flag. Because reserve a new data block here will make use
of i_addr[0], if we save inline data in i_addr[0..872], then the first
4 bytes would be overwriten. This problem can be avoided simply by
not using i_addr[0] for inline data.
Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-11-10 23:13:20 +08:00
|
|
|
|
2014-02-17 18:29:27 +08:00
|
|
|
if (!wbc->for_reclaim)
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 15:21:29 +08:00
|
|
|
need_balance_fs = true;
|
2016-09-02 03:02:51 +08:00
|
|
|
else if (has_not_enough_free_secs(sbi, 0, 0))
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 15:21:29 +08:00
|
|
|
goto redirty_out;
|
2017-03-25 08:05:13 +08:00
|
|
|
else
|
|
|
|
set_inode_flag(inode, FI_HOT_DATA);
|
2012-11-02 16:10:12 +08:00
|
|
|
|
2014-10-24 10:48:09 +08:00
|
|
|
err = -EAGAIN;
|
2017-02-23 20:31:20 +08:00
|
|
|
if (f2fs_has_inline_data(inode)) {
|
2014-10-24 10:48:09 +08:00
|
|
|
err = f2fs_write_inline_data(inode, page);
|
2017-02-23 20:31:20 +08:00
|
|
|
if (!err)
|
|
|
|
goto out;
|
|
|
|
}
|
2017-04-27 00:17:21 +08:00
|
|
|
|
2017-05-13 04:51:34 +08:00
|
|
|
if (err == -EAGAIN) {
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
err = f2fs_do_write_data_page(&fio);
|
2017-05-13 04:51:34 +08:00
|
|
|
if (err == -EAGAIN) {
|
|
|
|
fio.need_lock = LOCK_REQ;
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
err = f2fs_do_write_data_page(&fio);
|
2017-05-13 04:51:34 +08:00
|
|
|
}
|
|
|
|
}
|
2017-10-09 17:55:19 +08:00
|
|
|
|
2018-01-17 16:31:37 +08:00
|
|
|
if (err) {
|
|
|
|
file_set_keep_isize(inode);
|
|
|
|
} else {
|
|
|
|
down_write(&F2FS_I(inode)->i_sem);
|
|
|
|
if (F2FS_I(inode)->last_disk_size < psize)
|
|
|
|
F2FS_I(inode)->last_disk_size = psize;
|
|
|
|
up_write(&F2FS_I(inode)->i_sem);
|
|
|
|
}
|
2017-04-27 00:17:21 +08:00
|
|
|
|
2014-02-17 18:29:27 +08:00
|
|
|
done:
|
|
|
|
if (err && err != -ENOENT)
|
|
|
|
goto redirty_out;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
f2fs: introduce a new global lock scheme
In the previous version, f2fs uses global locks according to the usage types,
such as directory operations, block allocation, block write, and so on.
Reference the following lock types in f2fs.h.
enum lock_type {
RENAME, /* for renaming operations */
DENTRY_OPS, /* for directory operations */
DATA_WRITE, /* for data write */
DATA_NEW, /* for data allocation */
DATA_TRUNC, /* for data truncate */
NODE_NEW, /* for node allocation */
NODE_TRUNC, /* for node truncate */
NODE_WRITE, /* for node write */
NR_LOCK_TYPE,
};
In that case, we lose the performance under the multi-threading environment,
since every types of operations must be conducted one at a time.
In order to address the problem, let's share the locks globally with a mutex
array regardless of any types.
So, let users grab a mutex and perform their jobs in parallel as much as
possbile.
For this, I propose a new global lock scheme as follows.
0. Data structure
- f2fs_sb_info -> mutex_lock[NR_GLOBAL_LOCKS]
- f2fs_sb_info -> node_write
1. mutex_lock_op(sbi)
- try to get an avaiable lock from the array.
- returns the index of the gottern lock variable.
2. mutex_unlock_op(sbi, index of the lock)
- unlock the given index of the lock.
3. mutex_lock_all(sbi)
- grab all the locks in the array before the checkpoint.
4. mutex_unlock_all(sbi)
- release all the locks in the array after checkpoint.
5. block_operations()
- call mutex_lock_all()
- sync_dirty_dir_inodes()
- grab node_write
- sync_node_pages()
Note that,
the pairs of mutex_lock_op()/mutex_unlock_op() and
mutex_lock_all()/mutex_unlock_all() should be used together.
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2012-11-22 15:21:29 +08:00
|
|
|
out:
|
2014-09-13 06:53:45 +08:00
|
|
|
inode_dec_dirty_pages(inode);
|
2018-07-27 18:15:16 +08:00
|
|
|
if (err) {
|
2015-02-26 11:25:01 +08:00
|
|
|
ClearPageUptodate(page);
|
2018-07-27 18:15:16 +08:00
|
|
|
clear_cold_data(page);
|
|
|
|
}
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
|
|
|
|
if (wbc->for_reclaim) {
|
2018-09-27 23:41:16 +08:00
|
|
|
f2fs_submit_merged_write_cond(sbi, NULL, page, 0, DATA);
|
2017-03-25 08:05:13 +08:00
|
|
|
clear_inode_flag(inode, FI_HOT_DATA);
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_remove_dirty_inode(inode);
|
2017-02-04 09:44:04 +08:00
|
|
|
submitted = NULL;
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
}
|
|
|
|
|
2012-11-02 16:10:12 +08:00
|
|
|
unlock_page(page);
|
f2fs: fix potential recursive call when enabling data_flush
As Hagbard Celine reported:
Hi, this is a long standing bug that I've hit before on older kernels,
but I was not able to get the syslog saved because of the nature of
the bug. This time I had booted form a pen-drive, and was able to save
the log to it's efi-partition.
What i did to trigger it was to create a partition and format it f2fs,
then mount it with options:
"rw,relatime,lazytime,background_gc=on,disable_ext_identify,discard,heap,user_xattr,inline_xattr,acl,inline_data,inline_dentry,flush_merge,data_flush,extent_cache,mode=adaptive,active_logs=6,whint_mode=fs-based,alloc_mode=default,fsync_mode=strict".
Then I unpacked a big .tar.xz to the partition (I used a
gentoo-stage3-tarball as I was in process of installing Gentoo).
Same options just without data_flush gives no problems.
Mar 20 20:54:01 usbgentoo kernel: FAT-fs (nvme0n1p4): Volume was not
properly unmounted. Some data may be corrupt. Please run fsck.
Mar 20 21:05:23 usbgentoo kernel: kworker/dying (1588) used greatest
stack depth: 12064 bytes left
Mar 20 21:06:40 usbgentoo kernel: BUG: stack guard page was hit at
00000000a4b0733c (stack is 0000000056016422..0000000096e7463f)
Mar 20 21:06:40 usbgentoo kernel: kernel stack overflow
......
Mar 20 21:06:40 usbgentoo kernel: Call Trace:
Mar 20 21:06:40 usbgentoo kernel: read_node_page+0x71/0xf0
Mar 20 21:06:40 usbgentoo kernel: ? xas_load+0x8/0x50
Mar 20 21:06:40 usbgentoo kernel: __get_node_page+0x73/0x2a0
Mar 20 21:06:40 usbgentoo kernel: f2fs_get_dnode_of_data+0x34e/0x580
Mar 20 21:06:40 usbgentoo kernel: f2fs_write_inline_data+0x5e/0x2a0
Mar 20 21:06:40 usbgentoo kernel: __write_data_page+0x421/0x690
Mar 20 21:06:40 usbgentoo kernel: f2fs_write_cache_pages+0x1cf/0x460
Mar 20 21:06:40 usbgentoo kernel: f2fs_write_data_pages+0x2b3/0x2e0
Mar 20 21:06:40 usbgentoo kernel: ? f2fs_inode_chksum_verify+0x1d/0xc0
Mar 20 21:06:40 usbgentoo kernel: ? read_node_page+0x71/0xf0
Mar 20 21:06:40 usbgentoo kernel: do_writepages+0x3c/0xd0
Mar 20 21:06:40 usbgentoo kernel: __filemap_fdatawrite_range+0x7c/0xb0
Mar 20 21:06:40 usbgentoo kernel: f2fs_sync_dirty_inodes+0xf2/0x200
Mar 20 21:06:40 usbgentoo kernel: f2fs_balance_fs_bg+0x2a3/0x2c0
Mar 20 21:06:40 usbgentoo kernel: ? f2fs_inode_dirtied+0x21/0xc0
Mar 20 21:06:40 usbgentoo kernel: f2fs_balance_fs+0xd6/0x2b0
Mar 20 21:06:40 usbgentoo kernel: __write_data_page+0x4fb/0x690
......
Mar 20 21:06:40 usbgentoo kernel: __writeback_single_inode+0x2a1/0x340
Mar 20 21:06:40 usbgentoo kernel: ? soft_cursor+0x1b4/0x220
Mar 20 21:06:40 usbgentoo kernel: writeback_sb_inodes+0x1d5/0x3e0
Mar 20 21:06:40 usbgentoo kernel: __writeback_inodes_wb+0x58/0xa0
Mar 20 21:06:40 usbgentoo kernel: wb_writeback+0x250/0x2e0
Mar 20 21:06:40 usbgentoo kernel: ? 0xffffffff8c000000
Mar 20 21:06:40 usbgentoo kernel: ? cpumask_next+0x16/0x20
Mar 20 21:06:40 usbgentoo kernel: wb_workfn+0x2f6/0x3b0
Mar 20 21:06:40 usbgentoo kernel: ? __switch_to_asm+0x40/0x70
Mar 20 21:06:40 usbgentoo kernel: process_one_work+0x1f5/0x3f0
Mar 20 21:06:40 usbgentoo kernel: worker_thread+0x28/0x3c0
Mar 20 21:06:40 usbgentoo kernel: ? rescuer_thread+0x330/0x330
Mar 20 21:06:40 usbgentoo kernel: kthread+0x10e/0x130
Mar 20 21:06:40 usbgentoo kernel: ? kthread_create_on_node+0x60/0x60
Mar 20 21:06:40 usbgentoo kernel: ret_from_fork+0x35/0x40
The root cause is that we run into an infinite recursive calling in
between f2fs_balance_fs_bg and writepage() as described below:
- f2fs_write_data_pages --- A
- __write_data_page
- f2fs_balance_fs
- f2fs_balance_fs_bg --- B
- f2fs_sync_dirty_inodes
- filemap_fdatawrite
- f2fs_write_data_pages --- A
...
- f2fs_balance_fs_bg --- B
...
In order to fix this issue, let's detect such condition in __write_data_page()
and just skip calling f2fs_balance_fs() recursively.
Reported-by: Hagbard Celine <hagbardcelin@gmail.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-02 18:52:19 +08:00
|
|
|
if (!S_ISDIR(inode->i_mode) && !IS_NOQUOTA(inode) &&
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
!F2FS_I(inode)->cp_task) {
|
|
|
|
f2fs_submit_ipu_bio(sbi, bio, page);
|
2017-04-21 04:51:57 +08:00
|
|
|
f2fs_balance_fs(sbi, need_balance_fs);
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
}
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
|
2017-02-04 09:44:04 +08:00
|
|
|
if (unlikely(f2fs_cp_error(sbi))) {
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
f2fs_submit_ipu_bio(sbi, bio, page);
|
2017-05-11 02:28:38 +08:00
|
|
|
f2fs_submit_merged_write(sbi, DATA);
|
2017-02-04 09:44:04 +08:00
|
|
|
submitted = NULL;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (submitted)
|
|
|
|
*submitted = fio.submitted;
|
f2fs: introduce f2fs_submit_merged_bio_cond
f2fs use single bio buffer per type data (META/NODE/DATA) for caching
writes locating in continuous block address as many as possible, after
submitting, these writes may be still cached in bio buffer, so we have
to flush cached writes in bio buffer by calling f2fs_submit_merged_bio.
Unfortunately, in the scenario of high concurrency, bio buffer could be
flushed by someone else before we submit it as below reasons:
a) there is no space in bio buffer.
b) add a request of different type (SYNC, ASYNC).
c) add a discontinuous block address.
For this condition, f2fs_submit_merged_bio will be devastating, because
it could break the following merging of writes in bio buffer, split one
big bio into two smaller one.
This patch introduces f2fs_submit_merged_bio_cond which can do a
conditional submitting with bio buffer, before submitting it will judge
whether:
- page in DATA type bio buffer is matching with specified page;
- page in DATA type bio buffer is belong to specified inode;
- page in NODE type bio buffer is belong to specified inode;
If there is no eligible page in bio buffer, we will skip submitting step,
result in gaining more chance to merge consecutive block IOs in bio cache.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-01-18 18:28:11 +08:00
|
|
|
|
2012-11-02 16:10:12 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
redirty_out:
|
2014-04-15 15:04:15 +08:00
|
|
|
redirty_page_for_writepage(wbc, page);
|
2018-05-04 14:26:02 +08:00
|
|
|
/*
|
|
|
|
* pageout() in MM traslates EAGAIN, so calls handle_write_error()
|
|
|
|
* -> mapping_set_error() -> set_bit(AS_EIO, ...).
|
|
|
|
* file_write_and_wait_range() will see EIO error, which is critical
|
|
|
|
* to return value of fsync() followed by atomic_write failure to user.
|
|
|
|
*/
|
|
|
|
if (!err || wbc->for_reclaim)
|
2016-11-29 11:13:43 +08:00
|
|
|
return AOP_WRITEPAGE_ACTIVATE;
|
2016-05-30 12:18:23 +08:00
|
|
|
unlock_page(page);
|
|
|
|
return err;
|
2013-01-15 15:45:24 +08:00
|
|
|
}
|
|
|
|
|
2017-02-04 09:18:00 +08:00
|
|
|
static int f2fs_write_data_page(struct page *page,
|
|
|
|
struct writeback_control *wbc)
|
|
|
|
{
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
return __write_data_page(page, NULL, NULL, NULL, wbc, FS_DATA_IO);
|
2017-02-04 09:18:00 +08:00
|
|
|
}
|
|
|
|
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
/*
|
|
|
|
* This function was copied from write_cche_pages from mm/page-writeback.c.
|
|
|
|
* The major change is making write step of cold data page separately from
|
|
|
|
* warm/hot data page.
|
|
|
|
*/
|
|
|
|
static int f2fs_write_cache_pages(struct address_space *mapping,
|
2017-08-02 23:21:48 +08:00
|
|
|
struct writeback_control *wbc,
|
|
|
|
enum iostat_type io_type)
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
{
|
|
|
|
int ret = 0;
|
|
|
|
int done = 0;
|
|
|
|
struct pagevec pvec;
|
2018-06-04 23:20:36 +08:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_M_SB(mapping);
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
struct bio *bio = NULL;
|
|
|
|
sector_t last_block;
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
int nr_pages;
|
|
|
|
pgoff_t uninitialized_var(writeback_index);
|
|
|
|
pgoff_t index;
|
|
|
|
pgoff_t end; /* Inclusive */
|
|
|
|
pgoff_t done_index;
|
|
|
|
int cycled;
|
|
|
|
int range_whole = 0;
|
2017-12-06 06:30:38 +08:00
|
|
|
xa_mark_t tag;
|
2018-09-27 23:41:16 +08:00
|
|
|
int nwritten = 0;
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
|
2017-11-16 09:37:52 +08:00
|
|
|
pagevec_init(&pvec);
|
2016-05-26 11:57:16 +08:00
|
|
|
|
2017-03-25 08:05:13 +08:00
|
|
|
if (get_dirty_pages(mapping->host) <=
|
|
|
|
SM_I(F2FS_M_SB(mapping))->min_hot_blocks)
|
|
|
|
set_inode_flag(mapping->host, FI_HOT_DATA);
|
|
|
|
else
|
|
|
|
clear_inode_flag(mapping->host, FI_HOT_DATA);
|
|
|
|
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
if (wbc->range_cyclic) {
|
|
|
|
writeback_index = mapping->writeback_index; /* prev offset */
|
|
|
|
index = writeback_index;
|
|
|
|
if (index == 0)
|
|
|
|
cycled = 1;
|
|
|
|
else
|
|
|
|
cycled = 0;
|
|
|
|
end = -1;
|
|
|
|
} else {
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
index = wbc->range_start >> PAGE_SHIFT;
|
|
|
|
end = wbc->range_end >> PAGE_SHIFT;
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
if (wbc->range_start == 0 && wbc->range_end == LLONG_MAX)
|
|
|
|
range_whole = 1;
|
|
|
|
cycled = 1; /* ignore range_cyclic tests */
|
|
|
|
}
|
|
|
|
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
|
|
|
|
tag = PAGECACHE_TAG_TOWRITE;
|
|
|
|
else
|
|
|
|
tag = PAGECACHE_TAG_DIRTY;
|
|
|
|
retry:
|
|
|
|
if (wbc->sync_mode == WB_SYNC_ALL || wbc->tagged_writepages)
|
|
|
|
tag_pages_for_writeback(mapping, index, end);
|
|
|
|
done_index = index;
|
|
|
|
while (!done && (index <= end)) {
|
|
|
|
int i;
|
|
|
|
|
2017-11-16 09:34:48 +08:00
|
|
|
nr_pages = pagevec_lookup_range_tag(&pvec, mapping, &index, end,
|
2017-11-16 09:35:19 +08:00
|
|
|
tag);
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
if (nr_pages == 0)
|
|
|
|
break;
|
|
|
|
|
|
|
|
for (i = 0; i < nr_pages; i++) {
|
|
|
|
struct page *page = pvec.pages[i];
|
2017-02-04 09:44:04 +08:00
|
|
|
bool submitted = false;
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
|
2018-05-23 22:25:09 +08:00
|
|
|
/* give a priority to WB_SYNC threads */
|
2018-06-04 23:20:36 +08:00
|
|
|
if (atomic_read(&sbi->wb_sync_req[DATA]) &&
|
2018-05-23 22:25:09 +08:00
|
|
|
wbc->sync_mode == WB_SYNC_NONE) {
|
|
|
|
done = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
done_index = page->index;
|
2017-06-22 08:52:39 +08:00
|
|
|
retry_write:
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
lock_page(page);
|
|
|
|
|
|
|
|
if (unlikely(page->mapping != mapping)) {
|
|
|
|
continue_unlock:
|
|
|
|
unlock_page(page);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!PageDirty(page)) {
|
|
|
|
/* someone wrote it for us */
|
|
|
|
goto continue_unlock;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (PageWriteback(page)) {
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
if (wbc->sync_mode != WB_SYNC_NONE) {
|
2016-01-20 23:43:51 +08:00
|
|
|
f2fs_wait_on_page_writeback(page,
|
2018-12-25 17:43:42 +08:00
|
|
|
DATA, true, true);
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
f2fs_submit_ipu_bio(sbi, &bio, page);
|
|
|
|
} else {
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
goto continue_unlock;
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
}
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
if (!clear_page_dirty_for_io(page))
|
|
|
|
goto continue_unlock;
|
|
|
|
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
ret = __write_data_page(page, &submitted, &bio,
|
|
|
|
&last_block, wbc, io_type);
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
if (unlikely(ret)) {
|
2016-11-29 11:13:43 +08:00
|
|
|
/*
|
|
|
|
* keep nr_to_write, since vfs uses this to
|
|
|
|
* get # of written pages.
|
|
|
|
*/
|
|
|
|
if (ret == AOP_WRITEPAGE_ACTIVATE) {
|
|
|
|
unlock_page(page);
|
|
|
|
ret = 0;
|
|
|
|
continue;
|
2017-06-22 08:52:39 +08:00
|
|
|
} else if (ret == -EAGAIN) {
|
|
|
|
ret = 0;
|
|
|
|
if (wbc->sync_mode == WB_SYNC_ALL) {
|
|
|
|
cond_resched();
|
|
|
|
congestion_wait(BLK_RW_ASYNC,
|
|
|
|
HZ/50);
|
|
|
|
goto retry_write;
|
|
|
|
}
|
|
|
|
continue;
|
2016-11-29 11:13:43 +08:00
|
|
|
}
|
2016-05-30 12:18:23 +08:00
|
|
|
done_index = page->index + 1;
|
|
|
|
done = 1;
|
|
|
|
break;
|
2017-02-04 09:44:04 +08:00
|
|
|
} else if (submitted) {
|
2018-09-27 23:41:16 +08:00
|
|
|
nwritten++;
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
}
|
|
|
|
|
2018-05-23 22:25:09 +08:00
|
|
|
if (--wbc->nr_to_write <= 0 &&
|
2017-03-29 09:07:38 +08:00
|
|
|
wbc->sync_mode == WB_SYNC_NONE) {
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
done = 1;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
pagevec_release(&pvec);
|
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
|
|
|
|
if (!cycled && !done) {
|
|
|
|
cycled = 1;
|
|
|
|
index = 0;
|
|
|
|
end = writeback_index - 1;
|
|
|
|
goto retry;
|
|
|
|
}
|
|
|
|
if (wbc->range_cyclic || (range_whole && wbc->nr_to_write > 0))
|
|
|
|
mapping->writeback_index = done_index;
|
|
|
|
|
2018-09-27 23:41:16 +08:00
|
|
|
if (nwritten)
|
2017-05-11 02:28:38 +08:00
|
|
|
f2fs_submit_merged_write_cond(F2FS_M_SB(mapping), mapping->host,
|
2018-09-27 23:41:16 +08:00
|
|
|
NULL, 0, DATA);
|
f2fs: add bio cache for IPU
SQLite in Wal mode may trigger sequential IPU write in db-wal file, after
commit d1b3e72d5490 ("f2fs: submit bio of in-place-update pages"), we
lost the chance of merging page in inner managed bio cache, result in
submitting more small-sized IO.
So let's add temporary bio in writepages() to cache mergeable write IO as
much as possible.
Test case:
1. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
2. xfs_io -f /mnt/f2fs/file -c "pwrite 0 65536" -c "fsync"
Before:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65552, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65560, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65568, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65576, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65584, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65592, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65600, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65608, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65616, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65624, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65632, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65640, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65648, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65656, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65664, size = 4096
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57352, size = 4096
After:
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), DATA, sector = 65544, size = 65536
f2fs_submit_write_bio: dev = (251,0)/(251,0), rw = WRITE(S), NODE, sector = 57368, size = 4096
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-02-19 16:15:29 +08:00
|
|
|
/* submit cached bio of IPU write */
|
|
|
|
if (bio)
|
|
|
|
__submit_bio(sbi, bio, DATA);
|
2016-09-29 18:50:11 +08:00
|
|
|
|
f2fs: expose f2fs_write_cache_pages
If there are gced dirty pages and normal dirty pages in the mapping
of one inode, we might writeback them alternately with discontinuous
block address, resulting in low performance.
This patch introduces f2fs_write_cache_pages with codes copied from
write_cache_pages in mm/page-writeback.c.
In this function, we refactor flow with two steps:
1) writeback all cold type pages.
2) writeback all non-cold type pages.
By using this method, f2fs will writeback dirty pages with the same
temperature in bunch mode, it makes writeouted block being with
more continuous address, so they can be merged as much as possible
in f2fs bio cache, and also it will reduce the chance of submiting
small IO from block layer.
Test environment: 8g nokia sd card (very old sd card, but it shows
better effect when testing with this patch, and with a 32g kingston
sd card, I didn't see much more improvement).
Test step:
1. touch testfile;
2. truncate -s 512K testfile;
3. write all pages with odd index;
4. trigger gc by ioctl;
5. write all pages with even index;
6. time fsync testfile.
before:
real 0m0.402s
user 0m0.000s
sys 0m0.000s
after:
real 0m0.143s
user 0m0.004s
sys 0m0.004s
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-07-14 18:56:10 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2018-08-10 08:53:34 +08:00
|
|
|
static inline bool __should_serialize_io(struct inode *inode,
|
|
|
|
struct writeback_control *wbc)
|
|
|
|
{
|
|
|
|
if (!S_ISREG(inode->i_mode))
|
|
|
|
return false;
|
f2fs: guarantee journalled quota data by checkpoint
For journalled quota mode, let checkpoint to flush dquot dirty data
and quota file data to guarntee persistence of all quota sysfile in
last checkpoint, by this way, we can avoid corrupting quota sysfile
when encountering SPO.
The implementation is as below:
1. add a global state SBI_QUOTA_NEED_FLUSH to indicate that there is
cached dquot metadata changes in quota subsystem, and later checkpoint
should:
a) flush dquot metadata into quota file.
b) flush quota file to storage to keep file usage be consistent.
2. add a global state SBI_QUOTA_NEED_REPAIR to indicate that quota
operation failed due to -EIO or -ENOSPC, so later,
a) checkpoint will skip syncing dquot metadata.
b) CP_QUOTA_NEED_FSCK_FLAG will be set in last cp pack to give a
hint for fsck repairing.
3. add a global state SBI_QUOTA_SKIP_FLUSH, in checkpoint, if quota
data updating is very heavy, it may cause hungtask in block_operation().
To avoid this, if our retry time exceed threshold, let's just skip
flushing and retry in next checkpoint().
Signed-off-by: Weichao Guo <guoweichao@huawei.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: avoid warnings and set fsck flag]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-09-20 20:05:00 +08:00
|
|
|
if (IS_NOQUOTA(inode))
|
|
|
|
return false;
|
2019-05-20 17:36:59 +08:00
|
|
|
/* to avoid deadlock in path of data flush */
|
|
|
|
if (F2FS_I(inode)->cp_task)
|
|
|
|
return false;
|
2018-08-10 08:53:34 +08:00
|
|
|
if (wbc->sync_mode != WB_SYNC_ALL)
|
|
|
|
return true;
|
|
|
|
if (get_dirty_pages(inode) >= SM_I(F2FS_I_SB(inode))->min_seq_blocks)
|
|
|
|
return true;
|
|
|
|
return false;
|
|
|
|
}
|
|
|
|
|
2018-05-30 00:20:39 +08:00
|
|
|
static int __f2fs_write_data_pages(struct address_space *mapping,
|
2017-08-02 23:21:48 +08:00
|
|
|
struct writeback_control *wbc,
|
|
|
|
enum iostat_type io_type)
|
2012-11-02 16:10:12 +08:00
|
|
|
{
|
|
|
|
struct inode *inode = mapping->host;
|
2014-09-03 06:31:18 +08:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
2016-07-14 10:33:19 +08:00
|
|
|
struct blk_plug plug;
|
2012-11-02 16:10:12 +08:00
|
|
|
int ret;
|
2018-08-10 08:53:34 +08:00
|
|
|
bool locked = false;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
2013-04-03 10:38:00 +08:00
|
|
|
/* deal with chardevs and other special file */
|
|
|
|
if (!mapping->a_ops->writepage)
|
|
|
|
return 0;
|
|
|
|
|
2015-07-17 18:02:39 +08:00
|
|
|
/* skip writing if there is no dirty page in this inode */
|
|
|
|
if (!get_dirty_pages(inode) && wbc->sync_mode == WB_SYNC_NONE)
|
|
|
|
return 0;
|
|
|
|
|
2017-06-29 23:20:45 +08:00
|
|
|
/* during POR, we don't need to trigger writepage at all. */
|
|
|
|
if (unlikely(is_sbi_flag_set(sbi, SBI_POR_DOING)))
|
|
|
|
goto skip_write;
|
|
|
|
|
f2fs: guarantee journalled quota data by checkpoint
For journalled quota mode, let checkpoint to flush dquot dirty data
and quota file data to guarntee persistence of all quota sysfile in
last checkpoint, by this way, we can avoid corrupting quota sysfile
when encountering SPO.
The implementation is as below:
1. add a global state SBI_QUOTA_NEED_FLUSH to indicate that there is
cached dquot metadata changes in quota subsystem, and later checkpoint
should:
a) flush dquot metadata into quota file.
b) flush quota file to storage to keep file usage be consistent.
2. add a global state SBI_QUOTA_NEED_REPAIR to indicate that quota
operation failed due to -EIO or -ENOSPC, so later,
a) checkpoint will skip syncing dquot metadata.
b) CP_QUOTA_NEED_FSCK_FLAG will be set in last cp pack to give a
hint for fsck repairing.
3. add a global state SBI_QUOTA_SKIP_FLUSH, in checkpoint, if quota
data updating is very heavy, it may cause hungtask in block_operation().
To avoid this, if our retry time exceed threshold, let's just skip
flushing and retry in next checkpoint().
Signed-off-by: Weichao Guo <guoweichao@huawei.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: avoid warnings and set fsck flag]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-09-20 20:05:00 +08:00
|
|
|
if ((S_ISDIR(inode->i_mode) || IS_NOQUOTA(inode)) &&
|
|
|
|
wbc->sync_mode == WB_SYNC_NONE &&
|
2015-10-09 01:40:07 +08:00
|
|
|
get_dirty_pages(inode) < nr_pages_to_skip(sbi, DATA) &&
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_available_free_memory(sbi, DIRTY_DENTS))
|
2015-10-09 01:40:07 +08:00
|
|
|
goto skip_write;
|
|
|
|
|
2015-10-27 09:53:45 +08:00
|
|
|
/* skip writing during file defragment */
|
2016-05-21 01:13:22 +08:00
|
|
|
if (is_inode_flag_set(inode, FI_DO_DEFRAG))
|
2015-10-27 09:53:45 +08:00
|
|
|
goto skip_write;
|
|
|
|
|
2016-02-04 16:14:00 +08:00
|
|
|
trace_f2fs_writepages(mapping->host, wbc, DATA);
|
|
|
|
|
2017-03-29 09:07:38 +08:00
|
|
|
/* to avoid spliting IOs due to mixed WB_SYNC_ALL and WB_SYNC_NONE */
|
|
|
|
if (wbc->sync_mode == WB_SYNC_ALL)
|
2018-06-04 23:20:36 +08:00
|
|
|
atomic_inc(&sbi->wb_sync_req[DATA]);
|
2024-06-12 13:13:20 +08:00
|
|
|
else if (atomic_read(&sbi->wb_sync_req[DATA])) {
|
|
|
|
/* to avoid potential deadlock */
|
|
|
|
if (current->plug)
|
|
|
|
blk_finish_plug(current->plug);
|
2017-03-29 09:07:38 +08:00
|
|
|
goto skip_write;
|
2024-06-12 13:13:20 +08:00
|
|
|
}
|
2017-03-29 09:07:38 +08:00
|
|
|
|
2018-08-10 08:53:34 +08:00
|
|
|
if (__should_serialize_io(inode, wbc)) {
|
|
|
|
mutex_lock(&sbi->writepages);
|
|
|
|
locked = true;
|
|
|
|
}
|
|
|
|
|
2016-07-14 10:33:19 +08:00
|
|
|
blk_start_plug(&plug);
|
2017-08-02 23:21:48 +08:00
|
|
|
ret = f2fs_write_cache_pages(mapping, wbc, io_type);
|
2016-07-14 10:33:19 +08:00
|
|
|
blk_finish_plug(&plug);
|
2017-03-29 09:07:38 +08:00
|
|
|
|
2018-08-10 08:53:34 +08:00
|
|
|
if (locked)
|
|
|
|
mutex_unlock(&sbi->writepages);
|
|
|
|
|
2017-03-29 09:07:38 +08:00
|
|
|
if (wbc->sync_mode == WB_SYNC_ALL)
|
2018-06-04 23:20:36 +08:00
|
|
|
atomic_dec(&sbi->wb_sync_req[DATA]);
|
2016-05-26 08:17:56 +08:00
|
|
|
/*
|
|
|
|
* if some pages were truncated, we cannot guarantee its mapping->host
|
|
|
|
* to detect pending bios.
|
|
|
|
*/
|
2013-12-11 12:54:01 +08:00
|
|
|
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_remove_dirty_inode(inode);
|
2012-11-02 16:10:12 +08:00
|
|
|
return ret;
|
2014-03-18 12:43:05 +08:00
|
|
|
|
|
|
|
skip_write:
|
2014-09-13 06:53:45 +08:00
|
|
|
wbc->pages_skipped += get_dirty_pages(inode);
|
2016-02-04 16:14:00 +08:00
|
|
|
trace_f2fs_writepages(mapping->host, wbc, DATA);
|
2014-03-18 12:43:05 +08:00
|
|
|
return 0;
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
|
|
|
|
2017-08-02 23:21:48 +08:00
|
|
|
static int f2fs_write_data_pages(struct address_space *mapping,
|
|
|
|
struct writeback_control *wbc)
|
|
|
|
{
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
|
|
|
|
return __f2fs_write_data_pages(mapping, wbc,
|
|
|
|
F2FS_I(inode)->cp_task == current ?
|
|
|
|
FS_CP_DATA_IO : FS_DATA_IO);
|
|
|
|
}
|
|
|
|
|
2014-07-02 13:25:04 +08:00
|
|
|
static void f2fs_write_failed(struct address_space *mapping, loff_t to)
|
|
|
|
{
|
|
|
|
struct inode *inode = mapping->host;
|
2015-12-29 05:48:11 +08:00
|
|
|
loff_t i_size = i_size_read(inode);
|
2014-07-02 13:25:04 +08:00
|
|
|
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
/* In the fs-verity case, f2fs_end_enable_verity() does the truncate */
|
|
|
|
if (to > i_size && !f2fs_verity_in_progress(inode)) {
|
2018-08-05 23:04:25 +08:00
|
|
|
down_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
|
2018-07-25 11:11:56 +08:00
|
|
|
down_write(&F2FS_I(inode)->i_mmap_sem);
|
2018-08-05 23:04:25 +08:00
|
|
|
|
2015-12-29 05:48:11 +08:00
|
|
|
truncate_pagecache(inode, i_size);
|
2019-02-02 17:33:01 +08:00
|
|
|
if (!IS_NOQUOTA(inode))
|
|
|
|
f2fs_truncate_blocks(inode, i_size, true);
|
2018-08-05 23:04:25 +08:00
|
|
|
|
2017-05-18 11:06:45 +08:00
|
|
|
up_write(&F2FS_I(inode)->i_mmap_sem);
|
2018-07-25 11:11:56 +08:00
|
|
|
up_write(&F2FS_I(inode)->i_gc_rwsem[WRITE]);
|
2014-07-02 13:25:04 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2015-12-24 03:55:18 +08:00
|
|
|
static int prepare_write_begin(struct f2fs_sb_info *sbi,
|
|
|
|
struct page *page, loff_t pos, unsigned len,
|
|
|
|
block_t *blk_addr, bool *node_changed)
|
|
|
|
{
|
|
|
|
struct inode *inode = page->mapping->host;
|
|
|
|
pgoff_t index = page->index;
|
|
|
|
struct dnode_of_data dn;
|
|
|
|
struct page *ipage;
|
2015-12-24 05:48:58 +08:00
|
|
|
bool locked = false;
|
2017-02-23 17:18:05 +08:00
|
|
|
struct extent_info ei = {0,0,0};
|
2015-12-24 03:55:18 +08:00
|
|
|
int err = 0;
|
2018-11-14 19:34:28 +08:00
|
|
|
int flag;
|
2015-12-24 03:55:18 +08:00
|
|
|
|
2016-02-04 05:49:44 +08:00
|
|
|
/*
|
|
|
|
* we already allocated all the blocks, so we don't need to get
|
|
|
|
* the block addresses when there is no need to fill the page.
|
|
|
|
*/
|
2017-01-14 05:12:29 +08:00
|
|
|
if (!f2fs_has_inline_data(inode) && len == PAGE_SIZE &&
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
!is_inode_flag_set(inode, FI_NO_PREALLOC) &&
|
|
|
|
!f2fs_verity_in_progress(inode))
|
2016-02-04 05:49:44 +08:00
|
|
|
return 0;
|
|
|
|
|
2018-11-14 19:34:28 +08:00
|
|
|
/* f2fs_lock_op avoids race between write CP and convert_inline_page */
|
|
|
|
if (f2fs_has_inline_data(inode) && pos + len > MAX_INLINE_DATA(inode))
|
|
|
|
flag = F2FS_GET_BLOCK_DEFAULT;
|
|
|
|
else
|
|
|
|
flag = F2FS_GET_BLOCK_PRE_AIO;
|
|
|
|
|
2015-12-24 05:48:58 +08:00
|
|
|
if (f2fs_has_inline_data(inode) ||
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
(pos & PAGE_MASK) >= i_size_read(inode)) {
|
2018-11-14 19:34:28 +08:00
|
|
|
__do_map_lock(sbi, flag, true);
|
2015-12-24 05:48:58 +08:00
|
|
|
locked = true;
|
|
|
|
}
|
|
|
|
restart:
|
2015-12-24 03:55:18 +08:00
|
|
|
/* check inline_data */
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
ipage = f2fs_get_node_page(sbi, inode->i_ino);
|
2015-12-24 03:55:18 +08:00
|
|
|
if (IS_ERR(ipage)) {
|
|
|
|
err = PTR_ERR(ipage);
|
|
|
|
goto unlock_out;
|
|
|
|
}
|
|
|
|
|
|
|
|
set_new_dnode(&dn, inode, ipage, ipage, 0);
|
|
|
|
|
|
|
|
if (f2fs_has_inline_data(inode)) {
|
2017-07-19 00:19:05 +08:00
|
|
|
if (pos + len <= MAX_INLINE_DATA(inode)) {
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_do_read_inline_data(page, ipage);
|
2016-05-21 01:13:22 +08:00
|
|
|
set_inode_flag(inode, FI_DATA_EXIST);
|
2016-05-11 19:48:44 +08:00
|
|
|
if (inode->i_nlink)
|
|
|
|
set_inline_node(ipage);
|
2015-12-24 03:55:18 +08:00
|
|
|
} else {
|
|
|
|
err = f2fs_convert_inline_page(&dn, page);
|
|
|
|
if (err)
|
2015-12-24 05:48:58 +08:00
|
|
|
goto out;
|
|
|
|
if (dn.data_blkaddr == NULL_ADDR)
|
|
|
|
err = f2fs_get_block(&dn, index);
|
|
|
|
}
|
|
|
|
} else if (locked) {
|
|
|
|
err = f2fs_get_block(&dn, index);
|
|
|
|
} else {
|
|
|
|
if (f2fs_lookup_extent_cache(inode, index, &ei)) {
|
|
|
|
dn.data_blkaddr = ei.blk + index - ei.fofs;
|
|
|
|
} else {
|
|
|
|
/* hole case */
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
err = f2fs_get_dnode_of_data(&dn, index, LOOKUP_NODE);
|
2016-04-07 02:27:03 +08:00
|
|
|
if (err || dn.data_blkaddr == NULL_ADDR) {
|
2015-12-24 05:48:58 +08:00
|
|
|
f2fs_put_dnode(&dn);
|
2017-03-13 20:22:18 +08:00
|
|
|
__do_map_lock(sbi, F2FS_GET_BLOCK_PRE_AIO,
|
|
|
|
true);
|
2018-11-14 19:34:28 +08:00
|
|
|
WARN_ON(flag != F2FS_GET_BLOCK_PRE_AIO);
|
2015-12-24 05:48:58 +08:00
|
|
|
locked = true;
|
|
|
|
goto restart;
|
|
|
|
}
|
2015-12-24 03:55:18 +08:00
|
|
|
}
|
|
|
|
}
|
2015-12-24 05:48:58 +08:00
|
|
|
|
2015-12-24 03:55:18 +08:00
|
|
|
/* convert_inline_page can make node_changed */
|
|
|
|
*blk_addr = dn.data_blkaddr;
|
|
|
|
*node_changed = dn.node_changed;
|
2015-12-24 05:48:58 +08:00
|
|
|
out:
|
2015-12-24 03:55:18 +08:00
|
|
|
f2fs_put_dnode(&dn);
|
|
|
|
unlock_out:
|
2015-12-24 05:48:58 +08:00
|
|
|
if (locked)
|
2018-11-14 19:34:28 +08:00
|
|
|
__do_map_lock(sbi, flag, false);
|
2015-12-24 03:55:18 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2012-11-02 16:10:12 +08:00
|
|
|
static int f2fs_write_begin(struct file *file, struct address_space *mapping,
|
|
|
|
loff_t pos, unsigned len, unsigned flags,
|
|
|
|
struct page **pagep, void **fsdata)
|
|
|
|
{
|
|
|
|
struct inode *inode = mapping->host;
|
2014-09-03 06:31:18 +08:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
2015-07-16 04:08:21 +08:00
|
|
|
struct page *page = NULL;
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
pgoff_t index = ((unsigned long long) pos) >> PAGE_SHIFT;
|
2018-01-15 17:16:46 +08:00
|
|
|
bool need_balance = false, drop_atomic = false;
|
2015-12-24 03:55:18 +08:00
|
|
|
block_t blkaddr = NULL_ADDR;
|
2012-11-02 16:10:12 +08:00
|
|
|
int err = 0;
|
|
|
|
|
2014-05-06 16:46:04 +08:00
|
|
|
trace_f2fs_write_begin(inode, pos, len, flags);
|
|
|
|
|
2019-08-23 17:58:36 +08:00
|
|
|
if (!f2fs_is_checkpoint_ready(sbi)) {
|
|
|
|
err = -ENOSPC;
|
2018-08-21 10:21:43 +08:00
|
|
|
goto fail;
|
2019-08-23 17:58:36 +08:00
|
|
|
}
|
2018-08-21 10:21:43 +08:00
|
|
|
|
2018-07-27 17:15:11 +08:00
|
|
|
if ((f2fs_is_atomic_file(inode) &&
|
|
|
|
!f2fs_available_free_memory(sbi, INMEM_PAGES)) ||
|
|
|
|
is_inode_flag_set(inode, FI_ATOMIC_REVOKE_REQUEST)) {
|
2017-10-19 10:05:57 +08:00
|
|
|
err = -ENOMEM;
|
2018-01-15 17:16:46 +08:00
|
|
|
drop_atomic = true;
|
2017-10-19 10:05:57 +08:00
|
|
|
goto fail;
|
|
|
|
}
|
|
|
|
|
2014-11-26 02:59:45 +08:00
|
|
|
/*
|
|
|
|
* We should check this at this moment to avoid deadlock on inode page
|
|
|
|
* and #0 page. The locking rule for inline_data conversion should be:
|
|
|
|
* lock_page(page #0) -> lock_page(inode_page)
|
|
|
|
*/
|
|
|
|
if (index != 0) {
|
|
|
|
err = f2fs_convert_inline_inode(inode);
|
|
|
|
if (err)
|
|
|
|
goto fail;
|
|
|
|
}
|
2013-04-26 10:55:17 +08:00
|
|
|
repeat:
|
2017-02-18 01:55:55 +08:00
|
|
|
/*
|
|
|
|
* Do not use grab_cache_page_write_begin() to avoid deadlock due to
|
|
|
|
* wait_for_stable_page. Will wait that below with our IO control.
|
|
|
|
*/
|
2017-10-28 16:52:30 +08:00
|
|
|
page = f2fs_pagecache_get_page(mapping, index,
|
2017-02-18 01:55:55 +08:00
|
|
|
FGP_LOCK | FGP_WRITE | FGP_CREAT, GFP_NOFS);
|
2014-07-02 13:25:04 +08:00
|
|
|
if (!page) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto fail;
|
|
|
|
}
|
2014-04-30 08:22:45 +08:00
|
|
|
|
2012-11-02 16:10:12 +08:00
|
|
|
*pagep = page;
|
|
|
|
|
2015-12-24 03:55:18 +08:00
|
|
|
err = prepare_write_begin(sbi, page, pos, len,
|
|
|
|
&blkaddr, &need_balance);
|
2014-10-18 11:33:55 +08:00
|
|
|
if (err)
|
2015-12-24 03:55:18 +08:00
|
|
|
goto fail;
|
2014-10-18 11:33:55 +08:00
|
|
|
|
f2fs: guarantee journalled quota data by checkpoint
For journalled quota mode, let checkpoint to flush dquot dirty data
and quota file data to guarntee persistence of all quota sysfile in
last checkpoint, by this way, we can avoid corrupting quota sysfile
when encountering SPO.
The implementation is as below:
1. add a global state SBI_QUOTA_NEED_FLUSH to indicate that there is
cached dquot metadata changes in quota subsystem, and later checkpoint
should:
a) flush dquot metadata into quota file.
b) flush quota file to storage to keep file usage be consistent.
2. add a global state SBI_QUOTA_NEED_REPAIR to indicate that quota
operation failed due to -EIO or -ENOSPC, so later,
a) checkpoint will skip syncing dquot metadata.
b) CP_QUOTA_NEED_FSCK_FLAG will be set in last cp pack to give a
hint for fsck repairing.
3. add a global state SBI_QUOTA_SKIP_FLUSH, in checkpoint, if quota
data updating is very heavy, it may cause hungtask in block_operation().
To avoid this, if our retry time exceed threshold, let's just skip
flushing and retry in next checkpoint().
Signed-off-by: Weichao Guo <guoweichao@huawei.com>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
[Jaegeuk Kim: avoid warnings and set fsck flag]
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-09-20 20:05:00 +08:00
|
|
|
if (need_balance && !IS_NOQUOTA(inode) &&
|
|
|
|
has_not_enough_free_secs(sbi, 0, 0)) {
|
2015-12-23 05:23:35 +08:00
|
|
|
unlock_page(page);
|
2016-01-08 06:15:04 +08:00
|
|
|
f2fs_balance_fs(sbi, true);
|
2015-12-23 05:23:35 +08:00
|
|
|
lock_page(page);
|
|
|
|
if (page->mapping != mapping) {
|
|
|
|
/* The page got truncated from under us */
|
|
|
|
f2fs_put_page(page, 1);
|
|
|
|
goto repeat;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2018-12-25 17:43:42 +08:00
|
|
|
f2fs_wait_on_page_writeback(page, DATA, false, true);
|
2014-10-24 10:48:09 +08:00
|
|
|
|
2016-09-07 02:02:03 +08:00
|
|
|
if (len == PAGE_SIZE || PageUptodate(page))
|
|
|
|
return 0;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
if (!(pos & (PAGE_SIZE - 1)) && (pos + len) >= i_size_read(inode) &&
|
|
|
|
!f2fs_verity_in_progress(inode)) {
|
2016-12-20 11:11:35 +08:00
|
|
|
zero_user_segment(page, len, PAGE_SIZE);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2015-12-24 03:55:18 +08:00
|
|
|
if (blkaddr == NEW_ADDR) {
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
zero_user_segment(page, 0, PAGE_SIZE);
|
2016-09-07 02:02:03 +08:00
|
|
|
SetPageUptodate(page);
|
2012-11-02 16:10:12 +08:00
|
|
|
} else {
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
if (!f2fs_is_valid_blkaddr(sbi, blkaddr,
|
|
|
|
DATA_GENERIC_ENHANCE_READ)) {
|
2019-06-20 11:36:14 +08:00
|
|
|
err = -EFSCORRUPTED;
|
f2fs: introduce DATA_GENERIC_ENHANCE
Previously, f2fs_is_valid_blkaddr(, blkaddr, DATA_GENERIC) will check
whether @blkaddr locates in main area or not.
That check is weak, since the block address in range of main area can
point to the address which is not valid in segment info table, and we
can not detect such condition, we may suffer worse corruption as system
continues running.
So this patch introduce DATA_GENERIC_ENHANCE to enhance the sanity check
which trigger SIT bitmap check rather than only range check.
This patch did below changes as wel:
- set SBI_NEED_FSCK in f2fs_is_valid_blkaddr().
- get rid of is_valid_data_blkaddr() to avoid panic if blkaddr is invalid.
- introduce verify_fio_blkaddr() to wrap fio {new,old}_blkaddr validation check.
- spread blkaddr check in:
* f2fs_get_node_info()
* __read_out_blkaddrs()
* f2fs_submit_page_read()
* ra_data_block()
* do_recover_data()
This patch can fix bug reported from bugzilla below:
https://bugzilla.kernel.org/show_bug.cgi?id=203215
https://bugzilla.kernel.org/show_bug.cgi?id=203223
https://bugzilla.kernel.org/show_bug.cgi?id=203231
https://bugzilla.kernel.org/show_bug.cgi?id=203235
https://bugzilla.kernel.org/show_bug.cgi?id=203241
= Update by Jaegeuk Kim =
DATA_GENERIC_ENHANCE enhanced to validate block addresses on read/write paths.
But, xfstest/generic/446 compalins some generated kernel messages saying invalid
bitmap was detected when reading a block. The reaons is, when we get the
block addresses from extent_cache, there is no lock to synchronize it from
truncating the blocks in parallel.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2019-04-15 15:26:32 +08:00
|
|
|
goto fail;
|
|
|
|
}
|
2017-09-07 12:04:44 +08:00
|
|
|
err = f2fs_submit_page_read(inode, page, blkaddr);
|
|
|
|
if (err)
|
2016-07-03 22:05:11 +08:00
|
|
|
goto fail;
|
2014-03-29 15:30:40 +08:00
|
|
|
|
2013-03-08 20:29:23 +08:00
|
|
|
lock_page(page);
|
2013-12-06 14:00:58 +08:00
|
|
|
if (unlikely(page->mapping != mapping)) {
|
2013-04-26 10:55:17 +08:00
|
|
|
f2fs_put_page(page, 1);
|
|
|
|
goto repeat;
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
2016-07-03 22:05:12 +08:00
|
|
|
if (unlikely(!PageUptodate(page))) {
|
|
|
|
err = -EIO;
|
|
|
|
goto fail;
|
2015-04-24 03:04:33 +08:00
|
|
|
}
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
|
|
|
return 0;
|
2014-10-18 11:33:55 +08:00
|
|
|
|
2014-07-02 13:25:04 +08:00
|
|
|
fail:
|
2015-07-16 04:08:21 +08:00
|
|
|
f2fs_put_page(page, 1);
|
2014-07-02 13:25:04 +08:00
|
|
|
f2fs_write_failed(mapping, pos + len);
|
2018-01-15 17:16:46 +08:00
|
|
|
if (drop_atomic)
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_drop_inmem_pages_all(sbi, false);
|
2014-07-02 13:25:04 +08:00
|
|
|
return err;
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
|
|
|
|
2013-06-27 12:04:08 +08:00
|
|
|
static int f2fs_write_end(struct file *file,
|
|
|
|
struct address_space *mapping,
|
|
|
|
loff_t pos, unsigned len, unsigned copied,
|
|
|
|
struct page *page, void *fsdata)
|
|
|
|
{
|
|
|
|
struct inode *inode = page->mapping->host;
|
|
|
|
|
2014-05-06 16:47:23 +08:00
|
|
|
trace_f2fs_write_end(inode, pos, len, copied);
|
|
|
|
|
2016-09-07 02:02:03 +08:00
|
|
|
/*
|
|
|
|
* This should be come from len == PAGE_SIZE, and we expect copied
|
|
|
|
* should be PAGE_SIZE. Otherwise, we treat it with zero copied and
|
|
|
|
* let generic_perform_write() try to copy data again through copied=0.
|
|
|
|
*/
|
|
|
|
if (!PageUptodate(page)) {
|
2016-12-20 11:11:35 +08:00
|
|
|
if (unlikely(copied != len))
|
2016-09-07 02:02:03 +08:00
|
|
|
copied = 0;
|
|
|
|
else
|
|
|
|
SetPageUptodate(page);
|
|
|
|
}
|
|
|
|
if (!copied)
|
|
|
|
goto unlock_out;
|
|
|
|
|
2014-10-10 04:19:53 +08:00
|
|
|
set_page_dirty(page);
|
2013-06-27 12:04:08 +08:00
|
|
|
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
if (pos + copied > i_size_read(inode) &&
|
|
|
|
!f2fs_verity_in_progress(inode))
|
2016-05-21 00:22:03 +08:00
|
|
|
f2fs_i_size_write(inode, pos + copied);
|
2016-09-07 02:02:03 +08:00
|
|
|
unlock_out:
|
2016-08-06 21:09:41 +08:00
|
|
|
f2fs_put_page(page, 1);
|
2016-01-09 08:57:48 +08:00
|
|
|
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
|
2013-06-27 12:04:08 +08:00
|
|
|
return copied;
|
|
|
|
}
|
|
|
|
|
2015-03-16 19:33:52 +08:00
|
|
|
static int check_direct_IO(struct inode *inode, struct iov_iter *iter,
|
|
|
|
loff_t offset)
|
2013-12-26 19:15:09 +08:00
|
|
|
{
|
2018-06-30 09:55:12 +08:00
|
|
|
unsigned i_blkbits = READ_ONCE(inode->i_blkbits);
|
|
|
|
unsigned blkbits = i_blkbits;
|
|
|
|
unsigned blocksize_mask = (1 << blkbits) - 1;
|
|
|
|
unsigned long align = offset | iov_iter_alignment(iter);
|
|
|
|
struct block_device *bdev = inode->i_sb->s_bdev;
|
|
|
|
|
2024-06-11 20:26:44 +08:00
|
|
|
if (iov_iter_rw(iter) == READ && offset >= i_size_read(inode))
|
|
|
|
return 1;
|
|
|
|
|
2018-06-30 09:55:12 +08:00
|
|
|
if (align & blocksize_mask) {
|
|
|
|
if (bdev)
|
|
|
|
blkbits = blksize_bits(bdev_logical_block_size(bdev));
|
|
|
|
blocksize_mask = (1 << blkbits) - 1;
|
|
|
|
if (align & blocksize_mask)
|
|
|
|
return -EINVAL;
|
|
|
|
return 1;
|
|
|
|
}
|
2013-12-26 19:15:09 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2018-11-12 00:46:46 +08:00
|
|
|
static void f2fs_dio_end_io(struct bio *bio)
|
|
|
|
{
|
|
|
|
struct f2fs_private_dio *dio = bio->bi_private;
|
|
|
|
|
|
|
|
dec_page_count(F2FS_I_SB(dio->inode),
|
|
|
|
dio->write ? F2FS_DIO_WRITE : F2FS_DIO_READ);
|
|
|
|
|
|
|
|
bio->bi_private = dio->orig_private;
|
|
|
|
bio->bi_end_io = dio->orig_end_io;
|
|
|
|
|
2018-12-14 10:38:33 +08:00
|
|
|
kvfree(dio);
|
2018-11-12 00:46:46 +08:00
|
|
|
|
|
|
|
bio_endio(bio);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void f2fs_dio_submit_bio(struct bio *bio, struct inode *inode,
|
|
|
|
loff_t file_offset)
|
|
|
|
{
|
|
|
|
struct f2fs_private_dio *dio;
|
|
|
|
bool write = (bio_op(bio) == REQ_OP_WRITE);
|
|
|
|
|
|
|
|
dio = f2fs_kzalloc(F2FS_I_SB(inode),
|
|
|
|
sizeof(struct f2fs_private_dio), GFP_NOFS);
|
2019-01-04 09:38:29 +08:00
|
|
|
if (!dio)
|
2018-11-12 00:46:46 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
dio->inode = inode;
|
|
|
|
dio->orig_end_io = bio->bi_end_io;
|
|
|
|
dio->orig_private = bio->bi_private;
|
|
|
|
dio->write = write;
|
|
|
|
|
|
|
|
bio->bi_end_io = f2fs_dio_end_io;
|
|
|
|
bio->bi_private = dio;
|
|
|
|
|
|
|
|
inc_page_count(F2FS_I_SB(inode),
|
|
|
|
write ? F2FS_DIO_WRITE : F2FS_DIO_READ);
|
|
|
|
|
|
|
|
submit_bio(bio);
|
|
|
|
return;
|
|
|
|
out:
|
|
|
|
bio->bi_status = BLK_STS_IOERR;
|
|
|
|
bio_endio(bio);
|
|
|
|
}
|
|
|
|
|
2016-04-07 23:51:58 +08:00
|
|
|
static ssize_t f2fs_direct_IO(struct kiocb *iocb, struct iov_iter *iter)
|
2012-11-02 16:10:12 +08:00
|
|
|
{
|
2016-02-04 05:09:09 +08:00
|
|
|
struct address_space *mapping = iocb->ki_filp->f_mapping;
|
2014-07-02 13:25:04 +08:00
|
|
|
struct inode *inode = mapping->host;
|
2018-01-31 10:36:57 +08:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
2018-09-27 18:34:52 +08:00
|
|
|
struct f2fs_inode_info *fi = F2FS_I(inode);
|
2014-07-02 13:25:04 +08:00
|
|
|
size_t count = iov_iter_count(iter);
|
2016-04-07 23:51:58 +08:00
|
|
|
loff_t offset = iocb->ki_pos;
|
2016-07-13 09:18:29 +08:00
|
|
|
int rw = iov_iter_rw(iter);
|
2014-07-02 13:25:04 +08:00
|
|
|
int err;
|
2018-01-31 10:36:57 +08:00
|
|
|
enum rw_hint hint = iocb->ki_hint;
|
2018-03-08 14:22:56 +08:00
|
|
|
int whint_mode = F2FS_OPTION(sbi).whint_mode;
|
2018-09-27 18:34:52 +08:00
|
|
|
bool do_opu;
|
2013-12-26 19:15:09 +08:00
|
|
|
|
2016-02-04 05:09:09 +08:00
|
|
|
err = check_direct_IO(inode, iter, offset);
|
2015-12-23 03:09:35 +08:00
|
|
|
if (err)
|
2018-06-30 09:55:12 +08:00
|
|
|
return err < 0 ? err : 0;
|
f2fs: handle inline data operations
Hook inline data read/write, truncate, fallocate, setattr, etc.
Files need meet following 2 requirement to inline:
1) file size is not greater than MAX_INLINE_DATA;
2) file doesn't pre-allocate data blocks by fallocate().
FI_INLINE_DATA will not be set while creating a new regular inode because
most of the files are bigger than ~3.4K. Set FI_INLINE_DATA only when
data is submitted to block layer, ranther than set it while creating a new
inode, this also avoids converting data from inline to normal data block
and vice versa.
While writting inline data to inode block, the first data block should be
released if the file has a block indexed by i_addr[0].
On the other hand, when a file operation is appied to a file with inline
data, we need to test if this file can remain inline by doing this
operation, otherwise it should be convert into normal file by reserving
a new data block, copying inline data to this new block and clear
FI_INLINE_DATA flag. Because reserve a new data block here will make use
of i_addr[0], if we save inline data in i_addr[0..872], then the first
4 bytes would be overwriten. This problem can be avoided simply by
not using i_addr[0] for inline data.
Signed-off-by: Huajun Li <huajun.li@intel.com>
Signed-off-by: Haicheng Li <haicheng.li@linux.intel.com>
Signed-off-by: Weihong Xu <weihong.xu@intel.com>
Signed-off-by: Jaegeuk Kim <jaegeuk.kim@samsung.com>
2013-11-10 23:13:20 +08:00
|
|
|
|
2018-09-27 18:34:52 +08:00
|
|
|
if (f2fs_force_buffered_io(inode, iocb, iter))
|
2016-06-04 10:29:38 +08:00
|
|
|
return 0;
|
2015-04-22 11:39:58 +08:00
|
|
|
|
2018-09-27 18:34:52 +08:00
|
|
|
do_opu = allow_outplace_dio(inode, iocb, iter);
|
|
|
|
|
2016-07-23 06:25:47 +08:00
|
|
|
trace_f2fs_direct_IO_enter(inode, offset, count, rw);
|
2014-07-31 21:11:22 +08:00
|
|
|
|
2018-01-31 10:36:57 +08:00
|
|
|
if (rw == WRITE && whint_mode == WHINT_MODE_OFF)
|
|
|
|
iocb->ki_hint = WRITE_LIFE_NOT_SET;
|
|
|
|
|
2018-09-27 18:34:52 +08:00
|
|
|
if (iocb->ki_flags & IOCB_NOWAIT) {
|
|
|
|
if (!down_read_trylock(&fi->i_gc_rwsem[rw])) {
|
|
|
|
iocb->ki_hint = hint;
|
|
|
|
err = -EAGAIN;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
if (do_opu && !down_read_trylock(&fi->i_gc_rwsem[READ])) {
|
|
|
|
up_read(&fi->i_gc_rwsem[rw]);
|
2018-03-08 18:34:38 +08:00
|
|
|
iocb->ki_hint = hint;
|
|
|
|
err = -EAGAIN;
|
|
|
|
goto out;
|
|
|
|
}
|
2018-09-27 18:34:52 +08:00
|
|
|
} else {
|
|
|
|
down_read(&fi->i_gc_rwsem[rw]);
|
|
|
|
if (do_opu)
|
|
|
|
down_read(&fi->i_gc_rwsem[READ]);
|
2018-03-08 18:34:38 +08:00
|
|
|
}
|
|
|
|
|
2018-11-12 00:46:46 +08:00
|
|
|
err = __blockdev_direct_IO(iocb, inode, inode->i_sb->s_bdev,
|
2018-11-13 14:33:45 +08:00
|
|
|
iter, rw == WRITE ? get_data_block_dio_write :
|
|
|
|
get_data_block_dio, NULL, f2fs_dio_submit_bio,
|
2018-11-12 00:46:46 +08:00
|
|
|
DIO_LOCKING | DIO_SKIP_HOLES);
|
2018-09-27 18:34:52 +08:00
|
|
|
|
|
|
|
if (do_opu)
|
|
|
|
up_read(&fi->i_gc_rwsem[READ]);
|
|
|
|
|
|
|
|
up_read(&fi->i_gc_rwsem[rw]);
|
2016-07-13 09:18:29 +08:00
|
|
|
|
|
|
|
if (rw == WRITE) {
|
2018-01-31 10:36:57 +08:00
|
|
|
if (whint_mode == WHINT_MODE_OFF)
|
|
|
|
iocb->ki_hint = hint;
|
2017-08-02 23:21:48 +08:00
|
|
|
if (err > 0) {
|
|
|
|
f2fs_update_iostat(F2FS_I_SB(inode), APP_DIRECT_IO,
|
|
|
|
err);
|
2018-09-27 18:34:52 +08:00
|
|
|
if (!do_opu)
|
|
|
|
set_inode_flag(inode, FI_UPDATE_WRITE);
|
2017-08-02 23:21:48 +08:00
|
|
|
} else if (err < 0) {
|
2016-04-19 05:07:44 +08:00
|
|
|
f2fs_write_failed(mapping, offset + count);
|
2017-08-02 23:21:48 +08:00
|
|
|
}
|
2016-04-19 05:07:44 +08:00
|
|
|
}
|
2014-07-31 21:11:22 +08:00
|
|
|
|
2018-03-08 18:34:38 +08:00
|
|
|
out:
|
2016-07-23 06:25:47 +08:00
|
|
|
trace_f2fs_direct_IO_exit(inode, offset, count, rw, err);
|
2014-07-31 21:11:22 +08:00
|
|
|
|
2014-07-02 13:25:04 +08:00
|
|
|
return err;
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
|
|
|
|
2015-02-05 17:44:29 +08:00
|
|
|
void f2fs_invalidate_page(struct page *page, unsigned int offset,
|
|
|
|
unsigned int length)
|
2012-11-02 16:10:12 +08:00
|
|
|
{
|
|
|
|
struct inode *inode = page->mapping->host;
|
2015-02-05 17:44:29 +08:00
|
|
|
struct f2fs_sb_info *sbi = F2FS_I_SB(inode);
|
2014-09-13 06:53:45 +08:00
|
|
|
|
2015-02-05 17:44:29 +08:00
|
|
|
if (inode->i_ino >= F2FS_ROOT_INO(sbi) &&
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
(offset % PAGE_SIZE || length != PAGE_SIZE))
|
2014-09-13 06:53:45 +08:00
|
|
|
return;
|
|
|
|
|
2015-02-05 17:44:29 +08:00
|
|
|
if (PageDirty(page)) {
|
2016-10-11 22:57:01 +08:00
|
|
|
if (inode->i_ino == F2FS_META_INO(sbi)) {
|
2015-02-05 17:44:29 +08:00
|
|
|
dec_page_count(sbi, F2FS_DIRTY_META);
|
2016-10-11 22:57:01 +08:00
|
|
|
} else if (inode->i_ino == F2FS_NODE_INO(sbi)) {
|
2015-02-05 17:44:29 +08:00
|
|
|
dec_page_count(sbi, F2FS_DIRTY_NODES);
|
2016-10-11 22:57:01 +08:00
|
|
|
} else {
|
2015-02-05 17:44:29 +08:00
|
|
|
inode_dec_dirty_pages(inode);
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_remove_dirty_inode(inode);
|
2016-10-11 22:57:01 +08:00
|
|
|
}
|
2015-02-05 17:44:29 +08:00
|
|
|
}
|
2015-08-07 18:42:09 +08:00
|
|
|
|
2018-07-27 18:15:16 +08:00
|
|
|
clear_cold_data(page);
|
|
|
|
|
2015-08-07 18:42:09 +08:00
|
|
|
if (IS_ATOMIC_WRITTEN_PAGE(page))
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
return f2fs_drop_inmem_page(inode, page);
|
2015-08-07 18:42:09 +08:00
|
|
|
|
2019-03-06 17:30:59 +08:00
|
|
|
f2fs_clear_page_private(page);
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
|
|
|
|
2015-02-05 17:44:29 +08:00
|
|
|
int f2fs_release_page(struct page *page, gfp_t wait)
|
2012-11-02 16:10:12 +08:00
|
|
|
{
|
2015-01-31 03:39:08 +08:00
|
|
|
/* If this is dirty page, keep PagePrivate */
|
|
|
|
if (PageDirty(page))
|
|
|
|
return 0;
|
|
|
|
|
2015-08-07 18:42:09 +08:00
|
|
|
/* This is atomic written page, keep Private */
|
|
|
|
if (IS_ATOMIC_WRITTEN_PAGE(page))
|
|
|
|
return 0;
|
|
|
|
|
2018-07-27 18:15:16 +08:00
|
|
|
clear_cold_data(page);
|
2019-03-06 17:30:59 +08:00
|
|
|
f2fs_clear_page_private(page);
|
2013-03-14 08:24:32 +08:00
|
|
|
return 1;
|
2012-11-02 16:10:12 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static int f2fs_set_data_page_dirty(struct page *page)
|
|
|
|
{
|
2019-07-02 10:15:29 +08:00
|
|
|
struct inode *inode = page_file_mapping(page)->host;
|
2012-11-02 16:10:12 +08:00
|
|
|
|
2013-10-24 16:53:29 +08:00
|
|
|
trace_f2fs_set_page_dirty(page, DATA);
|
|
|
|
|
2016-07-01 09:49:15 +08:00
|
|
|
if (!PageUptodate(page))
|
|
|
|
SetPageUptodate(page);
|
2019-07-02 10:15:29 +08:00
|
|
|
if (PageSwapCache(page))
|
|
|
|
return __set_page_dirty_nobuffers(page);
|
2014-10-10 04:19:53 +08:00
|
|
|
|
2017-01-07 18:50:26 +08:00
|
|
|
if (f2fs_is_atomic_file(inode) && !f2fs_is_commit_atomic_write(inode)) {
|
2015-08-07 18:42:09 +08:00
|
|
|
if (!IS_ATOMIC_WRITTEN_PAGE(page)) {
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_register_inmem_page(inode, page);
|
2015-08-07 18:42:09 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Previously, this page has been registered, we just
|
|
|
|
* return here.
|
|
|
|
*/
|
|
|
|
return 0;
|
2014-10-10 04:19:53 +08:00
|
|
|
}
|
|
|
|
|
2012-11-02 16:10:12 +08:00
|
|
|
if (!PageDirty(page)) {
|
2018-04-21 10:29:52 +08:00
|
|
|
__set_page_dirty_nobuffers(page);
|
f2fs: clean up symbol namespace
As Ted reported:
"Hi, I was looking at f2fs's sources recently, and I noticed that there
is a very large number of non-static symbols which don't have a f2fs
prefix. There's well over a hundred (see attached below).
As one example, in fs/f2fs/dir.c there is:
unsigned char get_de_type(struct f2fs_dir_entry *de)
This function is clearly only useful for f2fs, but it has a generic
name. This means that if any other file system tries to have the same
symbol name, there will be a symbol conflict and the kernel would not
successfully build. It also means that when someone is looking f2fs
sources, it's not at all obvious whether a function such as
read_data_page(), invalidate_blocks(), is a generic kernel function
found in the fs, mm, or block layers, or a f2fs specific function.
You might want to fix this at some point. Hopefully Kent's bcachefs
isn't similarly using genericly named functions, since that might
cause conflicts with f2fs's functions --- but just as this would be a
problem that we would rightly insist that Kent fix, this is something
that we should have rightly insisted that f2fs should have fixed
before it was integrated into the mainline kernel.
acquire_orphan_inode
add_ino_entry
add_orphan_inode
allocate_data_block
allocate_new_segments
alloc_nid
alloc_nid_done
alloc_nid_failed
available_free_memory
...."
This patch adds "f2fs_" prefix for all non-static symbols in order to:
a) avoid conflict with other kernel generic symbols;
b) to indicate the function is f2fs specific one instead of generic
one;
Reported-by: Theodore Ts'o <tytso@mit.edu>
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-05-30 00:20:41 +08:00
|
|
|
f2fs_update_dirty_page(inode, page);
|
2012-11-02 16:10:12 +08:00
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2013-01-17 19:30:23 +08:00
|
|
|
static sector_t f2fs_bmap(struct address_space *mapping, sector_t block)
|
|
|
|
{
|
2014-04-22 13:34:01 +08:00
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
|
2015-10-20 01:29:51 +08:00
|
|
|
if (f2fs_has_inline_data(inode))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/* make sure allocating whole blocks */
|
|
|
|
if (mapping_tagged(mapping, PAGECACHE_TAG_DIRTY))
|
|
|
|
filemap_write_and_wait(mapping);
|
|
|
|
|
2015-08-19 19:11:19 +08:00
|
|
|
return generic_block_bmap(mapping, block, get_data_block_bmap);
|
2015-02-05 17:54:31 +08:00
|
|
|
}
|
|
|
|
|
2016-09-20 05:03:27 +08:00
|
|
|
#ifdef CONFIG_MIGRATION
|
|
|
|
#include <linux/migrate.h>
|
|
|
|
|
|
|
|
int f2fs_migrate_page(struct address_space *mapping,
|
|
|
|
struct page *newpage, struct page *page, enum migrate_mode mode)
|
|
|
|
{
|
|
|
|
int rc, extra_count;
|
|
|
|
struct f2fs_inode_info *fi = F2FS_I(mapping->host);
|
|
|
|
bool atomic_written = IS_ATOMIC_WRITTEN_PAGE(page);
|
|
|
|
|
|
|
|
BUG_ON(PageWriteback(page));
|
|
|
|
|
|
|
|
/* migrating an atomic written page is safe with the inmem_lock hold */
|
2017-07-07 05:46:01 +08:00
|
|
|
if (atomic_written) {
|
|
|
|
if (mode != MIGRATE_SYNC)
|
|
|
|
return -EBUSY;
|
|
|
|
if (!mutex_trylock(&fi->inmem_lock))
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
2016-09-20 05:03:27 +08:00
|
|
|
|
2019-03-06 17:30:59 +08:00
|
|
|
/* one extra reference was held for atomic_write page */
|
|
|
|
extra_count = atomic_written ? 1 : 0;
|
2016-09-20 05:03:27 +08:00
|
|
|
rc = migrate_page_move_mapping(mapping, newpage,
|
2019-07-19 06:58:46 +08:00
|
|
|
page, extra_count);
|
2016-09-20 05:03:27 +08:00
|
|
|
if (rc != MIGRATEPAGE_SUCCESS) {
|
|
|
|
if (atomic_written)
|
|
|
|
mutex_unlock(&fi->inmem_lock);
|
|
|
|
return rc;
|
|
|
|
}
|
|
|
|
|
|
|
|
if (atomic_written) {
|
|
|
|
struct inmem_pages *cur;
|
|
|
|
list_for_each_entry(cur, &fi->inmem_pages, list)
|
|
|
|
if (cur->page == page) {
|
|
|
|
cur->page = newpage;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
mutex_unlock(&fi->inmem_lock);
|
|
|
|
put_page(page);
|
|
|
|
get_page(newpage);
|
|
|
|
}
|
|
|
|
|
2019-03-06 17:30:59 +08:00
|
|
|
if (PagePrivate(page)) {
|
|
|
|
f2fs_set_page_private(newpage, page_private(page));
|
|
|
|
f2fs_clear_page_private(page);
|
|
|
|
}
|
2016-09-20 05:03:27 +08:00
|
|
|
|
2017-09-09 07:12:06 +08:00
|
|
|
if (mode != MIGRATE_SYNC_NO_COPY)
|
|
|
|
migrate_page_copy(newpage, page);
|
|
|
|
else
|
|
|
|
migrate_page_states(newpage, page);
|
2016-09-20 05:03:27 +08:00
|
|
|
|
|
|
|
return MIGRATEPAGE_SUCCESS;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2019-07-02 10:15:29 +08:00
|
|
|
#ifdef CONFIG_SWAP
|
|
|
|
/* Copied from generic_swapfile_activate() to check any holes */
|
2024-06-11 20:08:33 +08:00
|
|
|
static int check_swap_activate(struct swap_info_struct *sis,
|
|
|
|
struct file *swap_file, sector_t *span)
|
2019-07-02 10:15:29 +08:00
|
|
|
{
|
|
|
|
struct address_space *mapping = swap_file->f_mapping;
|
|
|
|
struct inode *inode = mapping->host;
|
|
|
|
unsigned blocks_per_page;
|
|
|
|
unsigned long page_no;
|
|
|
|
unsigned blkbits;
|
|
|
|
sector_t probe_block;
|
|
|
|
sector_t last_block;
|
|
|
|
sector_t lowest_block = -1;
|
|
|
|
sector_t highest_block = 0;
|
2024-06-11 20:08:33 +08:00
|
|
|
int nr_extents = 0;
|
|
|
|
int ret;
|
2019-07-02 10:15:29 +08:00
|
|
|
|
|
|
|
blkbits = inode->i_blkbits;
|
|
|
|
blocks_per_page = PAGE_SIZE >> blkbits;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Map all the blocks into the extent list. This code doesn't try
|
|
|
|
* to be very smart.
|
|
|
|
*/
|
|
|
|
probe_block = 0;
|
|
|
|
page_no = 0;
|
|
|
|
last_block = i_size_read(inode) >> blkbits;
|
2024-06-11 20:08:33 +08:00
|
|
|
while ((probe_block + blocks_per_page) <= last_block &&
|
|
|
|
page_no < sis->max) {
|
2019-07-02 10:15:29 +08:00
|
|
|
unsigned block_in_page;
|
|
|
|
sector_t first_block;
|
|
|
|
|
|
|
|
cond_resched();
|
|
|
|
|
|
|
|
first_block = bmap(inode, probe_block);
|
|
|
|
if (first_block == 0)
|
|
|
|
goto bad_bmap;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* It must be PAGE_SIZE aligned on-disk
|
|
|
|
*/
|
|
|
|
if (first_block & (blocks_per_page - 1)) {
|
|
|
|
probe_block++;
|
|
|
|
goto reprobe;
|
|
|
|
}
|
|
|
|
|
|
|
|
for (block_in_page = 1; block_in_page < blocks_per_page;
|
|
|
|
block_in_page++) {
|
|
|
|
sector_t block;
|
|
|
|
|
|
|
|
block = bmap(inode, probe_block + block_in_page);
|
|
|
|
if (block == 0)
|
|
|
|
goto bad_bmap;
|
|
|
|
if (block != first_block + block_in_page) {
|
|
|
|
/* Discontiguity */
|
|
|
|
probe_block++;
|
|
|
|
goto reprobe;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
first_block >>= (PAGE_SHIFT - blkbits);
|
|
|
|
if (page_no) { /* exclude the header page */
|
|
|
|
if (first_block < lowest_block)
|
|
|
|
lowest_block = first_block;
|
|
|
|
if (first_block > highest_block)
|
|
|
|
highest_block = first_block;
|
|
|
|
}
|
|
|
|
|
2024-06-11 20:08:33 +08:00
|
|
|
/*
|
|
|
|
* We found a PAGE_SIZE-length, PAGE_SIZE-aligned run of blocks
|
|
|
|
*/
|
|
|
|
ret = add_swap_extent(sis, page_no, 1, first_block);
|
|
|
|
if (ret < 0)
|
|
|
|
goto out;
|
|
|
|
nr_extents += ret;
|
2019-07-02 10:15:29 +08:00
|
|
|
page_no++;
|
|
|
|
probe_block += blocks_per_page;
|
|
|
|
reprobe:
|
|
|
|
continue;
|
|
|
|
}
|
2024-06-11 20:08:33 +08:00
|
|
|
ret = nr_extents;
|
|
|
|
*span = 1 + highest_block - lowest_block;
|
|
|
|
if (page_no == 0)
|
|
|
|
page_no = 1; /* force Empty message */
|
|
|
|
sis->max = page_no;
|
|
|
|
sis->pages = page_no - 1;
|
|
|
|
sis->highest_bit = page_no - 1;
|
|
|
|
out:
|
|
|
|
return ret;
|
2019-07-02 10:15:29 +08:00
|
|
|
bad_bmap:
|
|
|
|
pr_err("swapon: swapfile has holes\n");
|
|
|
|
return -EINVAL;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
|
|
|
sector_t *span)
|
|
|
|
{
|
|
|
|
struct inode *inode = file_inode(file);
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
if (!S_ISREG(inode->i_mode))
|
|
|
|
return -EINVAL;
|
|
|
|
|
|
|
|
if (f2fs_readonly(F2FS_I_SB(inode)->sb))
|
|
|
|
return -EROFS;
|
|
|
|
|
|
|
|
ret = f2fs_convert_inline_inode(inode);
|
|
|
|
if (ret)
|
|
|
|
return ret;
|
|
|
|
|
2024-06-11 20:08:33 +08:00
|
|
|
ret = check_swap_activate(sis, file, span);
|
|
|
|
if (ret < 0)
|
2019-07-02 10:15:29 +08:00
|
|
|
return ret;
|
|
|
|
|
|
|
|
set_inode_flag(inode, FI_PIN_FILE);
|
|
|
|
f2fs_precache_extents(inode);
|
|
|
|
f2fs_update_time(F2FS_I_SB(inode), REQ_TIME);
|
2024-06-11 20:08:33 +08:00
|
|
|
return ret;
|
2019-07-02 10:15:29 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void f2fs_swap_deactivate(struct file *file)
|
|
|
|
{
|
|
|
|
struct inode *inode = file_inode(file);
|
|
|
|
|
|
|
|
clear_inode_flag(inode, FI_PIN_FILE);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static int f2fs_swap_activate(struct swap_info_struct *sis, struct file *file,
|
|
|
|
sector_t *span)
|
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void f2fs_swap_deactivate(struct file *file)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2012-11-02 16:10:12 +08:00
|
|
|
const struct address_space_operations f2fs_dblock_aops = {
|
|
|
|
.readpage = f2fs_read_data_page,
|
|
|
|
.readpages = f2fs_read_data_pages,
|
|
|
|
.writepage = f2fs_write_data_page,
|
|
|
|
.writepages = f2fs_write_data_pages,
|
|
|
|
.write_begin = f2fs_write_begin,
|
2013-06-27 12:04:08 +08:00
|
|
|
.write_end = f2fs_write_end,
|
2012-11-02 16:10:12 +08:00
|
|
|
.set_page_dirty = f2fs_set_data_page_dirty,
|
2015-02-05 17:44:29 +08:00
|
|
|
.invalidatepage = f2fs_invalidate_page,
|
|
|
|
.releasepage = f2fs_release_page,
|
2012-11-02 16:10:12 +08:00
|
|
|
.direct_IO = f2fs_direct_IO,
|
2013-01-17 19:30:23 +08:00
|
|
|
.bmap = f2fs_bmap,
|
2019-07-02 10:15:29 +08:00
|
|
|
.swap_activate = f2fs_swap_activate,
|
|
|
|
.swap_deactivate = f2fs_swap_deactivate,
|
2016-09-20 05:03:27 +08:00
|
|
|
#ifdef CONFIG_MIGRATION
|
|
|
|
.migratepage = f2fs_migrate_page,
|
|
|
|
#endif
|
2012-11-02 16:10:12 +08:00
|
|
|
};
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
|
2017-12-05 09:25:25 +08:00
|
|
|
void f2fs_clear_page_cache_dirty_tag(struct page *page)
|
2018-05-26 18:03:35 +08:00
|
|
|
{
|
|
|
|
struct address_space *mapping = page_mapping(page);
|
|
|
|
unsigned long flags;
|
|
|
|
|
|
|
|
xa_lock_irqsave(&mapping->i_pages, flags);
|
2017-12-05 09:25:25 +08:00
|
|
|
__xa_clear_mark(&mapping->i_pages, page_index(page),
|
2018-05-26 18:03:35 +08:00
|
|
|
PAGECACHE_TAG_DIRTY);
|
|
|
|
xa_unlock_irqrestore(&mapping->i_pages, flags);
|
|
|
|
}
|
|
|
|
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
int __init f2fs_init_post_read_processing(void)
|
|
|
|
{
|
f2fs: add fs-verity support
Add fs-verity support to f2fs. fs-verity is a filesystem feature that
enables transparent integrity protection and authentication of read-only
files. It uses a dm-verity like mechanism at the file level: a Merkle
tree is used to verify any block in the file in log(filesize) time. It
is implemented mainly by helper functions in fs/verity/. See
Documentation/filesystems/fsverity.rst for the full documentation.
The f2fs support for fs-verity consists of:
- Adding a filesystem feature flag and an inode flag for fs-verity.
- Implementing the fsverity_operations to support enabling verity on an
inode and reading/writing the verity metadata.
- Updating ->readpages() to verify data as it's read from verity files
and to support reading verity metadata pages.
- Updating ->write_begin(), ->write_end(), and ->writepages() to support
writing verity metadata pages.
- Calling the fs-verity hooks for ->open(), ->setattr(), and ->ioctl().
Like ext4, f2fs stores the verity metadata (Merkle tree and
fsverity_descriptor) past the end of the file, starting at the first 64K
boundary beyond i_size. This approach works because (a) verity files
are readonly, and (b) pages fully beyond i_size aren't visible to
userspace but can be read/written internally by f2fs with only some
relatively small changes to f2fs. Extended attributes cannot be used
because (a) f2fs limits the total size of an inode's xattr entries to
4096 bytes, which wouldn't be enough for even a single Merkle tree
block, and (b) f2fs encryption doesn't encrypt xattrs, yet the verity
metadata *must* be encrypted when the file is because it contains hashes
of the plaintext data.
Acked-by: Jaegeuk Kim <jaegeuk@kernel.org>
Acked-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Eric Biggers <ebiggers@google.com>
2019-07-23 00:26:24 +08:00
|
|
|
bio_post_read_ctx_cache =
|
|
|
|
kmem_cache_create("f2fs_bio_post_read_ctx",
|
|
|
|
sizeof(struct bio_post_read_ctx), 0, 0, NULL);
|
f2fs: refactor read path to allow multiple postprocessing steps
Currently f2fs's ->readpage() and ->readpages() assume that either the
data undergoes no postprocessing, or decryption only. But with
fs-verity, there will be an additional authenticity verification step,
and it may be needed either by itself, or combined with decryption.
To support this, store a 'struct bio_post_read_ctx' in ->bi_private
which contains a work struct, a bitmask of postprocessing steps that are
enabled, and an indicator of the current step. The bio completion
routine, if there was no I/O error, enqueues the first postprocessing
step. When that completes, it continues to the next step. Pages that
fail any postprocessing step have PageError set. Once all steps have
completed, pages without PageError set are set Uptodate, and all pages
are unlocked.
Also replace f2fs_encrypted_file() with a new function
f2fs_post_read_required() in places like direct I/O and garbage
collection that really should be testing whether the file needs special
I/O processing, not whether it is encrypted specifically.
This may also be useful for other future f2fs features such as
compression.
Signed-off-by: Eric Biggers <ebiggers@google.com>
Reviewed-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2018-04-19 02:09:48 +08:00
|
|
|
if (!bio_post_read_ctx_cache)
|
|
|
|
goto fail;
|
|
|
|
bio_post_read_ctx_pool =
|
|
|
|
mempool_create_slab_pool(NUM_PREALLOC_POST_READ_CTXS,
|
|
|
|
bio_post_read_ctx_cache);
|
|
|
|
if (!bio_post_read_ctx_pool)
|
|
|
|
goto fail_free_cache;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
fail_free_cache:
|
|
|
|
kmem_cache_destroy(bio_post_read_ctx_cache);
|
|
|
|
fail:
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
void __exit f2fs_destroy_post_read_processing(void)
|
|
|
|
{
|
|
|
|
mempool_destroy(bio_post_read_ctx_pool);
|
|
|
|
kmem_cache_destroy(bio_post_read_ctx_cache);
|
|
|
|
}
|