2012-11-29 12:28:09 +08:00
|
|
|
/*
|
2012-11-02 16:13:32 +08:00
|
|
|
* fs/f2fs/recovery.c
|
|
|
|
*
|
|
|
|
* Copyright (c) 2012 Samsung Electronics Co., Ltd.
|
|
|
|
* http://www.samsung.com/
|
|
|
|
*
|
|
|
|
* This program is free software; you can redistribute it and/or modify
|
|
|
|
* it under the terms of the GNU General Public License version 2 as
|
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*/
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/f2fs_fs.h>
|
|
|
|
#include "f2fs.h"
|
|
|
|
#include "node.h"
|
|
|
|
#include "segment.h"
|
|
|
|
|
2014-09-16 07:46:08 +08:00
|
|
|
/*
|
|
|
|
* Roll forward recovery scenarios.
|
|
|
|
*
|
|
|
|
* [Term] F: fsync_mark, D: dentry_mark
|
|
|
|
*
|
|
|
|
* 1. inode(x) | CP | inode(x) | dnode(F)
|
|
|
|
* -> Update the latest inode(x).
|
|
|
|
*
|
|
|
|
* 2. inode(x) | CP | inode(F) | dnode(F)
|
|
|
|
* -> No problem.
|
|
|
|
*
|
|
|
|
* 3. inode(x) | CP | dnode(F) | inode(x)
|
|
|
|
* -> Recover to the latest dnode(F), and drop the last inode(x)
|
|
|
|
*
|
|
|
|
* 4. inode(x) | CP | dnode(F) | inode(F)
|
|
|
|
* -> No problem.
|
|
|
|
*
|
|
|
|
* 5. CP | inode(x) | dnode(F)
|
|
|
|
* -> The inode(DF) was missing. Should drop this dnode(F).
|
|
|
|
*
|
|
|
|
* 6. CP | inode(DF) | dnode(F)
|
|
|
|
* -> No problem.
|
|
|
|
*
|
|
|
|
* 7. CP | dnode(F) | inode(DF)
|
|
|
|
* -> If f2fs_iget fails, then goto next to find inode(DF).
|
|
|
|
*
|
|
|
|
* 8. CP | dnode(F) | inode(x)
|
|
|
|
* -> If f2fs_iget fails, then goto next to find inode(DF).
|
|
|
|
* But it will fail due to no inode(DF).
|
|
|
|
*/
|
|
|
|
|
2012-11-02 16:13:32 +08:00
|
|
|
static struct kmem_cache *fsync_entry_slab;
|
|
|
|
|
|
|
|
bool space_for_roll_forward(struct f2fs_sb_info *sbi)
|
|
|
|
{
|
2016-05-17 02:06:50 +08:00
|
|
|
s64 nalloc = percpu_counter_sum_positive(&sbi->alloc_valid_block_count);
|
|
|
|
|
|
|
|
if (sbi->last_valid_block_count + nalloc > sbi->user_block_count)
|
2012-11-02 16:13:32 +08:00
|
|
|
return false;
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static struct fsync_inode_entry *get_fsync_inode(struct list_head *head,
|
|
|
|
nid_t ino)
|
|
|
|
{
|
|
|
|
struct fsync_inode_entry *entry;
|
|
|
|
|
2014-03-29 11:33:17 +08:00
|
|
|
list_for_each_entry(entry, head, list)
|
2012-11-02 16:13:32 +08:00
|
|
|
if (entry->inode->i_ino == ino)
|
|
|
|
return entry;
|
2014-03-29 11:33:17 +08:00
|
|
|
|
2012-11-02 16:13:32 +08:00
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2016-09-10 07:48:15 +08:00
|
|
|
static struct fsync_inode_entry *add_fsync_inode(struct f2fs_sb_info *sbi,
|
2017-08-08 10:54:31 +08:00
|
|
|
struct list_head *head, nid_t ino, bool quota_inode)
|
2016-04-29 20:13:37 +08:00
|
|
|
{
|
2016-09-10 07:59:39 +08:00
|
|
|
struct inode *inode;
|
2016-04-29 20:13:37 +08:00
|
|
|
struct fsync_inode_entry *entry;
|
2017-08-08 10:54:31 +08:00
|
|
|
int err;
|
2016-04-29 20:13:37 +08:00
|
|
|
|
2016-09-10 07:59:39 +08:00
|
|
|
inode = f2fs_iget_retry(sbi->sb, ino);
|
2016-09-10 07:48:15 +08:00
|
|
|
if (IS_ERR(inode))
|
|
|
|
return ERR_CAST(inode);
|
|
|
|
|
2017-08-08 10:54:31 +08:00
|
|
|
err = dquot_initialize(inode);
|
|
|
|
if (err)
|
|
|
|
goto err_out;
|
|
|
|
|
|
|
|
if (quota_inode) {
|
|
|
|
err = dquot_alloc_inode(inode);
|
|
|
|
if (err)
|
|
|
|
goto err_out;
|
|
|
|
}
|
|
|
|
|
2016-09-10 07:59:39 +08:00
|
|
|
entry = f2fs_kmem_cache_alloc(fsync_entry_slab, GFP_F2FS_ZERO);
|
2016-04-29 20:13:37 +08:00
|
|
|
entry->inode = inode;
|
|
|
|
list_add_tail(&entry->list, head);
|
|
|
|
|
|
|
|
return entry;
|
2017-08-08 10:54:31 +08:00
|
|
|
err_out:
|
|
|
|
iput(inode);
|
|
|
|
return ERR_PTR(err);
|
2016-04-29 20:13:37 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static void del_fsync_inode(struct fsync_inode_entry *entry)
|
|
|
|
{
|
|
|
|
iput(entry->inode);
|
|
|
|
list_del(&entry->list);
|
|
|
|
kmem_cache_free(fsync_entry_slab, entry);
|
|
|
|
}
|
|
|
|
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
static int recover_dentry(struct inode *inode, struct page *ipage,
|
|
|
|
struct list_head *dir_list)
|
2012-11-02 16:13:32 +08:00
|
|
|
{
|
2013-12-26 15:30:41 +08:00
|
|
|
struct f2fs_inode *raw_inode = F2FS_INODE(ipage);
|
2013-05-15 15:40:02 +08:00
|
|
|
nid_t pino = le32_to_cpu(raw_inode->i_pino);
|
2013-05-28 08:19:22 +08:00
|
|
|
struct f2fs_dir_entry *de;
|
2016-08-29 11:27:56 +08:00
|
|
|
struct fscrypt_name fname;
|
2012-11-02 16:13:32 +08:00
|
|
|
struct page *page;
|
2013-05-28 08:19:22 +08:00
|
|
|
struct inode *dir, *einode;
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
struct fsync_inode_entry *entry;
|
2012-11-02 16:13:32 +08:00
|
|
|
int err = 0;
|
2016-08-29 11:27:56 +08:00
|
|
|
char *name;
|
2012-11-02 16:13:32 +08:00
|
|
|
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
entry = get_fsync_inode(dir_list, pino);
|
|
|
|
if (!entry) {
|
2017-08-08 10:54:31 +08:00
|
|
|
entry = add_fsync_inode(F2FS_I_SB(inode), dir_list,
|
|
|
|
pino, false);
|
2016-09-10 07:48:15 +08:00
|
|
|
if (IS_ERR(entry)) {
|
|
|
|
dir = ERR_CAST(entry);
|
|
|
|
err = PTR_ERR(entry);
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
goto out;
|
|
|
|
}
|
2014-04-15 10:19:28 +08:00
|
|
|
}
|
|
|
|
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
dir = entry->inode;
|
|
|
|
|
2016-08-29 11:27:56 +08:00
|
|
|
memset(&fname, 0, sizeof(struct fscrypt_name));
|
|
|
|
fname.disk_name.len = le32_to_cpu(raw_inode->i_namelen);
|
|
|
|
fname.disk_name.name = raw_inode->i_name;
|
2013-12-23 11:12:21 +08:00
|
|
|
|
2016-08-29 11:27:56 +08:00
|
|
|
if (unlikely(fname.disk_name.len > F2FS_NAME_LEN)) {
|
2013-12-23 11:12:21 +08:00
|
|
|
WARN_ON(1);
|
|
|
|
err = -ENAMETOOLONG;
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
goto out;
|
2013-12-23 11:12:21 +08:00
|
|
|
}
|
2013-05-28 08:19:22 +08:00
|
|
|
retry:
|
2016-08-29 11:27:56 +08:00
|
|
|
de = __f2fs_find_entry(dir, &fname, &page);
|
2015-04-01 09:03:29 +08:00
|
|
|
if (de && inode->i_ino == le32_to_cpu(de->ino))
|
2013-09-24 22:40:57 +08:00
|
|
|
goto out_unmap_put;
|
2015-04-01 09:03:29 +08:00
|
|
|
|
2013-05-28 08:19:22 +08:00
|
|
|
if (de) {
|
2016-09-10 07:59:39 +08:00
|
|
|
einode = f2fs_iget_retry(inode->i_sb, le32_to_cpu(de->ino));
|
2013-05-28 08:19:22 +08:00
|
|
|
if (IS_ERR(einode)) {
|
|
|
|
WARN_ON(1);
|
2014-04-28 17:58:34 +08:00
|
|
|
err = PTR_ERR(einode);
|
|
|
|
if (err == -ENOENT)
|
2013-05-28 08:19:22 +08:00
|
|
|
err = -EEXIST;
|
2013-09-24 22:40:57 +08:00
|
|
|
goto out_unmap_put;
|
|
|
|
}
|
2017-08-08 10:54:31 +08:00
|
|
|
|
|
|
|
err = dquot_initialize(einode);
|
|
|
|
if (err) {
|
|
|
|
iput(einode);
|
|
|
|
goto out_unmap_put;
|
|
|
|
}
|
|
|
|
|
2014-09-03 06:31:18 +08:00
|
|
|
err = acquire_orphan_inode(F2FS_I_SB(inode));
|
2013-09-24 22:40:57 +08:00
|
|
|
if (err) {
|
|
|
|
iput(einode);
|
|
|
|
goto out_unmap_put;
|
2013-05-28 08:19:22 +08:00
|
|
|
}
|
2014-09-24 18:17:04 +08:00
|
|
|
f2fs_delete_entry(de, page, dir, einode);
|
2013-05-28 08:19:22 +08:00
|
|
|
iput(einode);
|
|
|
|
goto retry;
|
2016-07-19 08:27:47 +08:00
|
|
|
} else if (IS_ERR(page)) {
|
|
|
|
err = PTR_ERR(page);
|
|
|
|
} else {
|
2016-08-29 11:27:56 +08:00
|
|
|
err = __f2fs_do_add_link(dir, &fname, inode,
|
2016-07-19 08:27:47 +08:00
|
|
|
inode->i_ino, inode->i_mode);
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
2016-09-10 07:59:39 +08:00
|
|
|
if (err == -ENOMEM)
|
|
|
|
goto retry;
|
2013-09-24 22:40:57 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
out_unmap_put:
|
2014-11-22 08:36:28 +08:00
|
|
|
f2fs_dentry_kunmap(dir, page);
|
2013-09-24 22:40:57 +08:00
|
|
|
f2fs_put_page(page, 0);
|
2012-11-02 16:13:32 +08:00
|
|
|
out:
|
2016-08-29 11:27:56 +08:00
|
|
|
if (file_enc_name(inode))
|
|
|
|
name = "<encrypted>";
|
|
|
|
else
|
|
|
|
name = raw_inode->i_name;
|
2014-01-18 04:44:39 +08:00
|
|
|
f2fs_msg(inode->i_sb, KERN_NOTICE,
|
|
|
|
"%s: ino = %x, name = %s, dir = %lx, err = %d",
|
2016-08-29 11:27:56 +08:00
|
|
|
__func__, ino_of_node(ipage), name,
|
2013-05-23 18:02:13 +08:00
|
|
|
IS_ERR(dir) ? 0 : dir->i_ino, err);
|
2012-11-02 16:13:32 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2014-09-12 05:29:06 +08:00
|
|
|
static void recover_inode(struct inode *inode, struct page *page)
|
2012-11-02 16:13:32 +08:00
|
|
|
{
|
2014-09-16 07:46:08 +08:00
|
|
|
struct f2fs_inode *raw = F2FS_INODE(page);
|
2015-04-30 08:02:18 +08:00
|
|
|
char *name;
|
2014-09-16 07:46:08 +08:00
|
|
|
|
|
|
|
inode->i_mode = le16_to_cpu(raw->i_mode);
|
2016-05-21 00:22:03 +08:00
|
|
|
f2fs_i_size_write(inode, le64_to_cpu(raw->i_size));
|
2016-11-04 00:26:55 +08:00
|
|
|
inode->i_atime.tv_sec = le64_to_cpu(raw->i_atime);
|
2014-09-16 07:46:08 +08:00
|
|
|
inode->i_ctime.tv_sec = le64_to_cpu(raw->i_ctime);
|
|
|
|
inode->i_mtime.tv_sec = le64_to_cpu(raw->i_mtime);
|
2016-11-04 00:26:55 +08:00
|
|
|
inode->i_atime.tv_nsec = le32_to_cpu(raw->i_atime_nsec);
|
2014-09-16 07:46:08 +08:00
|
|
|
inode->i_ctime.tv_nsec = le32_to_cpu(raw->i_ctime_nsec);
|
|
|
|
inode->i_mtime.tv_nsec = le32_to_cpu(raw->i_mtime_nsec);
|
2013-05-16 14:04:49 +08:00
|
|
|
|
2016-11-29 07:33:38 +08:00
|
|
|
F2FS_I(inode)->i_advise = raw->i_advise;
|
|
|
|
|
2015-04-30 08:02:18 +08:00
|
|
|
if (file_enc_name(inode))
|
|
|
|
name = "<encrypted>";
|
|
|
|
else
|
|
|
|
name = F2FS_INODE(page)->i_name;
|
|
|
|
|
2013-05-16 14:04:49 +08:00
|
|
|
f2fs_msg(inode->i_sb, KERN_NOTICE, "recover_inode: ino = %x, name = %s",
|
2015-04-30 08:02:18 +08:00
|
|
|
ino_of_node(page), name);
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
|
|
|
|
2017-04-15 06:46:23 +08:00
|
|
|
static int find_fsync_dnodes(struct f2fs_sb_info *sbi, struct list_head *head,
|
|
|
|
bool check_only)
|
2012-11-02 16:13:32 +08:00
|
|
|
{
|
|
|
|
struct curseg_info *curseg;
|
2014-09-12 04:49:55 +08:00
|
|
|
struct page *page = NULL;
|
2012-11-02 16:13:32 +08:00
|
|
|
block_t blkaddr;
|
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
/* get node pages in the current segment */
|
|
|
|
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
|
2014-02-27 19:52:21 +08:00
|
|
|
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
|
2012-11-02 16:13:32 +08:00
|
|
|
|
|
|
|
while (1) {
|
|
|
|
struct fsync_inode_entry *entry;
|
|
|
|
|
2015-04-18 18:05:36 +08:00
|
|
|
if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
|
2014-09-12 04:49:55 +08:00
|
|
|
return 0;
|
2012-11-02 16:13:32 +08:00
|
|
|
|
2015-10-12 17:04:21 +08:00
|
|
|
page = get_tmp_page(sbi, blkaddr);
|
2013-03-08 20:29:23 +08:00
|
|
|
|
2016-09-20 08:55:10 +08:00
|
|
|
if (!is_recoverable_dnode(page))
|
2013-05-16 14:04:49 +08:00
|
|
|
break;
|
2012-11-02 16:13:32 +08:00
|
|
|
|
|
|
|
if (!is_fsync_dnode(page))
|
|
|
|
goto next;
|
|
|
|
|
|
|
|
entry = get_fsync_inode(head, ino_of_node(page));
|
2016-11-05 11:12:40 +08:00
|
|
|
if (!entry) {
|
2017-08-08 10:54:31 +08:00
|
|
|
bool quota_inode = false;
|
|
|
|
|
2017-04-15 06:46:23 +08:00
|
|
|
if (!check_only &&
|
|
|
|
IS_INODE(page) && is_dent_dnode(page)) {
|
2013-03-20 18:01:06 +08:00
|
|
|
err = recover_inode_page(sbi, page);
|
|
|
|
if (err)
|
2013-05-16 14:04:49 +08:00
|
|
|
break;
|
2017-08-08 10:54:31 +08:00
|
|
|
quota_inode = true;
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
|
|
|
|
2014-09-16 07:46:08 +08:00
|
|
|
/*
|
|
|
|
* CP | dnode(F) | inode(DF)
|
|
|
|
* For this case, we should not give up now.
|
|
|
|
*/
|
2017-08-08 10:54:31 +08:00
|
|
|
entry = add_fsync_inode(sbi, head, ino_of_node(page),
|
|
|
|
quota_inode);
|
2016-09-10 07:48:15 +08:00
|
|
|
if (IS_ERR(entry)) {
|
|
|
|
err = PTR_ERR(entry);
|
2015-02-25 10:01:46 +08:00
|
|
|
if (err == -ENOENT) {
|
|
|
|
err = 0;
|
2014-09-16 07:46:08 +08:00
|
|
|
goto next;
|
2015-02-25 10:01:46 +08:00
|
|
|
}
|
2013-05-16 14:04:49 +08:00
|
|
|
break;
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
|
|
|
}
|
2013-05-15 09:49:13 +08:00
|
|
|
entry->blkaddr = blkaddr;
|
|
|
|
|
2016-04-16 00:43:17 +08:00
|
|
|
if (IS_INODE(page) && is_dent_dnode(page))
|
|
|
|
entry->last_dentry = blkaddr;
|
2012-11-02 16:13:32 +08:00
|
|
|
next:
|
|
|
|
/* check next segment */
|
|
|
|
blkaddr = next_blkaddr_of_node(page);
|
2014-09-12 04:49:55 +08:00
|
|
|
f2fs_put_page(page, 1);
|
2014-12-08 15:02:52 +08:00
|
|
|
|
|
|
|
ra_meta_pages_cond(sbi, blkaddr);
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
2014-09-12 04:49:55 +08:00
|
|
|
f2fs_put_page(page, 1);
|
2012-11-02 16:13:32 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2013-06-27 09:28:54 +08:00
|
|
|
static void destroy_fsync_dnodes(struct list_head *head)
|
2012-11-02 16:13:32 +08:00
|
|
|
{
|
2013-01-20 23:02:58 +08:00
|
|
|
struct fsync_inode_entry *entry, *tmp;
|
|
|
|
|
2016-04-29 20:13:37 +08:00
|
|
|
list_for_each_entry_safe(entry, tmp, head, list)
|
|
|
|
del_fsync_inode(entry);
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
|
|
|
|
2013-05-22 07:20:01 +08:00
|
|
|
static int check_index_in_prev_nodes(struct f2fs_sb_info *sbi,
|
2013-05-22 07:02:02 +08:00
|
|
|
block_t blkaddr, struct dnode_of_data *dn)
|
2012-11-02 16:13:32 +08:00
|
|
|
{
|
|
|
|
struct seg_entry *sentry;
|
|
|
|
unsigned int segno = GET_SEGNO(sbi, blkaddr);
|
2014-02-04 12:01:10 +08:00
|
|
|
unsigned short blkoff = GET_BLKOFF_FROM_SEG0(sbi, blkaddr);
|
2014-01-28 13:54:07 +08:00
|
|
|
struct f2fs_summary_block *sum_node;
|
2012-11-02 16:13:32 +08:00
|
|
|
struct f2fs_summary sum;
|
2014-01-28 13:54:07 +08:00
|
|
|
struct page *sum_page, *node_page;
|
2015-03-27 09:46:38 +08:00
|
|
|
struct dnode_of_data tdn = *dn;
|
2013-05-22 07:02:02 +08:00
|
|
|
nid_t ino, nid;
|
2012-11-02 16:13:32 +08:00
|
|
|
struct inode *inode;
|
2013-08-12 20:08:03 +08:00
|
|
|
unsigned int offset;
|
2012-11-02 16:13:32 +08:00
|
|
|
block_t bidx;
|
|
|
|
int i;
|
|
|
|
|
|
|
|
sentry = get_seg_entry(sbi, segno);
|
|
|
|
if (!f2fs_test_bit(blkoff, sentry->cur_valid_map))
|
2013-05-22 07:20:01 +08:00
|
|
|
return 0;
|
2012-11-02 16:13:32 +08:00
|
|
|
|
|
|
|
/* Get the previous summary */
|
2017-08-13 12:33:23 +08:00
|
|
|
for (i = CURSEG_HOT_DATA; i <= CURSEG_COLD_DATA; i++) {
|
2012-11-02 16:13:32 +08:00
|
|
|
struct curseg_info *curseg = CURSEG_I(sbi, i);
|
|
|
|
if (curseg->segno == segno) {
|
|
|
|
sum = curseg->sum_blk->entries[blkoff];
|
2014-01-28 13:54:07 +08:00
|
|
|
goto got_it;
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2014-01-28 13:54:07 +08:00
|
|
|
sum_page = get_sum_page(sbi, segno);
|
|
|
|
sum_node = (struct f2fs_summary_block *)page_address(sum_page);
|
|
|
|
sum = sum_node->entries[blkoff];
|
|
|
|
f2fs_put_page(sum_page, 1);
|
|
|
|
got_it:
|
2013-05-22 07:02:02 +08:00
|
|
|
/* Use the locked dnode page and inode */
|
|
|
|
nid = le32_to_cpu(sum.nid);
|
|
|
|
if (dn->inode->i_ino == nid) {
|
|
|
|
tdn.nid = nid;
|
2015-03-27 09:46:38 +08:00
|
|
|
if (!dn->inode_page_locked)
|
|
|
|
lock_page(dn->inode_page);
|
2013-05-22 07:02:02 +08:00
|
|
|
tdn.node_page = dn->inode_page;
|
2013-06-24 06:47:23 +08:00
|
|
|
tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
|
2015-03-27 09:46:38 +08:00
|
|
|
goto truncate_out;
|
2013-05-22 07:02:02 +08:00
|
|
|
} else if (dn->nid == nid) {
|
2013-06-24 06:47:23 +08:00
|
|
|
tdn.ofs_in_node = le16_to_cpu(sum.ofs_in_node);
|
2015-03-27 09:46:38 +08:00
|
|
|
goto truncate_out;
|
2013-05-22 07:02:02 +08:00
|
|
|
}
|
|
|
|
|
2012-11-02 16:13:32 +08:00
|
|
|
/* Get the node page */
|
2013-05-22 07:02:02 +08:00
|
|
|
node_page = get_node_page(sbi, nid);
|
2013-05-22 07:20:01 +08:00
|
|
|
if (IS_ERR(node_page))
|
|
|
|
return PTR_ERR(node_page);
|
2013-08-12 20:08:03 +08:00
|
|
|
|
|
|
|
offset = ofs_of_node(node_page);
|
2012-11-02 16:13:32 +08:00
|
|
|
ino = ino_of_node(node_page);
|
|
|
|
f2fs_put_page(node_page, 1);
|
|
|
|
|
f2fs: fix double lock for inode page during roll-foward recovery
If the inode is same and its data index are needed to truncate, we can fall into
double lock for its inode page via get_dnode_of_data.
Error case is like this.
1. write data 1, 2, 3, 4, 5 in inode #4.
2. write data 100, 102, 103, 104, 105 in dnode #6 of inode #4.
3. sync
4. update data 100->106 in dnode #6.
5. fsync inode #4.
6. power-cut
-> Then,
1. go back to #3's checkpoint
2. in do_recover_data, get_dnode_of_data() gets inode #4.
3. detect 100->106 in dnode #6.
4. check_index_in_prev_nodes tries to truncate 100 in dnode #6.
5. to trigger truncate_hole, get_dnode_of_data should grab inode #4.
6. detect *kernel hang*
This patch should resolve that bug.
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-09-12 23:35:58 +08:00
|
|
|
if (ino != dn->inode->i_ino) {
|
2017-08-08 10:54:31 +08:00
|
|
|
int ret;
|
|
|
|
|
f2fs: fix double lock for inode page during roll-foward recovery
If the inode is same and its data index are needed to truncate, we can fall into
double lock for its inode page via get_dnode_of_data.
Error case is like this.
1. write data 1, 2, 3, 4, 5 in inode #4.
2. write data 100, 102, 103, 104, 105 in dnode #6 of inode #4.
3. sync
4. update data 100->106 in dnode #6.
5. fsync inode #4.
6. power-cut
-> Then,
1. go back to #3's checkpoint
2. in do_recover_data, get_dnode_of_data() gets inode #4.
3. detect 100->106 in dnode #6.
4. check_index_in_prev_nodes tries to truncate 100 in dnode #6.
5. to trigger truncate_hole, get_dnode_of_data should grab inode #4.
6. detect *kernel hang*
This patch should resolve that bug.
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-09-12 23:35:58 +08:00
|
|
|
/* Deallocate previous index in the node page */
|
2016-09-10 07:59:39 +08:00
|
|
|
inode = f2fs_iget_retry(sbi->sb, ino);
|
f2fs: fix double lock for inode page during roll-foward recovery
If the inode is same and its data index are needed to truncate, we can fall into
double lock for its inode page via get_dnode_of_data.
Error case is like this.
1. write data 1, 2, 3, 4, 5 in inode #4.
2. write data 100, 102, 103, 104, 105 in dnode #6 of inode #4.
3. sync
4. update data 100->106 in dnode #6.
5. fsync inode #4.
6. power-cut
-> Then,
1. go back to #3's checkpoint
2. in do_recover_data, get_dnode_of_data() gets inode #4.
3. detect 100->106 in dnode #6.
4. check_index_in_prev_nodes tries to truncate 100 in dnode #6.
5. to trigger truncate_hole, get_dnode_of_data should grab inode #4.
6. detect *kernel hang*
This patch should resolve that bug.
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-09-12 23:35:58 +08:00
|
|
|
if (IS_ERR(inode))
|
|
|
|
return PTR_ERR(inode);
|
2017-08-08 10:54:31 +08:00
|
|
|
|
|
|
|
ret = dquot_initialize(inode);
|
|
|
|
if (ret) {
|
|
|
|
iput(inode);
|
|
|
|
return ret;
|
|
|
|
}
|
f2fs: fix double lock for inode page during roll-foward recovery
If the inode is same and its data index are needed to truncate, we can fall into
double lock for its inode page via get_dnode_of_data.
Error case is like this.
1. write data 1, 2, 3, 4, 5 in inode #4.
2. write data 100, 102, 103, 104, 105 in dnode #6 of inode #4.
3. sync
4. update data 100->106 in dnode #6.
5. fsync inode #4.
6. power-cut
-> Then,
1. go back to #3's checkpoint
2. in do_recover_data, get_dnode_of_data() gets inode #4.
3. detect 100->106 in dnode #6.
4. check_index_in_prev_nodes tries to truncate 100 in dnode #6.
5. to trigger truncate_hole, get_dnode_of_data should grab inode #4.
6. detect *kernel hang*
This patch should resolve that bug.
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-09-12 23:35:58 +08:00
|
|
|
} else {
|
|
|
|
inode = dn->inode;
|
|
|
|
}
|
2012-12-22 11:09:43 +08:00
|
|
|
|
2016-01-26 15:39:35 +08:00
|
|
|
bidx = start_bidx_of_node(offset, inode) + le16_to_cpu(sum.ofs_in_node);
|
2013-08-12 20:08:03 +08:00
|
|
|
|
2015-03-27 09:46:38 +08:00
|
|
|
/*
|
|
|
|
* if inode page is locked, unlock temporarily, but its reference
|
|
|
|
* count keeps alive.
|
|
|
|
*/
|
|
|
|
if (ino == dn->inode->i_ino && dn->inode_page_locked)
|
|
|
|
unlock_page(dn->inode_page);
|
|
|
|
|
|
|
|
set_new_dnode(&tdn, inode, NULL, NULL, 0);
|
|
|
|
if (get_dnode_of_data(&tdn, bidx, LOOKUP_NODE))
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
if (tdn.data_blkaddr == blkaddr)
|
|
|
|
truncate_data_blocks_range(&tdn, 1);
|
|
|
|
|
|
|
|
f2fs_put_dnode(&tdn);
|
|
|
|
out:
|
|
|
|
if (ino != dn->inode->i_ino)
|
f2fs: fix double lock for inode page during roll-foward recovery
If the inode is same and its data index are needed to truncate, we can fall into
double lock for its inode page via get_dnode_of_data.
Error case is like this.
1. write data 1, 2, 3, 4, 5 in inode #4.
2. write data 100, 102, 103, 104, 105 in dnode #6 of inode #4.
3. sync
4. update data 100->106 in dnode #6.
5. fsync inode #4.
6. power-cut
-> Then,
1. go back to #3's checkpoint
2. in do_recover_data, get_dnode_of_data() gets inode #4.
3. detect 100->106 in dnode #6.
4. check_index_in_prev_nodes tries to truncate 100 in dnode #6.
5. to trigger truncate_hole, get_dnode_of_data should grab inode #4.
6. detect *kernel hang*
This patch should resolve that bug.
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2014-09-12 23:35:58 +08:00
|
|
|
iput(inode);
|
2015-03-27 09:46:38 +08:00
|
|
|
else if (dn->inode_page_locked)
|
|
|
|
lock_page(dn->inode_page);
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
truncate_out:
|
2017-07-19 00:19:06 +08:00
|
|
|
if (datablock_addr(tdn.inode, tdn.node_page,
|
|
|
|
tdn.ofs_in_node) == blkaddr)
|
2015-03-27 09:46:38 +08:00
|
|
|
truncate_data_blocks_range(&tdn, 1);
|
|
|
|
if (dn->inode->i_ino == nid && !dn->inode_page_locked)
|
|
|
|
unlock_page(dn->inode_page);
|
2013-05-22 07:20:01 +08:00
|
|
|
return 0;
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
|
|
|
|
2013-03-20 18:01:06 +08:00
|
|
|
static int do_recover_data(struct f2fs_sb_info *sbi, struct inode *inode,
|
2012-11-02 16:13:32 +08:00
|
|
|
struct page *page, block_t blkaddr)
|
|
|
|
{
|
|
|
|
struct dnode_of_data dn;
|
|
|
|
struct node_info ni;
|
2016-01-26 15:39:35 +08:00
|
|
|
unsigned int start, end;
|
2013-05-16 14:04:49 +08:00
|
|
|
int err = 0, recovered = 0;
|
2012-11-02 16:13:32 +08:00
|
|
|
|
2014-08-08 14:49:17 +08:00
|
|
|
/* step 1: recover xattr */
|
|
|
|
if (IS_INODE(page)) {
|
|
|
|
recover_inline_xattr(inode, page);
|
|
|
|
} else if (f2fs_has_xattr_block(ofs_of_node(page))) {
|
f2fs: change recovery policy of xattr node block
Currently, if we call fsync after updating the xattr date belongs to the
file, f2fs needs to trigger checkpoint to keep xattr data consistent. But,
this policy cause low performance as checkpoint will block most foreground
operations and cause unneeded and unrelated IOs around checkpoint.
This patch will reuse regular file recovery policy for xattr node block,
so, we change to write xattr node block tagged with fsync flag to warm
area instead of cold area, and during recovery, we search warm node chain
for fsynced xattr block, and do the recovery.
So, for below application IO pattern, performance can be improved
obviously:
- touch file
- create/update/delete xattr entry in file
- fsync file
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2017-02-08 17:39:45 +08:00
|
|
|
err = recover_xattr_data(inode, page, blkaddr);
|
|
|
|
if (!err)
|
|
|
|
recovered++;
|
2013-12-26 11:49:48 +08:00
|
|
|
goto out;
|
2014-08-08 14:49:17 +08:00
|
|
|
}
|
2013-12-26 11:49:48 +08:00
|
|
|
|
2014-08-08 14:49:17 +08:00
|
|
|
/* step 2: recover inline data */
|
|
|
|
if (recover_inline_data(inode, page))
|
2014-01-28 11:25:06 +08:00
|
|
|
goto out;
|
|
|
|
|
2014-08-08 14:49:17 +08:00
|
|
|
/* step 3: recover data indices */
|
2016-01-26 15:39:35 +08:00
|
|
|
start = start_bidx_of_node(ofs_of_node(page), inode);
|
|
|
|
end = start + ADDRS_PER_PAGE(page, inode);
|
2012-11-02 16:13:32 +08:00
|
|
|
|
|
|
|
set_new_dnode(&dn, inode, NULL, NULL, 0);
|
2016-09-10 07:59:39 +08:00
|
|
|
retry_dn:
|
2013-03-20 18:01:06 +08:00
|
|
|
err = get_dnode_of_data(&dn, start, ALLOC_NODE);
|
2016-09-10 07:59:39 +08:00
|
|
|
if (err) {
|
|
|
|
if (err == -ENOMEM) {
|
|
|
|
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
|
|
|
goto retry_dn;
|
|
|
|
}
|
2013-12-26 11:49:48 +08:00
|
|
|
goto out;
|
2016-09-10 07:59:39 +08:00
|
|
|
}
|
2012-11-02 16:13:32 +08:00
|
|
|
|
2016-01-20 23:43:51 +08:00
|
|
|
f2fs_wait_on_page_writeback(dn.node_page, NODE, true);
|
2012-11-02 16:13:32 +08:00
|
|
|
|
|
|
|
get_node_info(sbi, dn.nid, &ni);
|
2014-09-03 06:52:58 +08:00
|
|
|
f2fs_bug_on(sbi, ni.ino != ino_of_node(page));
|
|
|
|
f2fs_bug_on(sbi, ofs_of_node(dn.node_page) != ofs_of_node(page));
|
2012-11-02 16:13:32 +08:00
|
|
|
|
f2fs: recover invalid/reserved block address for fsynced file
When testing with generic/101 in xfstests, error message outputed as below:
--- tests/generic/101.out
+++ results//generic/101.out.bad
@@ -10,10 +10,14 @@
File foo content after log replay:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
-0200000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+0200000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
*
0372000
...
(Run 'diff -u tests/generic/101.out results/generic/101.out.bad' to see the entire diff)
The test flow is like below:
1. pwrite foo -S 0xaa 0 64K
2. pwrite foo -S 0xbb 64K 61K
3. sync
4. truncate foo 64K
5. truncate foo 125K
6. fsync foo
7. flakey drop writes
8. umount
After this test, we expect the data of recovered file will have the first
64k of data filling with value 0xaa and the next 61k of data filling with
value 0x00 because we have fsynced it before dropping writes in dm.
In f2fs, during recovering, we will only recover the valid block address
in direct node page if it is marked as a fsynced dnode, but block address
which means invalid/reserved (with value NULL_ADDR/NEW_ADDR) will not be
recovered. So, the file recovered shows its incorrect data 0xbb in range of
[61k, 125k].
In this patch, we fix to recover invalid/reserved block during recover flow.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-08-05 17:23:54 +08:00
|
|
|
for (; start < end; start++, dn.ofs_in_node++) {
|
2012-11-02 16:13:32 +08:00
|
|
|
block_t src, dest;
|
|
|
|
|
2017-07-19 00:19:06 +08:00
|
|
|
src = datablock_addr(dn.inode, dn.node_page, dn.ofs_in_node);
|
|
|
|
dest = datablock_addr(dn.inode, page, dn.ofs_in_node);
|
2012-11-02 16:13:32 +08:00
|
|
|
|
f2fs: recover invalid/reserved block address for fsynced file
When testing with generic/101 in xfstests, error message outputed as below:
--- tests/generic/101.out
+++ results//generic/101.out.bad
@@ -10,10 +10,14 @@
File foo content after log replay:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
-0200000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+0200000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
*
0372000
...
(Run 'diff -u tests/generic/101.out results/generic/101.out.bad' to see the entire diff)
The test flow is like below:
1. pwrite foo -S 0xaa 0 64K
2. pwrite foo -S 0xbb 64K 61K
3. sync
4. truncate foo 64K
5. truncate foo 125K
6. fsync foo
7. flakey drop writes
8. umount
After this test, we expect the data of recovered file will have the first
64k of data filling with value 0xaa and the next 61k of data filling with
value 0x00 because we have fsynced it before dropping writes in dm.
In f2fs, during recovering, we will only recover the valid block address
in direct node page if it is marked as a fsynced dnode, but block address
which means invalid/reserved (with value NULL_ADDR/NEW_ADDR) will not be
recovered. So, the file recovered shows its incorrect data 0xbb in range of
[61k, 125k].
In this patch, we fix to recover invalid/reserved block during recover flow.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-08-05 17:23:54 +08:00
|
|
|
/* skip recovering if dest is the same as src */
|
|
|
|
if (src == dest)
|
|
|
|
continue;
|
|
|
|
|
|
|
|
/* dest is invalid, just invalidate src block */
|
|
|
|
if (dest == NULL_ADDR) {
|
|
|
|
truncate_data_blocks_range(&dn, 1);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2016-11-29 07:33:38 +08:00
|
|
|
if (!file_keep_isize(inode) &&
|
2017-01-25 10:52:39 +08:00
|
|
|
(i_size_read(inode) <= ((loff_t)start << PAGE_SHIFT)))
|
|
|
|
f2fs_i_size_write(inode,
|
|
|
|
(loff_t)(start + 1) << PAGE_SHIFT);
|
2016-05-21 11:42:37 +08:00
|
|
|
|
f2fs: recover invalid/reserved block address for fsynced file
When testing with generic/101 in xfstests, error message outputed as below:
--- tests/generic/101.out
+++ results//generic/101.out.bad
@@ -10,10 +10,14 @@
File foo content after log replay:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
-0200000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+0200000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
*
0372000
...
(Run 'diff -u tests/generic/101.out results/generic/101.out.bad' to see the entire diff)
The test flow is like below:
1. pwrite foo -S 0xaa 0 64K
2. pwrite foo -S 0xbb 64K 61K
3. sync
4. truncate foo 64K
5. truncate foo 125K
6. fsync foo
7. flakey drop writes
8. umount
After this test, we expect the data of recovered file will have the first
64k of data filling with value 0xaa and the next 61k of data filling with
value 0x00 because we have fsynced it before dropping writes in dm.
In f2fs, during recovering, we will only recover the valid block address
in direct node page if it is marked as a fsynced dnode, but block address
which means invalid/reserved (with value NULL_ADDR/NEW_ADDR) will not be
recovered. So, the file recovered shows its incorrect data 0xbb in range of
[61k, 125k].
In this patch, we fix to recover invalid/reserved block during recover flow.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-08-05 17:23:54 +08:00
|
|
|
/*
|
|
|
|
* dest is reserved block, invalidate src block
|
|
|
|
* and then reserve one new block in dnode page.
|
|
|
|
*/
|
|
|
|
if (dest == NEW_ADDR) {
|
|
|
|
truncate_data_blocks_range(&dn, 1);
|
2016-05-12 00:13:13 +08:00
|
|
|
reserve_new_block(&dn);
|
f2fs: recover invalid/reserved block address for fsynced file
When testing with generic/101 in xfstests, error message outputed as below:
--- tests/generic/101.out
+++ results//generic/101.out.bad
@@ -10,10 +10,14 @@
File foo content after log replay:
0000000 aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa aa
*
-0200000 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00 00
+0200000 bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb bb
*
0372000
...
(Run 'diff -u tests/generic/101.out results/generic/101.out.bad' to see the entire diff)
The test flow is like below:
1. pwrite foo -S 0xaa 0 64K
2. pwrite foo -S 0xbb 64K 61K
3. sync
4. truncate foo 64K
5. truncate foo 125K
6. fsync foo
7. flakey drop writes
8. umount
After this test, we expect the data of recovered file will have the first
64k of data filling with value 0xaa and the next 61k of data filling with
value 0x00 because we have fsynced it before dropping writes in dm.
In f2fs, during recovering, we will only recover the valid block address
in direct node page if it is marked as a fsynced dnode, but block address
which means invalid/reserved (with value NULL_ADDR/NEW_ADDR) will not be
recovered. So, the file recovered shows its incorrect data 0xbb in range of
[61k, 125k].
In this patch, we fix to recover invalid/reserved block during recover flow.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2015-08-05 17:23:54 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* dest is valid block, try to recover from src to dest */
|
|
|
|
if (is_valid_blkaddr(sbi, dest, META_POR)) {
|
2015-04-02 10:38:20 +08:00
|
|
|
|
2012-11-02 16:13:32 +08:00
|
|
|
if (src == NULL_ADDR) {
|
2013-10-29 14:14:54 +08:00
|
|
|
err = reserve_new_block(&dn);
|
2016-05-20 02:57:21 +08:00
|
|
|
#ifdef CONFIG_F2FS_FAULT_INJECTION
|
|
|
|
while (err)
|
|
|
|
err = reserve_new_block(&dn);
|
|
|
|
#endif
|
2012-11-02 16:13:32 +08:00
|
|
|
/* We should not get -ENOSPC */
|
2014-09-03 06:52:58 +08:00
|
|
|
f2fs_bug_on(sbi, err);
|
2016-07-20 10:30:06 +08:00
|
|
|
if (err)
|
|
|
|
goto err;
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
2016-09-10 07:59:39 +08:00
|
|
|
retry_prev:
|
2012-11-02 16:13:32 +08:00
|
|
|
/* Check the previous node page having this index */
|
2013-05-22 07:20:01 +08:00
|
|
|
err = check_index_in_prev_nodes(sbi, dest, &dn);
|
2016-09-10 07:59:39 +08:00
|
|
|
if (err) {
|
|
|
|
if (err == -ENOMEM) {
|
|
|
|
congestion_wait(BLK_RW_ASYNC, HZ/50);
|
|
|
|
goto retry_prev;
|
|
|
|
}
|
2013-05-22 07:20:01 +08:00
|
|
|
goto err;
|
2016-09-10 07:59:39 +08:00
|
|
|
}
|
2012-11-02 16:13:32 +08:00
|
|
|
|
|
|
|
/* write dummy data page */
|
2015-05-28 19:15:35 +08:00
|
|
|
f2fs_replace_block(sbi, &dn, src, dest,
|
f2fs: support revoking atomic written pages
f2fs support atomic write with following semantics:
1. open db file
2. ioctl start atomic write
3. (write db file) * n
4. ioctl commit atomic write
5. close db file
With this flow we can avoid file becoming corrupted when abnormal power
cut, because we hold data of transaction in referenced pages linked in
inmem_pages list of inode, but without setting them dirty, so these data
won't be persisted unless we commit them in step 4.
But we should still hold journal db file in memory by using volatile
write, because our semantics of 'atomic write support' is incomplete, in
step 4, we could fail to submit all dirty data of transaction, once
partial dirty data was committed in storage, then after a checkpoint &
abnormal power-cut, db file will be corrupted forever.
So this patch tries to improve atomic write flow by adding a revoking flow,
once inner error occurs in committing, this gives another chance to try to
revoke these partial submitted data of current transaction, it makes
committing operation more like aotmical one.
If we're not lucky, once revoking operation was failed, EAGAIN will be
reported to user for suggesting doing the recovery with held journal file,
or retrying current transaction again.
Signed-off-by: Chao Yu <chao2.yu@samsung.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-02-06 14:40:34 +08:00
|
|
|
ni.version, false, false);
|
2013-05-16 14:04:49 +08:00
|
|
|
recovered++;
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
copy_node_footer(dn.node_page, page);
|
|
|
|
fill_node_footer(dn.node_page, dn.nid, ni.ino,
|
|
|
|
ofs_of_node(page), false);
|
|
|
|
set_page_dirty(dn.node_page);
|
2013-05-22 07:20:01 +08:00
|
|
|
err:
|
2012-11-02 16:13:32 +08:00
|
|
|
f2fs_put_dnode(&dn);
|
2013-12-26 11:49:48 +08:00
|
|
|
out:
|
2014-01-18 04:44:39 +08:00
|
|
|
f2fs_msg(sbi->sb, KERN_NOTICE,
|
2016-11-29 07:33:38 +08:00
|
|
|
"recover_data: ino = %lx (i_size: %s) recovered = %d, err = %d",
|
|
|
|
inode->i_ino,
|
|
|
|
file_keep_isize(inode) ? "keep" : "recover",
|
|
|
|
recovered, err);
|
2013-05-22 07:20:01 +08:00
|
|
|
return err;
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
|
|
|
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
static int recover_data(struct f2fs_sb_info *sbi, struct list_head *inode_list,
|
|
|
|
struct list_head *dir_list)
|
2012-11-02 16:13:32 +08:00
|
|
|
{
|
|
|
|
struct curseg_info *curseg;
|
2014-09-12 04:49:55 +08:00
|
|
|
struct page *page = NULL;
|
2013-03-20 18:01:06 +08:00
|
|
|
int err = 0;
|
2012-11-02 16:13:32 +08:00
|
|
|
block_t blkaddr;
|
|
|
|
|
|
|
|
/* get node pages in the current segment */
|
2015-12-01 11:43:59 +08:00
|
|
|
curseg = CURSEG_I(sbi, CURSEG_WARM_NODE);
|
2012-11-02 16:13:32 +08:00
|
|
|
blkaddr = NEXT_FREE_BLKADDR(sbi, curseg);
|
|
|
|
|
|
|
|
while (1) {
|
|
|
|
struct fsync_inode_entry *entry;
|
|
|
|
|
2015-04-18 18:05:36 +08:00
|
|
|
if (!is_valid_blkaddr(sbi, blkaddr, META_POR))
|
2014-09-12 04:49:55 +08:00
|
|
|
break;
|
2012-11-02 16:13:32 +08:00
|
|
|
|
2014-12-08 15:02:52 +08:00
|
|
|
ra_meta_pages_cond(sbi, blkaddr);
|
|
|
|
|
2015-10-12 17:04:21 +08:00
|
|
|
page = get_tmp_page(sbi, blkaddr);
|
2013-03-08 20:29:23 +08:00
|
|
|
|
2016-09-20 08:55:10 +08:00
|
|
|
if (!is_recoverable_dnode(page)) {
|
2014-09-12 04:49:55 +08:00
|
|
|
f2fs_put_page(page, 1);
|
2013-05-20 09:26:09 +08:00
|
|
|
break;
|
2014-09-12 04:49:55 +08:00
|
|
|
}
|
2012-11-02 16:13:32 +08:00
|
|
|
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
entry = get_fsync_inode(inode_list, ino_of_node(page));
|
2012-11-02 16:13:32 +08:00
|
|
|
if (!entry)
|
|
|
|
goto next;
|
2014-09-16 07:46:08 +08:00
|
|
|
/*
|
|
|
|
* inode(x) | CP | inode(x) | dnode(F)
|
|
|
|
* In this case, we can lose the latest inode(x).
|
2014-09-12 05:29:06 +08:00
|
|
|
* So, call recover_inode for the inode update.
|
2014-09-16 07:46:08 +08:00
|
|
|
*/
|
2016-04-16 00:43:17 +08:00
|
|
|
if (IS_INODE(page))
|
2014-09-12 05:29:06 +08:00
|
|
|
recover_inode(entry->inode, page);
|
|
|
|
if (entry->last_dentry == blkaddr) {
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
err = recover_dentry(entry->inode, page, dir_list);
|
2014-09-12 05:29:06 +08:00
|
|
|
if (err) {
|
|
|
|
f2fs_put_page(page, 1);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
2013-03-20 18:01:06 +08:00
|
|
|
err = do_recover_data(sbi, entry->inode, page, blkaddr);
|
2014-09-12 04:49:55 +08:00
|
|
|
if (err) {
|
|
|
|
f2fs_put_page(page, 1);
|
2013-05-20 09:26:09 +08:00
|
|
|
break;
|
2014-09-12 04:49:55 +08:00
|
|
|
}
|
2012-11-02 16:13:32 +08:00
|
|
|
|
2016-04-29 20:13:37 +08:00
|
|
|
if (entry->blkaddr == blkaddr)
|
|
|
|
del_fsync_inode(entry);
|
2012-11-02 16:13:32 +08:00
|
|
|
next:
|
|
|
|
/* check next segment */
|
|
|
|
blkaddr = next_blkaddr_of_node(page);
|
2014-09-12 04:49:55 +08:00
|
|
|
f2fs_put_page(page, 1);
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
2013-03-20 18:01:06 +08:00
|
|
|
if (!err)
|
|
|
|
allocate_new_segments(sbi);
|
|
|
|
return err;
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|
|
|
|
|
2016-03-24 07:12:58 +08:00
|
|
|
int recover_fsync_data(struct f2fs_sb_info *sbi, bool check_only)
|
2012-11-02 16:13:32 +08:00
|
|
|
{
|
|
|
|
struct list_head inode_list;
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
struct list_head dir_list;
|
2013-03-20 18:01:06 +08:00
|
|
|
int err;
|
2016-03-24 07:12:58 +08:00
|
|
|
int ret = 0;
|
2017-08-08 10:54:31 +08:00
|
|
|
unsigned long s_flags = sbi->sb->s_flags;
|
2013-10-23 12:39:32 +08:00
|
|
|
bool need_writecp = false;
|
2017-10-07 00:14:28 +08:00
|
|
|
#ifdef CONFIG_QUOTA
|
|
|
|
int quota_enabled;
|
|
|
|
#endif
|
2012-11-02 16:13:32 +08:00
|
|
|
|
2017-08-08 10:54:31 +08:00
|
|
|
if (s_flags & MS_RDONLY) {
|
|
|
|
f2fs_msg(sbi->sb, KERN_INFO, "orphan cleanup on readonly fs");
|
|
|
|
sbi->sb->s_flags &= ~MS_RDONLY;
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_QUOTA
|
|
|
|
/* Needed for iput() to work correctly and not trash data */
|
|
|
|
sbi->sb->s_flags |= MS_ACTIVE;
|
|
|
|
/* Turn on quotas so that they are updated correctly */
|
2017-10-07 00:14:28 +08:00
|
|
|
quota_enabled = f2fs_enable_quota_files(sbi, s_flags & MS_RDONLY);
|
2017-08-08 10:54:31 +08:00
|
|
|
#endif
|
|
|
|
|
2012-11-02 16:13:32 +08:00
|
|
|
fsync_entry_slab = f2fs_kmem_cache_create("f2fs_fsync_inode_entry",
|
2014-03-07 18:43:28 +08:00
|
|
|
sizeof(struct fsync_inode_entry));
|
2017-08-08 10:54:31 +08:00
|
|
|
if (!fsync_entry_slab) {
|
|
|
|
err = -ENOMEM;
|
|
|
|
goto out;
|
|
|
|
}
|
2012-11-02 16:13:32 +08:00
|
|
|
|
|
|
|
INIT_LIST_HEAD(&inode_list);
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
INIT_LIST_HEAD(&dir_list);
|
2012-11-02 16:13:32 +08:00
|
|
|
|
2014-08-14 07:30:46 +08:00
|
|
|
/* prevent checkpoint */
|
|
|
|
mutex_lock(&sbi->cp_mutex);
|
|
|
|
|
2015-08-12 03:45:39 +08:00
|
|
|
/* step #1: find fsynced inode numbers */
|
2017-04-15 06:46:23 +08:00
|
|
|
err = find_fsync_dnodes(sbi, &inode_list, check_only);
|
2016-03-24 07:12:58 +08:00
|
|
|
if (err || list_empty(&inode_list))
|
2017-08-08 10:54:31 +08:00
|
|
|
goto skip;
|
2012-11-02 16:13:32 +08:00
|
|
|
|
2016-03-24 07:12:58 +08:00
|
|
|
if (check_only) {
|
|
|
|
ret = 1;
|
2017-08-08 10:54:31 +08:00
|
|
|
goto skip;
|
2016-03-24 07:12:58 +08:00
|
|
|
}
|
2012-11-02 16:13:32 +08:00
|
|
|
|
2013-10-23 12:39:32 +08:00
|
|
|
need_writecp = true;
|
2013-09-24 09:26:24 +08:00
|
|
|
|
2012-11-02 16:13:32 +08:00
|
|
|
/* step #2: recover data */
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
err = recover_data(sbi, &inode_list, &dir_list);
|
2014-08-09 01:18:43 +08:00
|
|
|
if (!err)
|
2014-09-03 06:52:58 +08:00
|
|
|
f2fs_bug_on(sbi, !list_empty(&inode_list));
|
2017-08-08 10:54:31 +08:00
|
|
|
skip:
|
2013-06-27 09:28:54 +08:00
|
|
|
destroy_fsync_dnodes(&inode_list);
|
2014-07-26 06:47:25 +08:00
|
|
|
|
2014-09-12 04:49:55 +08:00
|
|
|
/* truncate meta pages to be used by the recovery */
|
|
|
|
truncate_inode_pages_range(META_MAPPING(sbi),
|
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros
PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time
ago with promise that one day it will be possible to implement page
cache with bigger chunks than PAGE_SIZE.
This promise never materialized. And unlikely will.
We have many places where PAGE_CACHE_SIZE assumed to be equal to
PAGE_SIZE. And it's constant source of confusion on whether
PAGE_CACHE_* or PAGE_* constant should be used in a particular case,
especially on the border between fs and mm.
Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much
breakage to be doable.
Let's stop pretending that pages in page cache are special. They are
not.
The changes are pretty straight-forward:
- <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>;
- PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN};
- page_cache_get() -> get_page();
- page_cache_release() -> put_page();
This patch contains automated changes generated with coccinelle using
script below. For some reason, coccinelle doesn't patch header files.
I've called spatch for them manually.
The only adjustment after coccinelle is revert of changes to
PAGE_CAHCE_ALIGN definition: we are going to drop it later.
There are few places in the code where coccinelle didn't reach. I'll
fix them manually in a separate patch. Comments and documentation also
will be addressed with the separate patch.
virtual patch
@@
expression E;
@@
- E << (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
expression E;
@@
- E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT)
+ E
@@
@@
- PAGE_CACHE_SHIFT
+ PAGE_SHIFT
@@
@@
- PAGE_CACHE_SIZE
+ PAGE_SIZE
@@
@@
- PAGE_CACHE_MASK
+ PAGE_MASK
@@
expression E;
@@
- PAGE_CACHE_ALIGN(E)
+ PAGE_ALIGN(E)
@@
expression E;
@@
- page_cache_get(E)
+ get_page(E)
@@
expression E;
@@
- page_cache_release(E)
+ put_page(E)
Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com>
Acked-by: Michal Hocko <mhocko@suse.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
|
|
|
(loff_t)MAIN_BLKADDR(sbi) << PAGE_SHIFT, -1);
|
2014-09-12 04:49:55 +08:00
|
|
|
|
2014-07-26 06:47:25 +08:00
|
|
|
if (err) {
|
|
|
|
truncate_inode_pages_final(NODE_MAPPING(sbi));
|
|
|
|
truncate_inode_pages_final(META_MAPPING(sbi));
|
|
|
|
}
|
|
|
|
|
2015-01-28 17:48:42 +08:00
|
|
|
clear_sbi_flag(sbi, SBI_POR_DOING);
|
2016-09-20 08:55:10 +08:00
|
|
|
mutex_unlock(&sbi->cp_mutex);
|
|
|
|
|
2016-09-20 09:13:54 +08:00
|
|
|
/* let's drop all the directory inodes for clean checkpoint */
|
|
|
|
destroy_fsync_dnodes(&dir_list);
|
|
|
|
|
2016-09-20 08:55:10 +08:00
|
|
|
if (!err && need_writecp) {
|
2014-09-21 12:57:51 +08:00
|
|
|
struct cp_control cpc = {
|
2015-04-10 08:03:53 +08:00
|
|
|
.reason = CP_RECOVERY,
|
2014-09-21 12:57:51 +08:00
|
|
|
};
|
2015-12-23 17:50:30 +08:00
|
|
|
err = write_checkpoint(sbi, &cpc);
|
2014-07-26 06:47:25 +08:00
|
|
|
}
|
f2fs: fix inode cache leak
When testing f2fs with inline_dentry option, generic/342 reports:
VFS: Busy inodes after unmount of dm-0. Self-destruct in 5 seconds. Have a nice day...
After rmmod f2fs module, kenrel shows following dmesg:
=============================================================================
BUG f2fs_inode_cache (Tainted: G O ): Objects remaining in f2fs_inode_cache on __kmem_cache_shutdown()
-----------------------------------------------------------------------------
Disabling lock debugging due to kernel taint
INFO: Slab 0xf51ca0e0 objects=22 used=1 fp=0xd1e6fc60 flags=0x40004080
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000086 00000086 d062fe18 c13a83a0 f51ca0e0 d062fe38 d062fea4 c11c7276
c1981040 f51ca0e0 00000016 00000001 d1e6fc60 40004080 656a624f 20737463
616d6572 6e696e69 6e692067 66326620 6e695f73 5f65646f 68636163 6e6f2065
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c11c7276>] slab_err+0x76/0x80
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfc0>] ? __kmem_cache_shutdown+0x100/0x2f0
[<c11cbfe5>] __kmem_cache_shutdown+0x125/0x2f0
[<c1198a38>] kmem_cache_destroy+0x158/0x1f0
[<c176b43d>] ? mutex_unlock+0xd/0x10
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
INFO: Object 0xd1e6d9e0 @offset=6624
kmem_cache_destroy f2fs_inode_cache: Slab cache still has objects
CPU: 3 PID: 7455 Comm: rmmod Tainted: G B O 4.6.0-rc4+ #16
Hardware name: innotek GmbH VirtualBox/VirtualBox, BIOS VirtualBox 12/01/2006
00000286 00000286 d062fef4 c13a83a0 f174b000 d062ff14 d062ff28 c1198ac7
c197fe18 f3c5b980 d062ff20 000d04f2 d062ff0c d062ff0c d062ff14 d062ff14
f8f20dc0 fffffff5 d062e000 d062ff30 f8f15aa3 d062ff7c c10f596c 73663266
Call Trace:
[<c13a83a0>] dump_stack+0x5f/0x8f
[<c1198ac7>] kmem_cache_destroy+0x1e7/0x1f0
[<f8f15aa3>] exit_f2fs_fs+0x4b/0x5a8 [f2fs]
[<c10f596c>] SyS_delete_module+0x16c/0x1d0
[<c1001b10>] ? do_fast_syscall_32+0x30/0x1c0
[<c13c59bf>] ? __this_cpu_preempt_check+0xf/0x20
[<c10afa7d>] ? trace_hardirqs_on_caller+0xdd/0x210
[<c10ad50b>] ? trace_hardirqs_off+0xb/0x10
[<c1001b81>] do_fast_syscall_32+0xa1/0x1c0
[<c176d888>] sysenter_past_esp+0x45/0x74
The reason is: in recovery flow, we use delayed iput mechanism for directory
which has recovered dentry block. It means the reference of inode will be
held until last dirty dentry page being writebacked.
But when we mount f2fs with inline_dentry option, during recovery, dirent
may only be recovered into dir inode page rather than dentry page, so there
are no chance for us to release inode reference in ->writepage when
writebacking last dentry page.
We can call paired iget/iput explicityly for inline_dentry case, but for
non-inline_dentry case, iput will call writeback_single_inode to write all
data pages synchronously, but during recovery, ->writepages of f2fs skips
writing all pages, result in losing dirent.
This patch fixes this issue by obsoleting old mechanism, and introduce a
new dir_list to hold all directory inodes which has recovered datas until
finishing recovery.
Signed-off-by: Chao Yu <yuchao0@huawei.com>
Signed-off-by: Jaegeuk Kim <jaegeuk@kernel.org>
2016-05-07 16:15:05 +08:00
|
|
|
|
|
|
|
kmem_cache_destroy(fsync_entry_slab);
|
2017-08-08 10:54:31 +08:00
|
|
|
out:
|
|
|
|
#ifdef CONFIG_QUOTA
|
|
|
|
/* Turn quotas off */
|
2017-10-07 00:14:28 +08:00
|
|
|
if (quota_enabled)
|
|
|
|
f2fs_quota_off_umount(sbi->sb);
|
2017-08-08 10:54:31 +08:00
|
|
|
#endif
|
|
|
|
sbi->sb->s_flags = s_flags; /* Restore MS_RDONLY status */
|
|
|
|
|
2016-03-24 07:12:58 +08:00
|
|
|
return ret ? ret: err;
|
2012-11-02 16:13:32 +08:00
|
|
|
}
|