ext4: make the zero-out chunk size tunable
Currently in ext4 the length of zero-out chunk is set to 7 file system blocks. But if an inode has uninitailized extents from using fallocate to preallocate space, and the workload issues many random writes, this can cause a fragmented extent tree that will unnecessarily grow the extent tree. So create a new sysfs tunable, extent_max_zeroout_kb, which controls the maximum size where blocks will be zeroed out instead of creating a new uninitialized extent. The default of this has been sent to 32kb. CC: Zach Brown <zab@zabbo.net> CC: Andreas Dilger <adilger@dilger.ca> Signed-off-by: Zheng Liu <wenqing.lz@taobao.com> Signed-off-by: "Theodore Ts'o" <tytso@mit.edu>
This commit is contained in:
parent
8137029172
commit
67a5da564f
|
@ -96,3 +96,16 @@ Contact: "Theodore Ts'o" <tytso@mit.edu>
|
|||
Description:
|
||||
The maximum number of megabytes the writeback code will
|
||||
try to write out before move on to another inode.
|
||||
|
||||
What: /sys/fs/ext4/<disk>/extent_max_zeroout_kb
|
||||
Date: August 2012
|
||||
Contact: "Theodore Ts'o" <tytso@mit.edu>
|
||||
Description:
|
||||
The maximum number of kilobytes which will be zeroed
|
||||
out in preference to creating a new uninitialized
|
||||
extent when manipulating an inode's extent tree. Note
|
||||
that using a larger value will increase the
|
||||
variability of time necessary to complete a random
|
||||
write operation (since a 4k random write might turn
|
||||
into a much larger write due to the zeroout
|
||||
operation).
|
||||
|
|
|
@ -1271,6 +1271,9 @@ struct ext4_sb_info {
|
|||
unsigned long s_sectors_written_start;
|
||||
u64 s_kbytes_written;
|
||||
|
||||
/* the size of zero-out chunk */
|
||||
unsigned int s_extent_max_zeroout_kb;
|
||||
|
||||
unsigned int s_log_groups_per_flex;
|
||||
struct flex_groups *s_flex_groups;
|
||||
|
||||
|
|
|
@ -3085,7 +3085,6 @@ out:
|
|||
return err ? err : map->m_len;
|
||||
}
|
||||
|
||||
#define EXT4_EXT_ZERO_LEN 7
|
||||
/*
|
||||
* This function is called by ext4_ext_map_blocks() if someone tries to write
|
||||
* to an uninitialized extent. It may result in splitting the uninitialized
|
||||
|
@ -3111,13 +3110,14 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
|
|||
struct ext4_map_blocks *map,
|
||||
struct ext4_ext_path *path)
|
||||
{
|
||||
struct ext4_sb_info *sbi;
|
||||
struct ext4_extent_header *eh;
|
||||
struct ext4_map_blocks split_map;
|
||||
struct ext4_extent zero_ex;
|
||||
struct ext4_extent *ex;
|
||||
ext4_lblk_t ee_block, eof_block;
|
||||
unsigned int ee_len, depth;
|
||||
int allocated;
|
||||
int allocated, max_zeroout = 0;
|
||||
int err = 0;
|
||||
int split_flag = 0;
|
||||
|
||||
|
@ -3125,6 +3125,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
|
|||
"block %llu, max_blocks %u\n", inode->i_ino,
|
||||
(unsigned long long)map->m_lblk, map->m_len);
|
||||
|
||||
sbi = EXT4_SB(inode->i_sb);
|
||||
eof_block = (inode->i_size + inode->i_sb->s_blocksize - 1) >>
|
||||
inode->i_sb->s_blocksize_bits;
|
||||
if (eof_block < map->m_lblk + map->m_len)
|
||||
|
@ -3224,9 +3225,12 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
|
|||
*/
|
||||
split_flag |= ee_block + ee_len <= eof_block ? EXT4_EXT_MAY_ZEROOUT : 0;
|
||||
|
||||
/* If extent has less than 2*EXT4_EXT_ZERO_LEN zerout directly */
|
||||
if (ee_len <= 2*EXT4_EXT_ZERO_LEN &&
|
||||
(EXT4_EXT_MAY_ZEROOUT & split_flag)) {
|
||||
if (EXT4_EXT_MAY_ZEROOUT & split_flag)
|
||||
max_zeroout = sbi->s_extent_max_zeroout_kb >>
|
||||
inode->i_sb->s_blocksize_bits;
|
||||
|
||||
/* If extent is less than s_max_zeroout_kb, zeroout directly */
|
||||
if (max_zeroout && (ee_len <= max_zeroout)) {
|
||||
err = ext4_ext_zeroout(inode, ex);
|
||||
if (err)
|
||||
goto out;
|
||||
|
@ -3250,9 +3254,8 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
|
|||
split_map.m_lblk = map->m_lblk;
|
||||
split_map.m_len = map->m_len;
|
||||
|
||||
if (allocated > map->m_len) {
|
||||
if (allocated <= EXT4_EXT_ZERO_LEN &&
|
||||
(EXT4_EXT_MAY_ZEROOUT & split_flag)) {
|
||||
if (max_zeroout && (allocated > map->m_len)) {
|
||||
if (allocated <= max_zeroout) {
|
||||
/* case 3 */
|
||||
zero_ex.ee_block =
|
||||
cpu_to_le32(map->m_lblk);
|
||||
|
@ -3264,9 +3267,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
|
|||
goto out;
|
||||
split_map.m_lblk = map->m_lblk;
|
||||
split_map.m_len = allocated;
|
||||
} else if ((map->m_lblk - ee_block + map->m_len <
|
||||
EXT4_EXT_ZERO_LEN) &&
|
||||
(EXT4_EXT_MAY_ZEROOUT & split_flag)) {
|
||||
} else if (map->m_lblk - ee_block + map->m_len < max_zeroout) {
|
||||
/* case 2 */
|
||||
if (map->m_lblk != ee_block) {
|
||||
zero_ex.ee_block = ex->ee_block;
|
||||
|
@ -3286,7 +3287,7 @@ static int ext4_ext_convert_to_initialized(handle_t *handle,
|
|||
}
|
||||
|
||||
allocated = ext4_split_extent(handle, inode, path,
|
||||
&split_map, split_flag, 0);
|
||||
&split_map, split_flag, 0);
|
||||
if (allocated < 0)
|
||||
err = allocated;
|
||||
|
||||
|
|
|
@ -2541,6 +2541,7 @@ EXT4_RW_ATTR_SBI_UI(mb_order2_req, s_mb_order2_reqs);
|
|||
EXT4_RW_ATTR_SBI_UI(mb_stream_req, s_mb_stream_request);
|
||||
EXT4_RW_ATTR_SBI_UI(mb_group_prealloc, s_mb_group_prealloc);
|
||||
EXT4_RW_ATTR_SBI_UI(max_writeback_mb_bump, s_max_writeback_mb_bump);
|
||||
EXT4_RW_ATTR_SBI_UI(extent_max_zeroout_kb, s_extent_max_zeroout_kb);
|
||||
EXT4_ATTR(trigger_fs_error, 0200, NULL, trigger_test_error);
|
||||
|
||||
static struct attribute *ext4_attrs[] = {
|
||||
|
@ -2556,6 +2557,7 @@ static struct attribute *ext4_attrs[] = {
|
|||
ATTR_LIST(mb_stream_req),
|
||||
ATTR_LIST(mb_group_prealloc),
|
||||
ATTR_LIST(max_writeback_mb_bump),
|
||||
ATTR_LIST(extent_max_zeroout_kb),
|
||||
ATTR_LIST(trigger_fs_error),
|
||||
NULL,
|
||||
};
|
||||
|
@ -3756,6 +3758,7 @@ static int ext4_fill_super(struct super_block *sb, void *data, int silent)
|
|||
|
||||
sbi->s_stripe = ext4_get_stripe_size(sbi);
|
||||
sbi->s_max_writeback_mb_bump = 128;
|
||||
sbi->s_extent_max_zeroout_kb = 32;
|
||||
|
||||
/*
|
||||
* set up enough so that it can read an inode
|
||||
|
|
Loading…
Reference in New Issue