ext4: add largedir feature

This INCOMPAT_LARGEDIR feature allows larger directories to be created
in ldiskfs, both with directory sizes over 2GB and and a maximum htree
depth of 3 instead of the current limit of 2. These features are needed
in order to exceed the current limit of approximately 10M entries in a
single directory.

This patch was originally written by Yang Sheng to support the Lustre server.

[ Bumped the credits needed to update an indexed directory -- tytso ]

Signed-off-by: Liang Zhen <liang.zhen@intel.com>
Signed-off-by: Yang Sheng <yang.sheng@intel.com>
Signed-off-by: Artem Blagodarenko <artem.blagodarenko@seagate.com>
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Andreas Dilger <andreas.dilger@intel.com>
This commit is contained in:
Artem Blagodarenko 2017-06-21 21:09:57 -04:00 committed by Theodore Ts'o
parent 67a7d5f561
commit e08ac99fa2
4 changed files with 113 additions and 47 deletions

View File

@ -1800,7 +1800,8 @@ EXT4_FEATURE_INCOMPAT_FUNCS(encrypt, ENCRYPT)
EXT4_FEATURE_INCOMPAT_MMP | \ EXT4_FEATURE_INCOMPAT_MMP | \
EXT4_FEATURE_INCOMPAT_INLINE_DATA | \ EXT4_FEATURE_INCOMPAT_INLINE_DATA | \
EXT4_FEATURE_INCOMPAT_ENCRYPT | \ EXT4_FEATURE_INCOMPAT_ENCRYPT | \
EXT4_FEATURE_INCOMPAT_CSUM_SEED) EXT4_FEATURE_INCOMPAT_CSUM_SEED | \
EXT4_FEATURE_INCOMPAT_LARGEDIR)
#define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \ #define EXT4_FEATURE_RO_COMPAT_SUPP (EXT4_FEATURE_RO_COMPAT_SPARSE_SUPER| \
EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \ EXT4_FEATURE_RO_COMPAT_LARGE_FILE| \
EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \ EXT4_FEATURE_RO_COMPAT_GDT_CSUM| \
@ -2126,6 +2127,16 @@ ext4_group_first_block_no(struct super_block *sb, ext4_group_t group_no)
*/ */
#define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1)) #define ERR_BAD_DX_DIR (-(MAX_ERRNO - 1))
/* htree levels for ext4 */
#define EXT4_HTREE_LEVEL_COMPAT 2
#define EXT4_HTREE_LEVEL 3
static inline int ext4_dir_htree_level(struct super_block *sb)
{
return ext4_has_feature_largedir(sb) ?
EXT4_HTREE_LEVEL : EXT4_HTREE_LEVEL_COMPAT;
}
/* /*
* Timeout and state flag for lazy initialization inode thread. * Timeout and state flag for lazy initialization inode thread.
*/ */
@ -2756,13 +2767,15 @@ static inline void ext4_r_blocks_count_set(struct ext4_super_block *es,
es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32); es->s_r_blocks_count_hi = cpu_to_le32(blk >> 32);
} }
static inline loff_t ext4_isize(struct ext4_inode *raw_inode) static inline loff_t ext4_isize(struct super_block *sb,
struct ext4_inode *raw_inode)
{ {
if (S_ISREG(le16_to_cpu(raw_inode->i_mode))) if (ext4_has_feature_largedir(sb) ||
S_ISREG(le16_to_cpu(raw_inode->i_mode)))
return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) | return ((loff_t)le32_to_cpu(raw_inode->i_size_high) << 32) |
le32_to_cpu(raw_inode->i_size_lo); le32_to_cpu(raw_inode->i_size_lo);
else
return (loff_t) le32_to_cpu(raw_inode->i_size_lo); return (loff_t) le32_to_cpu(raw_inode->i_size_lo);
} }
static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size) static inline void ext4_isize_set(struct ext4_inode *raw_inode, loff_t i_size)

View File

@ -77,7 +77,14 @@
#define EXT4_RESERVE_TRANS_BLOCKS 12U #define EXT4_RESERVE_TRANS_BLOCKS 12U
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 8 /*
* Number of credits needed if we need to insert an entry into a
* directory. For each new index block, we need 4 blocks (old index
* block, new index block, bitmap block, bg summary). For normal
* htree directories there are 2 levels; if the largedir feature
* enabled it's 3 levels.
*/
#define EXT4_INDEX_EXTRA_TRANS_BLOCKS 12U
#ifdef CONFIG_QUOTA #ifdef CONFIG_QUOTA
/* Amount of blocks needed for quota update - we know that the structure was /* Amount of blocks needed for quota update - we know that the structure was

View File

@ -4712,7 +4712,7 @@ struct inode *ext4_iget(struct super_block *sb, unsigned long ino)
if (ext4_has_feature_64bit(sb)) if (ext4_has_feature_64bit(sb))
ei->i_file_acl |= ei->i_file_acl |=
((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32; ((__u64)le16_to_cpu(raw_inode->i_file_acl_high)) << 32;
inode->i_size = ext4_isize(raw_inode); inode->i_size = ext4_isize(sb, raw_inode);
if ((size = i_size_read(inode)) < 0) { if ((size = i_size_read(inode)) < 0) {
EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size); EXT4_ERROR_INODE(inode, "bad i_size value: %lld", size);
ret = -EFSCORRUPTED; ret = -EFSCORRUPTED;
@ -5037,7 +5037,7 @@ static int ext4_do_update_inode(handle_t *handle,
raw_inode->i_file_acl_high = raw_inode->i_file_acl_high =
cpu_to_le16(ei->i_file_acl >> 32); cpu_to_le16(ei->i_file_acl >> 32);
raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl); raw_inode->i_file_acl_lo = cpu_to_le32(ei->i_file_acl);
if (ei->i_disksize != ext4_isize(raw_inode)) { if (ei->i_disksize != ext4_isize(inode->i_sb, raw_inode)) {
ext4_isize_set(raw_inode, ei->i_disksize); ext4_isize_set(raw_inode, ei->i_disksize);
need_datasync = 1; need_datasync = 1;
} }

View File

@ -513,7 +513,7 @@ ext4_next_entry(struct ext4_dir_entry_2 *p, unsigned long blocksize)
static inline ext4_lblk_t dx_get_block(struct dx_entry *entry) static inline ext4_lblk_t dx_get_block(struct dx_entry *entry)
{ {
return le32_to_cpu(entry->block) & 0x00ffffff; return le32_to_cpu(entry->block) & 0x0fffffff;
} }
static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value) static inline void dx_set_block(struct dx_entry *entry, ext4_lblk_t value)
@ -739,6 +739,7 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR); struct dx_frame *ret_err = ERR_PTR(ERR_BAD_DX_DIR);
u32 hash; u32 hash;
memset(frame_in, 0, EXT4_HTREE_LEVEL * sizeof(frame_in[0]));
frame->bh = ext4_read_dirblock(dir, 0, INDEX); frame->bh = ext4_read_dirblock(dir, 0, INDEX);
if (IS_ERR(frame->bh)) if (IS_ERR(frame->bh))
return (struct dx_frame *) frame->bh; return (struct dx_frame *) frame->bh;
@ -768,9 +769,15 @@ dx_probe(struct ext4_filename *fname, struct inode *dir,
} }
indirect = root->info.indirect_levels; indirect = root->info.indirect_levels;
if (indirect > 1) { if (indirect >= ext4_dir_htree_level(dir->i_sb)) {
ext4_warning_inode(dir, "Unimplemented hash depth: %#06x", ext4_warning(dir->i_sb,
root->info.indirect_levels); "Directory (ino: %lu) htree depth %#06x exceed"
"supported value", dir->i_ino,
ext4_dir_htree_level(dir->i_sb));
if (ext4_dir_htree_level(dir->i_sb) < EXT4_HTREE_LEVEL) {
ext4_warning(dir->i_sb, "Enable large directory "
"feature to access it");
}
goto fail; goto fail;
} }
@ -859,12 +866,19 @@ fail:
static void dx_release(struct dx_frame *frames) static void dx_release(struct dx_frame *frames)
{ {
struct dx_root_info *info;
int i;
if (frames[0].bh == NULL) if (frames[0].bh == NULL)
return; return;
if (((struct dx_root *)frames[0].bh->b_data)->info.indirect_levels) info = &((struct dx_root *)frames[0].bh->b_data)->info;
brelse(frames[1].bh); for (i = 0; i <= info->indirect_levels; i++) {
brelse(frames[0].bh); if (frames[i].bh == NULL)
break;
brelse(frames[i].bh);
frames[i].bh = NULL;
}
} }
/* /*
@ -1050,7 +1064,7 @@ int ext4_htree_fill_tree(struct file *dir_file, __u32 start_hash,
{ {
struct dx_hash_info hinfo; struct dx_hash_info hinfo;
struct ext4_dir_entry_2 *de; struct ext4_dir_entry_2 *de;
struct dx_frame frames[2], *frame; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct inode *dir; struct inode *dir;
ext4_lblk_t block; ext4_lblk_t block;
int count = 0; int count = 0;
@ -1485,7 +1499,7 @@ static struct buffer_head * ext4_dx_find_entry(struct inode *dir,
struct ext4_dir_entry_2 **res_dir) struct ext4_dir_entry_2 **res_dir)
{ {
struct super_block * sb = dir->i_sb; struct super_block * sb = dir->i_sb;
struct dx_frame frames[2], *frame; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct buffer_head *bh; struct buffer_head *bh;
ext4_lblk_t block; ext4_lblk_t block;
int retval; int retval;
@ -1889,7 +1903,7 @@ static int add_dirent_to_buf(handle_t *handle, struct ext4_filename *fname,
*/ */
dir->i_mtime = dir->i_ctime = current_time(dir); dir->i_mtime = dir->i_ctime = current_time(dir);
ext4_update_dx_flag(dir); ext4_update_dx_flag(dir);
dir->i_version++; inode_inc_iversion(dir);
ext4_mark_inode_dirty(handle, dir); ext4_mark_inode_dirty(handle, dir);
BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata"); BUFFER_TRACE(bh, "call ext4_handle_dirty_metadata");
err = ext4_handle_dirty_dirent_node(handle, dir, bh); err = ext4_handle_dirty_dirent_node(handle, dir, bh);
@ -1908,7 +1922,7 @@ static int make_indexed_dir(handle_t *handle, struct ext4_filename *fname,
{ {
struct buffer_head *bh2; struct buffer_head *bh2;
struct dx_root *root; struct dx_root *root;
struct dx_frame frames[2], *frame; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries; struct dx_entry *entries;
struct ext4_dir_entry_2 *de, *de2; struct ext4_dir_entry_2 *de, *de2;
struct ext4_dir_entry_tail *t; struct ext4_dir_entry_tail *t;
@ -2127,13 +2141,16 @@ out:
static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname, static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
struct inode *dir, struct inode *inode) struct inode *dir, struct inode *inode)
{ {
struct dx_frame frames[2], *frame; struct dx_frame frames[EXT4_HTREE_LEVEL], *frame;
struct dx_entry *entries, *at; struct dx_entry *entries, *at;
struct buffer_head *bh; struct buffer_head *bh;
struct super_block *sb = dir->i_sb; struct super_block *sb = dir->i_sb;
struct ext4_dir_entry_2 *de; struct ext4_dir_entry_2 *de;
int restart;
int err; int err;
again:
restart = 0;
frame = dx_probe(fname, dir, NULL, frames); frame = dx_probe(fname, dir, NULL, frames);
if (IS_ERR(frame)) if (IS_ERR(frame))
return PTR_ERR(frame); return PTR_ERR(frame);
@ -2155,24 +2172,44 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
if (err != -ENOSPC) if (err != -ENOSPC)
goto cleanup; goto cleanup;
err = 0;
/* Block full, should compress but for now just split */ /* Block full, should compress but for now just split */
dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n", dxtrace(printk(KERN_DEBUG "using %u of %u node entries\n",
dx_get_count(entries), dx_get_limit(entries))); dx_get_count(entries), dx_get_limit(entries)));
/* Need to split index? */ /* Need to split index? */
if (dx_get_count(entries) == dx_get_limit(entries)) { if (dx_get_count(entries) == dx_get_limit(entries)) {
ext4_lblk_t newblock; ext4_lblk_t newblock;
unsigned icount = dx_get_count(entries); int levels = frame - frames + 1;
int levels = frame - frames; unsigned int icount;
int add_level = 1;
struct dx_entry *entries2; struct dx_entry *entries2;
struct dx_node *node2; struct dx_node *node2;
struct buffer_head *bh2; struct buffer_head *bh2;
if (levels && (dx_get_count(frames->entries) == while (frame > frames) {
dx_get_limit(frames->entries))) { if (dx_get_count((frame - 1)->entries) <
ext4_warning_inode(dir, "Directory index full!"); dx_get_limit((frame - 1)->entries)) {
add_level = 0;
break;
}
frame--; /* split higher index block */
at = frame->at;
entries = frame->entries;
restart = 1;
}
if (add_level && levels == ext4_dir_htree_level(sb)) {
ext4_warning(sb, "Directory (ino: %lu) index full, "
"reach max htree level :%d",
dir->i_ino, levels);
if (ext4_dir_htree_level(sb) < EXT4_HTREE_LEVEL) {
ext4_warning(sb, "Large directory feature is "
"not enabled on this "
"filesystem");
}
err = -ENOSPC; err = -ENOSPC;
goto cleanup; goto cleanup;
} }
icount = dx_get_count(entries);
bh2 = ext4_append(handle, dir, &newblock); bh2 = ext4_append(handle, dir, &newblock);
if (IS_ERR(bh2)) { if (IS_ERR(bh2)) {
err = PTR_ERR(bh2); err = PTR_ERR(bh2);
@ -2187,7 +2224,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
err = ext4_journal_get_write_access(handle, frame->bh); err = ext4_journal_get_write_access(handle, frame->bh);
if (err) if (err)
goto journal_error; goto journal_error;
if (levels) { if (!add_level) {
unsigned icount1 = icount/2, icount2 = icount - icount1; unsigned icount1 = icount/2, icount2 = icount - icount1;
unsigned hash2 = dx_get_hash(entries + icount1); unsigned hash2 = dx_get_hash(entries + icount1);
dxtrace(printk(KERN_DEBUG "Split index %i/%i\n", dxtrace(printk(KERN_DEBUG "Split index %i/%i\n",
@ -2195,7 +2232,7 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */ BUFFER_TRACE(frame->bh, "get_write_access"); /* index root */
err = ext4_journal_get_write_access(handle, err = ext4_journal_get_write_access(handle,
frames[0].bh); (frame - 1)->bh);
if (err) if (err)
goto journal_error; goto journal_error;
@ -2211,17 +2248,25 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
frame->entries = entries = entries2; frame->entries = entries = entries2;
swap(frame->bh, bh2); swap(frame->bh, bh2);
} }
dx_insert_block(frames + 0, hash2, newblock); dx_insert_block((frame - 1), hash2, newblock);
dxtrace(dx_show_index("node", frames[1].entries)); dxtrace(dx_show_index("node", frame->entries));
dxtrace(dx_show_index("node", dxtrace(dx_show_index("node",
((struct dx_node *) bh2->b_data)->entries)); ((struct dx_node *) bh2->b_data)->entries));
err = ext4_handle_dirty_dx_node(handle, dir, bh2); err = ext4_handle_dirty_dx_node(handle, dir, bh2);
if (err) if (err)
goto journal_error; goto journal_error;
brelse (bh2); brelse (bh2);
err = ext4_handle_dirty_dx_node(handle, dir,
(frame - 1)->bh);
if (err)
goto journal_error;
if (restart) {
err = ext4_handle_dirty_dx_node(handle, dir,
frame->bh);
goto journal_error;
}
} else { } else {
dxtrace(printk(KERN_DEBUG struct dx_root *dxroot;
"Creating second level index...\n"));
memcpy((char *) entries2, (char *) entries, memcpy((char *) entries2, (char *) entries,
icount * sizeof(struct dx_entry)); icount * sizeof(struct dx_entry));
dx_set_limit(entries2, dx_node_limit(dir)); dx_set_limit(entries2, dx_node_limit(dir));
@ -2229,22 +2274,18 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
/* Set up root */ /* Set up root */
dx_set_count(entries, 1); dx_set_count(entries, 1);
dx_set_block(entries + 0, newblock); dx_set_block(entries + 0, newblock);
((struct dx_root *) frames[0].bh->b_data)->info.indirect_levels = 1; dxroot = (struct dx_root *)frames[0].bh->b_data;
dxroot->info.indirect_levels += 1;
/* Add new access path frame */ dxtrace(printk(KERN_DEBUG
frame = frames + 1; "Creating %d level index...\n",
frame->at = at = at - entries + entries2; info->indirect_levels));
frame->entries = entries = entries2; err = ext4_handle_dirty_dx_node(handle, dir, frame->bh);
frame->bh = bh2;
err = ext4_journal_get_write_access(handle,
frame->bh);
if (err) if (err)
goto journal_error; goto journal_error;
} err = ext4_handle_dirty_dx_node(handle, dir, bh2);
err = ext4_handle_dirty_dx_node(handle, dir, frames[0].bh); brelse(bh2);
if (err) { restart = 1;
ext4_std_error(inode->i_sb, err); goto journal_error;
goto cleanup;
} }
} }
de = do_split(handle, dir, &bh, frame, &fname->hinfo); de = do_split(handle, dir, &bh, frame, &fname->hinfo);
@ -2256,10 +2297,15 @@ static int ext4_dx_add_entry(handle_t *handle, struct ext4_filename *fname,
goto cleanup; goto cleanup;
journal_error: journal_error:
ext4_std_error(dir->i_sb, err); ext4_std_error(dir->i_sb, err); /* this is a no-op if err == 0 */
cleanup: cleanup:
brelse(bh); brelse(bh);
dx_release(frames); dx_release(frames);
/* @restart is true means htree-path has been changed, we need to
* repeat dx_probe() to find out valid htree-path
*/
if (restart && err == 0)
goto again;
return err; return err;
} }
@ -2296,7 +2342,7 @@ int ext4_generic_delete_entry(handle_t *handle,
blocksize); blocksize);
else else
de->inode = 0; de->inode = 0;
dir->i_version++; inode_inc_iversion(dir);
return 0; return 0;
} }
i += ext4_rec_len_from_disk(de->rec_len, blocksize); i += ext4_rec_len_from_disk(de->rec_len, blocksize);