OpenCloudOS-Kernel/fs/ocfs2/refcounttree.c

4794 lines
120 KiB
C
Raw Normal View History

// SPDX-License-Identifier: GPL-2.0-only
/*
* refcounttree.c
*
* Copyright (C) 2009 Oracle. All rights reserved.
*/
#include <linux/sort.h>
#include <cluster/masklog.h>
#include "ocfs2.h"
#include "inode.h"
#include "alloc.h"
#include "suballoc.h"
#include "journal.h"
#include "uptodate.h"
#include "super.h"
#include "buffer_head_io.h"
#include "blockcheck.h"
#include "refcounttree.h"
#include "sysfile.h"
#include "dlmglue.h"
#include "extent_map.h"
#include "aops.h"
#include "xattr.h"
#include "namei.h"
#include "ocfs2_trace.h"
#include "file.h"
#include <linux/bio.h>
#include <linux/blkdev.h>
#include <linux/slab.h>
#include <linux/writeback.h>
#include <linux/pagevec.h>
#include <linux/swap.h>
#include <linux/security.h>
#include <linux/fsnotify.h>
#include <linux/quotaops.h>
#include <linux/namei.h>
#include <linux/mount.h>
#include <linux/posix_acl.h>
struct ocfs2_cow_context {
struct inode *inode;
u32 cow_start;
u32 cow_len;
struct ocfs2_extent_tree data_et;
struct ocfs2_refcount_tree *ref_tree;
struct buffer_head *ref_root_bh;
struct ocfs2_alloc_context *meta_ac;
struct ocfs2_alloc_context *data_ac;
struct ocfs2_cached_dealloc_ctxt dealloc;
void *cow_object;
struct ocfs2_post_refcount *post_refcount;
int extra_credits;
int (*get_clusters)(struct ocfs2_cow_context *context,
u32 v_cluster, u32 *p_cluster,
u32 *num_clusters,
unsigned int *extent_flags);
int (*cow_duplicate_clusters)(handle_t *handle,
struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len);
};
static inline struct ocfs2_refcount_tree *
cache_info_to_refcount(struct ocfs2_caching_info *ci)
{
return container_of(ci, struct ocfs2_refcount_tree, rf_ci);
}
static int ocfs2_validate_refcount_block(struct super_block *sb,
struct buffer_head *bh)
{
int rc;
struct ocfs2_refcount_block *rb =
(struct ocfs2_refcount_block *)bh->b_data;
trace_ocfs2_validate_refcount_block((unsigned long long)bh->b_blocknr);
BUG_ON(!buffer_uptodate(bh));
/*
* If the ecc fails, we return the error but otherwise
* leave the filesystem running. We know any error is
* local to this block.
*/
rc = ocfs2_validate_meta_ecc(sb, bh->b_data, &rb->rf_check);
if (rc) {
mlog(ML_ERROR, "Checksum failed for refcount block %llu\n",
(unsigned long long)bh->b_blocknr);
return rc;
}
if (!OCFS2_IS_VALID_REFCOUNT_BLOCK(rb)) {
rc = ocfs2_error(sb,
"Refcount block #%llu has bad signature %.*s\n",
(unsigned long long)bh->b_blocknr, 7,
rb->rf_signature);
goto out;
}
if (le64_to_cpu(rb->rf_blkno) != bh->b_blocknr) {
rc = ocfs2_error(sb,
"Refcount block #%llu has an invalid rf_blkno of %llu\n",
(unsigned long long)bh->b_blocknr,
(unsigned long long)le64_to_cpu(rb->rf_blkno));
goto out;
}
if (le32_to_cpu(rb->rf_fs_generation) != OCFS2_SB(sb)->fs_generation) {
rc = ocfs2_error(sb,
"Refcount block #%llu has an invalid rf_fs_generation of #%u\n",
(unsigned long long)bh->b_blocknr,
le32_to_cpu(rb->rf_fs_generation));
goto out;
}
out:
return rc;
}
static int ocfs2_read_refcount_block(struct ocfs2_caching_info *ci,
u64 rb_blkno,
struct buffer_head **bh)
{
int rc;
struct buffer_head *tmp = *bh;
rc = ocfs2_read_block(ci, rb_blkno, &tmp,
ocfs2_validate_refcount_block);
/* If ocfs2_read_block() got us a new bh, pass it up. */
if (!rc && !*bh)
*bh = tmp;
return rc;
}
static u64 ocfs2_refcount_cache_owner(struct ocfs2_caching_info *ci)
{
struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
return rf->rf_blkno;
}
static struct super_block *
ocfs2_refcount_cache_get_super(struct ocfs2_caching_info *ci)
{
struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
return rf->rf_sb;
}
static void ocfs2_refcount_cache_lock(struct ocfs2_caching_info *ci)
__acquires(&rf->rf_lock)
{
struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
spin_lock(&rf->rf_lock);
}
static void ocfs2_refcount_cache_unlock(struct ocfs2_caching_info *ci)
__releases(&rf->rf_lock)
{
struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
spin_unlock(&rf->rf_lock);
}
static void ocfs2_refcount_cache_io_lock(struct ocfs2_caching_info *ci)
{
struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
mutex_lock(&rf->rf_io_mutex);
}
static void ocfs2_refcount_cache_io_unlock(struct ocfs2_caching_info *ci)
{
struct ocfs2_refcount_tree *rf = cache_info_to_refcount(ci);
mutex_unlock(&rf->rf_io_mutex);
}
static const struct ocfs2_caching_operations ocfs2_refcount_caching_ops = {
.co_owner = ocfs2_refcount_cache_owner,
.co_get_super = ocfs2_refcount_cache_get_super,
.co_cache_lock = ocfs2_refcount_cache_lock,
.co_cache_unlock = ocfs2_refcount_cache_unlock,
.co_io_lock = ocfs2_refcount_cache_io_lock,
.co_io_unlock = ocfs2_refcount_cache_io_unlock,
};
static struct ocfs2_refcount_tree *
ocfs2_find_refcount_tree(struct ocfs2_super *osb, u64 blkno)
{
struct rb_node *n = osb->osb_rf_lock_tree.rb_node;
struct ocfs2_refcount_tree *tree = NULL;
while (n) {
tree = rb_entry(n, struct ocfs2_refcount_tree, rf_node);
if (blkno < tree->rf_blkno)
n = n->rb_left;
else if (blkno > tree->rf_blkno)
n = n->rb_right;
else
return tree;
}
return NULL;
}
/* osb_lock is already locked. */
static void ocfs2_insert_refcount_tree(struct ocfs2_super *osb,
struct ocfs2_refcount_tree *new)
{
u64 rf_blkno = new->rf_blkno;
struct rb_node *parent = NULL;
struct rb_node **p = &osb->osb_rf_lock_tree.rb_node;
struct ocfs2_refcount_tree *tmp;
while (*p) {
parent = *p;
tmp = rb_entry(parent, struct ocfs2_refcount_tree,
rf_node);
if (rf_blkno < tmp->rf_blkno)
p = &(*p)->rb_left;
else if (rf_blkno > tmp->rf_blkno)
p = &(*p)->rb_right;
else {
/* This should never happen! */
mlog(ML_ERROR, "Duplicate refcount block %llu found!\n",
(unsigned long long)rf_blkno);
BUG();
}
}
rb_link_node(&new->rf_node, parent, p);
rb_insert_color(&new->rf_node, &osb->osb_rf_lock_tree);
}
static void ocfs2_free_refcount_tree(struct ocfs2_refcount_tree *tree)
{
ocfs2_metadata_cache_exit(&tree->rf_ci);
ocfs2_simple_drop_lockres(OCFS2_SB(tree->rf_sb), &tree->rf_lockres);
ocfs2_lock_res_free(&tree->rf_lockres);
kfree(tree);
}
static inline void
ocfs2_erase_refcount_tree_from_list_no_lock(struct ocfs2_super *osb,
struct ocfs2_refcount_tree *tree)
{
rb_erase(&tree->rf_node, &osb->osb_rf_lock_tree);
if (osb->osb_ref_tree_lru && osb->osb_ref_tree_lru == tree)
osb->osb_ref_tree_lru = NULL;
}
static void ocfs2_erase_refcount_tree_from_list(struct ocfs2_super *osb,
struct ocfs2_refcount_tree *tree)
{
spin_lock(&osb->osb_lock);
ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
spin_unlock(&osb->osb_lock);
}
static void ocfs2_kref_remove_refcount_tree(struct kref *kref)
{
struct ocfs2_refcount_tree *tree =
container_of(kref, struct ocfs2_refcount_tree, rf_getcnt);
ocfs2_free_refcount_tree(tree);
}
static inline void
ocfs2_refcount_tree_get(struct ocfs2_refcount_tree *tree)
{
kref_get(&tree->rf_getcnt);
}
static inline void
ocfs2_refcount_tree_put(struct ocfs2_refcount_tree *tree)
{
kref_put(&tree->rf_getcnt, ocfs2_kref_remove_refcount_tree);
}
static inline void ocfs2_init_refcount_tree_ci(struct ocfs2_refcount_tree *new,
struct super_block *sb)
{
ocfs2_metadata_cache_init(&new->rf_ci, &ocfs2_refcount_caching_ops);
mutex_init(&new->rf_io_mutex);
new->rf_sb = sb;
spin_lock_init(&new->rf_lock);
}
static inline void ocfs2_init_refcount_tree_lock(struct ocfs2_super *osb,
struct ocfs2_refcount_tree *new,
u64 rf_blkno, u32 generation)
{
init_rwsem(&new->rf_sem);
ocfs2_refcount_lock_res_init(&new->rf_lockres, osb,
rf_blkno, generation);
}
static struct ocfs2_refcount_tree*
ocfs2_allocate_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno)
{
struct ocfs2_refcount_tree *new;
new = kzalloc(sizeof(struct ocfs2_refcount_tree), GFP_NOFS);
if (!new)
return NULL;
new->rf_blkno = rf_blkno;
kref_init(&new->rf_getcnt);
ocfs2_init_refcount_tree_ci(new, osb->sb);
return new;
}
static int ocfs2_get_refcount_tree(struct ocfs2_super *osb, u64 rf_blkno,
struct ocfs2_refcount_tree **ret_tree)
{
int ret = 0;
struct ocfs2_refcount_tree *tree, *new = NULL;
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_block *ref_rb;
spin_lock(&osb->osb_lock);
if (osb->osb_ref_tree_lru &&
osb->osb_ref_tree_lru->rf_blkno == rf_blkno)
tree = osb->osb_ref_tree_lru;
else
tree = ocfs2_find_refcount_tree(osb, rf_blkno);
if (tree)
goto out;
spin_unlock(&osb->osb_lock);
new = ocfs2_allocate_refcount_tree(osb, rf_blkno);
if (!new) {
ret = -ENOMEM;
mlog_errno(ret);
return ret;
}
/*
* We need the generation to create the refcount tree lock and since
* it isn't changed during the tree modification, we are safe here to
* read without protection.
* We also have to purge the cache after we create the lock since the
* refcount block may have the stale data. It can only be trusted when
* we hold the refcount lock.
*/
ret = ocfs2_read_refcount_block(&new->rf_ci, rf_blkno, &ref_root_bh);
if (ret) {
mlog_errno(ret);
ocfs2_metadata_cache_exit(&new->rf_ci);
kfree(new);
return ret;
}
ref_rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
new->rf_generation = le32_to_cpu(ref_rb->rf_generation);
ocfs2_init_refcount_tree_lock(osb, new, rf_blkno,
new->rf_generation);
ocfs2_metadata_cache_purge(&new->rf_ci);
spin_lock(&osb->osb_lock);
tree = ocfs2_find_refcount_tree(osb, rf_blkno);
if (tree)
goto out;
ocfs2_insert_refcount_tree(osb, new);
tree = new;
new = NULL;
out:
*ret_tree = tree;
osb->osb_ref_tree_lru = tree;
spin_unlock(&osb->osb_lock);
if (new)
ocfs2_free_refcount_tree(new);
brelse(ref_root_bh);
return ret;
}
static int ocfs2_get_refcount_block(struct inode *inode, u64 *ref_blkno)
{
int ret;
struct buffer_head *di_bh = NULL;
struct ocfs2_dinode *di;
ret = ocfs2_read_inode_block(inode, &di_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
BUG_ON(!ocfs2_is_refcount_inode(inode));
di = (struct ocfs2_dinode *)di_bh->b_data;
*ref_blkno = le64_to_cpu(di->i_refcount_loc);
brelse(di_bh);
out:
return ret;
}
static int __ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
struct ocfs2_refcount_tree *tree, int rw)
{
int ret;
ret = ocfs2_refcount_lock(tree, rw);
if (ret) {
mlog_errno(ret);
goto out;
}
if (rw)
down_write(&tree->rf_sem);
else
down_read(&tree->rf_sem);
out:
return ret;
}
/*
* Lock the refcount tree pointed by ref_blkno and return the tree.
* In most case, we lock the tree and read the refcount block.
* So read it here if the caller really needs it.
*
* If the tree has been re-created by other node, it will free the
* old one and re-create it.
*/
int ocfs2_lock_refcount_tree(struct ocfs2_super *osb,
u64 ref_blkno, int rw,
struct ocfs2_refcount_tree **ret_tree,
struct buffer_head **ref_bh)
{
int ret, delete_tree = 0;
struct ocfs2_refcount_tree *tree = NULL;
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_block *rb;
again:
ret = ocfs2_get_refcount_tree(osb, ref_blkno, &tree);
if (ret) {
mlog_errno(ret);
return ret;
}
ocfs2_refcount_tree_get(tree);
ret = __ocfs2_lock_refcount_tree(osb, tree, rw);
if (ret) {
mlog_errno(ret);
ocfs2_refcount_tree_put(tree);
goto out;
}
ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
&ref_root_bh);
if (ret) {
mlog_errno(ret);
ocfs2_unlock_refcount_tree(osb, tree, rw);
goto out;
}
rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
/*
* If the refcount block has been freed and re-created, we may need
* to recreate the refcount tree also.
*
* Here we just remove the tree from the rb-tree, and the last
* kref holder will unlock and delete this refcount_tree.
* Then we goto "again" and ocfs2_get_refcount_tree will create
* the new refcount tree for us.
*/
if (tree->rf_generation != le32_to_cpu(rb->rf_generation)) {
if (!tree->rf_removed) {
ocfs2_erase_refcount_tree_from_list(osb, tree);
tree->rf_removed = 1;
delete_tree = 1;
}
ocfs2_unlock_refcount_tree(osb, tree, rw);
/*
* We get an extra reference when we create the refcount
* tree, so another put will destroy it.
*/
if (delete_tree)
ocfs2_refcount_tree_put(tree);
brelse(ref_root_bh);
ref_root_bh = NULL;
goto again;
}
*ret_tree = tree;
if (ref_bh) {
*ref_bh = ref_root_bh;
ref_root_bh = NULL;
}
out:
brelse(ref_root_bh);
return ret;
}
void ocfs2_unlock_refcount_tree(struct ocfs2_super *osb,
struct ocfs2_refcount_tree *tree, int rw)
{
if (rw)
up_write(&tree->rf_sem);
else
up_read(&tree->rf_sem);
ocfs2_refcount_unlock(tree, rw);
ocfs2_refcount_tree_put(tree);
}
void ocfs2_purge_refcount_trees(struct ocfs2_super *osb)
{
struct rb_node *node;
struct ocfs2_refcount_tree *tree;
struct rb_root *root = &osb->osb_rf_lock_tree;
while ((node = rb_last(root)) != NULL) {
tree = rb_entry(node, struct ocfs2_refcount_tree, rf_node);
trace_ocfs2_purge_refcount_trees(
(unsigned long long) tree->rf_blkno);
rb_erase(&tree->rf_node, root);
ocfs2_free_refcount_tree(tree);
}
}
/*
* Create a refcount tree for an inode.
* We take for granted that the inode is already locked.
*/
static int ocfs2_create_refcount_tree(struct inode *inode,
struct buffer_head *di_bh)
{
int ret;
handle_t *handle = NULL;
struct ocfs2_alloc_context *meta_ac = NULL;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *new_bh = NULL;
struct ocfs2_refcount_block *rb;
struct ocfs2_refcount_tree *new_tree = NULL, *tree = NULL;
u16 suballoc_bit_start;
u32 num_got;
u64 suballoc_loc, first_blkno;
BUG_ON(ocfs2_is_refcount_inode(inode));
trace_ocfs2_create_refcount_tree(
(unsigned long long)oi->ip_blkno);
ret = ocfs2_reserve_new_metadata_blocks(osb, 1, &meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_CREATE_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out;
}
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
&suballoc_bit_start, &num_got,
&first_blkno);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
new_tree = ocfs2_allocate_refcount_tree(osb, first_blkno);
if (!new_tree) {
ret = -ENOMEM;
mlog_errno(ret);
goto out_commit;
}
new_bh = sb_getblk(inode->i_sb, first_blkno);
if (!new_bh) {
ret = -ENOMEM;
mlog_errno(ret);
goto out_commit;
}
ocfs2_set_new_buffer_uptodate(&new_tree->rf_ci, new_bh);
ret = ocfs2_journal_access_rb(handle, &new_tree->rf_ci, new_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
/* Initialize ocfs2_refcount_block. */
rb = (struct ocfs2_refcount_block *)new_bh->b_data;
memset(rb, 0, inode->i_sb->s_blocksize);
strcpy((void *)rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
rb->rf_fs_generation = cpu_to_le32(osb->fs_generation);
rb->rf_blkno = cpu_to_le64(first_blkno);
rb->rf_count = cpu_to_le32(1);
rb->rf_records.rl_count =
cpu_to_le16(ocfs2_refcount_recs_per_rb(osb->sb));
spin_lock(&osb->osb_lock);
rb->rf_generation = osb->s_next_generation++;
spin_unlock(&osb->osb_lock);
ocfs2_journal_dirty(handle, new_bh);
spin_lock(&oi->ip_lock);
oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
di->i_refcount_loc = cpu_to_le64(first_blkno);
spin_unlock(&oi->ip_lock);
trace_ocfs2_create_refcount_tree_blkno((unsigned long long)first_blkno);
ocfs2_journal_dirty(handle, di_bh);
/*
* We have to init the tree lock here since it will use
* the generation number to create it.
*/
new_tree->rf_generation = le32_to_cpu(rb->rf_generation);
ocfs2_init_refcount_tree_lock(osb, new_tree, first_blkno,
new_tree->rf_generation);
spin_lock(&osb->osb_lock);
tree = ocfs2_find_refcount_tree(osb, first_blkno);
/*
* We've just created a new refcount tree in this block. If
* we found a refcount tree on the ocfs2_super, it must be
* one we just deleted. We free the old tree before
* inserting the new tree.
*/
BUG_ON(tree && tree->rf_generation == new_tree->rf_generation);
if (tree)
ocfs2_erase_refcount_tree_from_list_no_lock(osb, tree);
ocfs2_insert_refcount_tree(osb, new_tree);
spin_unlock(&osb->osb_lock);
new_tree = NULL;
if (tree)
ocfs2_refcount_tree_put(tree);
out_commit:
ocfs2_commit_trans(osb, handle);
out:
if (new_tree) {
ocfs2_metadata_cache_exit(&new_tree->rf_ci);
kfree(new_tree);
}
brelse(new_bh);
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
return ret;
}
static int ocfs2_set_refcount_tree(struct inode *inode,
struct buffer_head *di_bh,
u64 refcount_loc)
{
int ret;
handle_t *handle = NULL;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_block *rb;
struct ocfs2_refcount_tree *ref_tree;
BUG_ON(ocfs2_is_refcount_inode(inode));
ret = ocfs2_lock_refcount_tree(osb, refcount_loc, 1,
&ref_tree, &ref_root_bh);
if (ret) {
mlog_errno(ret);
return ret;
}
handle = ocfs2_start_trans(osb, OCFS2_REFCOUNT_TREE_SET_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out;
}
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, ref_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
le32_add_cpu(&rb->rf_count, 1);
ocfs2_journal_dirty(handle, ref_root_bh);
spin_lock(&oi->ip_lock);
oi->ip_dyn_features |= OCFS2_HAS_REFCOUNT_FL;
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
di->i_refcount_loc = cpu_to_le64(refcount_loc);
spin_unlock(&oi->ip_lock);
ocfs2_journal_dirty(handle, di_bh);
out_commit:
ocfs2_commit_trans(osb, handle);
out:
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh);
return ret;
}
int ocfs2_remove_refcount_tree(struct inode *inode, struct buffer_head *di_bh)
{
int ret, delete_tree = 0;
handle_t *handle = NULL;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_refcount_block *rb;
struct inode *alloc_inode = NULL;
struct buffer_head *alloc_bh = NULL;
struct buffer_head *blk_bh = NULL;
struct ocfs2_refcount_tree *ref_tree;
int credits = OCFS2_REFCOUNT_TREE_REMOVE_CREDITS;
u64 blk = 0, bg_blkno = 0, ref_blkno = le64_to_cpu(di->i_refcount_loc);
u16 bit = 0;
if (!ocfs2_is_refcount_inode(inode))
return 0;
BUG_ON(!ref_blkno);
ret = ocfs2_lock_refcount_tree(osb, ref_blkno, 1, &ref_tree, &blk_bh);
if (ret) {
mlog_errno(ret);
return ret;
}
rb = (struct ocfs2_refcount_block *)blk_bh->b_data;
/*
* If we are the last user, we need to free the block.
* So lock the allocator ahead.
*/
if (le32_to_cpu(rb->rf_count) == 1) {
blk = le64_to_cpu(rb->rf_blkno);
bit = le16_to_cpu(rb->rf_suballoc_bit);
if (rb->rf_suballoc_loc)
bg_blkno = le64_to_cpu(rb->rf_suballoc_loc);
else
bg_blkno = ocfs2_which_suballoc_group(blk, bit);
alloc_inode = ocfs2_get_system_file_inode(osb,
EXTENT_ALLOC_SYSTEM_INODE,
le16_to_cpu(rb->rf_suballoc_slot));
if (!alloc_inode) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
inode_lock(alloc_inode);
ret = ocfs2_inode_lock(alloc_inode, &alloc_bh, 1);
if (ret) {
mlog_errno(ret);
goto out_mutex;
}
credits += OCFS2_SUBALLOC_FREE;
}
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out_unlock;
}
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
ret = ocfs2_journal_access_rb(handle, &ref_tree->rf_ci, blk_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
spin_lock(&oi->ip_lock);
oi->ip_dyn_features &= ~OCFS2_HAS_REFCOUNT_FL;
di->i_dyn_features = cpu_to_le16(oi->ip_dyn_features);
di->i_refcount_loc = 0;
spin_unlock(&oi->ip_lock);
ocfs2_journal_dirty(handle, di_bh);
le32_add_cpu(&rb->rf_count , -1);
ocfs2_journal_dirty(handle, blk_bh);
if (!rb->rf_count) {
delete_tree = 1;
ocfs2_erase_refcount_tree_from_list(osb, ref_tree);
ret = ocfs2_free_suballoc_bits(handle, alloc_inode,
alloc_bh, bit, bg_blkno, 1);
if (ret)
mlog_errno(ret);
}
out_commit:
ocfs2_commit_trans(osb, handle);
out_unlock:
if (alloc_inode) {
ocfs2_inode_unlock(alloc_inode, 1);
brelse(alloc_bh);
}
out_mutex:
if (alloc_inode) {
inode_unlock(alloc_inode);
iput(alloc_inode);
}
out:
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
if (delete_tree)
ocfs2_refcount_tree_put(ref_tree);
brelse(blk_bh);
return ret;
}
static void ocfs2_find_refcount_rec_in_rl(struct ocfs2_caching_info *ci,
struct buffer_head *ref_leaf_bh,
u64 cpos, unsigned int len,
struct ocfs2_refcount_rec *ret_rec,
int *index)
{
int i = 0;
struct ocfs2_refcount_block *rb =
(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
struct ocfs2_refcount_rec *rec = NULL;
for (; i < le16_to_cpu(rb->rf_records.rl_used); i++) {
rec = &rb->rf_records.rl_recs[i];
if (le64_to_cpu(rec->r_cpos) +
le32_to_cpu(rec->r_clusters) <= cpos)
continue;
else if (le64_to_cpu(rec->r_cpos) > cpos)
break;
/* ok, cpos fail in this rec. Just return. */
if (ret_rec)
*ret_rec = *rec;
goto out;
}
if (ret_rec) {
/* We meet with a hole here, so fake the rec. */
ret_rec->r_cpos = cpu_to_le64(cpos);
ret_rec->r_refcount = 0;
if (i < le16_to_cpu(rb->rf_records.rl_used) &&
le64_to_cpu(rec->r_cpos) < cpos + len)
ret_rec->r_clusters =
cpu_to_le32(le64_to_cpu(rec->r_cpos) - cpos);
else
ret_rec->r_clusters = cpu_to_le32(len);
}
out:
*index = i;
}
/*
* Try to remove refcount tree. The mechanism is:
* 1) Check whether i_clusters == 0, if no, exit.
* 2) check whether we have i_xattr_loc in dinode. if yes, exit.
* 3) Check whether we have inline xattr stored outside, if yes, exit.
* 4) Remove the tree.
*/
int ocfs2_try_remove_refcount_tree(struct inode *inode,
struct buffer_head *di_bh)
{
int ret;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
down_write(&oi->ip_xattr_sem);
down_write(&oi->ip_alloc_sem);
if (oi->ip_clusters)
goto out;
if ((oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) && di->i_xattr_loc)
goto out;
if (oi->ip_dyn_features & OCFS2_INLINE_XATTR_FL &&
ocfs2_has_inline_xattr_value_outside(inode, di))
goto out;
ret = ocfs2_remove_refcount_tree(inode, di_bh);
if (ret)
mlog_errno(ret);
out:
up_write(&oi->ip_alloc_sem);
up_write(&oi->ip_xattr_sem);
return 0;
}
ocfs2: Find proper end cpos for a leaf refcount block. ocfs2 refcount tree is stored as an extent tree while the leaf ocfs2_refcount_rec points to a refcount block. The following step can trip a kernel panic. mkfs.ocfs2 -b 512 -C 1M --fs-features=refcount $DEVICE mount -t ocfs2 $DEVICE $MNT_DIR FILE_NAME=$RANDOM FILE_NAME_1=$RANDOM FILE_REF="${FILE_NAME}_ref" FILE_REF_1="${FILE_NAME}_ref_1" for((i=0;i<305;i++)) do # /mnt/1048576 is a file with 1048576 sizes. cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done for((i=0;i<3;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME done for((i=0;i<2;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME for((i=0;i<11;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF # write_f is a program which will write some bytes to a file at offset. # write_f -f file_name -l offset -w write_bytes. ./write_f -f $MNT_DIR/$FILE_REF -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[306*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[311*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF_1 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 #kernel panic here. The reason is that if the ocfs2_extent_rec is the last record in a leaf extent block, the old solution fails to find the suitable end cpos. So this patch try to walk through the b-tree, find the next sub root and get the c_pos the next sub-tree starts from. btw, I have runned tristan's test case against the patched kernel for several days and this type of kernel panic never happens again. Signed-off-by: Tao Ma <tao.ma@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2009-11-30 14:32:19 +08:00
/*
* Find the end range for a leaf refcount block indicated by
* el->l_recs[index].e_blkno.
*/
static int ocfs2_get_refcount_cpos_end(struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
struct ocfs2_extent_block *eb,
struct ocfs2_extent_list *el,
int index, u32 *cpos_end)
{
int ret, i, subtree_root;
u32 cpos;
u64 blkno;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct ocfs2_path *left_path = NULL, *right_path = NULL;
struct ocfs2_extent_tree et;
struct ocfs2_extent_list *tmp_el;
if (index < le16_to_cpu(el->l_next_free_rec) - 1) {
/*
* We have a extent rec after index, so just use the e_cpos
* of the next extent rec.
*/
*cpos_end = le32_to_cpu(el->l_recs[index+1].e_cpos);
return 0;
}
if (!eb || !eb->h_next_leaf_blk) {
ocfs2: Find proper end cpos for a leaf refcount block. ocfs2 refcount tree is stored as an extent tree while the leaf ocfs2_refcount_rec points to a refcount block. The following step can trip a kernel panic. mkfs.ocfs2 -b 512 -C 1M --fs-features=refcount $DEVICE mount -t ocfs2 $DEVICE $MNT_DIR FILE_NAME=$RANDOM FILE_NAME_1=$RANDOM FILE_REF="${FILE_NAME}_ref" FILE_REF_1="${FILE_NAME}_ref_1" for((i=0;i<305;i++)) do # /mnt/1048576 is a file with 1048576 sizes. cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done for((i=0;i<3;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME done for((i=0;i<2;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME for((i=0;i<11;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF # write_f is a program which will write some bytes to a file at offset. # write_f -f file_name -l offset -w write_bytes. ./write_f -f $MNT_DIR/$FILE_REF -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[306*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[311*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF_1 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 #kernel panic here. The reason is that if the ocfs2_extent_rec is the last record in a leaf extent block, the old solution fails to find the suitable end cpos. So this patch try to walk through the b-tree, find the next sub root and get the c_pos the next sub-tree starts from. btw, I have runned tristan's test case against the patched kernel for several days and this type of kernel panic never happens again. Signed-off-by: Tao Ma <tao.ma@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2009-11-30 14:32:19 +08:00
/*
* We are the last extent rec, so any high cpos should
* be stored in this leaf refcount block.
*/
*cpos_end = UINT_MAX;
return 0;
}
/*
* If the extent block isn't the last one, we have to find
* the subtree root between this extent block and the next
* leaf extent block and get the corresponding e_cpos from
* the subroot. Otherwise we may corrupt the b-tree.
*/
ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
left_path = ocfs2_new_path_from_et(&et);
if (!left_path) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
cpos = le32_to_cpu(eb->h_list.l_recs[index].e_cpos);
ret = ocfs2_find_path(ci, left_path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
right_path = ocfs2_new_path_from_path(left_path);
if (!right_path) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
ret = ocfs2_find_cpos_for_right_leaf(sb, left_path, &cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_find_path(ci, right_path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
subtree_root = ocfs2_find_subtree_root(&et, left_path,
right_path);
tmp_el = left_path->p_node[subtree_root].el;
blkno = left_path->p_node[subtree_root+1].bh->b_blocknr;
for (i = 0; i < le16_to_cpu(tmp_el->l_next_free_rec); i++) {
ocfs2: Find proper end cpos for a leaf refcount block. ocfs2 refcount tree is stored as an extent tree while the leaf ocfs2_refcount_rec points to a refcount block. The following step can trip a kernel panic. mkfs.ocfs2 -b 512 -C 1M --fs-features=refcount $DEVICE mount -t ocfs2 $DEVICE $MNT_DIR FILE_NAME=$RANDOM FILE_NAME_1=$RANDOM FILE_REF="${FILE_NAME}_ref" FILE_REF_1="${FILE_NAME}_ref_1" for((i=0;i<305;i++)) do # /mnt/1048576 is a file with 1048576 sizes. cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done for((i=0;i<3;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME done for((i=0;i<2;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME for((i=0;i<11;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF # write_f is a program which will write some bytes to a file at offset. # write_f -f file_name -l offset -w write_bytes. ./write_f -f $MNT_DIR/$FILE_REF -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[306*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[311*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF_1 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 #kernel panic here. The reason is that if the ocfs2_extent_rec is the last record in a leaf extent block, the old solution fails to find the suitable end cpos. So this patch try to walk through the b-tree, find the next sub root and get the c_pos the next sub-tree starts from. btw, I have runned tristan's test case against the patched kernel for several days and this type of kernel panic never happens again. Signed-off-by: Tao Ma <tao.ma@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2009-11-30 14:32:19 +08:00
if (le64_to_cpu(tmp_el->l_recs[i].e_blkno) == blkno) {
*cpos_end = le32_to_cpu(tmp_el->l_recs[i+1].e_cpos);
break;
}
}
BUG_ON(i == le16_to_cpu(tmp_el->l_next_free_rec));
ocfs2: Find proper end cpos for a leaf refcount block. ocfs2 refcount tree is stored as an extent tree while the leaf ocfs2_refcount_rec points to a refcount block. The following step can trip a kernel panic. mkfs.ocfs2 -b 512 -C 1M --fs-features=refcount $DEVICE mount -t ocfs2 $DEVICE $MNT_DIR FILE_NAME=$RANDOM FILE_NAME_1=$RANDOM FILE_REF="${FILE_NAME}_ref" FILE_REF_1="${FILE_NAME}_ref_1" for((i=0;i<305;i++)) do # /mnt/1048576 is a file with 1048576 sizes. cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done for((i=0;i<3;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME done for((i=0;i<2;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME for((i=0;i<11;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF # write_f is a program which will write some bytes to a file at offset. # write_f -f file_name -l offset -w write_bytes. ./write_f -f $MNT_DIR/$FILE_REF -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[306*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[311*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF_1 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 #kernel panic here. The reason is that if the ocfs2_extent_rec is the last record in a leaf extent block, the old solution fails to find the suitable end cpos. So this patch try to walk through the b-tree, find the next sub root and get the c_pos the next sub-tree starts from. btw, I have runned tristan's test case against the patched kernel for several days and this type of kernel panic never happens again. Signed-off-by: Tao Ma <tao.ma@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2009-11-30 14:32:19 +08:00
out:
ocfs2_free_path(left_path);
ocfs2_free_path(right_path);
return ret;
}
/*
* Given a cpos and len, try to find the refcount record which contains cpos.
* 1. If cpos can be found in one refcount record, return the record.
* 2. If cpos can't be found, return a fake record which start from cpos
* and end at a small value between cpos+len and start of the next record.
* This fake record has r_refcount = 0.
*/
static int ocfs2_get_refcount_rec(struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
u64 cpos, unsigned int len,
struct ocfs2_refcount_rec *ret_rec,
int *index,
struct buffer_head **ret_bh)
{
int ret = 0, i, found;
treewide: Remove uninitialized_var() usage Using uninitialized_var() is dangerous as it papers over real bugs[1] (or can in the future), and suppresses unrelated compiler warnings (e.g. "unused variable"). If the compiler thinks it is uninitialized, either simply initialize the variable or make compiler changes. In preparation for removing[2] the[3] macro[4], remove all remaining needless uses with the following script: git grep '\buninitialized_var\b' | cut -d: -f1 | sort -u | \ xargs perl -pi -e \ 's/\buninitialized_var\(([^\)]+)\)/\1/g; s:\s*/\* (GCC be quiet|to make compiler happy) \*/$::g;' drivers/video/fbdev/riva/riva_hw.c was manually tweaked to avoid pathological white-space. No outstanding warnings were found building allmodconfig with GCC 9.3.0 for x86_64, i386, arm64, arm, powerpc, powerpc64le, s390x, mips, sparc64, alpha, and m68k. [1] https://lore.kernel.org/lkml/20200603174714.192027-1-glider@google.com/ [2] https://lore.kernel.org/lkml/CA+55aFw+Vbj0i=1TGqCR5vQkCzWJ0QxK6CernOU6eedsudAixw@mail.gmail.com/ [3] https://lore.kernel.org/lkml/CA+55aFwgbgqhbp1fkxvRKEpzyR5J8n1vKT1VZdz9knmPuXhOeg@mail.gmail.com/ [4] https://lore.kernel.org/lkml/CA+55aFz2500WfbKXAx8s67wrm9=yVJu65TpLgN_ybYNv0VEOKA@mail.gmail.com/ Reviewed-by: Leon Romanovsky <leonro@mellanox.com> # drivers/infiniband and mlx4/mlx5 Acked-by: Jason Gunthorpe <jgg@mellanox.com> # IB Acked-by: Kalle Valo <kvalo@codeaurora.org> # wireless drivers Reviewed-by: Chao Yu <yuchao0@huawei.com> # erofs Signed-off-by: Kees Cook <keescook@chromium.org>
2020-06-04 04:09:38 +08:00
u32 low_cpos, cpos_end;
struct ocfs2_extent_list *el;
ocfs2: Find proper end cpos for a leaf refcount block. ocfs2 refcount tree is stored as an extent tree while the leaf ocfs2_refcount_rec points to a refcount block. The following step can trip a kernel panic. mkfs.ocfs2 -b 512 -C 1M --fs-features=refcount $DEVICE mount -t ocfs2 $DEVICE $MNT_DIR FILE_NAME=$RANDOM FILE_NAME_1=$RANDOM FILE_REF="${FILE_NAME}_ref" FILE_REF_1="${FILE_NAME}_ref_1" for((i=0;i<305;i++)) do # /mnt/1048576 is a file with 1048576 sizes. cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done for((i=0;i<3;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME done for((i=0;i<2;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME for((i=0;i<11;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF # write_f is a program which will write some bytes to a file at offset. # write_f -f file_name -l offset -w write_bytes. ./write_f -f $MNT_DIR/$FILE_REF -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[306*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[311*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF_1 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 #kernel panic here. The reason is that if the ocfs2_extent_rec is the last record in a leaf extent block, the old solution fails to find the suitable end cpos. So this patch try to walk through the b-tree, find the next sub root and get the c_pos the next sub-tree starts from. btw, I have runned tristan's test case against the patched kernel for several days and this type of kernel panic never happens again. Signed-off-by: Tao Ma <tao.ma@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2009-11-30 14:32:19 +08:00
struct ocfs2_extent_rec *rec = NULL;
struct ocfs2_extent_block *eb = NULL;
struct buffer_head *eb_bh = NULL, *ref_leaf_bh = NULL;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct ocfs2_refcount_block *rb =
(struct ocfs2_refcount_block *)ref_root_bh->b_data;
if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)) {
ocfs2_find_refcount_rec_in_rl(ci, ref_root_bh, cpos, len,
ret_rec, index);
*ret_bh = ref_root_bh;
get_bh(ref_root_bh);
return 0;
}
el = &rb->rf_list;
low_cpos = cpos & OCFS2_32BIT_POS_MASK;
if (el->l_tree_depth) {
ret = ocfs2_find_leaf(ci, el, low_cpos, &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
if (el->l_tree_depth) {
ret = ocfs2_error(sb,
"refcount tree %llu has non zero tree depth in leaf btree tree block %llu\n",
(unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
found = 0;
for (i = le16_to_cpu(el->l_next_free_rec) - 1; i >= 0; i--) {
rec = &el->l_recs[i];
if (le32_to_cpu(rec->e_cpos) <= low_cpos) {
found = 1;
break;
}
}
ocfs2: Find proper end cpos for a leaf refcount block. ocfs2 refcount tree is stored as an extent tree while the leaf ocfs2_refcount_rec points to a refcount block. The following step can trip a kernel panic. mkfs.ocfs2 -b 512 -C 1M --fs-features=refcount $DEVICE mount -t ocfs2 $DEVICE $MNT_DIR FILE_NAME=$RANDOM FILE_NAME_1=$RANDOM FILE_REF="${FILE_NAME}_ref" FILE_REF_1="${FILE_NAME}_ref_1" for((i=0;i<305;i++)) do # /mnt/1048576 is a file with 1048576 sizes. cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done for((i=0;i<3;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME done for((i=0;i<2;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME for((i=0;i<11;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF # write_f is a program which will write some bytes to a file at offset. # write_f -f file_name -l offset -w write_bytes. ./write_f -f $MNT_DIR/$FILE_REF -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[306*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[311*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF_1 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 #kernel panic here. The reason is that if the ocfs2_extent_rec is the last record in a leaf extent block, the old solution fails to find the suitable end cpos. So this patch try to walk through the b-tree, find the next sub root and get the c_pos the next sub-tree starts from. btw, I have runned tristan's test case against the patched kernel for several days and this type of kernel panic never happens again. Signed-off-by: Tao Ma <tao.ma@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2009-11-30 14:32:19 +08:00
if (found) {
ret = ocfs2_get_refcount_cpos_end(ci, ref_root_bh,
eb, el, i, &cpos_end);
if (ret) {
mlog_errno(ret);
goto out;
}
ocfs2: Find proper end cpos for a leaf refcount block. ocfs2 refcount tree is stored as an extent tree while the leaf ocfs2_refcount_rec points to a refcount block. The following step can trip a kernel panic. mkfs.ocfs2 -b 512 -C 1M --fs-features=refcount $DEVICE mount -t ocfs2 $DEVICE $MNT_DIR FILE_NAME=$RANDOM FILE_NAME_1=$RANDOM FILE_REF="${FILE_NAME}_ref" FILE_REF_1="${FILE_NAME}_ref_1" for((i=0;i<305;i++)) do # /mnt/1048576 is a file with 1048576 sizes. cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done for((i=0;i<3;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME done for((i=0;i<2;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME for((i=0;i<11;i++)) do cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME cat /mnt/1048576 >> $MNT_DIR/$FILE_NAME_1 done reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF # write_f is a program which will write some bytes to a file at offset. # write_f -f file_name -l offset -w write_bytes. ./write_f -f $MNT_DIR/$FILE_REF -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[306*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_REF -l $[311*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[310*1048576] -w 4096 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 reflink $MNT_DIR/$FILE_NAME $MNT_DIR/$FILE_REF_1 ./write_f -f $MNT_DIR/$FILE_NAME -l $[311*1048576] -w 4096 #kernel panic here. The reason is that if the ocfs2_extent_rec is the last record in a leaf extent block, the old solution fails to find the suitable end cpos. So this patch try to walk through the b-tree, find the next sub root and get the c_pos the next sub-tree starts from. btw, I have runned tristan's test case against the patched kernel for several days and this type of kernel panic never happens again. Signed-off-by: Tao Ma <tao.ma@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2009-11-30 14:32:19 +08:00
if (cpos_end < low_cpos + len)
len = cpos_end - low_cpos;
}
ret = ocfs2_read_refcount_block(ci, le64_to_cpu(rec->e_blkno),
&ref_leaf_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
ocfs2_find_refcount_rec_in_rl(ci, ref_leaf_bh, cpos, len,
ret_rec, index);
*ret_bh = ref_leaf_bh;
out:
brelse(eb_bh);
return ret;
}
enum ocfs2_ref_rec_contig {
REF_CONTIG_NONE = 0,
REF_CONTIG_LEFT,
REF_CONTIG_RIGHT,
REF_CONTIG_LEFTRIGHT,
};
static enum ocfs2_ref_rec_contig
ocfs2_refcount_rec_adjacent(struct ocfs2_refcount_block *rb,
int index)
{
if ((rb->rf_records.rl_recs[index].r_refcount ==
rb->rf_records.rl_recs[index + 1].r_refcount) &&
(le64_to_cpu(rb->rf_records.rl_recs[index].r_cpos) +
le32_to_cpu(rb->rf_records.rl_recs[index].r_clusters) ==
le64_to_cpu(rb->rf_records.rl_recs[index + 1].r_cpos)))
return REF_CONTIG_RIGHT;
return REF_CONTIG_NONE;
}
static enum ocfs2_ref_rec_contig
ocfs2_refcount_rec_contig(struct ocfs2_refcount_block *rb,
int index)
{
enum ocfs2_ref_rec_contig ret = REF_CONTIG_NONE;
if (index < le16_to_cpu(rb->rf_records.rl_used) - 1)
ret = ocfs2_refcount_rec_adjacent(rb, index);
if (index > 0) {
enum ocfs2_ref_rec_contig tmp;
tmp = ocfs2_refcount_rec_adjacent(rb, index - 1);
if (tmp == REF_CONTIG_RIGHT) {
if (ret == REF_CONTIG_RIGHT)
ret = REF_CONTIG_LEFTRIGHT;
else
ret = REF_CONTIG_LEFT;
}
}
return ret;
}
static void ocfs2_rotate_refcount_rec_left(struct ocfs2_refcount_block *rb,
int index)
{
BUG_ON(rb->rf_records.rl_recs[index].r_refcount !=
rb->rf_records.rl_recs[index+1].r_refcount);
le32_add_cpu(&rb->rf_records.rl_recs[index].r_clusters,
le32_to_cpu(rb->rf_records.rl_recs[index+1].r_clusters));
if (index < le16_to_cpu(rb->rf_records.rl_used) - 2)
memmove(&rb->rf_records.rl_recs[index + 1],
&rb->rf_records.rl_recs[index + 2],
sizeof(struct ocfs2_refcount_rec) *
(le16_to_cpu(rb->rf_records.rl_used) - index - 2));
memset(&rb->rf_records.rl_recs[le16_to_cpu(rb->rf_records.rl_used) - 1],
0, sizeof(struct ocfs2_refcount_rec));
le16_add_cpu(&rb->rf_records.rl_used, -1);
}
/*
* Merge the refcount rec if we are contiguous with the adjacent recs.
*/
static void ocfs2_refcount_rec_merge(struct ocfs2_refcount_block *rb,
int index)
{
enum ocfs2_ref_rec_contig contig =
ocfs2_refcount_rec_contig(rb, index);
if (contig == REF_CONTIG_NONE)
return;
if (contig == REF_CONTIG_LEFT || contig == REF_CONTIG_LEFTRIGHT) {
BUG_ON(index == 0);
index--;
}
ocfs2_rotate_refcount_rec_left(rb, index);
if (contig == REF_CONTIG_LEFTRIGHT)
ocfs2_rotate_refcount_rec_left(rb, index);
}
/*
* Change the refcount indexed by "index" in ref_bh.
* If refcount reaches 0, remove it.
*/
static int ocfs2_change_refcount_rec(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_leaf_bh,
int index, int merge, int change)
{
int ret;
struct ocfs2_refcount_block *rb =
(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
struct ocfs2_refcount_list *rl = &rb->rf_records;
struct ocfs2_refcount_rec *rec = &rl->rl_recs[index];
ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
trace_ocfs2_change_refcount_rec(
(unsigned long long)ocfs2_metadata_cache_owner(ci),
index, le32_to_cpu(rec->r_refcount), change);
le32_add_cpu(&rec->r_refcount, change);
if (!rec->r_refcount) {
if (index != le16_to_cpu(rl->rl_used) - 1) {
memmove(rec, rec + 1,
(le16_to_cpu(rl->rl_used) - index - 1) *
sizeof(struct ocfs2_refcount_rec));
memset(&rl->rl_recs[le16_to_cpu(rl->rl_used) - 1],
0, sizeof(struct ocfs2_refcount_rec));
}
le16_add_cpu(&rl->rl_used, -1);
} else if (merge)
ocfs2_refcount_rec_merge(rb, index);
ocfs2_journal_dirty(handle, ref_leaf_bh);
out:
return ret;
}
static int ocfs2_expand_inline_ref_root(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
struct buffer_head **ref_leaf_bh,
struct ocfs2_alloc_context *meta_ac)
{
int ret;
u16 suballoc_bit_start;
u32 num_got;
u64 suballoc_loc, blkno;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct buffer_head *new_bh = NULL;
struct ocfs2_refcount_block *new_rb;
struct ocfs2_refcount_block *root_rb =
(struct ocfs2_refcount_block *)ref_root_bh->b_data;
ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
&suballoc_bit_start, &num_got,
&blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
new_bh = sb_getblk(sb, blkno);
if (new_bh == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
ocfs2_set_new_buffer_uptodate(ci, new_bh);
ret = ocfs2_journal_access_rb(handle, ci, new_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
goto out;
}
/*
* Initialize ocfs2_refcount_block.
* It should contain the same information as the old root.
* so just memcpy it and change the corresponding field.
*/
memcpy(new_bh->b_data, ref_root_bh->b_data, sb->s_blocksize);
new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
new_rb->rf_blkno = cpu_to_le64(blkno);
new_rb->rf_cpos = cpu_to_le32(0);
new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
ocfs2_journal_dirty(handle, new_bh);
/* Now change the root. */
memset(&root_rb->rf_list, 0, sb->s_blocksize -
offsetof(struct ocfs2_refcount_block, rf_list));
root_rb->rf_list.l_count = cpu_to_le16(ocfs2_extent_recs_per_rb(sb));
root_rb->rf_clusters = cpu_to_le32(1);
root_rb->rf_list.l_next_free_rec = cpu_to_le16(1);
root_rb->rf_list.l_recs[0].e_blkno = cpu_to_le64(blkno);
root_rb->rf_list.l_recs[0].e_leaf_clusters = cpu_to_le16(1);
root_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_TREE_FL);
ocfs2_journal_dirty(handle, ref_root_bh);
trace_ocfs2_expand_inline_ref_root((unsigned long long)blkno,
le16_to_cpu(new_rb->rf_records.rl_used));
*ref_leaf_bh = new_bh;
new_bh = NULL;
out:
brelse(new_bh);
return ret;
}
static int ocfs2_refcount_rec_no_intersect(struct ocfs2_refcount_rec *prev,
struct ocfs2_refcount_rec *next)
{
if (ocfs2_get_ref_rec_low_cpos(prev) + le32_to_cpu(prev->r_clusters) <=
ocfs2_get_ref_rec_low_cpos(next))
return 1;
return 0;
}
static int cmp_refcount_rec_by_low_cpos(const void *a, const void *b)
{
const struct ocfs2_refcount_rec *l = a, *r = b;
u32 l_cpos = ocfs2_get_ref_rec_low_cpos(l);
u32 r_cpos = ocfs2_get_ref_rec_low_cpos(r);
if (l_cpos > r_cpos)
return 1;
if (l_cpos < r_cpos)
return -1;
return 0;
}
static int cmp_refcount_rec_by_cpos(const void *a, const void *b)
{
const struct ocfs2_refcount_rec *l = a, *r = b;
u64 l_cpos = le64_to_cpu(l->r_cpos);
u64 r_cpos = le64_to_cpu(r->r_cpos);
if (l_cpos > r_cpos)
return 1;
if (l_cpos < r_cpos)
return -1;
return 0;
}
static void swap_refcount_rec(void *a, void *b, int size)
{
struct ocfs2_refcount_rec *l = a, *r = b;
swap(*l, *r);
}
/*
* The refcount cpos are ordered by their 64bit cpos,
* But we will use the low 32 bit to be the e_cpos in the b-tree.
* So we need to make sure that this pos isn't intersected with others.
*
* Note: The refcount block is already sorted by their low 32 bit cpos,
* So just try the middle pos first, and we will exit when we find
* the good position.
*/
static int ocfs2_find_refcount_split_pos(struct ocfs2_refcount_list *rl,
u32 *split_pos, int *split_index)
{
int num_used = le16_to_cpu(rl->rl_used);
int delta, middle = num_used / 2;
for (delta = 0; delta < middle; delta++) {
/* Let's check delta earlier than middle */
if (ocfs2_refcount_rec_no_intersect(
&rl->rl_recs[middle - delta - 1],
&rl->rl_recs[middle - delta])) {
*split_index = middle - delta;
break;
}
/* For even counts, don't walk off the end */
if ((middle + delta + 1) == num_used)
continue;
/* Now try delta past middle */
if (ocfs2_refcount_rec_no_intersect(
&rl->rl_recs[middle + delta],
&rl->rl_recs[middle + delta + 1])) {
*split_index = middle + delta + 1;
break;
}
}
if (delta >= middle)
return -ENOSPC;
*split_pos = ocfs2_get_ref_rec_low_cpos(&rl->rl_recs[*split_index]);
return 0;
}
static int ocfs2_divide_leaf_refcount_block(struct buffer_head *ref_leaf_bh,
struct buffer_head *new_bh,
u32 *split_cpos)
{
int split_index = 0, num_moved, ret;
u32 cpos = 0;
struct ocfs2_refcount_block *rb =
(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
struct ocfs2_refcount_list *rl = &rb->rf_records;
struct ocfs2_refcount_block *new_rb =
(struct ocfs2_refcount_block *)new_bh->b_data;
struct ocfs2_refcount_list *new_rl = &new_rb->rf_records;
trace_ocfs2_divide_leaf_refcount_block(
(unsigned long long)ref_leaf_bh->b_blocknr,
le16_to_cpu(rl->rl_count), le16_to_cpu(rl->rl_used));
/*
* XXX: Improvement later.
* If we know all the high 32 bit cpos is the same, no need to sort.
*
* In order to make the whole process safe, we do:
* 1. sort the entries by their low 32 bit cpos first so that we can
* find the split cpos easily.
* 2. call ocfs2_insert_extent to insert the new refcount block.
* 3. move the refcount rec to the new block.
* 4. sort the entries by their 64 bit cpos.
* 5. dirty the new_rb and rb.
*/
sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
sizeof(struct ocfs2_refcount_rec),
cmp_refcount_rec_by_low_cpos, swap_refcount_rec);
ret = ocfs2_find_refcount_split_pos(rl, &cpos, &split_index);
if (ret) {
mlog_errno(ret);
return ret;
}
new_rb->rf_cpos = cpu_to_le32(cpos);
/* move refcount records starting from split_index to the new block. */
num_moved = le16_to_cpu(rl->rl_used) - split_index;
memcpy(new_rl->rl_recs, &rl->rl_recs[split_index],
num_moved * sizeof(struct ocfs2_refcount_rec));
/*ok, remove the entries we just moved over to the other block. */
memset(&rl->rl_recs[split_index], 0,
num_moved * sizeof(struct ocfs2_refcount_rec));
/* change old and new rl_used accordingly. */
le16_add_cpu(&rl->rl_used, -num_moved);
new_rl->rl_used = cpu_to_le16(num_moved);
sort(&rl->rl_recs, le16_to_cpu(rl->rl_used),
sizeof(struct ocfs2_refcount_rec),
cmp_refcount_rec_by_cpos, swap_refcount_rec);
sort(&new_rl->rl_recs, le16_to_cpu(new_rl->rl_used),
sizeof(struct ocfs2_refcount_rec),
cmp_refcount_rec_by_cpos, swap_refcount_rec);
*split_cpos = cpos;
return 0;
}
static int ocfs2_new_leaf_refcount_block(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
struct buffer_head *ref_leaf_bh,
struct ocfs2_alloc_context *meta_ac)
{
int ret;
u16 suballoc_bit_start;
u32 num_got, new_cpos;
u64 suballoc_loc, blkno;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct ocfs2_refcount_block *root_rb =
(struct ocfs2_refcount_block *)ref_root_bh->b_data;
struct buffer_head *new_bh = NULL;
struct ocfs2_refcount_block *new_rb;
struct ocfs2_extent_tree ref_et;
BUG_ON(!(le32_to_cpu(root_rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL));
ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_claim_metadata(handle, meta_ac, 1, &suballoc_loc,
&suballoc_bit_start, &num_got,
&blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
new_bh = sb_getblk(sb, blkno);
if (new_bh == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
ocfs2_set_new_buffer_uptodate(ci, new_bh);
ret = ocfs2_journal_access_rb(handle, ci, new_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
goto out;
}
/* Initialize ocfs2_refcount_block. */
new_rb = (struct ocfs2_refcount_block *)new_bh->b_data;
memset(new_rb, 0, sb->s_blocksize);
strcpy((void *)new_rb, OCFS2_REFCOUNT_BLOCK_SIGNATURE);
new_rb->rf_suballoc_slot = cpu_to_le16(meta_ac->ac_alloc_slot);
new_rb->rf_suballoc_loc = cpu_to_le64(suballoc_loc);
new_rb->rf_suballoc_bit = cpu_to_le16(suballoc_bit_start);
new_rb->rf_fs_generation = cpu_to_le32(OCFS2_SB(sb)->fs_generation);
new_rb->rf_blkno = cpu_to_le64(blkno);
new_rb->rf_parent = cpu_to_le64(ref_root_bh->b_blocknr);
new_rb->rf_flags = cpu_to_le32(OCFS2_REFCOUNT_LEAF_FL);
new_rb->rf_records.rl_count =
cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
new_rb->rf_generation = root_rb->rf_generation;
ret = ocfs2_divide_leaf_refcount_block(ref_leaf_bh, new_bh, &new_cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
ocfs2_journal_dirty(handle, ref_leaf_bh);
ocfs2_journal_dirty(handle, new_bh);
ocfs2_init_refcount_extent_tree(&ref_et, ci, ref_root_bh);
trace_ocfs2_new_leaf_refcount_block(
(unsigned long long)new_bh->b_blocknr, new_cpos);
/* Insert the new leaf block with the specific offset cpos. */
ret = ocfs2_insert_extent(handle, &ref_et, new_cpos, new_bh->b_blocknr,
1, 0, meta_ac);
if (ret)
mlog_errno(ret);
out:
brelse(new_bh);
return ret;
}
static int ocfs2_expand_refcount_tree(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
struct buffer_head *ref_leaf_bh,
struct ocfs2_alloc_context *meta_ac)
{
int ret;
struct buffer_head *expand_bh = NULL;
if (ref_root_bh == ref_leaf_bh) {
/*
* the old root bh hasn't been expanded to a b-tree,
* so expand it first.
*/
ret = ocfs2_expand_inline_ref_root(handle, ci, ref_root_bh,
&expand_bh, meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
} else {
expand_bh = ref_leaf_bh;
get_bh(expand_bh);
}
/* Now add a new refcount block into the tree.*/
ret = ocfs2_new_leaf_refcount_block(handle, ci, ref_root_bh,
expand_bh, meta_ac);
if (ret)
mlog_errno(ret);
out:
brelse(expand_bh);
return ret;
}
/*
* Adjust the extent rec in b-tree representing ref_leaf_bh.
*
* Only called when we have inserted a new refcount rec at index 0
* which means ocfs2_extent_rec.e_cpos may need some change.
*/
static int ocfs2_adjust_refcount_rec(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
struct buffer_head *ref_leaf_bh,
struct ocfs2_refcount_rec *rec)
{
int ret = 0, i;
u32 new_cpos, old_cpos;
struct ocfs2_path *path = NULL;
struct ocfs2_extent_tree et;
struct ocfs2_refcount_block *rb =
(struct ocfs2_refcount_block *)ref_root_bh->b_data;
struct ocfs2_extent_list *el;
if (!(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL))
goto out;
rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
old_cpos = le32_to_cpu(rb->rf_cpos);
new_cpos = le64_to_cpu(rec->r_cpos) & OCFS2_32BIT_POS_MASK;
if (old_cpos <= new_cpos)
goto out;
ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
path = ocfs2_new_path_from_et(&et);
if (!path) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
ret = ocfs2_find_path(ci, path, old_cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
/*
* 2 more credits, one for the leaf refcount block, one for
* the extent block contains the extent rec.
*/
ret = ocfs2_extend_trans(handle, 2);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_journal_access_eb(handle, ci, path_leaf_bh(path),
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
/* change the leaf extent block first. */
el = path_leaf_el(path);
for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++)
if (le32_to_cpu(el->l_recs[i].e_cpos) == old_cpos)
break;
BUG_ON(i == le16_to_cpu(el->l_next_free_rec));
el->l_recs[i].e_cpos = cpu_to_le32(new_cpos);
/* change the r_cpos in the leaf block. */
rb->rf_cpos = cpu_to_le32(new_cpos);
ocfs2_journal_dirty(handle, path_leaf_bh(path));
ocfs2_journal_dirty(handle, ref_leaf_bh);
out:
ocfs2_free_path(path);
return ret;
}
static int ocfs2_insert_refcount_rec(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
struct buffer_head *ref_leaf_bh,
struct ocfs2_refcount_rec *rec,
int index, int merge,
struct ocfs2_alloc_context *meta_ac)
{
int ret;
struct ocfs2_refcount_block *rb =
(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
struct ocfs2_refcount_list *rf_list = &rb->rf_records;
struct buffer_head *new_bh = NULL;
BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
if (rf_list->rl_used == rf_list->rl_count) {
u64 cpos = le64_to_cpu(rec->r_cpos);
u32 len = le32_to_cpu(rec->r_clusters);
ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
ref_leaf_bh, meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
cpos, len, NULL, &index,
&new_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
ref_leaf_bh = new_bh;
rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
rf_list = &rb->rf_records;
}
ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
if (index < le16_to_cpu(rf_list->rl_used))
memmove(&rf_list->rl_recs[index + 1],
&rf_list->rl_recs[index],
(le16_to_cpu(rf_list->rl_used) - index) *
sizeof(struct ocfs2_refcount_rec));
trace_ocfs2_insert_refcount_rec(
(unsigned long long)ref_leaf_bh->b_blocknr, index,
(unsigned long long)le64_to_cpu(rec->r_cpos),
le32_to_cpu(rec->r_clusters), le32_to_cpu(rec->r_refcount));
rf_list->rl_recs[index] = *rec;
le16_add_cpu(&rf_list->rl_used, 1);
if (merge)
ocfs2_refcount_rec_merge(rb, index);
ocfs2_journal_dirty(handle, ref_leaf_bh);
if (index == 0) {
ret = ocfs2_adjust_refcount_rec(handle, ci,
ref_root_bh,
ref_leaf_bh, rec);
if (ret)
mlog_errno(ret);
}
out:
brelse(new_bh);
return ret;
}
/*
* Split the refcount_rec indexed by "index" in ref_leaf_bh.
* This is much simple than our b-tree code.
* split_rec is the new refcount rec we want to insert.
* If split_rec->r_refcount > 0, we are changing the refcount(in case we
* increase refcount or decrease a refcount to non-zero).
* If split_rec->r_refcount == 0, we are punching a hole in current refcount
* rec( in case we decrease a refcount to zero).
*/
static int ocfs2_split_refcount_rec(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
struct buffer_head *ref_leaf_bh,
struct ocfs2_refcount_rec *split_rec,
int index, int merge,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret, recs_need;
u32 len;
struct ocfs2_refcount_block *rb =
(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
struct ocfs2_refcount_list *rf_list = &rb->rf_records;
struct ocfs2_refcount_rec *orig_rec = &rf_list->rl_recs[index];
struct ocfs2_refcount_rec *tail_rec = NULL;
struct buffer_head *new_bh = NULL;
BUG_ON(le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL);
trace_ocfs2_split_refcount_rec(le64_to_cpu(orig_rec->r_cpos),
le32_to_cpu(orig_rec->r_clusters),
le32_to_cpu(orig_rec->r_refcount),
le64_to_cpu(split_rec->r_cpos),
le32_to_cpu(split_rec->r_clusters),
le32_to_cpu(split_rec->r_refcount));
/*
* If we just need to split the header or tail clusters,
* no more recs are needed, just split is OK.
* Otherwise we at least need one new recs.
*/
if (!split_rec->r_refcount &&
(split_rec->r_cpos == orig_rec->r_cpos ||
le64_to_cpu(split_rec->r_cpos) +
le32_to_cpu(split_rec->r_clusters) ==
le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
recs_need = 0;
else
recs_need = 1;
/*
* We need one more rec if we split in the middle and the new rec have
* some refcount in it.
*/
if (split_rec->r_refcount &&
(split_rec->r_cpos != orig_rec->r_cpos &&
le64_to_cpu(split_rec->r_cpos) +
le32_to_cpu(split_rec->r_clusters) !=
le64_to_cpu(orig_rec->r_cpos) + le32_to_cpu(orig_rec->r_clusters)))
recs_need++;
/* If the leaf block don't have enough record, expand it. */
if (le16_to_cpu(rf_list->rl_used) + recs_need >
le16_to_cpu(rf_list->rl_count)) {
struct ocfs2_refcount_rec tmp_rec;
u64 cpos = le64_to_cpu(orig_rec->r_cpos);
len = le32_to_cpu(orig_rec->r_clusters);
ret = ocfs2_expand_refcount_tree(handle, ci, ref_root_bh,
ref_leaf_bh, meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
/*
* We have to re-get it since now cpos may be moved to
* another leaf block.
*/
ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
cpos, len, &tmp_rec, &index,
&new_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
ref_leaf_bh = new_bh;
rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
rf_list = &rb->rf_records;
orig_rec = &rf_list->rl_recs[index];
}
ret = ocfs2_journal_access_rb(handle, ci, ref_leaf_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
/*
* We have calculated out how many new records we need and store
* in recs_need, so spare enough space first by moving the records
* after "index" to the end.
*/
if (index != le16_to_cpu(rf_list->rl_used) - 1)
memmove(&rf_list->rl_recs[index + 1 + recs_need],
&rf_list->rl_recs[index + 1],
(le16_to_cpu(rf_list->rl_used) - index - 1) *
sizeof(struct ocfs2_refcount_rec));
len = (le64_to_cpu(orig_rec->r_cpos) +
le32_to_cpu(orig_rec->r_clusters)) -
(le64_to_cpu(split_rec->r_cpos) +
le32_to_cpu(split_rec->r_clusters));
/*
* If we have "len", the we will split in the tail and move it
* to the end of the space we have just spared.
*/
if (len) {
tail_rec = &rf_list->rl_recs[index + recs_need];
memcpy(tail_rec, orig_rec, sizeof(struct ocfs2_refcount_rec));
le64_add_cpu(&tail_rec->r_cpos,
le32_to_cpu(tail_rec->r_clusters) - len);
tail_rec->r_clusters = cpu_to_le32(len);
}
/*
* If the split pos isn't the same as the original one, we need to
* split in the head.
*
* Note: We have the chance that split_rec.r_refcount = 0,
* recs_need = 0 and len > 0, which means we just cut the head from
* the orig_rec and in that case we have done some modification in
* orig_rec above, so the check for r_cpos is faked.
*/
if (split_rec->r_cpos != orig_rec->r_cpos && tail_rec != orig_rec) {
len = le64_to_cpu(split_rec->r_cpos) -
le64_to_cpu(orig_rec->r_cpos);
orig_rec->r_clusters = cpu_to_le32(len);
index++;
}
le16_add_cpu(&rf_list->rl_used, recs_need);
if (split_rec->r_refcount) {
rf_list->rl_recs[index] = *split_rec;
trace_ocfs2_split_refcount_rec_insert(
(unsigned long long)ref_leaf_bh->b_blocknr, index,
(unsigned long long)le64_to_cpu(split_rec->r_cpos),
le32_to_cpu(split_rec->r_clusters),
le32_to_cpu(split_rec->r_refcount));
if (merge)
ocfs2_refcount_rec_merge(rb, index);
}
ocfs2_journal_dirty(handle, ref_leaf_bh);
out:
brelse(new_bh);
return ret;
}
static int __ocfs2_increase_refcount(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
u64 cpos, u32 len, int merge,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret = 0, index;
struct buffer_head *ref_leaf_bh = NULL;
struct ocfs2_refcount_rec rec;
unsigned int set_len = 0;
trace_ocfs2_increase_refcount_begin(
(unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)cpos, len);
while (len) {
ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
cpos, len, &rec, &index,
&ref_leaf_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
set_len = le32_to_cpu(rec.r_clusters);
/*
* Here we may meet with 3 situations:
*
* 1. If we find an already existing record, and the length
* is the same, cool, we just need to increase the r_refcount
* and it is OK.
* 2. If we find a hole, just insert it with r_refcount = 1.
* 3. If we are in the middle of one extent record, split
* it.
*/
if (rec.r_refcount && le64_to_cpu(rec.r_cpos) == cpos &&
set_len <= len) {
trace_ocfs2_increase_refcount_change(
(unsigned long long)cpos, set_len,
le32_to_cpu(rec.r_refcount));
ret = ocfs2_change_refcount_rec(handle, ci,
ref_leaf_bh, index,
merge, 1);
if (ret) {
mlog_errno(ret);
goto out;
}
} else if (!rec.r_refcount) {
rec.r_refcount = cpu_to_le32(1);
trace_ocfs2_increase_refcount_insert(
(unsigned long long)le64_to_cpu(rec.r_cpos),
set_len);
ret = ocfs2_insert_refcount_rec(handle, ci, ref_root_bh,
ref_leaf_bh,
&rec, index,
merge, meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
} else {
set_len = min((u64)(cpos + len),
le64_to_cpu(rec.r_cpos) + set_len) - cpos;
rec.r_cpos = cpu_to_le64(cpos);
rec.r_clusters = cpu_to_le32(set_len);
le32_add_cpu(&rec.r_refcount, 1);
trace_ocfs2_increase_refcount_split(
(unsigned long long)le64_to_cpu(rec.r_cpos),
set_len, le32_to_cpu(rec.r_refcount));
ret = ocfs2_split_refcount_rec(handle, ci,
ref_root_bh, ref_leaf_bh,
&rec, index, merge,
meta_ac, dealloc);
if (ret) {
mlog_errno(ret);
goto out;
}
}
cpos += set_len;
len -= set_len;
brelse(ref_leaf_bh);
ref_leaf_bh = NULL;
}
out:
brelse(ref_leaf_bh);
return ret;
}
static int ocfs2_remove_refcount_extent(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
struct buffer_head *ref_leaf_bh,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct ocfs2_refcount_block *rb =
(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
struct ocfs2_extent_tree et;
BUG_ON(rb->rf_records.rl_used);
trace_ocfs2_remove_refcount_extent(
(unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)ref_leaf_bh->b_blocknr,
le32_to_cpu(rb->rf_cpos));
ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
ret = ocfs2_remove_extent(handle, &et, le32_to_cpu(rb->rf_cpos),
1, meta_ac, dealloc);
if (ret) {
mlog_errno(ret);
goto out;
}
ocfs2_remove_from_cache(ci, ref_leaf_bh);
/*
* add the freed block to the dealloc so that it will be freed
* when we run dealloc.
*/
ret = ocfs2_cache_block_dealloc(dealloc, EXTENT_ALLOC_SYSTEM_INODE,
le16_to_cpu(rb->rf_suballoc_slot),
le64_to_cpu(rb->rf_suballoc_loc),
le64_to_cpu(rb->rf_blkno),
le16_to_cpu(rb->rf_suballoc_bit));
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_journal_access_rb(handle, ci, ref_root_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out;
}
rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
le32_add_cpu(&rb->rf_clusters, -1);
/*
* check whether we need to restore the root refcount block if
* there is no leaf extent block at atll.
*/
if (!rb->rf_list.l_next_free_rec) {
BUG_ON(rb->rf_clusters);
trace_ocfs2_restore_refcount_block(
(unsigned long long)ref_root_bh->b_blocknr);
rb->rf_flags = 0;
rb->rf_parent = 0;
rb->rf_cpos = 0;
memset(&rb->rf_records, 0, sb->s_blocksize -
offsetof(struct ocfs2_refcount_block, rf_records));
rb->rf_records.rl_count =
cpu_to_le16(ocfs2_refcount_recs_per_rb(sb));
}
ocfs2_journal_dirty(handle, ref_root_bh);
out:
return ret;
}
int ocfs2_increase_refcount(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
u64 cpos, u32 len,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
return __ocfs2_increase_refcount(handle, ci, ref_root_bh,
cpos, len, 1,
meta_ac, dealloc);
}
static int ocfs2_decrease_refcount_rec(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
struct buffer_head *ref_leaf_bh,
int index, u64 cpos, unsigned int len,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret;
struct ocfs2_refcount_block *rb =
(struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
struct ocfs2_refcount_rec *rec = &rb->rf_records.rl_recs[index];
BUG_ON(cpos < le64_to_cpu(rec->r_cpos));
BUG_ON(cpos + len >
le64_to_cpu(rec->r_cpos) + le32_to_cpu(rec->r_clusters));
trace_ocfs2_decrease_refcount_rec(
(unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)cpos, len);
if (cpos == le64_to_cpu(rec->r_cpos) &&
len == le32_to_cpu(rec->r_clusters))
ret = ocfs2_change_refcount_rec(handle, ci,
ref_leaf_bh, index, 1, -1);
else {
struct ocfs2_refcount_rec split = *rec;
split.r_cpos = cpu_to_le64(cpos);
split.r_clusters = cpu_to_le32(len);
le32_add_cpu(&split.r_refcount, -1);
ret = ocfs2_split_refcount_rec(handle, ci,
ref_root_bh, ref_leaf_bh,
&split, index, 1,
meta_ac, dealloc);
}
if (ret) {
mlog_errno(ret);
goto out;
}
/* Remove the leaf refcount block if it contains no refcount record. */
if (!rb->rf_records.rl_used && ref_leaf_bh != ref_root_bh) {
ret = ocfs2_remove_refcount_extent(handle, ci, ref_root_bh,
ref_leaf_bh, meta_ac,
dealloc);
if (ret)
mlog_errno(ret);
}
out:
return ret;
}
static int __ocfs2_decrease_refcount(handle_t *handle,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
u64 cpos, u32 len,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc,
int delete)
{
int ret = 0, index = 0;
struct ocfs2_refcount_rec rec;
unsigned int r_count = 0, r_len;
struct super_block *sb = ocfs2_metadata_cache_get_super(ci);
struct buffer_head *ref_leaf_bh = NULL;
trace_ocfs2_decrease_refcount(
(unsigned long long)ocfs2_metadata_cache_owner(ci),
(unsigned long long)cpos, len, delete);
while (len) {
ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
cpos, len, &rec, &index,
&ref_leaf_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
r_count = le32_to_cpu(rec.r_refcount);
BUG_ON(r_count == 0);
if (!delete)
BUG_ON(r_count > 1);
r_len = min((u64)(cpos + len), le64_to_cpu(rec.r_cpos) +
le32_to_cpu(rec.r_clusters)) - cpos;
ret = ocfs2_decrease_refcount_rec(handle, ci, ref_root_bh,
ref_leaf_bh, index,
cpos, r_len,
meta_ac, dealloc);
if (ret) {
mlog_errno(ret);
goto out;
}
if (le32_to_cpu(rec.r_refcount) == 1 && delete) {
ret = ocfs2_cache_cluster_dealloc(dealloc,
ocfs2_clusters_to_blocks(sb, cpos),
r_len);
if (ret) {
mlog_errno(ret);
goto out;
}
}
cpos += r_len;
len -= r_len;
brelse(ref_leaf_bh);
ref_leaf_bh = NULL;
}
out:
brelse(ref_leaf_bh);
return ret;
}
/* Caller must hold refcount tree lock. */
int ocfs2_decrease_refcount(struct inode *inode,
handle_t *handle, u32 cpos, u32 len,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc,
int delete)
{
int ret;
u64 ref_blkno;
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *tree;
BUG_ON(!ocfs2_is_refcount_inode(inode));
ret = ocfs2_get_refcount_block(inode, &ref_blkno);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb), ref_blkno, &tree);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_read_refcount_block(&tree->rf_ci, tree->rf_blkno,
&ref_root_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = __ocfs2_decrease_refcount(handle, &tree->rf_ci, ref_root_bh,
cpos, len, meta_ac, dealloc, delete);
if (ret)
mlog_errno(ret);
out:
brelse(ref_root_bh);
return ret;
}
/*
* Mark the already-existing extent at cpos as refcounted for len clusters.
* This adds the refcount extent flag.
*
* If the existing extent is larger than the request, initiate a
* split. An attempt will be made at merging with adjacent extents.
*
* The caller is responsible for passing down meta_ac if we'll need it.
*/
static int ocfs2_mark_extent_refcounted(struct inode *inode,
struct ocfs2_extent_tree *et,
handle_t *handle, u32 cpos,
u32 len, u32 phys,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret;
trace_ocfs2_mark_extent_refcounted(OCFS2_I(inode)->ip_blkno,
cpos, len, phys);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
inode->i_ino);
goto out;
}
ret = ocfs2_change_extent_flag(handle, et, cpos,
len, phys, meta_ac, dealloc,
OCFS2_EXT_REFCOUNTED, 0);
if (ret)
mlog_errno(ret);
out:
return ret;
}
/*
* Given some contiguous physical clusters, calculate what we need
* for modifying their refcount.
*/
static int ocfs2_calc_refcount_meta_credits(struct super_block *sb,
struct ocfs2_caching_info *ci,
struct buffer_head *ref_root_bh,
u64 start_cpos,
u32 clusters,
int *meta_add,
int *credits)
{
int ret = 0, index, ref_blocks = 0, recs_add = 0;
u64 cpos = start_cpos;
struct ocfs2_refcount_block *rb;
struct ocfs2_refcount_rec rec;
struct buffer_head *ref_leaf_bh = NULL, *prev_bh = NULL;
u32 len;
while (clusters) {
ret = ocfs2_get_refcount_rec(ci, ref_root_bh,
cpos, clusters, &rec,
&index, &ref_leaf_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
if (ref_leaf_bh != prev_bh) {
/*
* Now we encounter a new leaf block, so calculate
* whether we need to extend the old leaf.
*/
if (prev_bh) {
rb = (struct ocfs2_refcount_block *)
prev_bh->b_data;
if (le16_to_cpu(rb->rf_records.rl_used) +
recs_add >
le16_to_cpu(rb->rf_records.rl_count))
ref_blocks++;
}
recs_add = 0;
*credits += 1;
brelse(prev_bh);
prev_bh = ref_leaf_bh;
get_bh(prev_bh);
}
trace_ocfs2_calc_refcount_meta_credits_iterate(
recs_add, (unsigned long long)cpos, clusters,
(unsigned long long)le64_to_cpu(rec.r_cpos),
le32_to_cpu(rec.r_clusters),
le32_to_cpu(rec.r_refcount), index);
len = min((u64)cpos + clusters, le64_to_cpu(rec.r_cpos) +
le32_to_cpu(rec.r_clusters)) - cpos;
/*
* We record all the records which will be inserted to the
* same refcount block, so that we can tell exactly whether
* we need a new refcount block or not.
*
* If we will insert a new one, this is easy and only happens
* during adding refcounted flag to the extent, so we don't
* have a chance of spliting. We just need one record.
*
* If the refcount rec already exists, that would be a little
* complicated. we may have to:
* 1) split at the beginning if the start pos isn't aligned.
* we need 1 more record in this case.
* 2) split int the end if the end pos isn't aligned.
* we need 1 more record in this case.
* 3) split in the middle because of file system fragmentation.
* we need 2 more records in this case(we can't detect this
* beforehand, so always think of the worst case).
*/
if (rec.r_refcount) {
recs_add += 2;
/* Check whether we need a split at the beginning. */
if (cpos == start_cpos &&
cpos != le64_to_cpu(rec.r_cpos))
recs_add++;
/* Check whether we need a split in the end. */
if (cpos + clusters < le64_to_cpu(rec.r_cpos) +
le32_to_cpu(rec.r_clusters))
recs_add++;
} else
recs_add++;
brelse(ref_leaf_bh);
ref_leaf_bh = NULL;
clusters -= len;
cpos += len;
}
if (prev_bh) {
rb = (struct ocfs2_refcount_block *)prev_bh->b_data;
if (le16_to_cpu(rb->rf_records.rl_used) + recs_add >
le16_to_cpu(rb->rf_records.rl_count))
ref_blocks++;
*credits += 1;
}
if (!ref_blocks)
goto out;
*meta_add += ref_blocks;
*credits += ref_blocks;
/*
* So we may need ref_blocks to insert into the tree.
* That also means we need to change the b-tree and add that number
* of records since we never merge them.
* We need one more block for expansion since the new created leaf
* block is also full and needs split.
*/
rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL) {
struct ocfs2_extent_tree et;
ocfs2_init_refcount_extent_tree(&et, ci, ref_root_bh);
*meta_add += ocfs2_extend_meta_needed(et.et_root_el);
*credits += ocfs2_calc_extend_credits(sb,
et.et_root_el);
} else {
*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
*meta_add += 1;
}
out:
trace_ocfs2_calc_refcount_meta_credits(
(unsigned long long)start_cpos, clusters,
*meta_add, *credits);
brelse(ref_leaf_bh);
brelse(prev_bh);
return ret;
}
/*
* For refcount tree, we will decrease some contiguous clusters
* refcount count, so just go through it to see how many blocks
* we gonna touch and whether we need to create new blocks.
*
* Normally the refcount blocks store these refcount should be
* contiguous also, so that we can get the number easily.
Ocfs2: Optimize ocfs2 truncate to use ocfs2_remove_btree_range() instead. Truncate is just a special case of punching holes(from new i_size to end), we therefore could take advantage of the existing ocfs2_remove_btree_range() to reduce the comlexity and redundancy in alloc.c. The goal here is to make truncate more generic and straightforward. Several functions only used by ocfs2_commit_truncate() will smiply be removed. ocfs2_remove_btree_range() was originally used by the hole punching code, which didn't take refcount trees into account (definitely a bug). We therefore need to change that func a bit to handle refcount trees. It must take the refcount lock, calculate and reserve blocks for refcount tree changes, and decrease refcounts at the end. We replace ocfs2_lock_allocators() here by adding a new func ocfs2_reserve_blocks_for_rec_trunc() which accepts some extra blocks to reserve. This will not hurt any other code using ocfs2_remove_btree_range() (such as dir truncate and hole punching). I merged the following steps into one patch since they may be logically doing one thing, though I know it looks a little bit fat to review. 1). Remove redundant code used by ocfs2_commit_truncate(), since we're moving to ocfs2_remove_btree_range anyway. 2). Add a new func ocfs2_reserve_blocks_for_rec_trunc() for purpose of accepting some extra blocks to reserve. 3). Change ocfs2_prepare_refcount_change_for_del() a bit to fit our needs. It's safe to do this since it's only being called by truncate. 4). Change ocfs2_remove_btree_range() a bit to take refcount case into account. 5). Finally, we change ocfs2_commit_truncate() to call ocfs2_remove_btree_range() in a proper way. The patch has been tested normally for sanity check, stress tests with heavier workload will be expected. Based on this patch, fixing the punching holes bug will be fairly easy. Signed-off-by: Tristan Ye <tristan.ye@oracle.com> Acked-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 17:54:42 +08:00
* We will at most add split 2 refcount records and 2 more
* refcount blocks, so just check it in a rough way.
*
* Caller must hold refcount tree lock.
*/
int ocfs2_prepare_refcount_change_for_del(struct inode *inode,
Ocfs2: Optimize ocfs2 truncate to use ocfs2_remove_btree_range() instead. Truncate is just a special case of punching holes(from new i_size to end), we therefore could take advantage of the existing ocfs2_remove_btree_range() to reduce the comlexity and redundancy in alloc.c. The goal here is to make truncate more generic and straightforward. Several functions only used by ocfs2_commit_truncate() will smiply be removed. ocfs2_remove_btree_range() was originally used by the hole punching code, which didn't take refcount trees into account (definitely a bug). We therefore need to change that func a bit to handle refcount trees. It must take the refcount lock, calculate and reserve blocks for refcount tree changes, and decrease refcounts at the end. We replace ocfs2_lock_allocators() here by adding a new func ocfs2_reserve_blocks_for_rec_trunc() which accepts some extra blocks to reserve. This will not hurt any other code using ocfs2_remove_btree_range() (such as dir truncate and hole punching). I merged the following steps into one patch since they may be logically doing one thing, though I know it looks a little bit fat to review. 1). Remove redundant code used by ocfs2_commit_truncate(), since we're moving to ocfs2_remove_btree_range anyway. 2). Add a new func ocfs2_reserve_blocks_for_rec_trunc() for purpose of accepting some extra blocks to reserve. 3). Change ocfs2_prepare_refcount_change_for_del() a bit to fit our needs. It's safe to do this since it's only being called by truncate. 4). Change ocfs2_remove_btree_range() a bit to take refcount case into account. 5). Finally, we change ocfs2_commit_truncate() to call ocfs2_remove_btree_range() in a proper way. The patch has been tested normally for sanity check, stress tests with heavier workload will be expected. Based on this patch, fixing the punching holes bug will be fairly easy. Signed-off-by: Tristan Ye <tristan.ye@oracle.com> Acked-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 17:54:42 +08:00
u64 refcount_loc,
u64 phys_blkno,
u32 clusters,
int *credits,
Ocfs2: Optimize ocfs2 truncate to use ocfs2_remove_btree_range() instead. Truncate is just a special case of punching holes(from new i_size to end), we therefore could take advantage of the existing ocfs2_remove_btree_range() to reduce the comlexity and redundancy in alloc.c. The goal here is to make truncate more generic and straightforward. Several functions only used by ocfs2_commit_truncate() will smiply be removed. ocfs2_remove_btree_range() was originally used by the hole punching code, which didn't take refcount trees into account (definitely a bug). We therefore need to change that func a bit to handle refcount trees. It must take the refcount lock, calculate and reserve blocks for refcount tree changes, and decrease refcounts at the end. We replace ocfs2_lock_allocators() here by adding a new func ocfs2_reserve_blocks_for_rec_trunc() which accepts some extra blocks to reserve. This will not hurt any other code using ocfs2_remove_btree_range() (such as dir truncate and hole punching). I merged the following steps into one patch since they may be logically doing one thing, though I know it looks a little bit fat to review. 1). Remove redundant code used by ocfs2_commit_truncate(), since we're moving to ocfs2_remove_btree_range anyway. 2). Add a new func ocfs2_reserve_blocks_for_rec_trunc() for purpose of accepting some extra blocks to reserve. 3). Change ocfs2_prepare_refcount_change_for_del() a bit to fit our needs. It's safe to do this since it's only being called by truncate. 4). Change ocfs2_remove_btree_range() a bit to take refcount case into account. 5). Finally, we change ocfs2_commit_truncate() to call ocfs2_remove_btree_range() in a proper way. The patch has been tested normally for sanity check, stress tests with heavier workload will be expected. Based on this patch, fixing the punching holes bug will be fairly easy. Signed-off-by: Tristan Ye <tristan.ye@oracle.com> Acked-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 17:54:42 +08:00
int *ref_blocks)
{
Ocfs2: Optimize ocfs2 truncate to use ocfs2_remove_btree_range() instead. Truncate is just a special case of punching holes(from new i_size to end), we therefore could take advantage of the existing ocfs2_remove_btree_range() to reduce the comlexity and redundancy in alloc.c. The goal here is to make truncate more generic and straightforward. Several functions only used by ocfs2_commit_truncate() will smiply be removed. ocfs2_remove_btree_range() was originally used by the hole punching code, which didn't take refcount trees into account (definitely a bug). We therefore need to change that func a bit to handle refcount trees. It must take the refcount lock, calculate and reserve blocks for refcount tree changes, and decrease refcounts at the end. We replace ocfs2_lock_allocators() here by adding a new func ocfs2_reserve_blocks_for_rec_trunc() which accepts some extra blocks to reserve. This will not hurt any other code using ocfs2_remove_btree_range() (such as dir truncate and hole punching). I merged the following steps into one patch since they may be logically doing one thing, though I know it looks a little bit fat to review. 1). Remove redundant code used by ocfs2_commit_truncate(), since we're moving to ocfs2_remove_btree_range anyway. 2). Add a new func ocfs2_reserve_blocks_for_rec_trunc() for purpose of accepting some extra blocks to reserve. 3). Change ocfs2_prepare_refcount_change_for_del() a bit to fit our needs. It's safe to do this since it's only being called by truncate. 4). Change ocfs2_remove_btree_range() a bit to take refcount case into account. 5). Finally, we change ocfs2_commit_truncate() to call ocfs2_remove_btree_range() in a proper way. The patch has been tested normally for sanity check, stress tests with heavier workload will be expected. Based on this patch, fixing the punching holes bug will be fairly easy. Signed-off-by: Tristan Ye <tristan.ye@oracle.com> Acked-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 17:54:42 +08:00
int ret;
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *tree;
u64 start_cpos = ocfs2_blocks_to_clusters(inode->i_sb, phys_blkno);
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb))) {
ret = ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
inode->i_ino);
goto out;
}
BUG_ON(!ocfs2_is_refcount_inode(inode));
ret = ocfs2_get_refcount_tree(OCFS2_SB(inode->i_sb),
Ocfs2: Optimize ocfs2 truncate to use ocfs2_remove_btree_range() instead. Truncate is just a special case of punching holes(from new i_size to end), we therefore could take advantage of the existing ocfs2_remove_btree_range() to reduce the comlexity and redundancy in alloc.c. The goal here is to make truncate more generic and straightforward. Several functions only used by ocfs2_commit_truncate() will smiply be removed. ocfs2_remove_btree_range() was originally used by the hole punching code, which didn't take refcount trees into account (definitely a bug). We therefore need to change that func a bit to handle refcount trees. It must take the refcount lock, calculate and reserve blocks for refcount tree changes, and decrease refcounts at the end. We replace ocfs2_lock_allocators() here by adding a new func ocfs2_reserve_blocks_for_rec_trunc() which accepts some extra blocks to reserve. This will not hurt any other code using ocfs2_remove_btree_range() (such as dir truncate and hole punching). I merged the following steps into one patch since they may be logically doing one thing, though I know it looks a little bit fat to review. 1). Remove redundant code used by ocfs2_commit_truncate(), since we're moving to ocfs2_remove_btree_range anyway. 2). Add a new func ocfs2_reserve_blocks_for_rec_trunc() for purpose of accepting some extra blocks to reserve. 3). Change ocfs2_prepare_refcount_change_for_del() a bit to fit our needs. It's safe to do this since it's only being called by truncate. 4). Change ocfs2_remove_btree_range() a bit to take refcount case into account. 5). Finally, we change ocfs2_commit_truncate() to call ocfs2_remove_btree_range() in a proper way. The patch has been tested normally for sanity check, stress tests with heavier workload will be expected. Based on this patch, fixing the punching holes bug will be fairly easy. Signed-off-by: Tristan Ye <tristan.ye@oracle.com> Acked-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 17:54:42 +08:00
refcount_loc, &tree);
if (ret) {
mlog_errno(ret);
goto out;
}
Ocfs2: Optimize ocfs2 truncate to use ocfs2_remove_btree_range() instead. Truncate is just a special case of punching holes(from new i_size to end), we therefore could take advantage of the existing ocfs2_remove_btree_range() to reduce the comlexity and redundancy in alloc.c. The goal here is to make truncate more generic and straightforward. Several functions only used by ocfs2_commit_truncate() will smiply be removed. ocfs2_remove_btree_range() was originally used by the hole punching code, which didn't take refcount trees into account (definitely a bug). We therefore need to change that func a bit to handle refcount trees. It must take the refcount lock, calculate and reserve blocks for refcount tree changes, and decrease refcounts at the end. We replace ocfs2_lock_allocators() here by adding a new func ocfs2_reserve_blocks_for_rec_trunc() which accepts some extra blocks to reserve. This will not hurt any other code using ocfs2_remove_btree_range() (such as dir truncate and hole punching). I merged the following steps into one patch since they may be logically doing one thing, though I know it looks a little bit fat to review. 1). Remove redundant code used by ocfs2_commit_truncate(), since we're moving to ocfs2_remove_btree_range anyway. 2). Add a new func ocfs2_reserve_blocks_for_rec_trunc() for purpose of accepting some extra blocks to reserve. 3). Change ocfs2_prepare_refcount_change_for_del() a bit to fit our needs. It's safe to do this since it's only being called by truncate. 4). Change ocfs2_remove_btree_range() a bit to take refcount case into account. 5). Finally, we change ocfs2_commit_truncate() to call ocfs2_remove_btree_range() in a proper way. The patch has been tested normally for sanity check, stress tests with heavier workload will be expected. Based on this patch, fixing the punching holes bug will be fairly easy. Signed-off-by: Tristan Ye <tristan.ye@oracle.com> Acked-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 17:54:42 +08:00
ret = ocfs2_read_refcount_block(&tree->rf_ci, refcount_loc,
&ref_root_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
&tree->rf_ci,
ref_root_bh,
start_cpos, clusters,
Ocfs2: Optimize ocfs2 truncate to use ocfs2_remove_btree_range() instead. Truncate is just a special case of punching holes(from new i_size to end), we therefore could take advantage of the existing ocfs2_remove_btree_range() to reduce the comlexity and redundancy in alloc.c. The goal here is to make truncate more generic and straightforward. Several functions only used by ocfs2_commit_truncate() will smiply be removed. ocfs2_remove_btree_range() was originally used by the hole punching code, which didn't take refcount trees into account (definitely a bug). We therefore need to change that func a bit to handle refcount trees. It must take the refcount lock, calculate and reserve blocks for refcount tree changes, and decrease refcounts at the end. We replace ocfs2_lock_allocators() here by adding a new func ocfs2_reserve_blocks_for_rec_trunc() which accepts some extra blocks to reserve. This will not hurt any other code using ocfs2_remove_btree_range() (such as dir truncate and hole punching). I merged the following steps into one patch since they may be logically doing one thing, though I know it looks a little bit fat to review. 1). Remove redundant code used by ocfs2_commit_truncate(), since we're moving to ocfs2_remove_btree_range anyway. 2). Add a new func ocfs2_reserve_blocks_for_rec_trunc() for purpose of accepting some extra blocks to reserve. 3). Change ocfs2_prepare_refcount_change_for_del() a bit to fit our needs. It's safe to do this since it's only being called by truncate. 4). Change ocfs2_remove_btree_range() a bit to take refcount case into account. 5). Finally, we change ocfs2_commit_truncate() to call ocfs2_remove_btree_range() in a proper way. The patch has been tested normally for sanity check, stress tests with heavier workload will be expected. Based on this patch, fixing the punching holes bug will be fairly easy. Signed-off-by: Tristan Ye <tristan.ye@oracle.com> Acked-by: Mark Fasheh <mfasheh@suse.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-05-11 17:54:42 +08:00
ref_blocks, credits);
if (ret) {
mlog_errno(ret);
goto out;
}
trace_ocfs2_prepare_refcount_change_for_del(*ref_blocks, *credits);
out:
brelse(ref_root_bh);
return ret;
}
#define MAX_CONTIG_BYTES 1048576
static inline unsigned int ocfs2_cow_contig_clusters(struct super_block *sb)
{
return ocfs2_clusters_for_bytes(sb, MAX_CONTIG_BYTES);
}
static inline unsigned int ocfs2_cow_contig_mask(struct super_block *sb)
{
return ~(ocfs2_cow_contig_clusters(sb) - 1);
}
/*
* Given an extent that starts at 'start' and an I/O that starts at 'cpos',
* find an offset (start + (n * contig_clusters)) that is closest to cpos
* while still being less than or equal to it.
*
* The goal is to break the extent at a multiple of contig_clusters.
*/
static inline unsigned int ocfs2_cow_align_start(struct super_block *sb,
unsigned int start,
unsigned int cpos)
{
BUG_ON(start > cpos);
return start + ((cpos - start) & ocfs2_cow_contig_mask(sb));
}
/*
* Given a cluster count of len, pad it out so that it is a multiple
* of contig_clusters.
*/
static inline unsigned int ocfs2_cow_align_length(struct super_block *sb,
unsigned int len)
{
unsigned int padded =
(len + (ocfs2_cow_contig_clusters(sb) - 1)) &
ocfs2_cow_contig_mask(sb);
/* Did we wrap? */
if (padded < len)
padded = UINT_MAX;
return padded;
}
/*
* Calculate out the start and number of virtual clusters we need to CoW.
*
* cpos is vitual start cluster position we want to do CoW in a
* file and write_len is the cluster length.
* max_cpos is the place where we want to stop CoW intentionally.
*
* Normal we will start CoW from the beginning of extent record cotaining cpos.
* We try to break up extents on boundaries of MAX_CONTIG_BYTES so that we
* get good I/O from the resulting extent tree.
*/
static int ocfs2_refcount_cal_cow_clusters(struct inode *inode,
struct ocfs2_extent_list *el,
u32 cpos,
u32 write_len,
u32 max_cpos,
u32 *cow_start,
u32 *cow_len)
{
int ret = 0;
int tree_height = le16_to_cpu(el->l_tree_depth), i;
struct buffer_head *eb_bh = NULL;
struct ocfs2_extent_block *eb = NULL;
struct ocfs2_extent_rec *rec;
unsigned int want_clusters, rec_end = 0;
int contig_clusters = ocfs2_cow_contig_clusters(inode->i_sb);
int leaf_clusters;
BUG_ON(cpos + write_len > max_cpos);
if (tree_height > 0) {
ret = ocfs2_find_leaf(INODE_CACHE(inode), el, cpos, &eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
if (el->l_tree_depth) {
ret = ocfs2_error(inode->i_sb,
"Inode %lu has non zero tree depth in leaf block %llu\n",
inode->i_ino,
(unsigned long long)eb_bh->b_blocknr);
goto out;
}
}
*cow_len = 0;
for (i = 0; i < le16_to_cpu(el->l_next_free_rec); i++) {
rec = &el->l_recs[i];
if (ocfs2_is_empty_extent(rec)) {
mlog_bug_on_msg(i != 0, "Inode %lu has empty record in "
"index %d\n", inode->i_ino, i);
continue;
}
if (le32_to_cpu(rec->e_cpos) +
le16_to_cpu(rec->e_leaf_clusters) <= cpos)
continue;
if (*cow_len == 0) {
/*
* We should find a refcounted record in the
* first pass.
*/
BUG_ON(!(rec->e_flags & OCFS2_EXT_REFCOUNTED));
*cow_start = le32_to_cpu(rec->e_cpos);
}
/*
* If we encounter a hole, a non-refcounted record or
* pass the max_cpos, stop the search.
*/
if ((!(rec->e_flags & OCFS2_EXT_REFCOUNTED)) ||
(*cow_len && rec_end != le32_to_cpu(rec->e_cpos)) ||
(max_cpos <= le32_to_cpu(rec->e_cpos)))
break;
leaf_clusters = le16_to_cpu(rec->e_leaf_clusters);
rec_end = le32_to_cpu(rec->e_cpos) + leaf_clusters;
if (rec_end > max_cpos) {
rec_end = max_cpos;
leaf_clusters = rec_end - le32_to_cpu(rec->e_cpos);
}
/*
* How many clusters do we actually need from
* this extent? First we see how many we actually
* need to complete the write. If that's smaller
* than contig_clusters, we try for contig_clusters.
*/
if (!*cow_len)
want_clusters = write_len;
else
want_clusters = (cpos + write_len) -
(*cow_start + *cow_len);
if (want_clusters < contig_clusters)
want_clusters = contig_clusters;
/*
* If the write does not cover the whole extent, we
* need to calculate how we're going to split the extent.
* We try to do it on contig_clusters boundaries.
*
* Any extent smaller than contig_clusters will be
* CoWed in its entirety.
*/
if (leaf_clusters <= contig_clusters)
*cow_len += leaf_clusters;
else if (*cow_len || (*cow_start == cpos)) {
/*
* This extent needs to be CoW'd from its
* beginning, so all we have to do is compute
* how many clusters to grab. We align
* want_clusters to the edge of contig_clusters
* to get better I/O.
*/
want_clusters = ocfs2_cow_align_length(inode->i_sb,
want_clusters);
if (leaf_clusters < want_clusters)
*cow_len += leaf_clusters;
else
*cow_len += want_clusters;
} else if ((*cow_start + contig_clusters) >=
(cpos + write_len)) {
/*
* Breaking off contig_clusters at the front
* of the extent will cover our write. That's
* easy.
*/
*cow_len = contig_clusters;
} else if ((rec_end - cpos) <= contig_clusters) {
/*
* Breaking off contig_clusters at the tail of
* this extent will cover cpos.
*/
*cow_start = rec_end - contig_clusters;
*cow_len = contig_clusters;
} else if ((rec_end - cpos) <= want_clusters) {
/*
* While we can't fit the entire write in this
* extent, we know that the write goes from cpos
* to the end of the extent. Break that off.
* We try to break it at some multiple of
* contig_clusters from the front of the extent.
* Failing that (ie, cpos is within
* contig_clusters of the front), we'll CoW the
* entire extent.
*/
*cow_start = ocfs2_cow_align_start(inode->i_sb,
*cow_start, cpos);
*cow_len = rec_end - *cow_start;
} else {
/*
* Ok, the entire write lives in the middle of
* this extent. Let's try to slice the extent up
* nicely. Optimally, our CoW region starts at
* m*contig_clusters from the beginning of the
* extent and goes for n*contig_clusters,
* covering the entire write.
*/
*cow_start = ocfs2_cow_align_start(inode->i_sb,
*cow_start, cpos);
want_clusters = (cpos + write_len) - *cow_start;
want_clusters = ocfs2_cow_align_length(inode->i_sb,
want_clusters);
if (*cow_start + want_clusters <= rec_end)
*cow_len = want_clusters;
else
*cow_len = rec_end - *cow_start;
}
/* Have we covered our entire write yet? */
if ((*cow_start + *cow_len) >= (cpos + write_len))
break;
/*
* If we reach the end of the extent block and don't get enough
* clusters, continue with the next extent block if possible.
*/
if (i + 1 == le16_to_cpu(el->l_next_free_rec) &&
eb && eb->h_next_leaf_blk) {
brelse(eb_bh);
eb_bh = NULL;
ret = ocfs2_read_extent_block(INODE_CACHE(inode),
le64_to_cpu(eb->h_next_leaf_blk),
&eb_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
eb = (struct ocfs2_extent_block *) eb_bh->b_data;
el = &eb->h_list;
i = -1;
}
}
out:
brelse(eb_bh);
return ret;
}
/*
* Prepare meta_ac, data_ac and calculate credits when we want to add some
* num_clusters in data_tree "et" and change the refcount for the old
* clusters(starting form p_cluster) in the refcount tree.
*
* Note:
* 1. since we may split the old tree, so we at most will need num_clusters + 2
* more new leaf records.
* 2. In some case, we may not need to reserve new clusters(e.g, reflink), so
* just give data_ac = NULL.
*/
static int ocfs2_lock_refcount_allocators(struct super_block *sb,
u32 p_cluster, u32 num_clusters,
struct ocfs2_extent_tree *et,
struct ocfs2_caching_info *ref_ci,
struct buffer_head *ref_root_bh,
struct ocfs2_alloc_context **meta_ac,
struct ocfs2_alloc_context **data_ac,
int *credits)
{
int ret = 0, meta_add = 0;
int num_free_extents = ocfs2_num_free_extents(et);
if (num_free_extents < 0) {
ret = num_free_extents;
mlog_errno(ret);
goto out;
}
if (num_free_extents < num_clusters + 2)
meta_add =
ocfs2_extend_meta_needed(et->et_root_el);
*credits += ocfs2_calc_extend_credits(sb, et->et_root_el);
ret = ocfs2_calc_refcount_meta_credits(sb, ref_ci, ref_root_bh,
p_cluster, num_clusters,
&meta_add, credits);
if (ret) {
mlog_errno(ret);
goto out;
}
trace_ocfs2_lock_refcount_allocators(meta_add, *credits);
ret = ocfs2_reserve_new_metadata_blocks(OCFS2_SB(sb), meta_add,
meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
if (data_ac) {
ret = ocfs2_reserve_clusters(OCFS2_SB(sb), num_clusters,
data_ac);
if (ret)
mlog_errno(ret);
}
out:
if (ret) {
if (*meta_ac) {
ocfs2_free_alloc_context(*meta_ac);
*meta_ac = NULL;
}
}
return ret;
}
static int ocfs2_clear_cow_buffer(handle_t *handle, struct buffer_head *bh)
{
BUG_ON(buffer_dirty(bh));
clear_buffer_mapped(bh);
return 0;
}
int ocfs2_duplicate_clusters_by_page(handle_t *handle,
struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len)
{
int ret = 0, partial;
struct super_block *sb = inode->i_sb;
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
struct page *page;
pgoff_t page_index;
unsigned int from, to;
loff_t offset, end, map_end;
struct address_space *mapping = inode->i_mapping;
trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
new_cluster, new_len);
offset = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = offset + (new_len << OCFS2_SB(sb)->s_clustersize_bits);
/*
* We only duplicate pages until we reach the page contains i_size - 1.
* So trim 'end' to i_size.
*/
if (end > i_size_read(inode))
end = i_size_read(inode);
while (offset < end) {
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
page_index = offset >> PAGE_SHIFT;
map_end = ((loff_t)page_index + 1) << PAGE_SHIFT;
if (map_end > end)
map_end = end;
/* from, to is the offset within the page. */
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
from = offset & (PAGE_SIZE - 1);
to = PAGE_SIZE;
if (map_end & (PAGE_SIZE - 1))
to = map_end & (PAGE_SIZE - 1);
ocfs2: fix crash in ocfs2_duplicate_clusters_by_page() ocfs2_duplicate_clusters_by_page() may crash if one of the extent's pages is dirty. When a page has not been written back, it is still in dirty state. If ocfs2_duplicate_clusters_by_page() is called against the dirty page, the crash happens. To fix this bug, we can just unlock the page and wait until the page until its not dirty. The following is the backtrace: kernel BUG at /root/code/ocfs2/refcounttree.c:2961! [exception RIP: ocfs2_duplicate_clusters_by_page+822] __ocfs2_move_extent+0x80/0x450 [ocfs2] ? __ocfs2_claim_clusters+0x130/0x250 [ocfs2] ocfs2_defrag_extent+0x5b8/0x5e0 [ocfs2] __ocfs2_move_extents_range+0x2a4/0x470 [ocfs2] ocfs2_move_extents+0x180/0x3b0 [ocfs2] ? ocfs2_wait_for_recovery+0x13/0x70 [ocfs2] ocfs2_ioctl_move_extents+0x133/0x2d0 [ocfs2] ocfs2_ioctl+0x253/0x640 [ocfs2] do_vfs_ioctl+0x90/0x5f0 SyS_ioctl+0x74/0x80 do_syscall_64+0x74/0x140 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Once we find the page is dirty, we do not wait until it's clean, rather we use write_one_page() to write it back Link: http://lkml.kernel.org/r/20180829074740.9438-1-lchen@suse.com [lchen@suse.com: update comments] Link: http://lkml.kernel.org/r/20180830075041.14879-1-lchen@suse.com [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Larry Chen <lchen@suse.com> Acked-by: Changwei Ge <ge.changwei@h3c.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Joseph Qi <jiangqi903@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2018-10-06 06:51:37 +08:00
retry:
page = find_or_create_page(mapping, page_index, GFP_NOFS);
if (!page) {
ret = -ENOMEM;
mlog_errno(ret);
break;
}
/*
ocfs2: fix crash in ocfs2_duplicate_clusters_by_page() ocfs2_duplicate_clusters_by_page() may crash if one of the extent's pages is dirty. When a page has not been written back, it is still in dirty state. If ocfs2_duplicate_clusters_by_page() is called against the dirty page, the crash happens. To fix this bug, we can just unlock the page and wait until the page until its not dirty. The following is the backtrace: kernel BUG at /root/code/ocfs2/refcounttree.c:2961! [exception RIP: ocfs2_duplicate_clusters_by_page+822] __ocfs2_move_extent+0x80/0x450 [ocfs2] ? __ocfs2_claim_clusters+0x130/0x250 [ocfs2] ocfs2_defrag_extent+0x5b8/0x5e0 [ocfs2] __ocfs2_move_extents_range+0x2a4/0x470 [ocfs2] ocfs2_move_extents+0x180/0x3b0 [ocfs2] ? ocfs2_wait_for_recovery+0x13/0x70 [ocfs2] ocfs2_ioctl_move_extents+0x133/0x2d0 [ocfs2] ocfs2_ioctl+0x253/0x640 [ocfs2] do_vfs_ioctl+0x90/0x5f0 SyS_ioctl+0x74/0x80 do_syscall_64+0x74/0x140 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Once we find the page is dirty, we do not wait until it's clean, rather we use write_one_page() to write it back Link: http://lkml.kernel.org/r/20180829074740.9438-1-lchen@suse.com [lchen@suse.com: update comments] Link: http://lkml.kernel.org/r/20180830075041.14879-1-lchen@suse.com [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Larry Chen <lchen@suse.com> Acked-by: Changwei Ge <ge.changwei@h3c.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Joseph Qi <jiangqi903@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2018-10-06 06:51:37 +08:00
* In case PAGE_SIZE <= CLUSTER_SIZE, we do not expect a dirty
* page, so write it back.
*/
ocfs2: fix crash in ocfs2_duplicate_clusters_by_page() ocfs2_duplicate_clusters_by_page() may crash if one of the extent's pages is dirty. When a page has not been written back, it is still in dirty state. If ocfs2_duplicate_clusters_by_page() is called against the dirty page, the crash happens. To fix this bug, we can just unlock the page and wait until the page until its not dirty. The following is the backtrace: kernel BUG at /root/code/ocfs2/refcounttree.c:2961! [exception RIP: ocfs2_duplicate_clusters_by_page+822] __ocfs2_move_extent+0x80/0x450 [ocfs2] ? __ocfs2_claim_clusters+0x130/0x250 [ocfs2] ocfs2_defrag_extent+0x5b8/0x5e0 [ocfs2] __ocfs2_move_extents_range+0x2a4/0x470 [ocfs2] ocfs2_move_extents+0x180/0x3b0 [ocfs2] ? ocfs2_wait_for_recovery+0x13/0x70 [ocfs2] ocfs2_ioctl_move_extents+0x133/0x2d0 [ocfs2] ocfs2_ioctl+0x253/0x640 [ocfs2] do_vfs_ioctl+0x90/0x5f0 SyS_ioctl+0x74/0x80 do_syscall_64+0x74/0x140 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Once we find the page is dirty, we do not wait until it's clean, rather we use write_one_page() to write it back Link: http://lkml.kernel.org/r/20180829074740.9438-1-lchen@suse.com [lchen@suse.com: update comments] Link: http://lkml.kernel.org/r/20180830075041.14879-1-lchen@suse.com [akpm@linux-foundation.org: coding-style fixes] Signed-off-by: Larry Chen <lchen@suse.com> Acked-by: Changwei Ge <ge.changwei@h3c.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Junxiao Bi <junxiao.bi@oracle.com> Cc: Joseph Qi <jiangqi903@gmail.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Greg Kroah-Hartman <gregkh@linuxfoundation.org>
2018-10-06 06:51:37 +08:00
if (PAGE_SIZE <= OCFS2_SB(sb)->s_clustersize) {
if (PageDirty(page)) {
/*
* write_on_page will unlock the page on return
*/
ret = write_one_page(page);
goto retry;
}
}
if (!PageUptodate(page)) {
struct folio *folio = page_folio(page);
ret = block_read_full_folio(folio, ocfs2_get_block);
if (ret) {
mlog_errno(ret);
goto unlock;
}
folio_lock(folio);
}
if (page_has_buffers(page)) {
ret = walk_page_buffers(handle, page_buffers(page),
from, to, &partial,
ocfs2_clear_cow_buffer);
if (ret) {
mlog_errno(ret);
goto unlock;
}
}
ocfs2_map_and_dirty_page(inode,
handle, from, to,
page, 0, &new_block);
mark_page_accessed(page);
unlock:
unlock_page(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
put_page(page);
page = NULL;
offset = map_end;
if (ret)
break;
}
return ret;
}
int ocfs2_duplicate_clusters_by_jbd(handle_t *handle,
struct inode *inode,
u32 cpos, u32 old_cluster,
u32 new_cluster, u32 new_len)
{
int ret = 0;
struct super_block *sb = inode->i_sb;
struct ocfs2_caching_info *ci = INODE_CACHE(inode);
int i, blocks = ocfs2_clusters_to_blocks(sb, new_len);
u64 old_block = ocfs2_clusters_to_blocks(sb, old_cluster);
u64 new_block = ocfs2_clusters_to_blocks(sb, new_cluster);
struct ocfs2_super *osb = OCFS2_SB(sb);
struct buffer_head *old_bh = NULL;
struct buffer_head *new_bh = NULL;
trace_ocfs2_duplicate_clusters_by_page(cpos, old_cluster,
new_cluster, new_len);
for (i = 0; i < blocks; i++, old_block++, new_block++) {
new_bh = sb_getblk(osb->sb, new_block);
if (new_bh == NULL) {
ret = -ENOMEM;
mlog_errno(ret);
break;
}
ocfs2_set_new_buffer_uptodate(ci, new_bh);
ret = ocfs2_read_block(ci, old_block, &old_bh, NULL);
if (ret) {
mlog_errno(ret);
break;
}
ret = ocfs2_journal_access(handle, ci, new_bh,
OCFS2_JOURNAL_ACCESS_CREATE);
if (ret) {
mlog_errno(ret);
break;
}
memcpy(new_bh->b_data, old_bh->b_data, sb->s_blocksize);
ocfs2_journal_dirty(handle, new_bh);
brelse(new_bh);
brelse(old_bh);
new_bh = NULL;
old_bh = NULL;
}
brelse(new_bh);
brelse(old_bh);
return ret;
}
static int ocfs2_clear_ext_refcount(handle_t *handle,
struct ocfs2_extent_tree *et,
u32 cpos, u32 p_cluster, u32 len,
unsigned int ext_flags,
struct ocfs2_alloc_context *meta_ac,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret, index;
struct ocfs2_extent_rec replace_rec;
struct ocfs2_path *path = NULL;
struct ocfs2_extent_list *el;
struct super_block *sb = ocfs2_metadata_cache_get_super(et->et_ci);
u64 ino = ocfs2_metadata_cache_owner(et->et_ci);
trace_ocfs2_clear_ext_refcount((unsigned long long)ino,
cpos, len, p_cluster, ext_flags);
memset(&replace_rec, 0, sizeof(replace_rec));
replace_rec.e_cpos = cpu_to_le32(cpos);
replace_rec.e_leaf_clusters = cpu_to_le16(len);
replace_rec.e_blkno = cpu_to_le64(ocfs2_clusters_to_blocks(sb,
p_cluster));
replace_rec.e_flags = ext_flags;
replace_rec.e_flags &= ~OCFS2_EXT_REFCOUNTED;
path = ocfs2_new_path_from_et(et);
if (!path) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
ret = ocfs2_find_path(et->et_ci, path, cpos);
if (ret) {
mlog_errno(ret);
goto out;
}
el = path_leaf_el(path);
index = ocfs2_search_extent_list(el, cpos);
if (index == -1) {
ret = ocfs2_error(sb,
"Inode %llu has an extent at cpos %u which can no longer be found\n",
(unsigned long long)ino, cpos);
goto out;
}
ret = ocfs2_split_extent(handle, et, path, index,
&replace_rec, meta_ac, dealloc);
if (ret)
mlog_errno(ret);
out:
ocfs2_free_path(path);
return ret;
}
static int ocfs2_replace_clusters(handle_t *handle,
struct ocfs2_cow_context *context,
u32 cpos, u32 old,
u32 new, u32 len,
unsigned int ext_flags)
{
int ret;
struct ocfs2_caching_info *ci = context->data_et.et_ci;
u64 ino = ocfs2_metadata_cache_owner(ci);
trace_ocfs2_replace_clusters((unsigned long long)ino,
cpos, old, new, len, ext_flags);
/*If the old clusters is unwritten, no need to duplicate. */
if (!(ext_flags & OCFS2_EXT_UNWRITTEN)) {
ret = context->cow_duplicate_clusters(handle, context->inode,
cpos, old, new, len);
if (ret) {
mlog_errno(ret);
goto out;
}
}
ret = ocfs2_clear_ext_refcount(handle, &context->data_et,
cpos, new, len, ext_flags,
context->meta_ac, &context->dealloc);
if (ret)
mlog_errno(ret);
out:
return ret;
}
int ocfs2_cow_sync_writeback(struct super_block *sb,
struct inode *inode,
u32 cpos, u32 num_clusters)
{
int ret;
loff_t start, end;
if (ocfs2_should_order_data(inode))
return 0;
start = ((loff_t)cpos) << OCFS2_SB(sb)->s_clustersize_bits;
end = start + (num_clusters << OCFS2_SB(sb)->s_clustersize_bits) - 1;
ret = filemap_write_and_wait_range(inode->i_mapping, start, end);
if (ret < 0)
mlog_errno(ret);
return ret;
}
static int ocfs2_di_get_clusters(struct ocfs2_cow_context *context,
u32 v_cluster, u32 *p_cluster,
u32 *num_clusters,
unsigned int *extent_flags)
{
return ocfs2_get_clusters(context->inode, v_cluster, p_cluster,
num_clusters, extent_flags);
}
static int ocfs2_make_clusters_writable(struct super_block *sb,
struct ocfs2_cow_context *context,
u32 cpos, u32 p_cluster,
u32 num_clusters, unsigned int e_flags)
{
int ret, delete, index, credits = 0;
u32 new_bit, new_len, orig_num_clusters;
unsigned int set_len;
struct ocfs2_super *osb = OCFS2_SB(sb);
handle_t *handle;
struct buffer_head *ref_leaf_bh = NULL;
struct ocfs2_caching_info *ref_ci = &context->ref_tree->rf_ci;
struct ocfs2_refcount_rec rec;
trace_ocfs2_make_clusters_writable(cpos, p_cluster,
num_clusters, e_flags);
ret = ocfs2_lock_refcount_allocators(sb, p_cluster, num_clusters,
&context->data_et,
ref_ci,
context->ref_root_bh,
&context->meta_ac,
&context->data_ac, &credits);
if (ret) {
mlog_errno(ret);
return ret;
}
if (context->post_refcount)
credits += context->post_refcount->credits;
credits += context->extra_credits;
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out;
}
orig_num_clusters = num_clusters;
while (num_clusters) {
ret = ocfs2_get_refcount_rec(ref_ci, context->ref_root_bh,
p_cluster, num_clusters,
&rec, &index, &ref_leaf_bh);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
BUG_ON(!rec.r_refcount);
set_len = min((u64)p_cluster + num_clusters,
le64_to_cpu(rec.r_cpos) +
le32_to_cpu(rec.r_clusters)) - p_cluster;
/*
* There are many different situation here.
* 1. If refcount == 1, remove the flag and don't COW.
* 2. If refcount > 1, allocate clusters.
* Here we may not allocate r_len once at a time, so continue
* until we reach num_clusters.
*/
if (le32_to_cpu(rec.r_refcount) == 1) {
delete = 0;
ret = ocfs2_clear_ext_refcount(handle,
&context->data_et,
cpos, p_cluster,
set_len, e_flags,
context->meta_ac,
&context->dealloc);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
} else {
delete = 1;
ret = __ocfs2_claim_clusters(handle,
context->data_ac,
1, set_len,
&new_bit, &new_len);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
ret = ocfs2_replace_clusters(handle, context,
cpos, p_cluster, new_bit,
new_len, e_flags);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
set_len = new_len;
}
ret = __ocfs2_decrease_refcount(handle, ref_ci,
context->ref_root_bh,
p_cluster, set_len,
context->meta_ac,
&context->dealloc, delete);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
cpos += set_len;
p_cluster += set_len;
num_clusters -= set_len;
brelse(ref_leaf_bh);
ref_leaf_bh = NULL;
}
/* handle any post_cow action. */
if (context->post_refcount && context->post_refcount->func) {
ret = context->post_refcount->func(context->inode, handle,
context->post_refcount->para);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
}
/*
* Here we should write the new page out first if we are
* in write-back mode.
*/
if (context->get_clusters == ocfs2_di_get_clusters) {
ret = ocfs2_cow_sync_writeback(sb, context->inode, cpos,
orig_num_clusters);
if (ret)
mlog_errno(ret);
}
out_commit:
ocfs2_commit_trans(osb, handle);
out:
if (context->data_ac) {
ocfs2_free_alloc_context(context->data_ac);
context->data_ac = NULL;
}
if (context->meta_ac) {
ocfs2_free_alloc_context(context->meta_ac);
context->meta_ac = NULL;
}
brelse(ref_leaf_bh);
return ret;
}
static int ocfs2_replace_cow(struct ocfs2_cow_context *context)
{
int ret = 0;
struct inode *inode = context->inode;
u32 cow_start = context->cow_start, cow_len = context->cow_len;
u32 p_cluster, num_clusters;
unsigned int ext_flags;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
if (!ocfs2_refcount_tree(osb)) {
return ocfs2_error(inode->i_sb, "Inode %lu want to use refcount tree, but the feature bit is not set in the super block\n",
inode->i_ino);
}
ocfs2_init_dealloc_ctxt(&context->dealloc);
while (cow_len) {
ret = context->get_clusters(context, cow_start, &p_cluster,
&num_clusters, &ext_flags);
if (ret) {
mlog_errno(ret);
break;
}
BUG_ON(!(ext_flags & OCFS2_EXT_REFCOUNTED));
if (cow_len < num_clusters)
num_clusters = cow_len;
ret = ocfs2_make_clusters_writable(inode->i_sb, context,
cow_start, p_cluster,
num_clusters, ext_flags);
if (ret) {
mlog_errno(ret);
break;
}
cow_len -= num_clusters;
cow_start += num_clusters;
}
if (ocfs2_dealloc_has_cluster(&context->dealloc)) {
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &context->dealloc);
}
return ret;
}
/*
* Starting at cpos, try to CoW write_len clusters. Don't CoW
* past max_cpos. This will stop when it runs into a hole or an
* unrefcounted extent.
*/
static int ocfs2_refcount_cow_hunk(struct inode *inode,
struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos)
{
int ret;
u32 cow_start = 0, cow_len = 0;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *ref_tree;
struct ocfs2_cow_context *context = NULL;
BUG_ON(!ocfs2_is_refcount_inode(inode));
ret = ocfs2_refcount_cal_cow_clusters(inode, &di->id2.i_list,
cpos, write_len, max_cpos,
&cow_start, &cow_len);
if (ret) {
mlog_errno(ret);
goto out;
}
trace_ocfs2_refcount_cow_hunk(OCFS2_I(inode)->ip_blkno,
cpos, write_len, max_cpos,
cow_start, cow_len);
BUG_ON(cow_len == 0);
context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
if (!context) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
1, &ref_tree, &ref_root_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
context->inode = inode;
context->cow_start = cow_start;
context->cow_len = cow_len;
context->ref_tree = ref_tree;
context->ref_root_bh = ref_root_bh;
context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_page;
context->get_clusters = ocfs2_di_get_clusters;
ocfs2_init_dinode_extent_tree(&context->data_et,
INODE_CACHE(inode), di_bh);
ret = ocfs2_replace_cow(context);
if (ret)
mlog_errno(ret);
/*
* truncate the extent map here since no matter whether we meet with
* any error during the action, we shouldn't trust cached extent map
* any more.
*/
ocfs2_extent_map_trunc(inode, cow_start);
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh);
out:
kfree(context);
return ret;
}
/*
* CoW any and all clusters between cpos and cpos+write_len.
* Don't CoW past max_cpos. If this returns successfully, all
* clusters between cpos and cpos+write_len are safe to modify.
*/
int ocfs2_refcount_cow(struct inode *inode,
struct buffer_head *di_bh,
u32 cpos, u32 write_len, u32 max_cpos)
{
int ret = 0;
u32 p_cluster, num_clusters;
unsigned int ext_flags;
while (write_len) {
ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
&num_clusters, &ext_flags);
if (ret) {
mlog_errno(ret);
break;
}
if (write_len < num_clusters)
num_clusters = write_len;
if (ext_flags & OCFS2_EXT_REFCOUNTED) {
ret = ocfs2_refcount_cow_hunk(inode, di_bh, cpos,
num_clusters, max_cpos);
if (ret) {
mlog_errno(ret);
break;
}
}
write_len -= num_clusters;
cpos += num_clusters;
}
return ret;
}
static int ocfs2_xattr_value_get_clusters(struct ocfs2_cow_context *context,
u32 v_cluster, u32 *p_cluster,
u32 *num_clusters,
unsigned int *extent_flags)
{
struct inode *inode = context->inode;
struct ocfs2_xattr_value_root *xv = context->cow_object;
return ocfs2_xattr_get_clusters(inode, v_cluster, p_cluster,
num_clusters, &xv->xr_list,
extent_flags);
}
/*
* Given a xattr value root, calculate the most meta/credits we need for
* refcount tree change if we truncate it to 0.
*/
int ocfs2_refcounted_xattr_delete_need(struct inode *inode,
struct ocfs2_caching_info *ref_ci,
struct buffer_head *ref_root_bh,
struct ocfs2_xattr_value_root *xv,
int *meta_add, int *credits)
{
int ret = 0, index, ref_blocks = 0;
u32 p_cluster, num_clusters;
u32 cpos = 0, clusters = le32_to_cpu(xv->xr_clusters);
struct ocfs2_refcount_block *rb;
struct ocfs2_refcount_rec rec;
struct buffer_head *ref_leaf_bh = NULL;
while (cpos < clusters) {
ret = ocfs2_xattr_get_clusters(inode, cpos, &p_cluster,
&num_clusters, &xv->xr_list,
NULL);
if (ret) {
mlog_errno(ret);
goto out;
}
cpos += num_clusters;
while (num_clusters) {
ret = ocfs2_get_refcount_rec(ref_ci, ref_root_bh,
p_cluster, num_clusters,
&rec, &index,
&ref_leaf_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
BUG_ON(!rec.r_refcount);
rb = (struct ocfs2_refcount_block *)ref_leaf_bh->b_data;
/*
* We really don't know whether the other clusters is in
* this refcount block or not, so just take the worst
* case that all the clusters are in this block and each
* one will split a refcount rec, so totally we need
* clusters * 2 new refcount rec.
*/
if (le16_to_cpu(rb->rf_records.rl_used) + clusters * 2 >
le16_to_cpu(rb->rf_records.rl_count))
ref_blocks++;
*credits += 1;
brelse(ref_leaf_bh);
ref_leaf_bh = NULL;
if (num_clusters <= le32_to_cpu(rec.r_clusters))
break;
else
num_clusters -= le32_to_cpu(rec.r_clusters);
p_cluster += num_clusters;
}
}
*meta_add += ref_blocks;
if (!ref_blocks)
goto out;
rb = (struct ocfs2_refcount_block *)ref_root_bh->b_data;
if (le32_to_cpu(rb->rf_flags) & OCFS2_REFCOUNT_TREE_FL)
*credits += OCFS2_EXPAND_REFCOUNT_TREE_CREDITS;
else {
struct ocfs2_extent_tree et;
ocfs2_init_refcount_extent_tree(&et, ref_ci, ref_root_bh);
*credits += ocfs2_calc_extend_credits(inode->i_sb,
et.et_root_el);
}
out:
brelse(ref_leaf_bh);
return ret;
}
/*
* Do CoW for xattr.
*/
int ocfs2_refcount_cow_xattr(struct inode *inode,
struct ocfs2_dinode *di,
struct ocfs2_xattr_value_buf *vb,
struct ocfs2_refcount_tree *ref_tree,
struct buffer_head *ref_root_bh,
u32 cpos, u32 write_len,
struct ocfs2_post_refcount *post)
{
int ret;
struct ocfs2_xattr_value_root *xv = vb->vb_xv;
struct ocfs2_cow_context *context = NULL;
u32 cow_start, cow_len;
BUG_ON(!ocfs2_is_refcount_inode(inode));
ret = ocfs2_refcount_cal_cow_clusters(inode, &xv->xr_list,
cpos, write_len, UINT_MAX,
&cow_start, &cow_len);
if (ret) {
mlog_errno(ret);
goto out;
}
BUG_ON(cow_len == 0);
context = kzalloc(sizeof(struct ocfs2_cow_context), GFP_NOFS);
if (!context) {
ret = -ENOMEM;
mlog_errno(ret);
goto out;
}
context->inode = inode;
context->cow_start = cow_start;
context->cow_len = cow_len;
context->ref_tree = ref_tree;
context->ref_root_bh = ref_root_bh;
context->cow_object = xv;
context->cow_duplicate_clusters = ocfs2_duplicate_clusters_by_jbd;
/* We need the extra credits for duplicate_clusters by jbd. */
context->extra_credits =
ocfs2_clusters_to_blocks(inode->i_sb, 1) * cow_len;
context->get_clusters = ocfs2_xattr_value_get_clusters;
context->post_refcount = post;
ocfs2_init_xattr_value_extent_tree(&context->data_et,
INODE_CACHE(inode), vb);
ret = ocfs2_replace_cow(context);
if (ret)
mlog_errno(ret);
out:
kfree(context);
return ret;
}
/*
* Insert a new extent into refcount tree and mark a extent rec
* as refcounted in the dinode tree.
*/
int ocfs2_add_refcount_flag(struct inode *inode,
struct ocfs2_extent_tree *data_et,
struct ocfs2_caching_info *ref_ci,
struct buffer_head *ref_root_bh,
u32 cpos, u32 p_cluster, u32 num_clusters,
struct ocfs2_cached_dealloc_ctxt *dealloc,
struct ocfs2_post_refcount *post)
{
int ret;
handle_t *handle;
int credits = 1, ref_blocks = 0;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_alloc_context *meta_ac = NULL;
/* We need to be able to handle at least an extent tree split. */
ref_blocks = ocfs2_extend_meta_needed(data_et->et_root_el);
ret = ocfs2_calc_refcount_meta_credits(inode->i_sb,
ref_ci, ref_root_bh,
p_cluster, num_clusters,
&ref_blocks, &credits);
if (ret) {
mlog_errno(ret);
goto out;
}
trace_ocfs2_add_refcount_flag(ref_blocks, credits);
if (ref_blocks) {
ret = ocfs2_reserve_new_metadata_blocks(osb,
ref_blocks, &meta_ac);
if (ret) {
mlog_errno(ret);
goto out;
}
}
if (post)
credits += post->credits;
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out;
}
ret = ocfs2_mark_extent_refcounted(inode, data_et, handle,
cpos, num_clusters, p_cluster,
meta_ac, dealloc);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
ret = __ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
p_cluster, num_clusters, 0,
meta_ac, dealloc);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
if (post && post->func) {
ret = post->func(inode, handle, post->para);
if (ret)
mlog_errno(ret);
}
out_commit:
ocfs2_commit_trans(osb, handle);
out:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
return ret;
}
static int ocfs2_change_ctime(struct inode *inode,
struct buffer_head *di_bh)
{
int ret;
handle_t *handle;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
handle = ocfs2_start_trans(OCFS2_SB(inode->i_sb),
OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out;
}
ret = ocfs2_journal_access_di(handle, INODE_CACHE(inode), di_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
inode->i_ctime = current_time(inode);
di->i_ctime = cpu_to_le64(inode->i_ctime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(inode->i_ctime.tv_nsec);
ocfs2_journal_dirty(handle, di_bh);
out_commit:
ocfs2_commit_trans(OCFS2_SB(inode->i_sb), handle);
out:
return ret;
}
static int ocfs2_attach_refcount_tree(struct inode *inode,
struct buffer_head *di_bh)
{
int ret, data_changed = 0;
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_inode_info *oi = OCFS2_I(inode);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)di_bh->b_data;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_refcount_tree *ref_tree;
unsigned int ext_flags;
loff_t size;
u32 cpos, num_clusters, clusters, p_cluster;
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_extent_tree di_et;
ocfs2_init_dealloc_ctxt(&dealloc);
if (!ocfs2_is_refcount_inode(inode)) {
ret = ocfs2_create_refcount_tree(inode, di_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
}
BUG_ON(!di->i_refcount_loc);
ret = ocfs2_lock_refcount_tree(osb,
le64_to_cpu(di->i_refcount_loc), 1,
&ref_tree, &ref_root_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
if (oi->ip_dyn_features & OCFS2_INLINE_DATA_FL)
goto attach_xattr;
ocfs2_init_dinode_extent_tree(&di_et, INODE_CACHE(inode), di_bh);
size = i_size_read(inode);
clusters = ocfs2_clusters_for_bytes(inode->i_sb, size);
cpos = 0;
while (cpos < clusters) {
ret = ocfs2_get_clusters(inode, cpos, &p_cluster,
&num_clusters, &ext_flags);
if (ret) {
mlog_errno(ret);
goto unlock;
}
if (p_cluster && !(ext_flags & OCFS2_EXT_REFCOUNTED)) {
ret = ocfs2_add_refcount_flag(inode, &di_et,
&ref_tree->rf_ci,
ref_root_bh, cpos,
p_cluster, num_clusters,
&dealloc, NULL);
if (ret) {
mlog_errno(ret);
goto unlock;
}
data_changed = 1;
}
cpos += num_clusters;
}
attach_xattr:
if (oi->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
ret = ocfs2_xattr_attach_refcount_tree(inode, di_bh,
&ref_tree->rf_ci,
ref_root_bh,
&dealloc);
if (ret) {
mlog_errno(ret);
goto unlock;
}
}
if (data_changed) {
ret = ocfs2_change_ctime(inode, di_bh);
if (ret)
mlog_errno(ret);
}
unlock:
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh);
if (!ret && ocfs2_dealloc_has_cluster(&dealloc)) {
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &dealloc);
}
out:
/*
* Empty the extent map so that we may get the right extent
* record from the disk.
*/
ocfs2_extent_map_trunc(inode, 0);
return ret;
}
static int ocfs2_add_refcounted_extent(struct inode *inode,
struct ocfs2_extent_tree *et,
struct ocfs2_caching_info *ref_ci,
struct buffer_head *ref_root_bh,
u32 cpos, u32 p_cluster, u32 num_clusters,
unsigned int ext_flags,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret;
handle_t *handle;
int credits = 0;
struct ocfs2_super *osb = OCFS2_SB(inode->i_sb);
struct ocfs2_alloc_context *meta_ac = NULL;
ret = ocfs2_lock_refcount_allocators(inode->i_sb,
p_cluster, num_clusters,
et, ref_ci,
ref_root_bh, &meta_ac,
NULL, &credits);
if (ret) {
mlog_errno(ret);
goto out;
}
handle = ocfs2_start_trans(osb, credits);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out;
}
ret = ocfs2_insert_extent(handle, et, cpos,
ocfs2_clusters_to_blocks(inode->i_sb, p_cluster),
num_clusters, ext_flags, meta_ac);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
ret = ocfs2_increase_refcount(handle, ref_ci, ref_root_bh,
p_cluster, num_clusters,
meta_ac, dealloc);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
ret = dquot_alloc_space_nodirty(inode,
ocfs2_clusters_to_bytes(osb->sb, num_clusters));
if (ret)
mlog_errno(ret);
out_commit:
ocfs2_commit_trans(osb, handle);
out:
if (meta_ac)
ocfs2_free_alloc_context(meta_ac);
return ret;
}
static int ocfs2_duplicate_inline_data(struct inode *s_inode,
struct buffer_head *s_bh,
struct inode *t_inode,
struct buffer_head *t_bh)
{
int ret;
handle_t *handle;
struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
struct ocfs2_dinode *t_di = (struct ocfs2_dinode *)t_bh->b_data;
BUG_ON(!(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL));
handle = ocfs2_start_trans(osb, OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
goto out;
}
ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
t_di->id2.i_data.id_count = s_di->id2.i_data.id_count;
memcpy(t_di->id2.i_data.id_data, s_di->id2.i_data.id_data,
le16_to_cpu(s_di->id2.i_data.id_count));
spin_lock(&OCFS2_I(t_inode)->ip_lock);
OCFS2_I(t_inode)->ip_dyn_features |= OCFS2_INLINE_DATA_FL;
t_di->i_dyn_features = cpu_to_le16(OCFS2_I(t_inode)->ip_dyn_features);
spin_unlock(&OCFS2_I(t_inode)->ip_lock);
ocfs2_journal_dirty(handle, t_bh);
out_commit:
ocfs2_commit_trans(osb, handle);
out:
return ret;
}
static int ocfs2_duplicate_extent_list(struct inode *s_inode,
struct inode *t_inode,
struct buffer_head *t_bh,
struct ocfs2_caching_info *ref_ci,
struct buffer_head *ref_root_bh,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
int ret = 0;
u32 p_cluster, num_clusters, clusters, cpos;
loff_t size;
unsigned int ext_flags;
struct ocfs2_extent_tree et;
ocfs2_init_dinode_extent_tree(&et, INODE_CACHE(t_inode), t_bh);
size = i_size_read(s_inode);
clusters = ocfs2_clusters_for_bytes(s_inode->i_sb, size);
cpos = 0;
while (cpos < clusters) {
ret = ocfs2_get_clusters(s_inode, cpos, &p_cluster,
&num_clusters, &ext_flags);
if (ret) {
mlog_errno(ret);
goto out;
}
if (p_cluster) {
ret = ocfs2_add_refcounted_extent(t_inode, &et,
ref_ci, ref_root_bh,
cpos, p_cluster,
num_clusters,
ext_flags,
dealloc);
if (ret) {
mlog_errno(ret);
goto out;
}
}
cpos += num_clusters;
}
out:
return ret;
}
/*
* change the new file's attributes to the src.
*
* reflink creates a snapshot of a file, that means the attributes
* must be identical except for three exceptions - nlink, ino, and ctime.
*/
static int ocfs2_complete_reflink(struct inode *s_inode,
struct buffer_head *s_bh,
struct inode *t_inode,
struct buffer_head *t_bh,
bool preserve)
{
int ret;
handle_t *handle;
struct ocfs2_dinode *s_di = (struct ocfs2_dinode *)s_bh->b_data;
struct ocfs2_dinode *di = (struct ocfs2_dinode *)t_bh->b_data;
loff_t size = i_size_read(s_inode);
handle = ocfs2_start_trans(OCFS2_SB(t_inode->i_sb),
OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
return ret;
}
ret = ocfs2_journal_access_di(handle, INODE_CACHE(t_inode), t_bh,
OCFS2_JOURNAL_ACCESS_WRITE);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
spin_lock(&OCFS2_I(t_inode)->ip_lock);
OCFS2_I(t_inode)->ip_clusters = OCFS2_I(s_inode)->ip_clusters;
OCFS2_I(t_inode)->ip_attr = OCFS2_I(s_inode)->ip_attr;
OCFS2_I(t_inode)->ip_dyn_features = OCFS2_I(s_inode)->ip_dyn_features;
spin_unlock(&OCFS2_I(t_inode)->ip_lock);
i_size_write(t_inode, size);
t_inode->i_blocks = s_inode->i_blocks;
di->i_xattr_inline_size = s_di->i_xattr_inline_size;
di->i_clusters = s_di->i_clusters;
di->i_size = s_di->i_size;
di->i_dyn_features = s_di->i_dyn_features;
di->i_attr = s_di->i_attr;
if (preserve) {
t_inode->i_uid = s_inode->i_uid;
t_inode->i_gid = s_inode->i_gid;
t_inode->i_mode = s_inode->i_mode;
di->i_uid = s_di->i_uid;
di->i_gid = s_di->i_gid;
di->i_mode = s_di->i_mode;
/*
* update time.
* we want mtime to appear identical to the source and
* update ctime.
*/
t_inode->i_ctime = current_time(t_inode);
di->i_ctime = cpu_to_le64(t_inode->i_ctime.tv_sec);
di->i_ctime_nsec = cpu_to_le32(t_inode->i_ctime.tv_nsec);
t_inode->i_mtime = s_inode->i_mtime;
di->i_mtime = s_di->i_mtime;
di->i_mtime_nsec = s_di->i_mtime_nsec;
}
ocfs2_journal_dirty(handle, t_bh);
out_commit:
ocfs2_commit_trans(OCFS2_SB(t_inode->i_sb), handle);
return ret;
}
static int ocfs2_create_reflink_node(struct inode *s_inode,
struct buffer_head *s_bh,
struct inode *t_inode,
struct buffer_head *t_bh,
bool preserve)
{
int ret;
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_super *osb = OCFS2_SB(s_inode->i_sb);
struct ocfs2_dinode *di = (struct ocfs2_dinode *)s_bh->b_data;
struct ocfs2_refcount_tree *ref_tree;
ocfs2_init_dealloc_ctxt(&dealloc);
ret = ocfs2_set_refcount_tree(t_inode, t_bh,
le64_to_cpu(di->i_refcount_loc));
if (ret) {
mlog_errno(ret);
goto out;
}
if (OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
ret = ocfs2_duplicate_inline_data(s_inode, s_bh,
t_inode, t_bh);
if (ret)
mlog_errno(ret);
goto out;
}
ret = ocfs2_lock_refcount_tree(osb, le64_to_cpu(di->i_refcount_loc),
1, &ref_tree, &ref_root_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_duplicate_extent_list(s_inode, t_inode, t_bh,
&ref_tree->rf_ci, ref_root_bh,
&dealloc);
if (ret) {
mlog_errno(ret);
goto out_unlock_refcount;
}
out_unlock_refcount:
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh);
out:
if (ocfs2_dealloc_has_cluster(&dealloc)) {
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &dealloc);
}
return ret;
}
static int __ocfs2_reflink(struct dentry *old_dentry,
struct buffer_head *old_bh,
struct inode *new_inode,
bool preserve)
{
int ret;
struct inode *inode = d_inode(old_dentry);
struct buffer_head *new_bh = NULL;
ocfs2: Zero the tail cluster when extending past i_size. ocfs2's allocation unit is the cluster. This can be larger than a block or even a memory page. This means that a file may have many blocks in its last extent that are beyond the block containing i_size. There also may be more unwritten extents after that. When ocfs2 grows a file, it zeros the entire cluster in order to ensure future i_size growth will see cleared blocks. Unfortunately, block_write_full_page() drops the pages past i_size. This means that ocfs2 is actually leaking garbage data into the tail end of that last cluster. This is a bug. We adjust ocfs2_write_begin_nolock() and ocfs2_extend_file() to detect when a write or truncate is past i_size. They will use ocfs2_zero_extend() to ensure the data is properly zeroed. Older versions of ocfs2_zero_extend() simply zeroed every block between i_size and the zeroing position. This presumes three things: 1) There is allocation for all of these blocks. 2) The extents are not unwritten. 3) The extents are not refcounted. (1) and (2) hold true for non-sparse filesystems, which used to be the only users of ocfs2_zero_extend(). (3) is another bug. Since we're now using ocfs2_zero_extend() for sparse filesystems as well, we teach ocfs2_zero_extend() to check every extent between i_size and the zeroing position. If the extent is unwritten, it is ignored. If it is refcounted, it is CoWed. Then it is zeroed. Signed-off-by: Joel Becker <joel.becker@oracle.com> Cc: stable@kernel.org
2010-07-02 06:13:31 +08:00
if (OCFS2_I(inode)->ip_flags & OCFS2_INODE_SYSTEM_FILE) {
ret = -EINVAL;
mlog_errno(ret);
goto out;
}
ret = filemap_fdatawrite(inode->i_mapping);
if (ret) {
mlog_errno(ret);
goto out;
}
ret = ocfs2_attach_refcount_tree(inode, old_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
inode_lock_nested(new_inode, I_MUTEX_CHILD);
ocfs2: Fix lockdep warning in reflink. This patch change mutex_lock to a new subclass and add a new inode lock subclass for the target inode which caused this lockdep warning. ============================================= [ INFO: possible recursive locking detected ] 2.6.35+ #5 --------------------------------------------- reflink/11086 is trying to acquire lock: (Meta){+++++.}, at: [<ffffffffa06f9d65>] ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] but task is already holding lock: (Meta){+++++.}, at: [<ffffffffa06f9aa0>] ocfs2_reflink_ioctl+0x5d3/0x1229 [ocfs2] other info that might help us debug this: 6 locks held by reflink/11086: #0: (&sb->s_type->i_mutex_key#15/1){+.+.+.}, at: [<ffffffff820e09ec>] lookup_create+0x26/0x97 #1: (&sb->s_type->i_mutex_key#15){+.+.+.}, at: [<ffffffffa06f99a0>] ocfs2_reflink_ioctl+0x4d3/0x1229 [ocfs2] #2: (Meta){+++++.}, at: [<ffffffffa06f9aa0>] ocfs2_reflink_ioctl+0x5d3/0x1229 [ocfs2] #3: (&oi->ip_xattr_sem){+.+.+.}, at: [<ffffffffa06f9b58>] ocfs2_reflink_ioctl+0x68b/0x1229 [ocfs2] #4: (&oi->ip_alloc_sem){+.+.+.}, at: [<ffffffffa06f9b67>] ocfs2_reflink_ioctl+0x69a/0x1229 [ocfs2] #5: (&sb->s_type->i_mutex_key#15/2){+.+...}, at: [<ffffffffa06f9d4f>] ocfs2_reflink_ioctl+0x882/0x1229 [ocfs2] stack backtrace: Pid: 11086, comm: reflink Not tainted 2.6.35+ #5 Call Trace: [<ffffffff82063dd9>] validate_chain+0x56e/0xd68 [<ffffffff82062275>] ? mark_held_locks+0x49/0x69 [<ffffffff82064d6d>] __lock_acquire+0x79a/0x7f1 [<ffffffff82065a81>] lock_acquire+0xc6/0xed [<ffffffffa06f9d65>] ? ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] [<ffffffffa06c9ade>] __ocfs2_cluster_lock+0x975/0xa0d [ocfs2] [<ffffffffa06f9d65>] ? ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] [<ffffffffa06e107b>] ? ocfs2_wait_for_recovery+0x15/0x8a [ocfs2] [<ffffffffa06cb6ea>] ocfs2_inode_lock_full_nested+0x1ac/0xdc5 [ocfs2] [<ffffffffa06f9d65>] ? ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] [<ffffffff820623a0>] ? trace_hardirqs_on_caller+0x10b/0x12f [<ffffffff82060193>] ? debug_mutex_free_waiter+0x4f/0x53 [<ffffffffa06f9d65>] ocfs2_reflink_ioctl+0x898/0x1229 [ocfs2] [<ffffffffa06ce24a>] ? ocfs2_file_lock_res_init+0x66/0x78 [ocfs2] [<ffffffff820bb2d2>] ? might_fault+0x40/0x8d [<ffffffffa06df9f6>] ocfs2_ioctl+0x61a/0x656 [ocfs2] [<ffffffff820ee5d3>] ? mntput_no_expire+0x1d/0xb0 [<ffffffff820e07b3>] ? path_put+0x2c/0x31 [<ffffffff820e53ac>] vfs_ioctl+0x2a/0x9d [<ffffffff820e5903>] do_vfs_ioctl+0x45d/0x4ae [<ffffffff8233a7f6>] ? _raw_spin_unlock+0x26/0x2a [<ffffffff8200299c>] ? sysret_check+0x27/0x62 [<ffffffff820e59ab>] sys_ioctl+0x57/0x7a [<ffffffff8200296b>] system_call_fastpath+0x16/0x1b Signed-off-by: Tao Ma <tao.ma@oracle.com> Signed-off-by: Joel Becker <joel.becker@oracle.com>
2010-09-07 13:30:06 +08:00
ret = ocfs2_inode_lock_nested(new_inode, &new_bh, 1,
OI_LS_REFLINK_TARGET);
if (ret) {
mlog_errno(ret);
goto out_unlock;
}
ret = ocfs2_create_reflink_node(inode, old_bh,
new_inode, new_bh, preserve);
if (ret) {
mlog_errno(ret);
goto inode_unlock;
}
if (OCFS2_I(inode)->ip_dyn_features & OCFS2_HAS_XATTR_FL) {
ret = ocfs2_reflink_xattrs(inode, old_bh,
new_inode, new_bh,
preserve);
if (ret) {
mlog_errno(ret);
goto inode_unlock;
}
}
ret = ocfs2_complete_reflink(inode, old_bh,
new_inode, new_bh, preserve);
if (ret)
mlog_errno(ret);
inode_unlock:
ocfs2_inode_unlock(new_inode, 1);
brelse(new_bh);
out_unlock:
inode_unlock(new_inode);
out:
if (!ret) {
ret = filemap_fdatawait(inode->i_mapping);
if (ret)
mlog_errno(ret);
}
return ret;
}
static int ocfs2_reflink(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool preserve)
{
ocfs2: take inode cluster lock before moving reflinked inode from orphan dir While reflinking an inode, we create a new inode in orphan directory, then take EX lock on it, reflink the original inode to orphan inode and release EX lock. Once the lock is released another node could request it in EX mode from ocfs2_recover_orphans() which causes downconvert of the lock, on this node, to NL mode. Later we attempt to initialize security acl for the orphan inode and move it to the reflink destination. However, while doing this we dont take EX lock on the inode. This could potentially cause problems because we could be starting transaction, accessing journal and modifying metadata of the inode while holding NL lock and with another node holding EX lock on the inode. Fix this by taking orphan inode cluster lock in EX mode before initializing security and moving orphan inode to reflink destination. Use the __tracker variant while taking inode lock to avoid recursive locking in the ocfs2_init_security_and_acl() call chain. Link: http://lkml.kernel.org/r/1523475107-7639-1-git-send-email-ashish.samant@oracle.com Signed-off-by: Ashish Samant <ashish.samant@oracle.com> Reviewed-by: Joseph Qi <jiangqi903@gmail.com> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> Acked-by: Jun Piao <piaojun@huawei.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Changwei Ge <ge.changwei@h3c.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-12 07:02:07 +08:00
int error, had_lock;
struct inode *inode = d_inode(old_dentry);
struct buffer_head *old_bh = NULL;
struct inode *new_orphan_inode = NULL;
ocfs2: take inode cluster lock before moving reflinked inode from orphan dir While reflinking an inode, we create a new inode in orphan directory, then take EX lock on it, reflink the original inode to orphan inode and release EX lock. Once the lock is released another node could request it in EX mode from ocfs2_recover_orphans() which causes downconvert of the lock, on this node, to NL mode. Later we attempt to initialize security acl for the orphan inode and move it to the reflink destination. However, while doing this we dont take EX lock on the inode. This could potentially cause problems because we could be starting transaction, accessing journal and modifying metadata of the inode while holding NL lock and with another node holding EX lock on the inode. Fix this by taking orphan inode cluster lock in EX mode before initializing security and moving orphan inode to reflink destination. Use the __tracker variant while taking inode lock to avoid recursive locking in the ocfs2_init_security_and_acl() call chain. Link: http://lkml.kernel.org/r/1523475107-7639-1-git-send-email-ashish.samant@oracle.com Signed-off-by: Ashish Samant <ashish.samant@oracle.com> Reviewed-by: Joseph Qi <jiangqi903@gmail.com> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> Acked-by: Jun Piao <piaojun@huawei.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Changwei Ge <ge.changwei@h3c.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-12 07:02:07 +08:00
struct ocfs2_lock_holder oh;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
error = ocfs2_create_inode_in_orphan(dir, inode->i_mode,
&new_orphan_inode);
if (error) {
mlog_errno(error);
goto out;
}
ocfs2: refcount: take rw_lock in ocfs2_reflink This patch tries to fix this crash: #5 [ffff88003c1cd690] do_invalid_op at ffffffff810166d5 #6 [ffff88003c1cd730] invalid_op at ffffffff8159b2de [exception RIP: ocfs2_direct_IO_get_blocks+359] RIP: ffffffffa05dfa27 RSP: ffff88003c1cd7e8 RFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff88003c1cdaa8 RCX: 0000000000000000 RDX: 000000000000000c RSI: ffff880027a95000 RDI: ffff88003c79b540 RBP: ffff88003c1cd858 R8: 0000000000000000 R9: ffffffff815f6ba0 R10: 00000000000001c9 R11: 00000000000001c9 R12: ffff88002d271500 R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000001000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffff88003c1cd860] do_direct_IO at ffffffff811cd31b #8 [ffff88003c1cd950] direct_IO_iovec at ffffffff811cde9c #9 [ffff88003c1cd9b0] do_blockdev_direct_IO at ffffffff811ce764 #10 [ffff88003c1cdb80] __blockdev_direct_IO at ffffffff811ce7cc #11 [ffff88003c1cdbb0] ocfs2_direct_IO at ffffffffa05df756 [ocfs2] #12 [ffff88003c1cdbe0] generic_file_direct_write_iter at ffffffff8112f935 #13 [ffff88003c1cdc40] ocfs2_file_write_iter at ffffffffa0600ccc [ocfs2] #14 [ffff88003c1cdd50] do_aio_write at ffffffff8119126c #15 [ffff88003c1cddc0] aio_rw_vect_retry at ffffffff811d9bb4 #16 [ffff88003c1cddf0] aio_run_iocb at ffffffff811db880 #17 [ffff88003c1cde30] io_submit_one at ffffffff811dc238 #18 [ffff88003c1cde80] do_io_submit at ffffffff811dc437 #19 [ffff88003c1cdf70] sys_io_submit at ffffffff811dc530 #20 [ffff88003c1cdf80] system_call_fastpath at ffffffff8159a159 It crashes at BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); in ocfs2_direct_IO_get_blocks. ocfs2_direct_IO_get_blocks is expecting the OCFS2_EXT_REFCOUNTED be removed in ocfs2_prepare_inode_for_write() if it was there. But no cluster lock is taken during the time before (or inside) ocfs2_prepare_inode_for_write() and after ocfs2_direct_IO_get_blocks(). It can happen in this case: Node A(which crashes) Node B ------------------------ --------------------------- ocfs2_file_aio_write ocfs2_prepare_inode_for_write ocfs2_inode_lock ... ocfs2_inode_unlock #no refcount found .... ocfs2_reflink ocfs2_inode_lock ... ocfs2_inode_unlock #now, refcount flag set on extent ... flush change to disk ocfs2_direct_IO_get_blocks ocfs2_get_clusters #extent map miss #buffer_head miss read extents from disk found refcount flag on extent crash.. Fix: Take rw_lock in ocfs2_reflink path Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> Reviewed-by: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-24 04:22:08 +08:00
error = ocfs2_rw_lock(inode, 1);
if (error) {
mlog_errno(error);
goto out;
}
error = ocfs2_inode_lock(inode, &old_bh, 1);
if (error) {
mlog_errno(error);
ocfs2: refcount: take rw_lock in ocfs2_reflink This patch tries to fix this crash: #5 [ffff88003c1cd690] do_invalid_op at ffffffff810166d5 #6 [ffff88003c1cd730] invalid_op at ffffffff8159b2de [exception RIP: ocfs2_direct_IO_get_blocks+359] RIP: ffffffffa05dfa27 RSP: ffff88003c1cd7e8 RFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff88003c1cdaa8 RCX: 0000000000000000 RDX: 000000000000000c RSI: ffff880027a95000 RDI: ffff88003c79b540 RBP: ffff88003c1cd858 R8: 0000000000000000 R9: ffffffff815f6ba0 R10: 00000000000001c9 R11: 00000000000001c9 R12: ffff88002d271500 R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000001000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffff88003c1cd860] do_direct_IO at ffffffff811cd31b #8 [ffff88003c1cd950] direct_IO_iovec at ffffffff811cde9c #9 [ffff88003c1cd9b0] do_blockdev_direct_IO at ffffffff811ce764 #10 [ffff88003c1cdb80] __blockdev_direct_IO at ffffffff811ce7cc #11 [ffff88003c1cdbb0] ocfs2_direct_IO at ffffffffa05df756 [ocfs2] #12 [ffff88003c1cdbe0] generic_file_direct_write_iter at ffffffff8112f935 #13 [ffff88003c1cdc40] ocfs2_file_write_iter at ffffffffa0600ccc [ocfs2] #14 [ffff88003c1cdd50] do_aio_write at ffffffff8119126c #15 [ffff88003c1cddc0] aio_rw_vect_retry at ffffffff811d9bb4 #16 [ffff88003c1cddf0] aio_run_iocb at ffffffff811db880 #17 [ffff88003c1cde30] io_submit_one at ffffffff811dc238 #18 [ffff88003c1cde80] do_io_submit at ffffffff811dc437 #19 [ffff88003c1cdf70] sys_io_submit at ffffffff811dc530 #20 [ffff88003c1cdf80] system_call_fastpath at ffffffff8159a159 It crashes at BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); in ocfs2_direct_IO_get_blocks. ocfs2_direct_IO_get_blocks is expecting the OCFS2_EXT_REFCOUNTED be removed in ocfs2_prepare_inode_for_write() if it was there. But no cluster lock is taken during the time before (or inside) ocfs2_prepare_inode_for_write() and after ocfs2_direct_IO_get_blocks(). It can happen in this case: Node A(which crashes) Node B ------------------------ --------------------------- ocfs2_file_aio_write ocfs2_prepare_inode_for_write ocfs2_inode_lock ... ocfs2_inode_unlock #no refcount found .... ocfs2_reflink ocfs2_inode_lock ... ocfs2_inode_unlock #now, refcount flag set on extent ... flush change to disk ocfs2_direct_IO_get_blocks ocfs2_get_clusters #extent map miss #buffer_head miss read extents from disk found refcount flag on extent crash.. Fix: Take rw_lock in ocfs2_reflink path Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> Reviewed-by: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-24 04:22:08 +08:00
ocfs2_rw_unlock(inode, 1);
goto out;
}
down_write(&OCFS2_I(inode)->ip_xattr_sem);
down_write(&OCFS2_I(inode)->ip_alloc_sem);
error = __ocfs2_reflink(old_dentry, old_bh,
new_orphan_inode, preserve);
up_write(&OCFS2_I(inode)->ip_alloc_sem);
up_write(&OCFS2_I(inode)->ip_xattr_sem);
ocfs2_inode_unlock(inode, 1);
ocfs2: refcount: take rw_lock in ocfs2_reflink This patch tries to fix this crash: #5 [ffff88003c1cd690] do_invalid_op at ffffffff810166d5 #6 [ffff88003c1cd730] invalid_op at ffffffff8159b2de [exception RIP: ocfs2_direct_IO_get_blocks+359] RIP: ffffffffa05dfa27 RSP: ffff88003c1cd7e8 RFLAGS: 00010202 RAX: 0000000000000000 RBX: ffff88003c1cdaa8 RCX: 0000000000000000 RDX: 000000000000000c RSI: ffff880027a95000 RDI: ffff88003c79b540 RBP: ffff88003c1cd858 R8: 0000000000000000 R9: ffffffff815f6ba0 R10: 00000000000001c9 R11: 00000000000001c9 R12: ffff88002d271500 R13: 0000000000000001 R14: 0000000000000000 R15: 0000000000001000 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #7 [ffff88003c1cd860] do_direct_IO at ffffffff811cd31b #8 [ffff88003c1cd950] direct_IO_iovec at ffffffff811cde9c #9 [ffff88003c1cd9b0] do_blockdev_direct_IO at ffffffff811ce764 #10 [ffff88003c1cdb80] __blockdev_direct_IO at ffffffff811ce7cc #11 [ffff88003c1cdbb0] ocfs2_direct_IO at ffffffffa05df756 [ocfs2] #12 [ffff88003c1cdbe0] generic_file_direct_write_iter at ffffffff8112f935 #13 [ffff88003c1cdc40] ocfs2_file_write_iter at ffffffffa0600ccc [ocfs2] #14 [ffff88003c1cdd50] do_aio_write at ffffffff8119126c #15 [ffff88003c1cddc0] aio_rw_vect_retry at ffffffff811d9bb4 #16 [ffff88003c1cddf0] aio_run_iocb at ffffffff811db880 #17 [ffff88003c1cde30] io_submit_one at ffffffff811dc238 #18 [ffff88003c1cde80] do_io_submit at ffffffff811dc437 #19 [ffff88003c1cdf70] sys_io_submit at ffffffff811dc530 #20 [ffff88003c1cdf80] system_call_fastpath at ffffffff8159a159 It crashes at BUG_ON(create && (ext_flags & OCFS2_EXT_REFCOUNTED)); in ocfs2_direct_IO_get_blocks. ocfs2_direct_IO_get_blocks is expecting the OCFS2_EXT_REFCOUNTED be removed in ocfs2_prepare_inode_for_write() if it was there. But no cluster lock is taken during the time before (or inside) ocfs2_prepare_inode_for_write() and after ocfs2_direct_IO_get_blocks(). It can happen in this case: Node A(which crashes) Node B ------------------------ --------------------------- ocfs2_file_aio_write ocfs2_prepare_inode_for_write ocfs2_inode_lock ... ocfs2_inode_unlock #no refcount found .... ocfs2_reflink ocfs2_inode_lock ... ocfs2_inode_unlock #now, refcount flag set on extent ... flush change to disk ocfs2_direct_IO_get_blocks ocfs2_get_clusters #extent map miss #buffer_head miss read extents from disk found refcount flag on extent crash.. Fix: Take rw_lock in ocfs2_reflink path Signed-off-by: Wengang Wang <wen.gang.wang@oracle.com> Reviewed-by: Mark Fasheh <mfasheh@suse.de> Cc: Joel Becker <jlbec@evilplan.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-06-24 04:22:08 +08:00
ocfs2_rw_unlock(inode, 1);
brelse(old_bh);
if (error) {
mlog_errno(error);
goto out;
}
ocfs2: take inode cluster lock before moving reflinked inode from orphan dir While reflinking an inode, we create a new inode in orphan directory, then take EX lock on it, reflink the original inode to orphan inode and release EX lock. Once the lock is released another node could request it in EX mode from ocfs2_recover_orphans() which causes downconvert of the lock, on this node, to NL mode. Later we attempt to initialize security acl for the orphan inode and move it to the reflink destination. However, while doing this we dont take EX lock on the inode. This could potentially cause problems because we could be starting transaction, accessing journal and modifying metadata of the inode while holding NL lock and with another node holding EX lock on the inode. Fix this by taking orphan inode cluster lock in EX mode before initializing security and moving orphan inode to reflink destination. Use the __tracker variant while taking inode lock to avoid recursive locking in the ocfs2_init_security_and_acl() call chain. Link: http://lkml.kernel.org/r/1523475107-7639-1-git-send-email-ashish.samant@oracle.com Signed-off-by: Ashish Samant <ashish.samant@oracle.com> Reviewed-by: Joseph Qi <jiangqi903@gmail.com> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> Acked-by: Jun Piao <piaojun@huawei.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Changwei Ge <ge.changwei@h3c.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-12 07:02:07 +08:00
had_lock = ocfs2_inode_lock_tracker(new_orphan_inode, NULL, 1,
&oh);
if (had_lock < 0) {
error = had_lock;
mlog_errno(error);
goto out;
}
/* If the security isn't preserved, we need to re-initialize them. */
if (!preserve) {
error = ocfs2_init_security_and_acl(dir, new_orphan_inode,
&new_dentry->d_name);
if (error)
mlog_errno(error);
}
if (!error) {
error = ocfs2_mv_orphaned_inode_to_new(dir, new_orphan_inode,
new_dentry);
if (error)
mlog_errno(error);
}
ocfs2: take inode cluster lock before moving reflinked inode from orphan dir While reflinking an inode, we create a new inode in orphan directory, then take EX lock on it, reflink the original inode to orphan inode and release EX lock. Once the lock is released another node could request it in EX mode from ocfs2_recover_orphans() which causes downconvert of the lock, on this node, to NL mode. Later we attempt to initialize security acl for the orphan inode and move it to the reflink destination. However, while doing this we dont take EX lock on the inode. This could potentially cause problems because we could be starting transaction, accessing journal and modifying metadata of the inode while holding NL lock and with another node holding EX lock on the inode. Fix this by taking orphan inode cluster lock in EX mode before initializing security and moving orphan inode to reflink destination. Use the __tracker variant while taking inode lock to avoid recursive locking in the ocfs2_init_security_and_acl() call chain. Link: http://lkml.kernel.org/r/1523475107-7639-1-git-send-email-ashish.samant@oracle.com Signed-off-by: Ashish Samant <ashish.samant@oracle.com> Reviewed-by: Joseph Qi <jiangqi903@gmail.com> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> Acked-by: Jun Piao <piaojun@huawei.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Changwei Ge <ge.changwei@h3c.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-12 07:02:07 +08:00
ocfs2_inode_unlock_tracker(new_orphan_inode, 1, &oh, had_lock);
ocfs2: take inode cluster lock before moving reflinked inode from orphan dir While reflinking an inode, we create a new inode in orphan directory, then take EX lock on it, reflink the original inode to orphan inode and release EX lock. Once the lock is released another node could request it in EX mode from ocfs2_recover_orphans() which causes downconvert of the lock, on this node, to NL mode. Later we attempt to initialize security acl for the orphan inode and move it to the reflink destination. However, while doing this we dont take EX lock on the inode. This could potentially cause problems because we could be starting transaction, accessing journal and modifying metadata of the inode while holding NL lock and with another node holding EX lock on the inode. Fix this by taking orphan inode cluster lock in EX mode before initializing security and moving orphan inode to reflink destination. Use the __tracker variant while taking inode lock to avoid recursive locking in the ocfs2_init_security_and_acl() call chain. Link: http://lkml.kernel.org/r/1523475107-7639-1-git-send-email-ashish.samant@oracle.com Signed-off-by: Ashish Samant <ashish.samant@oracle.com> Reviewed-by: Joseph Qi <jiangqi903@gmail.com> Reviewed-by: Junxiao Bi <junxiao.bi@oracle.com> Acked-by: Jun Piao <piaojun@huawei.com> Cc: Mark Fasheh <mark@fasheh.com> Cc: Joel Becker <jlbec@evilplan.org> Cc: Changwei Ge <ge.changwei@h3c.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2018-05-12 07:02:07 +08:00
out:
if (new_orphan_inode) {
/*
* We need to open_unlock the inode no matter whether we
* succeed or not, so that other nodes can delete it later.
*/
ocfs2_open_unlock(new_orphan_inode);
if (error)
iput(new_orphan_inode);
}
return error;
}
/*
* Below here are the bits used by OCFS2_IOC_REFLINK() to fake
* sys_reflink(). This will go away when vfs_reflink() exists in
* fs/namei.c.
*/
/* copied from may_create in VFS. */
static inline int ocfs2_may_create(struct inode *dir, struct dentry *child)
{
if (d_really_is_positive(child))
return -EEXIST;
if (IS_DEADDIR(dir))
return -ENOENT;
return inode_permission(&init_user_ns, dir, MAY_WRITE | MAY_EXEC);
}
/**
* ocfs2_vfs_reflink - Create a reference-counted link
*
* @old_dentry: source dentry + inode
* @dir: directory to create the target
* @new_dentry: target dentry
* @preserve: if true, preserve all file attributes
*/
static int ocfs2_vfs_reflink(struct dentry *old_dentry, struct inode *dir,
struct dentry *new_dentry, bool preserve)
{
struct inode *inode = d_inode(old_dentry);
int error;
if (!inode)
return -ENOENT;
error = ocfs2_may_create(dir, new_dentry);
if (error)
return error;
if (dir->i_sb != inode->i_sb)
return -EXDEV;
/*
* A reflink to an append-only or immutable file cannot be created.
*/
if (IS_APPEND(inode) || IS_IMMUTABLE(inode))
return -EPERM;
/* Only regular files can be reflinked. */
if (!S_ISREG(inode->i_mode))
return -EPERM;
/*
* If the caller wants to preserve ownership, they require the
* rights to do so.
*/
if (preserve) {
if (!uid_eq(current_fsuid(), inode->i_uid) && !capable(CAP_CHOWN))
return -EPERM;
if (!in_group_p(inode->i_gid) && !capable(CAP_CHOWN))
return -EPERM;
}
/*
* If the caller is modifying any aspect of the attributes, they
* are not creating a snapshot. They need read permission on the
* file.
*/
if (!preserve) {
error = inode_permission(&init_user_ns, inode, MAY_READ);
if (error)
return error;
}
inode_lock(inode);
error = dquot_initialize(dir);
if (!error)
error = ocfs2_reflink(old_dentry, dir, new_dentry, preserve);
inode_unlock(inode);
if (!error)
fsnotify_create(dir, new_dentry);
return error;
}
/*
* Most codes are copied from sys_linkat.
*/
int ocfs2_reflink_ioctl(struct inode *inode,
const char __user *oldname,
const char __user *newname,
bool preserve)
{
struct dentry *new_dentry;
struct path old_path, new_path;
int error;
if (!ocfs2_refcount_tree(OCFS2_SB(inode->i_sb)))
return -EOPNOTSUPP;
error = user_path_at(AT_FDCWD, oldname, 0, &old_path);
if (error) {
mlog_errno(error);
return error;
}
new_dentry = user_path_create(AT_FDCWD, newname, &new_path, 0);
error = PTR_ERR(new_dentry);
if (IS_ERR(new_dentry)) {
mlog_errno(error);
goto out;
}
error = -EXDEV;
if (old_path.mnt != new_path.mnt) {
mlog_errno(error);
goto out_dput;
}
error = ocfs2_vfs_reflink(old_path.dentry,
d_inode(new_path.dentry),
new_dentry, preserve);
out_dput:
done_path_create(&new_path, new_dentry);
out:
path_put(&old_path);
return error;
}
/* Update destination inode size, if necessary. */
int ocfs2_reflink_update_dest(struct inode *dest,
struct buffer_head *d_bh,
loff_t newlen)
{
handle_t *handle;
int ret;
dest->i_blocks = ocfs2_inode_sector_count(dest);
if (newlen <= i_size_read(dest))
return 0;
handle = ocfs2_start_trans(OCFS2_SB(dest->i_sb),
OCFS2_INODE_UPDATE_CREDITS);
if (IS_ERR(handle)) {
ret = PTR_ERR(handle);
mlog_errno(ret);
return ret;
}
/* Extend i_size if needed. */
spin_lock(&OCFS2_I(dest)->ip_lock);
if (newlen > i_size_read(dest))
i_size_write(dest, newlen);
spin_unlock(&OCFS2_I(dest)->ip_lock);
dest->i_ctime = dest->i_mtime = current_time(dest);
ret = ocfs2_mark_inode_dirty(handle, dest, d_bh);
if (ret) {
mlog_errno(ret);
goto out_commit;
}
out_commit:
ocfs2_commit_trans(OCFS2_SB(dest->i_sb), handle);
return ret;
}
/* Remap the range pos_in:len in s_inode to pos_out:len in t_inode. */
static loff_t ocfs2_reflink_remap_extent(struct inode *s_inode,
struct buffer_head *s_bh,
loff_t pos_in,
struct inode *t_inode,
struct buffer_head *t_bh,
loff_t pos_out,
loff_t len,
struct ocfs2_cached_dealloc_ctxt *dealloc)
{
struct ocfs2_extent_tree s_et;
struct ocfs2_extent_tree t_et;
struct ocfs2_dinode *dis;
struct buffer_head *ref_root_bh = NULL;
struct ocfs2_refcount_tree *ref_tree;
struct ocfs2_super *osb;
loff_t remapped_bytes = 0;
loff_t pstart, plen;
u32 p_cluster, num_clusters, slast, spos, tpos, remapped_clus = 0;
unsigned int ext_flags;
int ret = 0;
osb = OCFS2_SB(s_inode->i_sb);
dis = (struct ocfs2_dinode *)s_bh->b_data;
ocfs2_init_dinode_extent_tree(&s_et, INODE_CACHE(s_inode), s_bh);
ocfs2_init_dinode_extent_tree(&t_et, INODE_CACHE(t_inode), t_bh);
spos = ocfs2_bytes_to_clusters(s_inode->i_sb, pos_in);
tpos = ocfs2_bytes_to_clusters(t_inode->i_sb, pos_out);
slast = ocfs2_clusters_for_bytes(s_inode->i_sb, pos_in + len);
while (spos < slast) {
if (fatal_signal_pending(current)) {
ret = -EINTR;
goto out;
}
/* Look up the extent. */
ret = ocfs2_get_clusters(s_inode, spos, &p_cluster,
&num_clusters, &ext_flags);
if (ret) {
mlog_errno(ret);
goto out;
}
num_clusters = min_t(u32, num_clusters, slast - spos);
/* Punch out the dest range. */
pstart = ocfs2_clusters_to_bytes(t_inode->i_sb, tpos);
plen = ocfs2_clusters_to_bytes(t_inode->i_sb, num_clusters);
ret = ocfs2_remove_inode_range(t_inode, t_bh, pstart, plen);
if (ret) {
mlog_errno(ret);
goto out;
}
if (p_cluster == 0)
goto next_loop;
/* Lock the refcount btree... */
ret = ocfs2_lock_refcount_tree(osb,
le64_to_cpu(dis->i_refcount_loc),
1, &ref_tree, &ref_root_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
/* Mark s_inode's extent as refcounted. */
if (!(ext_flags & OCFS2_EXT_REFCOUNTED)) {
ret = ocfs2_add_refcount_flag(s_inode, &s_et,
&ref_tree->rf_ci,
ref_root_bh, spos,
p_cluster, num_clusters,
dealloc, NULL);
if (ret) {
mlog_errno(ret);
goto out_unlock_refcount;
}
}
/* Map in the new extent. */
ext_flags |= OCFS2_EXT_REFCOUNTED;
ret = ocfs2_add_refcounted_extent(t_inode, &t_et,
&ref_tree->rf_ci,
ref_root_bh,
tpos, p_cluster,
num_clusters,
ext_flags,
dealloc);
if (ret) {
mlog_errno(ret);
goto out_unlock_refcount;
}
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh);
next_loop:
spos += num_clusters;
tpos += num_clusters;
remapped_clus += num_clusters;
}
goto out;
out_unlock_refcount:
ocfs2_unlock_refcount_tree(osb, ref_tree, 1);
brelse(ref_root_bh);
out:
remapped_bytes = ocfs2_clusters_to_bytes(t_inode->i_sb, remapped_clus);
remapped_bytes = min_t(loff_t, len, remapped_bytes);
return remapped_bytes > 0 ? remapped_bytes : ret;
}
/* Set up refcount tree and remap s_inode to t_inode. */
loff_t ocfs2_reflink_remap_blocks(struct inode *s_inode,
struct buffer_head *s_bh,
loff_t pos_in,
struct inode *t_inode,
struct buffer_head *t_bh,
loff_t pos_out,
loff_t len)
{
struct ocfs2_cached_dealloc_ctxt dealloc;
struct ocfs2_super *osb;
struct ocfs2_dinode *dis;
struct ocfs2_dinode *dit;
loff_t ret;
osb = OCFS2_SB(s_inode->i_sb);
dis = (struct ocfs2_dinode *)s_bh->b_data;
dit = (struct ocfs2_dinode *)t_bh->b_data;
ocfs2_init_dealloc_ctxt(&dealloc);
/*
* If we're reflinking the entire file and the source is inline
* data, just copy the contents.
*/
if (pos_in == pos_out && pos_in == 0 && len == i_size_read(s_inode) &&
i_size_read(t_inode) <= len &&
(OCFS2_I(s_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL)) {
ret = ocfs2_duplicate_inline_data(s_inode, s_bh, t_inode, t_bh);
if (ret)
mlog_errno(ret);
goto out;
}
/*
* If both inodes belong to two different refcount groups then
* forget it because we don't know how (or want) to go merging
* refcount trees.
*/
ret = -EOPNOTSUPP;
if (ocfs2_is_refcount_inode(s_inode) &&
ocfs2_is_refcount_inode(t_inode) &&
le64_to_cpu(dis->i_refcount_loc) !=
le64_to_cpu(dit->i_refcount_loc))
goto out;
/* Neither inode has a refcount tree. Add one to s_inode. */
if (!ocfs2_is_refcount_inode(s_inode) &&
!ocfs2_is_refcount_inode(t_inode)) {
ret = ocfs2_create_refcount_tree(s_inode, s_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
}
/* Ensure that both inodes end up with the same refcount tree. */
if (!ocfs2_is_refcount_inode(s_inode)) {
ret = ocfs2_set_refcount_tree(s_inode, s_bh,
le64_to_cpu(dit->i_refcount_loc));
if (ret) {
mlog_errno(ret);
goto out;
}
}
if (!ocfs2_is_refcount_inode(t_inode)) {
ret = ocfs2_set_refcount_tree(t_inode, t_bh,
le64_to_cpu(dis->i_refcount_loc));
if (ret) {
mlog_errno(ret);
goto out;
}
}
/* Turn off inline data in the dest file. */
if (OCFS2_I(t_inode)->ip_dyn_features & OCFS2_INLINE_DATA_FL) {
ret = ocfs2_convert_inline_data_to_extents(t_inode, t_bh);
if (ret) {
mlog_errno(ret);
goto out;
}
}
/* Actually remap extents now. */
ret = ocfs2_reflink_remap_extent(s_inode, s_bh, pos_in, t_inode, t_bh,
pos_out, len, &dealloc);
if (ret < 0) {
mlog_errno(ret);
goto out;
}
out:
if (ocfs2_dealloc_has_cluster(&dealloc)) {
ocfs2_schedule_truncate_log_flush(osb, 1);
ocfs2_run_deallocs(osb, &dealloc);
}
return ret;
}
/* Lock an inode and grab a bh pointing to the inode. */
int ocfs2_reflink_inodes_lock(struct inode *s_inode,
struct buffer_head **bh_s,
struct inode *t_inode,
struct buffer_head **bh_t)
{
struct inode *inode1 = s_inode;
struct inode *inode2 = t_inode;
struct ocfs2_inode_info *oi1;
struct ocfs2_inode_info *oi2;
struct buffer_head *bh1 = NULL;
struct buffer_head *bh2 = NULL;
bool same_inode = (s_inode == t_inode);
bool need_swap = (inode1->i_ino > inode2->i_ino);
int status;
/* First grab the VFS and rw locks. */
lock_two_nondirectories(s_inode, t_inode);
if (need_swap)
swap(inode1, inode2);
status = ocfs2_rw_lock(inode1, 1);
if (status) {
mlog_errno(status);
goto out_i1;
}
if (!same_inode) {
status = ocfs2_rw_lock(inode2, 1);
if (status) {
mlog_errno(status);
goto out_i2;
}
}
/* Now go for the cluster locks */
oi1 = OCFS2_I(inode1);
oi2 = OCFS2_I(inode2);
trace_ocfs2_double_lock((unsigned long long)oi1->ip_blkno,
(unsigned long long)oi2->ip_blkno);
/* We always want to lock the one with the lower lockid first. */
if (oi1->ip_blkno > oi2->ip_blkno)
mlog_errno(-ENOLCK);
/* lock id1 */
status = ocfs2_inode_lock_nested(inode1, &bh1, 1,
OI_LS_REFLINK_TARGET);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto out_rw2;
}
/* lock id2 */
if (!same_inode) {
status = ocfs2_inode_lock_nested(inode2, &bh2, 1,
OI_LS_REFLINK_TARGET);
if (status < 0) {
if (status != -ENOENT)
mlog_errno(status);
goto out_cl1;
}
} else {
bh2 = bh1;
}
/*
* If we swapped inode order above, we have to swap the buffer heads
* before passing them back to the caller.
*/
if (need_swap)
swap(bh1, bh2);
*bh_s = bh1;
*bh_t = bh2;
trace_ocfs2_double_lock_end(
(unsigned long long)oi1->ip_blkno,
(unsigned long long)oi2->ip_blkno);
return 0;
out_cl1:
ocfs2_inode_unlock(inode1, 1);
brelse(bh1);
out_rw2:
ocfs2_rw_unlock(inode2, 1);
out_i2:
ocfs2_rw_unlock(inode1, 1);
out_i1:
unlock_two_nondirectories(s_inode, t_inode);
return status;
}
/* Unlock both inodes and release buffers. */
void ocfs2_reflink_inodes_unlock(struct inode *s_inode,
struct buffer_head *s_bh,
struct inode *t_inode,
struct buffer_head *t_bh)
{
ocfs2_inode_unlock(s_inode, 1);
ocfs2_rw_unlock(s_inode, 1);
brelse(s_bh);
if (s_inode != t_inode) {
ocfs2_inode_unlock(t_inode, 1);
ocfs2_rw_unlock(t_inode, 1);
brelse(t_bh);
}
unlock_two_nondirectories(s_inode, t_inode);
}