for-5.19-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmKLxJAACgkQxWXV+ddt
 WDvC4BAAnSNwZ15FJKe5Y423f6PS6EXjyMuc5t/fW6UumTTbI+tsS+Glkis+JNBf
 BiDZSlVQmiK9WoQSJe04epZgHaK8MaCARyZaRaxjDC4Nvfq4DlD9mbAU9D6e7tZY
 Mo8M99D8wDW+SB+P8RBpNjwB/oGCMmE3nKC83g+1ObmA0FVRCyQ1Kazf8RzNT1rZ
 DiaJoKTvU1/wDN3/1rw5yG+EfW2m9A14gRCihslhFYaDV7jhpuabl8wLT7MftZtE
 MtJ6EOOQbgIDjnp5BEIrPmowW/N0tKDT/gorF7cWgLG2R1cbSlKgqSH1Sq7CjFUE
 AKj/DwfqZArPLpqMThWklCwy2B9qDEezrQSy7renP/vkeFLbOp8hQuIY5KRzohdG
 oDI8ThlQGtCVjbny6NX/BbCnWRAfTz0TquCgag3Xl8NbkRFgFJtkf/cSxzb+3LW1
 tFeiUyTVLXVDS1cZLwgcb29Rrtp4bjd5/v3uECQlVD+or5pcAqSMkQgOBlyQJGbE
 Xb0nmPRihzQ8D4vINa63WwRyq0+QczVjvBxKj1daas0VEKGd32PIBS/0Qha+EpGl
 uFMiHBMSfqyl8QcShFk0cCbcgPMcNc7I6IAbXCE/WhhFG0ytqm9vpmlLqsTrXmHH
 z7/Eye/waqgACNEXoA8C4pyYzduQ4i1CeLDOdcsvBU6XQSuicSM=
 =lv6P
 -----END PGP SIGNATURE-----

Merge tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "Features:

   - subpage:
      - support for PAGE_SIZE > 4K (previously only 64K)
      - make it work with raid56

   - repair super block num_devices automatically if it does not match
     the number of device items

   - defrag can convert inline extents to regular extents, up to now
     inline files were skipped but the setting of mount option
     max_inline could affect the decision logic

   - zoned:
      - minimal accepted zone size is explicitly set to 4MiB
      - make zone reclaim less aggressive and don't reclaim if there are
        enough free zones
      - add per-profile sysfs tunable of the reclaim threshold

   - allow automatic block group reclaim for non-zoned filesystems, with
     sysfs tunables

   - tree-checker: new check, compare extent buffer owner against owner
     rootid

  Performance:

   - avoid blocking on space reservation when doing nowait direct io
     writes (+7% throughput for reads and writes)

   - NOCOW write throughput improvement due to refined locking (+3%)

   - send: reduce pressure to page cache by dropping extent pages right
     after they're processed

  Core:

   - convert all radix trees to xarray

   - add iterators for b-tree node items

   - support printk message index

   - user bulk page allocation for extent buffers

   - switch to bio_alloc API, use on-stack bios where convenient, other
     bio cleanups

   - use rw lock for block groups to favor concurrent reads

   - simplify workques, don't allocate high priority threads for all
     normal queues as we need only one

   - refactor scrub, process chunks based on their constraints and
     similarity

   - allocate direct io structures on stack and pass around only
     pointers, avoids allocation and reduces potential error handling

  Fixes:

   - fix count of reserved transaction items for various inode
     operations

   - fix deadlock between concurrent dio writes when low on free data
     space

   - fix a few cases when zones need to be finished

  VFS, iomap:

   - add helper to check if sb write has started (usable for assertions)

   - new helper iomap_dio_alloc_bio, export iomap_dio_bio_end_io"

* tag 'for-5.19-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (173 commits)
  btrfs: zoned: introduce a minimal zone size 4M and reject mount
  btrfs: allow defrag to convert inline extents to regular extents
  btrfs: add "0x" prefix for unsupported optional features
  btrfs: do not account twice for inode ref when reserving metadata units
  btrfs: zoned: fix comparison of alloc_offset vs meta_write_pointer
  btrfs: send: avoid trashing the page cache
  btrfs: send: keep the current inode open while processing it
  btrfs: allocate the btrfs_dio_private as part of the iomap dio bio
  btrfs: move struct btrfs_dio_private to inode.c
  btrfs: remove the disk_bytenr in struct btrfs_dio_private
  btrfs: allocate dio_data on stack
  iomap: add per-iomap_iter private data
  iomap: allow the file system to provide a bio_set for direct I/O
  btrfs: add a btrfs_dio_rw wrapper
  btrfs: zoned: zone finish unused block group
  btrfs: zoned: properly finish block group on metadata write
  btrfs: zoned: finish block group when there are no more allocatable bytes left
  btrfs: zoned: consolidate zone finish functions
  btrfs: zoned: introduce btrfs_zoned_bg_is_full
  btrfs: improve error reporting in lookup_inline_extent_backref
  ...
This commit is contained in:
Linus Torvalds 2022-05-24 18:52:35 -07:00
commit bd1b7c1384
67 changed files with 4500 additions and 4239 deletions

View File

@ -55,9 +55,8 @@ struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
return acl;
}
static int __btrfs_set_acl(struct btrfs_trans_handle *trans,
struct user_namespace *mnt_userns,
struct inode *inode, struct posix_acl *acl, int type)
int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
struct posix_acl *acl, int type)
{
int ret, size = 0;
const char *name;
@ -123,40 +122,8 @@ int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
if (ret)
return ret;
}
ret = __btrfs_set_acl(NULL, mnt_userns, inode, acl, type);
ret = __btrfs_set_acl(NULL, inode, acl, type);
if (ret)
inode->i_mode = old_mode;
return ret;
}
int btrfs_init_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir)
{
struct posix_acl *default_acl, *acl;
int ret = 0;
/* this happens with subvols */
if (!dir)
return 0;
ret = posix_acl_create(dir, &inode->i_mode, &default_acl, &acl);
if (ret)
return ret;
if (default_acl) {
ret = __btrfs_set_acl(trans, &init_user_ns, inode, default_acl,
ACL_TYPE_DEFAULT);
posix_acl_release(default_acl);
}
if (acl) {
if (!ret)
ret = __btrfs_set_acl(trans, &init_user_ns, inode, acl,
ACL_TYPE_ACCESS);
posix_acl_release(acl);
}
if (!default_acl && !acl)
cache_no_acl(inode);
return ret;
}

View File

@ -15,13 +15,12 @@
enum {
WORK_DONE_BIT,
WORK_ORDER_DONE_BIT,
WORK_HIGH_PRIO_BIT,
};
#define NO_THRESHOLD (-1)
#define DFT_THRESHOLD (32)
struct __btrfs_workqueue {
struct btrfs_workqueue {
struct workqueue_struct *normal_wq;
/* File system this workqueue services */
@ -48,12 +47,7 @@ struct __btrfs_workqueue {
spinlock_t thres_lock;
};
struct btrfs_workqueue {
struct __btrfs_workqueue *normal;
struct __btrfs_workqueue *high;
};
struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq)
struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq)
{
return wq->fs_info;
}
@ -66,22 +60,22 @@ struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work)
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq)
{
/*
* We could compare wq->normal->pending with num_online_cpus()
* We could compare wq->pending with num_online_cpus()
* to support "thresh == NO_THRESHOLD" case, but it requires
* moving up atomic_inc/dec in thresh_queue/exec_hook. Let's
* postpone it until someone needs the support of that case.
*/
if (wq->normal->thresh == NO_THRESHOLD)
if (wq->thresh == NO_THRESHOLD)
return false;
return atomic_read(&wq->normal->pending) > wq->normal->thresh * 2;
return atomic_read(&wq->pending) > wq->thresh * 2;
}
static struct __btrfs_workqueue *
__btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
unsigned int flags, int limit_active, int thresh)
struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
const char *name, unsigned int flags,
int limit_active, int thresh)
{
struct __btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
@ -105,12 +99,8 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
ret->thresh = thresh;
}
if (flags & WQ_HIGHPRI)
ret->normal_wq = alloc_workqueue("btrfs-%s-high", flags,
ret->current_active, name);
else
ret->normal_wq = alloc_workqueue("btrfs-%s", flags,
ret->current_active, name);
ret->normal_wq = alloc_workqueue("btrfs-%s", flags, ret->current_active,
name);
if (!ret->normal_wq) {
kfree(ret);
return NULL;
@ -119,41 +109,7 @@ __btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info, const char *name,
INIT_LIST_HEAD(&ret->ordered_list);
spin_lock_init(&ret->list_lock);
spin_lock_init(&ret->thres_lock);
trace_btrfs_workqueue_alloc(ret, name, flags & WQ_HIGHPRI);
return ret;
}
static inline void
__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq);
struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
const char *name,
unsigned int flags,
int limit_active,
int thresh)
{
struct btrfs_workqueue *ret = kzalloc(sizeof(*ret), GFP_KERNEL);
if (!ret)
return NULL;
ret->normal = __btrfs_alloc_workqueue(fs_info, name,
flags & ~WQ_HIGHPRI,
limit_active, thresh);
if (!ret->normal) {
kfree(ret);
return NULL;
}
if (flags & WQ_HIGHPRI) {
ret->high = __btrfs_alloc_workqueue(fs_info, name, flags,
limit_active, thresh);
if (!ret->high) {
__btrfs_destroy_workqueue(ret->normal);
kfree(ret);
return NULL;
}
}
trace_btrfs_workqueue_alloc(ret, name);
return ret;
}
@ -162,7 +118,7 @@ struct btrfs_workqueue *btrfs_alloc_workqueue(struct btrfs_fs_info *fs_info,
* This hook WILL be called in IRQ handler context,
* so workqueue_set_max_active MUST NOT be called in this hook
*/
static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
static inline void thresh_queue_hook(struct btrfs_workqueue *wq)
{
if (wq->thresh == NO_THRESHOLD)
return;
@ -174,7 +130,7 @@ static inline void thresh_queue_hook(struct __btrfs_workqueue *wq)
* This hook is called in kthread content.
* So workqueue_set_max_active is called here.
*/
static inline void thresh_exec_hook(struct __btrfs_workqueue *wq)
static inline void thresh_exec_hook(struct btrfs_workqueue *wq)
{
int new_current_active;
long pending;
@ -217,7 +173,7 @@ out:
}
}
static void run_ordered_work(struct __btrfs_workqueue *wq,
static void run_ordered_work(struct btrfs_workqueue *wq,
struct btrfs_work *self)
{
struct list_head *list = &wq->ordered_list;
@ -305,7 +261,7 @@ static void btrfs_work_helper(struct work_struct *normal_work)
{
struct btrfs_work *work = container_of(normal_work, struct btrfs_work,
normal_work);
struct __btrfs_workqueue *wq;
struct btrfs_workqueue *wq = work->wq;
int need_order = 0;
/*
@ -318,7 +274,6 @@ static void btrfs_work_helper(struct work_struct *normal_work)
*/
if (work->ordered_func)
need_order = 1;
wq = work->wq;
trace_btrfs_work_sched(work);
thresh_exec_hook(wq);
@ -350,8 +305,7 @@ void btrfs_init_work(struct btrfs_work *work, btrfs_func_t func,
work->flags = 0;
}
static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
struct btrfs_work *work)
void btrfs_queue_work(struct btrfs_workqueue *wq, struct btrfs_work *work)
{
unsigned long flags;
@ -366,54 +320,22 @@ static inline void __btrfs_queue_work(struct __btrfs_workqueue *wq,
queue_work(wq->normal_wq, &work->normal_work);
}
void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work)
{
struct __btrfs_workqueue *dest_wq;
if (test_bit(WORK_HIGH_PRIO_BIT, &work->flags) && wq->high)
dest_wq = wq->high;
else
dest_wq = wq->normal;
__btrfs_queue_work(dest_wq, work);
}
static inline void
__btrfs_destroy_workqueue(struct __btrfs_workqueue *wq)
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
if (!wq)
return;
destroy_workqueue(wq->normal_wq);
trace_btrfs_workqueue_destroy(wq);
kfree(wq);
}
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq)
{
if (!wq)
return;
if (wq->high)
__btrfs_destroy_workqueue(wq->high);
__btrfs_destroy_workqueue(wq->normal);
kfree(wq);
}
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int limit_active)
{
if (!wq)
return;
wq->normal->limit_active = limit_active;
if (wq->high)
wq->high->limit_active = limit_active;
}
void btrfs_set_work_high_priority(struct btrfs_work *work)
{
set_bit(WORK_HIGH_PRIO_BIT, &work->flags);
if (wq)
wq->limit_active = limit_active;
}
void btrfs_flush_workqueue(struct btrfs_workqueue *wq)
{
if (wq->high)
flush_workqueue(wq->high->normal_wq);
flush_workqueue(wq->normal->normal_wq);
flush_workqueue(wq->normal_wq);
}

View File

@ -11,8 +11,6 @@
struct btrfs_fs_info;
struct btrfs_workqueue;
/* Internal use only */
struct __btrfs_workqueue;
struct btrfs_work;
typedef void (*btrfs_func_t)(struct btrfs_work *arg);
typedef void (*btrfs_work_func_t)(struct work_struct *arg);
@ -25,7 +23,7 @@ struct btrfs_work {
/* Don't touch things below */
struct work_struct normal_work;
struct list_head ordered_list;
struct __btrfs_workqueue *wq;
struct btrfs_workqueue *wq;
unsigned long flags;
};
@ -40,9 +38,8 @@ void btrfs_queue_work(struct btrfs_workqueue *wq,
struct btrfs_work *work);
void btrfs_destroy_workqueue(struct btrfs_workqueue *wq);
void btrfs_workqueue_set_max(struct btrfs_workqueue *wq, int max);
void btrfs_set_work_high_priority(struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_work_owner(const struct btrfs_work *work);
struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct __btrfs_workqueue *wq);
struct btrfs_fs_info * __pure btrfs_workqueue_owner(const struct btrfs_workqueue *wq);
bool btrfs_workqueue_normal_congested(const struct btrfs_workqueue *wq);
void btrfs_flush_workqueue(struct btrfs_workqueue *wq);

View File

@ -168,11 +168,12 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
struct rb_node **p;
struct rb_node *parent = NULL;
struct btrfs_block_group *cache;
bool leftmost = true;
ASSERT(block_group->length != 0);
spin_lock(&info->block_group_cache_lock);
p = &info->block_group_cache_tree.rb_node;
write_lock(&info->block_group_cache_lock);
p = &info->block_group_cache_tree.rb_root.rb_node;
while (*p) {
parent = *p;
@ -181,20 +182,18 @@ static int btrfs_add_block_group_cache(struct btrfs_fs_info *info,
p = &(*p)->rb_left;
} else if (block_group->start > cache->start) {
p = &(*p)->rb_right;
leftmost = false;
} else {
spin_unlock(&info->block_group_cache_lock);
write_unlock(&info->block_group_cache_lock);
return -EEXIST;
}
}
rb_link_node(&block_group->cache_node, parent, p);
rb_insert_color(&block_group->cache_node,
&info->block_group_cache_tree);
rb_insert_color_cached(&block_group->cache_node,
&info->block_group_cache_tree, leftmost);
if (info->first_logical_byte > block_group->start)
info->first_logical_byte = block_group->start;
spin_unlock(&info->block_group_cache_lock);
write_unlock(&info->block_group_cache_lock);
return 0;
}
@ -210,8 +209,8 @@ static struct btrfs_block_group *block_group_cache_tree_search(
struct rb_node *n;
u64 end, start;
spin_lock(&info->block_group_cache_lock);
n = info->block_group_cache_tree.rb_node;
read_lock(&info->block_group_cache_lock);
n = info->block_group_cache_tree.rb_root.rb_node;
while (n) {
cache = rb_entry(n, struct btrfs_block_group, cache_node);
@ -233,12 +232,9 @@ static struct btrfs_block_group *block_group_cache_tree_search(
break;
}
}
if (ret) {
if (ret)
btrfs_get_block_group(ret);
if (bytenr == 0 && info->first_logical_byte > ret->start)
info->first_logical_byte = ret->start;
}
spin_unlock(&info->block_group_cache_lock);
read_unlock(&info->block_group_cache_lock);
return ret;
}
@ -267,15 +263,15 @@ struct btrfs_block_group *btrfs_next_block_group(
struct btrfs_fs_info *fs_info = cache->fs_info;
struct rb_node *node;
spin_lock(&fs_info->block_group_cache_lock);
read_lock(&fs_info->block_group_cache_lock);
/* If our block group was removed, we need a full search. */
if (RB_EMPTY_NODE(&cache->cache_node)) {
const u64 next_bytenr = cache->start + cache->length;
spin_unlock(&fs_info->block_group_cache_lock);
read_unlock(&fs_info->block_group_cache_lock);
btrfs_put_block_group(cache);
cache = btrfs_lookup_first_block_group(fs_info, next_bytenr); return cache;
return btrfs_lookup_first_block_group(fs_info, next_bytenr);
}
node = rb_next(&cache->cache_node);
btrfs_put_block_group(cache);
@ -284,46 +280,70 @@ struct btrfs_block_group *btrfs_next_block_group(
btrfs_get_block_group(cache);
} else
cache = NULL;
spin_unlock(&fs_info->block_group_cache_lock);
read_unlock(&fs_info->block_group_cache_lock);
return cache;
}
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
/**
* Check if we can do a NOCOW write for a given extent.
*
* @fs_info: The filesystem information object.
* @bytenr: Logical start address of the extent.
*
* Check if we can do a NOCOW write for the given extent, and increments the
* number of NOCOW writers in the block group that contains the extent, as long
* as the block group exists and it's currently not in read-only mode.
*
* Returns: A non-NULL block group pointer if we can do a NOCOW write, the caller
* is responsible for calling btrfs_dec_nocow_writers() later.
*
* Or NULL if we can not do a NOCOW write
*/
struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
u64 bytenr)
{
struct btrfs_block_group *bg;
bool ret = true;
bool can_nocow = true;
bg = btrfs_lookup_block_group(fs_info, bytenr);
if (!bg)
return false;
return NULL;
spin_lock(&bg->lock);
if (bg->ro)
ret = false;
can_nocow = false;
else
atomic_inc(&bg->nocow_writers);
spin_unlock(&bg->lock);
/* No put on block group, done by btrfs_dec_nocow_writers */
if (!ret)
if (!can_nocow) {
btrfs_put_block_group(bg);
return NULL;
}
return ret;
/* No put on block group, done by btrfs_dec_nocow_writers(). */
return bg;
}
void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr)
/**
* Decrement the number of NOCOW writers in a block group.
*
* @bg: The block group.
*
* This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
* and on the block group returned by that call. Typically this is called after
* creating an ordered extent for a NOCOW write, to prevent races with scrub and
* relocation.
*
* After this call, the caller should not use the block group anymore. It it wants
* to use it, then it should get a reference on it before calling this function.
*/
void btrfs_dec_nocow_writers(struct btrfs_block_group *bg)
{
struct btrfs_block_group *bg;
bg = btrfs_lookup_block_group(fs_info, bytenr);
ASSERT(bg);
if (atomic_dec_and_test(&bg->nocow_writers))
wake_up_var(&bg->nocow_writers);
/*
* Once for our lookup and once for the lookup done by a previous call
* to btrfs_inc_nocow_writers()
*/
btrfs_put_block_group(bg);
/* For the lookup done by a previous call to btrfs_inc_nocow_writers(). */
btrfs_put_block_group(bg);
}
@ -772,10 +792,10 @@ int btrfs_cache_block_group(struct btrfs_block_group *cache, int load_cache_only
cache->has_caching_ctl = 1;
spin_unlock(&cache->lock);
spin_lock(&fs_info->block_group_cache_lock);
write_lock(&fs_info->block_group_cache_lock);
refcount_inc(&caching_ctl->count);
list_add_tail(&caching_ctl->list, &fs_info->caching_block_groups);
spin_unlock(&fs_info->block_group_cache_lock);
write_unlock(&fs_info->block_group_cache_lock);
btrfs_get_block_group(cache);
@ -957,17 +977,15 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (ret)
goto out;
spin_lock(&fs_info->block_group_cache_lock);
rb_erase(&block_group->cache_node,
&fs_info->block_group_cache_tree);
write_lock(&fs_info->block_group_cache_lock);
rb_erase_cached(&block_group->cache_node,
&fs_info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
/* Once for the block groups rbtree */
btrfs_put_block_group(block_group);
if (fs_info->first_logical_byte == block_group->start)
fs_info->first_logical_byte = (u64)-1;
spin_unlock(&fs_info->block_group_cache_lock);
write_unlock(&fs_info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
/*
@ -992,7 +1010,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
if (block_group->cached == BTRFS_CACHE_STARTED)
btrfs_wait_block_group_cache_done(block_group);
if (block_group->has_caching_ctl) {
spin_lock(&fs_info->block_group_cache_lock);
write_lock(&fs_info->block_group_cache_lock);
if (!caching_ctl) {
struct btrfs_caching_control *ctl;
@ -1006,7 +1024,7 @@ int btrfs_remove_block_group(struct btrfs_trans_handle *trans,
}
if (caching_ctl)
list_del_init(&caching_ctl->list);
spin_unlock(&fs_info->block_group_cache_lock);
write_unlock(&fs_info->block_group_cache_lock);
if (caching_ctl) {
/* Once for the caching bgs list and once for us. */
btrfs_put_caching_control(caching_ctl);
@ -1367,6 +1385,14 @@ void btrfs_delete_unused_bgs(struct btrfs_fs_info *fs_info)
goto next;
}
ret = btrfs_zone_finish(block_group);
if (ret < 0) {
btrfs_dec_block_group_ro(block_group);
if (ret == -EAGAIN)
ret = 0;
goto next;
}
/*
* Want to do this before we do anything else so we can recover
* properly if we fail to join the transaction.
@ -1512,6 +1538,13 @@ static int reclaim_bgs_cmp(void *unused, const struct list_head *a,
return bg1->used > bg2->used;
}
static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
{
if (btrfs_is_zoned(fs_info))
return btrfs_zoned_should_reclaim(fs_info);
return true;
}
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
@ -1522,6 +1555,9 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
if (!test_bit(BTRFS_FS_OPEN, &fs_info->flags))
return;
if (!btrfs_should_reclaim(fs_info))
return;
sb_start_write(fs_info->sb);
if (!btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
@ -1692,35 +1728,13 @@ static int find_first_block_group(struct btrfs_fs_info *fs_info,
struct btrfs_root *root = btrfs_block_group_root(fs_info);
int ret;
struct btrfs_key found_key;
struct extent_buffer *leaf;
int slot;
ret = btrfs_search_slot(NULL, root, key, path, 0, 0);
if (ret < 0)
return ret;
while (1) {
slot = path->slots[0];
leaf = path->nodes[0];
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto out;
break;
}
btrfs_item_key_to_cpu(leaf, &found_key, slot);
btrfs_for_each_slot(root, key, &found_key, path, ret) {
if (found_key.objectid >= key->objectid &&
found_key.type == BTRFS_BLOCK_GROUP_ITEM_KEY) {
ret = read_bg_from_eb(fs_info, &found_key, path);
break;
return read_bg_from_eb(fs_info, &found_key, path);
}
path->slots[0]++;
}
out:
return ret;
}
@ -3220,6 +3234,31 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
return ret;
}
static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
u64 bytes_freed)
{
const struct btrfs_space_info *space_info = bg->space_info;
const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
const u64 new_val = bg->used;
const u64 old_val = new_val + bytes_freed;
u64 thresh;
if (reclaim_thresh == 0)
return false;
thresh = div_factor_fine(bg->length, reclaim_thresh);
/*
* If we were below the threshold before don't reclaim, we are likely a
* brand new block group and we don't want to relocate new block groups.
*/
if (old_val < thresh)
return false;
if (new_val >= thresh)
return false;
return true;
}
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc)
{
@ -3242,6 +3281,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
spin_unlock(&info->delalloc_root_lock);
while (total) {
bool reclaim;
cache = btrfs_lookup_block_group(info, bytenr);
if (!cache) {
ret = -ENOENT;
@ -3287,6 +3328,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
cache->space_info, num_bytes);
cache->space_info->bytes_used -= num_bytes;
cache->space_info->disk_used -= num_bytes * factor;
reclaim = should_reclaim_block_group(cache, num_bytes);
spin_unlock(&cache->lock);
spin_unlock(&cache->space_info->lock);
@ -3313,6 +3356,8 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
if (!alloc && old_val == 0) {
if (!btrfs_test_opt(info, DISCARD_ASYNC))
btrfs_mark_bg_unused(cache);
} else if (!alloc && reclaim) {
btrfs_mark_bg_to_reclaim(cache);
}
btrfs_put_block_group(cache);
@ -3957,14 +4002,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
struct btrfs_caching_control *caching_ctl;
struct rb_node *n;
spin_lock(&info->block_group_cache_lock);
write_lock(&info->block_group_cache_lock);
while (!list_empty(&info->caching_block_groups)) {
caching_ctl = list_entry(info->caching_block_groups.next,
struct btrfs_caching_control, list);
list_del(&caching_ctl->list);
btrfs_put_caching_control(caching_ctl);
}
spin_unlock(&info->block_group_cache_lock);
write_unlock(&info->block_group_cache_lock);
spin_lock(&info->unused_bgs_lock);
while (!list_empty(&info->unused_bgs)) {
@ -3994,14 +4039,14 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
}
spin_unlock(&info->zone_active_bgs_lock);
spin_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree)) != NULL) {
write_lock(&info->block_group_cache_lock);
while ((n = rb_last(&info->block_group_cache_tree.rb_root)) != NULL) {
block_group = rb_entry(n, struct btrfs_block_group,
cache_node);
rb_erase(&block_group->cache_node,
&info->block_group_cache_tree);
rb_erase_cached(&block_group->cache_node,
&info->block_group_cache_tree);
RB_CLEAR_NODE(&block_group->cache_node);
spin_unlock(&info->block_group_cache_lock);
write_unlock(&info->block_group_cache_lock);
down_write(&block_group->space_info->groups_sem);
list_del(&block_group->list);
@ -4024,9 +4069,9 @@ int btrfs_free_block_groups(struct btrfs_fs_info *info)
ASSERT(block_group->swap_extents == 0);
btrfs_put_block_group(block_group);
spin_lock(&info->block_group_cache_lock);
write_lock(&info->block_group_cache_lock);
}
spin_unlock(&info->block_group_cache_lock);
write_unlock(&info->block_group_cache_lock);
btrfs_release_global_block_rsv(info);

View File

@ -212,6 +212,8 @@ struct btrfs_block_group {
u64 meta_write_pointer;
struct map_lookup *physical_map;
struct list_head active_bg_list;
struct work_struct zone_finish_work;
struct extent_buffer *last_eb;
};
static inline u64 btrfs_block_group_end(struct btrfs_block_group *block_group)
@ -254,8 +256,9 @@ void btrfs_put_block_group(struct btrfs_block_group *cache);
void btrfs_dec_block_group_reservations(struct btrfs_fs_info *fs_info,
const u64 start);
void btrfs_wait_block_group_reservations(struct btrfs_block_group *bg);
bool btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
void btrfs_dec_nocow_writers(struct btrfs_fs_info *fs_info, u64 bytenr);
struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
u64 bytenr);
void btrfs_dec_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_nocow_writers(struct btrfs_block_group *bg);
void btrfs_wait_block_group_cache_progress(struct btrfs_block_group *cache,
u64 num_bytes);

View File

@ -395,31 +395,6 @@ static inline bool btrfs_inode_can_compress(const struct btrfs_inode *inode)
return true;
}
struct btrfs_dio_private {
struct inode *inode;
/*
* Since DIO can use anonymous page, we cannot use page_offset() to
* grab the file offset, thus need a dedicated member for file offset.
*/
u64 file_offset;
u64 disk_bytenr;
/* Used for bio::bi_size */
u32 bytes;
/*
* References to this structure. There is one reference per in-flight
* bio plus one while we're still setting up.
*/
refcount_t refs;
/* dio_bio came from fs/direct-io.c */
struct bio *dio_bio;
/* Array of checksums */
u8 csums[];
};
/*
* btrfs_inode_item stores flags in a u64, btrfs_inode stores them in two
* separate u32s. These two functions convert between the two representations.

View File

@ -1552,21 +1552,18 @@ static int btrfsic_read_block(struct btrfsic_state *state,
return -ENOMEM;
block_ctx->datav = block_ctx->mem_to_free;
block_ctx->pagev = (struct page **)(block_ctx->datav + num_pages);
for (i = 0; i < num_pages; i++) {
block_ctx->pagev[i] = alloc_page(GFP_NOFS);
if (!block_ctx->pagev[i])
return -1;
}
ret = btrfs_alloc_page_array(num_pages, block_ctx->pagev);
if (ret)
return ret;
dev_bytenr = block_ctx->dev_bytenr;
for (i = 0; i < num_pages;) {
struct bio *bio;
unsigned int j;
bio = btrfs_bio_alloc(num_pages - i);
bio_set_dev(bio, block_ctx->dev->bdev);
bio = bio_alloc(block_ctx->dev->bdev, num_pages - i,
REQ_OP_READ, GFP_NOFS);
bio->bi_iter.bi_sector = dev_bytenr >> 9;
bio->bi_opf = REQ_OP_READ;
for (j = i; j < num_pages; j++) {
ret = bio_add_page(bio, block_ctx->pagev[j],
@ -2033,7 +2030,7 @@ continue_loop:
static void btrfsic_bio_end_io(struct bio *bp)
{
struct btrfsic_block *block = (struct btrfsic_block *)bp->bi_private;
struct btrfsic_block *block = bp->bi_private;
int iodone_w_error;
/* mutex is not held! This is not save if IO is not yet completed
@ -2635,102 +2632,95 @@ static struct btrfsic_dev_state *btrfsic_dev_state_lookup(dev_t dev)
&btrfsic_dev_state_hashtable);
}
static void __btrfsic_submit_bio(struct bio *bio)
static void btrfsic_check_write_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
{
unsigned int segs = bio_segments(bio);
u64 dev_bytenr = 512 * bio->bi_iter.bi_sector;
u64 cur_bytenr = dev_bytenr;
struct bvec_iter iter;
struct bio_vec bvec;
char **mapped_datav;
int bio_is_patched = 0;
int i = 0;
if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
pr_info(
"submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
bio_op(bio), bio->bi_opf, segs,
bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
mapped_datav = kmalloc_array(segs, sizeof(*mapped_datav), GFP_NOFS);
if (!mapped_datav)
return;
bio_for_each_segment(bvec, bio, iter) {
BUG_ON(bvec.bv_len != PAGE_SIZE);
mapped_datav[i] = page_address(bvec.bv_page);
i++;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
cur_bytenr += bvec.bv_len;
}
btrfsic_process_written_block(dev_state, dev_bytenr, mapped_datav, segs,
bio, &bio_is_patched, bio->bi_opf);
kfree(mapped_datav);
}
static void btrfsic_check_flush_bio(struct bio *bio, struct btrfsic_dev_state *dev_state)
{
if (dev_state->state->print_mask & BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
bio_op(bio), bio->bi_opf, bio->bi_bdev);
if (dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
struct btrfsic_block *const block =
&dev_state->dummy_block_for_bio_bh_flush;
block->is_iodone = 0;
block->never_written = 0;
block->iodone_w_error = 0;
block->flush_gen = dev_state->last_flush_gen + 1;
block->submit_bio_bh_rw = bio->bi_opf;
block->orig_bio_private = bio->bi_private;
block->orig_bio_end_io = bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
bio->bi_end_io = btrfsic_bio_end_io;
} else if ((dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
BTRFSIC_PRINT_MASK_VERBOSE))) {
pr_info(
"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
dev_state->bdev);
}
}
void btrfsic_check_bio(struct bio *bio)
{
struct btrfsic_dev_state *dev_state;
if (!btrfsic_is_initialized)
return;
mutex_lock(&btrfsic_mutex);
/* since btrfsic_submit_bio() is also called before
* btrfsic_mount(), this might return NULL */
/*
* We can be called before btrfsic_mount, so there might not be a
* dev_state.
*/
dev_state = btrfsic_dev_state_lookup(bio->bi_bdev->bd_dev);
if (NULL != dev_state &&
(bio_op(bio) == REQ_OP_WRITE) && bio_has_data(bio)) {
int i = 0;
u64 dev_bytenr;
u64 cur_bytenr;
struct bio_vec bvec;
struct bvec_iter iter;
int bio_is_patched;
char **mapped_datav;
unsigned int segs = bio_segments(bio);
dev_bytenr = 512 * bio->bi_iter.bi_sector;
bio_is_patched = 0;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
pr_info("submit_bio(rw=%d,0x%x, bi_vcnt=%u, bi_sector=%llu (bytenr %llu), bi_bdev=%p)\n",
bio_op(bio), bio->bi_opf, segs,
bio->bi_iter.bi_sector, dev_bytenr, bio->bi_bdev);
mapped_datav = kmalloc_array(segs,
sizeof(*mapped_datav), GFP_NOFS);
if (!mapped_datav)
goto leave;
cur_bytenr = dev_bytenr;
bio_for_each_segment(bvec, bio, iter) {
BUG_ON(bvec.bv_len != PAGE_SIZE);
mapped_datav[i] = page_address(bvec.bv_page);
i++;
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH_VERBOSE)
pr_info("#%u: bytenr=%llu, len=%u, offset=%u\n",
i, cur_bytenr, bvec.bv_len, bvec.bv_offset);
cur_bytenr += bvec.bv_len;
}
btrfsic_process_written_block(dev_state, dev_bytenr,
mapped_datav, segs,
bio, &bio_is_patched,
bio->bi_opf);
kfree(mapped_datav);
} else if (NULL != dev_state && (bio->bi_opf & REQ_PREFLUSH)) {
if (dev_state->state->print_mask &
BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH)
pr_info("submit_bio(rw=%d,0x%x FLUSH, bdev=%p)\n",
bio_op(bio), bio->bi_opf, bio->bi_bdev);
if (!dev_state->dummy_block_for_bio_bh_flush.is_iodone) {
if ((dev_state->state->print_mask &
(BTRFSIC_PRINT_MASK_SUBMIT_BIO_BH |
BTRFSIC_PRINT_MASK_VERBOSE)))
pr_info(
"btrfsic_submit_bio(%pg) with FLUSH but dummy block already in use (ignored)!\n",
dev_state->bdev);
} else {
struct btrfsic_block *const block =
&dev_state->dummy_block_for_bio_bh_flush;
block->is_iodone = 0;
block->never_written = 0;
block->iodone_w_error = 0;
block->flush_gen = dev_state->last_flush_gen + 1;
block->submit_bio_bh_rw = bio->bi_opf;
block->orig_bio_private = bio->bi_private;
block->orig_bio_end_io = bio->bi_end_io;
block->next_in_same_bio = NULL;
bio->bi_private = block;
bio->bi_end_io = btrfsic_bio_end_io;
}
mutex_lock(&btrfsic_mutex);
if (dev_state) {
if (bio_op(bio) == REQ_OP_WRITE && bio_has_data(bio))
btrfsic_check_write_bio(bio, dev_state);
else if (bio->bi_opf & REQ_PREFLUSH)
btrfsic_check_flush_bio(bio, dev_state);
}
leave:
mutex_unlock(&btrfsic_mutex);
}
void btrfsic_submit_bio(struct bio *bio)
{
__btrfsic_submit_bio(bio);
submit_bio(bio);
}
int btrfsic_submit_bio_wait(struct bio *bio)
{
__btrfsic_submit_bio(bio);
return submit_bio_wait(bio);
}
int btrfsic_mount(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices,
int including_extent_data, u32 print_mask)

View File

@ -7,11 +7,9 @@
#define BTRFS_CHECK_INTEGRITY_H
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
void btrfsic_submit_bio(struct bio *bio);
int btrfsic_submit_bio_wait(struct bio *bio);
void btrfsic_check_bio(struct bio *bio);
#else
#define btrfsic_submit_bio submit_bio
#define btrfsic_submit_bio_wait submit_bio_wait
static inline void btrfsic_check_bio(struct bio *bio) { }
#endif
int btrfsic_mount(struct btrfs_fs_info *fs_info,

View File

@ -425,7 +425,6 @@ out:
}
static blk_status_t submit_compressed_bio(struct btrfs_fs_info *fs_info,
struct compressed_bio *cb,
struct bio *bio, int mirror_num)
{
blk_status_t ret;
@ -604,7 +603,7 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
goto finish_cb;
}
ret = submit_compressed_bio(fs_info, cb, bio, 0);
ret = submit_compressed_bio(fs_info, bio, 0);
if (ret)
goto finish_cb;
bio = NULL;
@ -802,15 +801,13 @@ static noinline int add_ra_bio_pages(struct inode *inode,
* After the compressed pages are read, we copy the bytes into the
* bio we were passed and then call the bio end_io calls
*/
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct extent_map_tree *em_tree;
struct compressed_bio *cb;
unsigned int compressed_len;
unsigned int nr_pages;
unsigned int pg_index;
struct bio *comp_bio = NULL;
const u64 disk_bytenr = bio->bi_iter.bi_sector << SECTOR_SHIFT;
u64 cur_disk_byte = disk_bytenr;
@ -820,7 +817,8 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
u64 em_start;
struct extent_map *em;
blk_status_t ret;
int faili = 0;
int ret2;
int i;
u8 *sums;
em_tree = &BTRFS_I(inode)->extent_tree;
@ -855,32 +853,26 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
em_len = em->len;
em_start = em->start;
cb->len = bio->bi_iter.bi_size;
cb->compressed_len = compressed_len;
cb->compress_type = em->compress_type;
cb->orig_bio = bio;
free_extent_map(em);
em = NULL;
cb->len = bio->bi_iter.bi_size;
cb->compressed_len = compressed_len;
cb->compress_type = extent_compress_type(bio_flags);
cb->orig_bio = bio;
nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
cb->compressed_pages = kcalloc(nr_pages, sizeof(struct page *),
GFP_NOFS);
cb->nr_pages = DIV_ROUND_UP(compressed_len, PAGE_SIZE);
cb->compressed_pages = kcalloc(cb->nr_pages, sizeof(struct page *), GFP_NOFS);
if (!cb->compressed_pages) {
ret = BLK_STS_RESOURCE;
goto fail1;
goto fail;
}
for (pg_index = 0; pg_index < nr_pages; pg_index++) {
cb->compressed_pages[pg_index] = alloc_page(GFP_NOFS);
if (!cb->compressed_pages[pg_index]) {
faili = pg_index - 1;
ret = BLK_STS_RESOURCE;
goto fail2;
}
ret2 = btrfs_alloc_page_array(cb->nr_pages, cb->compressed_pages);
if (ret2) {
ret = BLK_STS_RESOURCE;
goto fail;
}
faili = nr_pages - 1;
cb->nr_pages = nr_pages;
add_ra_bio_pages(inode, em_start + em_len, cb);
@ -949,28 +941,29 @@ blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
fs_info->sectorsize);
sums += fs_info->csum_size * nr_sectors;
ret = submit_compressed_bio(fs_info, cb, comp_bio, mirror_num);
ret = submit_compressed_bio(fs_info, comp_bio, mirror_num);
if (ret)
goto finish_cb;
comp_bio = NULL;
}
}
return BLK_STS_OK;
return;
fail2:
while (faili >= 0) {
__free_page(cb->compressed_pages[faili]);
faili--;
fail:
if (cb->compressed_pages) {
for (i = 0; i < cb->nr_pages; i++) {
if (cb->compressed_pages[i])
__free_page(cb->compressed_pages[i]);
}
}
kfree(cb->compressed_pages);
fail1:
kfree(cb);
out:
free_extent_map(em);
bio->bi_status = ret;
bio_endio(bio);
return ret;
return;
finish_cb:
if (comp_bio) {
comp_bio->bi_status = ret;
@ -978,7 +971,7 @@ finish_cb:
}
/* All bytes of @cb is submitted, endio will free @cb */
if (cur_disk_byte == disk_bytenr + compressed_len)
return ret;
return;
wait_var_event(cb, refcount_read(&cb->pending_sectors) ==
(disk_bytenr + compressed_len - cur_disk_byte) >>
@ -990,7 +983,6 @@ finish_cb:
ASSERT(refcount_read(&cb->pending_sectors));
/* Now we are the only one referring @cb, can finish it safely. */
finish_compressed_bio_read(cb);
return ret;
}
/*

View File

@ -102,8 +102,8 @@ blk_status_t btrfs_submit_compressed_write(struct btrfs_inode *inode, u64 start,
unsigned int write_flags,
struct cgroup_subsys_state *blkcg_css,
bool writeback);
blk_status_t btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
void btrfs_submit_compressed_read(struct inode *inode, struct bio *bio,
int mirror_num);
unsigned int btrfs_compress_str2level(unsigned int type, const char *str);

View File

@ -16,6 +16,7 @@
#include "volumes.h"
#include "qgroup.h"
#include "tree-mod-log.h"
#include "tree-checker.h"
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@ -342,7 +343,7 @@ static noinline int update_ref_for_cow(struct btrfs_trans_handle *trans,
int level = btrfs_header_level(buf);
ret = btrfs_set_disk_extent_flags(trans, buf,
new_flags, level, 0);
new_flags, level);
if (ret)
return ret;
}
@ -1390,12 +1391,13 @@ static noinline void unlock_up(struct btrfs_path *path, int level,
}
/*
* helper function for btrfs_search_slot. The goal is to find a block
* in cache without setting the path to blocking. If we find the block
* we return zero and the path is unchanged.
* Helper function for btrfs_search_slot() and other functions that do a search
* on a btree. The goal is to find a tree block in the cache (the radix tree at
* fs_info->buffer_radix), but if we can't find it, or it's not up to date, read
* its pages from disk.
*
* If we can't find the block, we set the path blocking and do some
* reada. -EAGAIN is returned and the search must be repeated.
* Returns -EAGAIN, with the path unlocked, if the caller needs to repeat the
* whole btree search, starting again from the current root node.
*/
static int
read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
@ -1409,12 +1411,21 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
struct btrfs_key first_key;
int ret;
int parent_level;
bool unlock_up;
unlock_up = ((level + 1 < BTRFS_MAX_LEVEL) && p->locks[level + 1]);
blocknr = btrfs_node_blockptr(*eb_ret, slot);
gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret);
btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
/*
* If we need to read an extent buffer from disk and we are holding locks
* on upper level nodes, we unlock all the upper nodes before reading the
* extent buffer, and then return -EAGAIN to the caller as it needs to
* restart the search. We don't release the lock on the current level
* because we need to walk this node to figure out which blocks to read.
*/
tmp = find_extent_buffer(fs_info, blocknr);
if (tmp) {
if (p->reada == READA_FORWARD_ALWAYS)
@ -1436,30 +1447,38 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
return 0;
}
if (unlock_up)
btrfs_unlock_up_safe(p, level + 1);
/* now we're allowed to do a blocking uptodate check */
ret = btrfs_read_buffer(tmp, gen, parent_level - 1, &first_key);
ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
if (ret) {
free_extent_buffer(tmp);
btrfs_release_path(p);
return -EIO;
}
*eb_ret = tmp;
return 0;
if (btrfs_check_eb_owner(tmp, root->root_key.objectid)) {
free_extent_buffer(tmp);
btrfs_release_path(p);
return -EUCLEAN;
}
if (unlock_up)
ret = -EAGAIN;
goto out;
}
/*
* reduce lock contention at high levels
* of the btree by dropping locks before
* we read. Don't release the lock on the current
* level because we need to walk this node to figure
* out which blocks to read.
*/
btrfs_unlock_up_safe(p, level + 1);
if (unlock_up) {
btrfs_unlock_up_safe(p, level + 1);
ret = -EAGAIN;
} else {
ret = 0;
}
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
ret = -EAGAIN;
tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
gen, parent_level - 1, &first_key);
if (IS_ERR(tmp)) {
@ -1474,9 +1493,15 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
*/
if (!extent_buffer_uptodate(tmp))
ret = -EIO;
free_extent_buffer(tmp);
btrfs_release_path(p);
out:
if (ret == 0) {
*eb_ret = tmp;
} else {
free_extent_buffer(tmp);
btrfs_release_path(p);
}
return ret;
}
@ -2279,6 +2304,43 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
return ret;
}
/**
* Search for a valid slot for the given path.
*
* @root: The root node of the tree.
* @key: Will contain a valid item if found.
* @path: The starting point to validate the slot.
*
* Return: 0 if the item is valid
* 1 if not found
* <0 if error.
*/
int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_path *path)
{
while (1) {
int ret;
const int slot = path->slots[0];
const struct extent_buffer *leaf = path->nodes[0];
/* This is where we start walking the path. */
if (slot >= btrfs_header_nritems(leaf)) {
/*
* If we've reached the last slot in this leaf we need
* to go to the next leaf and reset the path.
*/
ret = btrfs_next_leaf(root, path);
if (ret)
return ret;
continue;
}
/* Store the found, valid item in @key. */
btrfs_item_key_to_cpu(leaf, key, slot);
break;
}
return 0;
}
/*
* adjust the pointers going up the tree, starting at level
* making sure the right key of each node is points to 'key'.

View File

@ -675,13 +675,13 @@ struct btrfs_fs_info {
rwlock_t global_root_lock;
struct rb_root global_root_tree;
spinlock_t fs_roots_radix_lock;
struct radix_tree_root fs_roots_radix;
/* The xarray that holds all the FS roots */
spinlock_t fs_roots_lock;
struct xarray fs_roots;
/* block group cache stuff */
spinlock_t block_group_cache_lock;
u64 first_logical_byte;
struct rb_root block_group_cache_tree;
rwlock_t block_group_cache_lock;
struct rb_root_cached block_group_cache_tree;
/* keep track of unallocated space */
atomic64_t free_chunk_space;
@ -848,12 +848,13 @@ struct btrfs_fs_info {
* two
*/
struct btrfs_workqueue *workers;
struct btrfs_workqueue *hipri_workers;
struct btrfs_workqueue *delalloc_workers;
struct btrfs_workqueue *flush_workers;
struct btrfs_workqueue *endio_workers;
struct btrfs_workqueue *endio_meta_workers;
struct btrfs_workqueue *endio_raid56_workers;
struct btrfs_workqueue *rmw_workers;
struct workqueue_struct *rmw_workers;
struct btrfs_workqueue *endio_meta_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
@ -946,9 +947,9 @@ struct btrfs_fs_info {
* running.
*/
refcount_t scrub_workers_refcnt;
struct btrfs_workqueue *scrub_workers;
struct btrfs_workqueue *scrub_wr_completion_workers;
struct btrfs_workqueue *scrub_parity_workers;
struct workqueue_struct *scrub_workers;
struct workqueue_struct *scrub_wr_completion_workers;
struct workqueue_struct *scrub_parity_workers;
struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
@ -994,10 +995,10 @@ struct btrfs_fs_info {
struct btrfs_delayed_root *delayed_root;
/* Extent buffer radix tree */
/* Extent buffer xarray */
spinlock_t buffer_lock;
/* Entries are eb->start / sectorsize */
struct radix_tree_root buffer_radix;
struct xarray extent_buffers;
/* next backup root to be overwritten */
int backup_root_index;
@ -1045,10 +1046,7 @@ struct btrfs_fs_info {
* Zone size > 0 when in ZONED mode, otherwise it's used for a check
* if the mode is enabled
*/
union {
u64 zone_size;
u64 zoned;
};
u64 zone_size;
struct mutex zoned_meta_io_lock;
spinlock_t treelog_bg_lock;
@ -1121,7 +1119,8 @@ enum {
*/
BTRFS_ROOT_SHAREABLE,
BTRFS_ROOT_TRACK_DIRTY,
BTRFS_ROOT_IN_RADIX,
/* The root is tracked in fs_info::fs_roots */
BTRFS_ROOT_REGISTERED,
BTRFS_ROOT_ORPHAN_ITEM_INSERTED,
BTRFS_ROOT_DEFRAG_RUNNING,
BTRFS_ROOT_FORCE_COW,
@ -1225,10 +1224,10 @@ struct btrfs_root {
struct rb_root inode_tree;
/*
* radix tree that keeps track of delayed nodes of every inode,
* protected by inode_lock
* Xarray that keeps track of delayed nodes of every inode, protected
* by inode_lock
*/
struct radix_tree_root delayed_nodes_tree;
struct xarray delayed_nodes;
/*
* right now this just gets used so that a root has its own devid
* for stat. It may be used for more later
@ -2784,7 +2783,8 @@ int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr, bool strict);
u64 objectid, u64 offset, u64 bytenr, bool strict,
struct btrfs_path *path);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
@ -2811,8 +2811,7 @@ int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags,
int level, int is_data);
struct extent_buffer *eb, u64 flags, int level);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
@ -2892,7 +2891,7 @@ void btrfs_subvolume_release_metadata(struct btrfs_root *root,
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
u64 disk_num_bytes);
u64 disk_num_bytes, bool noflush);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end);
@ -3039,6 +3038,35 @@ int btrfs_next_old_leaf(struct btrfs_root *root, struct btrfs_path *path,
int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_path *path);
int btrfs_get_next_valid_item(struct btrfs_root *root, struct btrfs_key *key,
struct btrfs_path *path);
/*
* Search in @root for a given @key, and store the slot found in @found_key.
*
* @root: The root node of the tree.
* @key: The key we are looking for.
* @found_key: Will hold the found item.
* @path: Holds the current slot/leaf.
* @iter_ret: Contains the value returned from btrfs_search_slot or
* btrfs_get_next_valid_item, whichever was executed last.
*
* The @iter_ret is an output variable that will contain the return value of
* btrfs_search_slot, if it encountered an error, or the value returned from
* btrfs_get_next_valid_item otherwise. That return value can be 0, if a valid
* slot was found, 1 if there were no more leaves, and <0 if there was an error.
*
* It's recommended to use a separate variable for iter_ret and then use it to
* set the function return value so there's no confusion of the 0/1/errno
* values stemming from btrfs_search_slot.
*/
#define btrfs_for_each_slot(root, key, found_key, path, iter_ret) \
for (iter_ret = btrfs_search_slot(NULL, (root), (key), (path), 0, 0); \
(iter_ret) >= 0 && \
(iter_ret = btrfs_get_next_valid_item((root), (found_key), (path))) == 0; \
(path)->slots[0]++ \
)
static inline int btrfs_next_old_item(struct btrfs_root *root,
struct btrfs_path *p, u64 time_seq)
{
@ -3190,7 +3218,6 @@ int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
int btrfs_find_orphan_item(struct btrfs_root *root, u64 offset);
/* file-item.c */
struct btrfs_dio_private;
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
@ -3224,8 +3251,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
u64 btrfs_file_extent_end(const struct btrfs_path *path);
/* inode.c */
blk_status_t btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
void btrfs_submit_data_bio(struct inode *inode, struct bio *bio,
int mirror_num, enum btrfs_compression_type compress_type);
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 bio_offset, struct page *page,
u64 start, u64 end);
@ -3255,10 +3282,28 @@ int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
int btrfs_create_subvol_root(struct btrfs_trans_handle *trans,
struct btrfs_root *new_root,
struct btrfs_root *parent_root,
struct user_namespace *mnt_userns);
struct btrfs_new_inode_args {
/* Input */
struct inode *dir;
struct dentry *dentry;
struct inode *inode;
bool orphan;
bool subvol;
/*
* Output from btrfs_new_inode_prepare(), input to
* btrfs_create_new_inode().
*/
struct posix_acl *default_acl;
struct posix_acl *acl;
};
int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
unsigned int *trans_num_items);
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_new_inode_args *args);
void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
struct inode *dir);
void btrfs_set_delalloc_extent(struct inode *inode, struct extent_state *state,
unsigned *bits);
void btrfs_clear_delalloc_extent(struct inode *inode,
@ -3269,7 +3314,6 @@ void btrfs_split_delalloc_extent(struct inode *inode,
struct extent_state *orig, u64 split);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
int btrfs_readpage(struct file *file, struct page *page);
void btrfs_evict_inode(struct inode *inode);
int btrfs_write_inode(struct inode *inode, struct writeback_control *wbc);
struct inode *btrfs_alloc_inode(struct super_block *sb);
@ -3314,9 +3358,9 @@ ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_before);
extern const struct dentry_operations btrfs_dentry_operations;
extern const struct iomap_ops btrfs_dio_iomap_ops;
extern const struct iomap_dio_ops btrfs_dio_ops;
/* Inode locking type flags, by default the exclusive lock is taken */
#define BTRFS_ILOCK_SHARED (1U << 0)
@ -3328,6 +3372,7 @@ void btrfs_inode_unlock(struct inode *inode, unsigned int ilock_flags);
void btrfs_update_inode_bytes(struct btrfs_inode *inode,
const u64 add_bytes,
const u64 del_bytes);
void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
/* ioctl.c */
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
@ -3403,11 +3448,29 @@ void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
}
#ifdef CONFIG_PRINTK
#ifdef CONFIG_PRINTK_INDEX
#define btrfs_printk(fs_info, fmt, args...) \
do { \
printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt); \
_btrfs_printk(fs_info, fmt, ##args); \
} while (0)
__printf(2, 3)
__cold
void btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
#elif defined(CONFIG_PRINTK)
#define btrfs_printk(fs_info, fmt, args...) \
_btrfs_printk(fs_info, fmt, ##args)
__printf(2, 3)
__cold
void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
#else
#define btrfs_printk(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, fmt, ##args)
#endif
@ -3658,12 +3721,25 @@ do { \
__LINE__, (errno)); \
} while (0)
#ifdef CONFIG_PRINTK_INDEX
#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
do { \
__btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
(errno), fmt, ##args); \
do { \
printk_index_subsys_emit( \
"BTRFS: error (device %s%s) in %s:%d: errno=%d %s", \
KERN_CRIT, fmt); \
__btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
(errno), fmt, ##args); \
} while (0)
#else
#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
__btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
(errno), fmt, ##args)
#endif
#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
&(fs_info)->fs_state)))
#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
@ -3816,15 +3892,16 @@ static inline int __btrfs_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag)
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
int btrfs_set_acl(struct user_namespace *mnt_userns, struct inode *inode,
struct posix_acl *acl, int type);
int btrfs_init_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir);
int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
struct posix_acl *acl, int type);
#else
#define btrfs_get_acl NULL
#define btrfs_set_acl NULL
static inline int btrfs_init_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *dir)
static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct posix_acl *acl,
int type)
{
return 0;
return -EOPNOTSUPP;
}
#endif
@ -3929,7 +4006,7 @@ static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
{
return fs_info->zoned != 0;
return fs_info->zone_size > 0;
}
static inline bool btrfs_is_data_reloc_root(const struct btrfs_root *root)

View File

@ -289,7 +289,7 @@ static void calc_inode_reservations(struct btrfs_fs_info *fs_info,
}
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
u64 disk_num_bytes)
u64 disk_num_bytes, bool noflush)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@ -308,7 +308,7 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
* If we have a transaction open (can happen if we call truncate_block
* from truncate), then we need FLUSH_LIMIT so we don't deadlock.
*/
if (btrfs_is_free_space_inode(inode)) {
if (noflush || btrfs_is_free_space_inode(inode)) {
flush = BTRFS_RESERVE_NO_FLUSH;
} else {
if (current->journal_info)
@ -333,7 +333,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
*/
calc_inode_reservations(fs_info, num_bytes, disk_num_bytes,
&meta_reserve, &qgroup_reserve);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true);
ret = btrfs_qgroup_reserve_meta_prealloc(root, qgroup_reserve, true,
noflush);
if (ret)
return ret;
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, meta_reserve, flush);
@ -456,7 +457,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
ret = btrfs_check_data_free_space(inode, reserved, start, len);
if (ret < 0)
return ret;
ret = btrfs_delalloc_reserve_metadata(inode, len, len);
ret = btrfs_delalloc_reserve_metadata(inode, len, len, false);
if (ret < 0) {
btrfs_free_reserved_data_space(inode, *reserved, start, len);
extent_changeset_free(*reserved);

View File

@ -78,7 +78,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
}
spin_lock(&root->inode_lock);
node = radix_tree_lookup(&root->delayed_nodes_tree, ino);
node = xa_load(&root->delayed_nodes, ino);
if (node) {
if (btrfs_inode->delayed_node) {
@ -90,9 +90,9 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
/*
* It's possible that we're racing into the middle of removing
* this node from the radix tree. In this case, the refcount
* this node from the xarray. In this case, the refcount
* was zero and it should never go back to one. Just return
* NULL like it was never in the radix at all; our release
* NULL like it was never in the xarray at all; our release
* function is in the process of removing it.
*
* Some implementations of refcount_inc refuse to bump the
@ -100,7 +100,7 @@ static struct btrfs_delayed_node *btrfs_get_delayed_node(
* here, refcount_inc() may decide to just WARN_ONCE() instead
* of actually bumping the refcount.
*
* If this node is properly in the radix, we want to bump the
* If this node is properly in the xarray, we want to bump the
* refcount twice, once for the inode and once for this get
* operation.
*/
@ -128,36 +128,30 @@ static struct btrfs_delayed_node *btrfs_get_or_create_delayed_node(
u64 ino = btrfs_ino(btrfs_inode);
int ret;
again:
node = btrfs_get_delayed_node(btrfs_inode);
if (node)
return node;
do {
node = btrfs_get_delayed_node(btrfs_inode);
if (node)
return node;
node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
if (!node)
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
node = kmem_cache_zalloc(delayed_node_cache, GFP_NOFS);
if (!node)
return ERR_PTR(-ENOMEM);
btrfs_init_delayed_node(node, root, ino);
/* cached in the btrfs inode and can be accessed */
refcount_set(&node->refs, 2);
/* Cached in the inode and can be accessed */
refcount_set(&node->refs, 2);
ret = radix_tree_preload(GFP_NOFS);
if (ret) {
kmem_cache_free(delayed_node_cache, node);
return ERR_PTR(ret);
}
spin_lock(&root->inode_lock);
ret = radix_tree_insert(&root->delayed_nodes_tree, ino, node);
if (ret == -EEXIST) {
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, node);
radix_tree_preload_end();
goto again;
}
spin_lock(&root->inode_lock);
ret = xa_insert(&root->delayed_nodes, ino, node, GFP_NOFS);
if (ret) {
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, node);
if (ret != -EBUSY)
return ERR_PTR(ret);
}
} while (ret);
btrfs_inode->delayed_node = node;
spin_unlock(&root->inode_lock);
radix_tree_preload_end();
return node;
}
@ -276,8 +270,7 @@ static void __btrfs_release_delayed_node(
* back up. We can delete it now.
*/
ASSERT(refcount_read(&delayed_node->refs) == 0);
radix_tree_delete(&root->delayed_nodes_tree,
delayed_node->inode_id);
xa_erase(&root->delayed_nodes, delayed_node->inode_id);
spin_unlock(&root->inode_lock);
kmem_cache_free(delayed_node_cache, delayed_node);
}
@ -1870,34 +1863,35 @@ void btrfs_kill_delayed_inode_items(struct btrfs_inode *inode)
void btrfs_kill_all_delayed_nodes(struct btrfs_root *root)
{
u64 inode_id = 0;
unsigned long index = 0;
struct btrfs_delayed_node *delayed_node;
struct btrfs_delayed_node *delayed_nodes[8];
int i, n;
while (1) {
int n = 0;
spin_lock(&root->inode_lock);
n = radix_tree_gang_lookup(&root->delayed_nodes_tree,
(void **)delayed_nodes, inode_id,
ARRAY_SIZE(delayed_nodes));
if (!n) {
if (xa_empty(&root->delayed_nodes)) {
spin_unlock(&root->inode_lock);
break;
return;
}
inode_id = delayed_nodes[n - 1]->inode_id + 1;
for (i = 0; i < n; i++) {
xa_for_each_start(&root->delayed_nodes, index, delayed_node, index) {
/*
* Don't increase refs in case the node is dead and
* about to be removed from the tree in the loop below
*/
if (!refcount_inc_not_zero(&delayed_nodes[i]->refs))
delayed_nodes[i] = NULL;
if (refcount_inc_not_zero(&delayed_node->refs)) {
delayed_nodes[n] = delayed_node;
n++;
}
if (n >= ARRAY_SIZE(delayed_nodes))
break;
}
index++;
spin_unlock(&root->inode_lock);
for (i = 0; i < n; i++) {
if (!delayed_nodes[i])
continue;
for (int i = 0; i < n; i++) {
__btrfs_kill_delayed_node(delayed_nodes[i]);
btrfs_release_delayed_node(delayed_nodes[i]);
}

View File

@ -930,7 +930,6 @@ int btrfs_add_delayed_tree_ref(struct btrfs_trans_handle *trans,
is_system = (generic_ref->tree_ref.owning_root == BTRFS_CHUNK_TREE_OBJECTID);
ASSERT(generic_ref->type == BTRFS_REF_METADATA && generic_ref->action);
BUG_ON(extent_op && extent_op->is_data);
ref = kmem_cache_alloc(btrfs_delayed_tree_ref_cachep, GFP_NOFS);
if (!ref)
return -ENOMEM;
@ -1103,8 +1102,7 @@ int btrfs_add_delayed_extent_op(struct btrfs_trans_handle *trans,
return -ENOMEM;
init_delayed_ref_head(head_ref, NULL, bytenr, num_bytes, 0, 0,
BTRFS_UPDATE_DELAYED_HEAD, extent_op->is_data,
false);
BTRFS_UPDATE_DELAYED_HEAD, false, false);
head_ref->extent_op = extent_op;
delayed_refs = &trans->transaction->delayed_refs;

View File

@ -58,7 +58,6 @@ struct btrfs_delayed_extent_op {
u8 level;
bool update_key;
bool update_flags;
bool is_data;
u64 flags_to_set;
};

View File

@ -474,6 +474,7 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
struct btrfs_dev_extent *dev_extent = NULL;
struct btrfs_block_group *cache;
struct btrfs_trans_handle *trans;
int iter_ret = 0;
int ret = 0;
u64 chunk_offset;
@ -524,29 +525,8 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
key.type = BTRFS_DEV_EXTENT_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto free_path;
if (ret > 0) {
if (path->slots[0] >=
btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto free_path;
if (ret > 0) {
ret = 0;
goto free_path;
}
} else {
ret = 0;
}
}
while (1) {
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.objectid != src_dev->devid)
break;
@ -557,30 +537,23 @@ static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
if (found_key.offset < key.offset)
break;
dev_extent = btrfs_item_ptr(leaf, slot, struct btrfs_dev_extent);
dev_extent = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_dev_extent);
chunk_offset = btrfs_dev_extent_chunk_offset(leaf, dev_extent);
cache = btrfs_lookup_block_group(fs_info, chunk_offset);
if (!cache)
goto skip;
continue;
spin_lock(&cache->lock);
cache->to_copy = 1;
spin_unlock(&cache->lock);
btrfs_put_block_group(cache);
skip:
ret = btrfs_next_item(root, path);
if (ret != 0) {
if (ret > 0)
ret = 0;
break;
}
}
if (iter_ret < 0)
ret = iter_ret;
free_path:
btrfs_free_path(path);
unlock:
mutex_unlock(&fs_info->chunk_mutex);
@ -881,6 +854,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
int scrub_ret)
{
struct btrfs_dev_replace *dev_replace = &fs_info->dev_replace;
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *tgt_device;
struct btrfs_device *src_device;
struct btrfs_root *root = fs_info->tree_root;
@ -930,12 +904,12 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
WARN_ON(ret);
/* Prevent write_all_supers() during the finishing procedure */
mutex_lock(&fs_info->fs_devices->device_list_mutex);
mutex_lock(&fs_devices->device_list_mutex);
/* Prevent new chunks being allocated on the source device */
mutex_lock(&fs_info->chunk_mutex);
if (!list_empty(&src_device->post_commit_list)) {
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&fs_devices->device_list_mutex);
mutex_unlock(&fs_info->chunk_mutex);
} else {
break;
@ -972,7 +946,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&fs_devices->device_list_mutex);
btrfs_rm_dev_replace_blocked(fs_info);
if (tgt_device)
btrfs_destroy_dev_replace_tgtdev(tgt_device);
@ -1001,8 +975,8 @@ error:
btrfs_assign_next_active_device(src_device, tgt_device);
list_add(&tgt_device->dev_alloc_list, &fs_info->fs_devices->alloc_list);
fs_info->fs_devices->rw_devices++;
list_add(&tgt_device->dev_alloc_list, &fs_devices->alloc_list);
fs_devices->rw_devices++;
up_write(&dev_replace->rwsem);
btrfs_rm_dev_replace_blocked(fs_info);
@ -1025,7 +999,7 @@ error:
* belong to this filesystem.
*/
mutex_unlock(&fs_info->chunk_mutex);
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
mutex_unlock(&fs_devices->device_list_mutex);
/* replace the sysfs entry */
btrfs_sysfs_remove_device(src_device);

View File

@ -325,36 +325,15 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
struct btrfs_path *path, u64 dirid,
const char *name, int name_len)
{
struct extent_buffer *leaf;
struct btrfs_dir_item *di;
struct btrfs_key key;
u32 nritems;
int ret;
key.objectid = dirid;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
return ERR_PTR(ret);
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
while (1) {
if (path->slots[0] >= nritems) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
return ERR_PTR(ret);
if (ret > 0)
break;
leaf = path->nodes[0];
nritems = btrfs_header_nritems(leaf);
continue;
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
btrfs_for_each_slot(root, &key, &key, path, ret) {
if (key.objectid != dirid || key.type != BTRFS_DIR_INDEX_KEY)
break;
@ -362,10 +341,12 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
name, name_len);
if (di)
return di;
path->slots[0]++;
}
return NULL;
/* Adjust return code if the key was not found in the next leaf. */
if (ret > 0)
ret = 0;
return ERR_PTR(ret);
}
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,

View File

@ -5,7 +5,6 @@
#include <linux/fs.h>
#include <linux/blkdev.h>
#include <linux/radix-tree.h>
#include <linux/writeback.h>
#include <linux/workqueue.h>
#include <linux/kthread.h>
@ -374,9 +373,9 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
* @level: expected level, mandatory check
* @first_key: expected key of first slot, skip check if NULL
*/
static int btree_read_extent_buffer_pages(struct extent_buffer *eb,
u64 parent_transid, int level,
struct btrfs_key *first_key)
int btrfs_read_extent_buffer(struct extent_buffer *eb,
u64 parent_transid, int level,
struct btrfs_key *first_key)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
@ -486,7 +485,7 @@ static int csum_dirty_subpage_buffers(struct btrfs_fs_info *fs_info,
uptodate = btrfs_subpage_test_uptodate(fs_info, page, cur,
fs_info->nodesize);
/* A dirty eb shouldn't disappear from buffer_radix */
/* A dirty eb shouldn't disappear from extent_buffers */
if (WARN_ON(!eb))
return -EUCLEAN;
@ -519,7 +518,7 @@ static int csum_dirty_buffer(struct btrfs_fs_info *fs_info, struct bio_vec *bvec
u64 found_start;
struct extent_buffer *eb;
if (fs_info->sectorsize < PAGE_SIZE)
if (fs_info->nodesize < PAGE_SIZE)
return csum_dirty_subpage_buffers(fs_info, bvec);
eb = (struct extent_buffer *)page->private;
@ -704,7 +703,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ASSERT(page->private);
if (btrfs_sb(page->mapping->host->i_sb)->sectorsize < PAGE_SIZE)
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return validate_subpage_buffer(page, start, end, mirror);
eb = (struct extent_buffer *)page->private;
@ -850,8 +849,7 @@ static void run_one_async_free(struct btrfs_work *work)
}
blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 dio_file_offset,
int mirror_num, u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
@ -874,9 +872,9 @@ blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
async->status = 0;
if (op_is_sync(bio->bi_opf))
btrfs_set_work_high_priority(&async->work);
btrfs_queue_work(fs_info->workers, &async->work);
btrfs_queue_work(fs_info->hipri_workers, &async->work);
else
btrfs_queue_work(fs_info->workers, &async->work);
return 0;
}
@ -920,8 +918,7 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
return true;
}
blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags)
void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
blk_status_t ret;
@ -933,31 +930,25 @@ blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
*/
ret = btrfs_bio_wq_end_io(fs_info, bio,
BTRFS_WQ_ENDIO_METADATA);
if (ret)
goto out_w_error;
ret = btrfs_map_bio(fs_info, bio, mirror_num);
if (!ret)
ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else if (!should_async_write(fs_info, BTRFS_I(inode))) {
ret = btree_csum_one_bio(bio);
if (ret)
goto out_w_error;
ret = btrfs_map_bio(fs_info, bio, mirror_num);
if (!ret)
ret = btrfs_map_bio(fs_info, bio, mirror_num);
} else {
/*
* kthread helpers are used to submit writes so that
* checksumming can happen in parallel across all CPUs
*/
ret = btrfs_wq_submit_bio(inode, bio, mirror_num, 0,
0, btree_submit_bio_start);
btree_submit_bio_start);
}
if (ret)
goto out_w_error;
return 0;
out_w_error:
bio->bi_status = ret;
bio_endio(bio);
return ret;
if (ret) {
bio->bi_status = ret;
bio_endio(bio);
}
}
#ifdef CONFIG_MIGRATION
@ -1118,12 +1109,15 @@ struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
if (IS_ERR(buf))
return buf;
ret = btree_read_extent_buffer_pages(buf, parent_transid,
level, first_key);
ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
if (ret) {
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
if (btrfs_check_eb_owner(buf, owner_root)) {
free_extent_buffer_stale(buf);
return ERR_PTR(-EUCLEAN);
}
return buf;
}
@ -1164,7 +1158,7 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->nr_delalloc_inodes = 0;
root->nr_ordered_extents = 0;
root->inode_tree = RB_ROOT;
INIT_RADIX_TREE(&root->delayed_nodes_tree, GFP_ATOMIC);
xa_init_flags(&root->delayed_nodes, GFP_ATOMIC);
btrfs_init_root_block_rsv(root);
@ -1216,9 +1210,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
btrfs_qgroup_init_swapped_blocks(&root->swapped_blocks);
#ifdef CONFIG_BTRFS_DEBUG
INIT_LIST_HEAD(&root->leak_list);
spin_lock(&fs_info->fs_roots_radix_lock);
spin_lock(&fs_info->fs_roots_lock);
list_add_tail(&root->leak_list, &fs_info->allocated_roots);
spin_unlock(&fs_info->fs_roots_radix_lock);
spin_unlock(&fs_info->fs_roots_lock);
#endif
}
@ -1563,6 +1557,23 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
ret = -EIO;
goto fail;
}
/*
* For real fs, and not log/reloc trees, root owner must
* match its root node owner
*/
if (!test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state) &&
root->root_key.objectid != BTRFS_TREE_LOG_OBJECTID &&
root->root_key.objectid != BTRFS_TREE_RELOC_OBJECTID &&
root->root_key.objectid != btrfs_header_owner(root->node)) {
btrfs_crit(fs_info,
"root=%llu block=%llu, tree root owner mismatch, have %llu expect %llu",
root->root_key.objectid, root->node->start,
btrfs_header_owner(root->node),
root->root_key.objectid);
ret = -EUCLEAN;
goto fail;
}
root->commit_root = btrfs_root_node(root);
return root;
fail:
@ -1648,12 +1659,11 @@ static struct btrfs_root *btrfs_lookup_fs_root(struct btrfs_fs_info *fs_info,
{
struct btrfs_root *root;
spin_lock(&fs_info->fs_roots_radix_lock);
root = radix_tree_lookup(&fs_info->fs_roots_radix,
(unsigned long)root_id);
spin_lock(&fs_info->fs_roots_lock);
root = xa_load(&fs_info->fs_roots, (unsigned long)root_id);
if (root)
root = btrfs_grab_root(root);
spin_unlock(&fs_info->fs_roots_radix_lock);
spin_unlock(&fs_info->fs_roots_lock);
return root;
}
@ -1695,20 +1705,14 @@ int btrfs_insert_fs_root(struct btrfs_fs_info *fs_info,
{
int ret;
ret = radix_tree_preload(GFP_NOFS);
if (ret)
return ret;
spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_insert(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
root);
spin_lock(&fs_info->fs_roots_lock);
ret = xa_insert(&fs_info->fs_roots, (unsigned long)root->root_key.objectid,
root, GFP_NOFS);
if (ret == 0) {
btrfs_grab_root(root);
set_bit(BTRFS_ROOT_IN_RADIX, &root->state);
set_bit(BTRFS_ROOT_REGISTERED, &root->state);
}
spin_unlock(&fs_info->fs_roots_radix_lock);
radix_tree_preload_end();
spin_unlock(&fs_info->fs_roots_lock);
return ret;
}
@ -1964,7 +1968,7 @@ static void end_workqueue_fn(struct btrfs_work *work)
static int cleaner_kthread(void *arg)
{
struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)arg;
struct btrfs_fs_info *fs_info = arg;
int again;
while (1) {
@ -2266,10 +2270,12 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
{
btrfs_destroy_workqueue(fs_info->fixup_workers);
btrfs_destroy_workqueue(fs_info->delalloc_workers);
btrfs_destroy_workqueue(fs_info->hipri_workers);
btrfs_destroy_workqueue(fs_info->workers);
btrfs_destroy_workqueue(fs_info->endio_workers);
btrfs_destroy_workqueue(fs_info->endio_raid56_workers);
btrfs_destroy_workqueue(fs_info->rmw_workers);
if (fs_info->rmw_workers)
destroy_workqueue(fs_info->rmw_workers);
btrfs_destroy_workqueue(fs_info->endio_write_workers);
btrfs_destroy_workqueue(fs_info->endio_freespace_worker);
btrfs_destroy_workqueue(fs_info->delayed_workers);
@ -2336,9 +2342,9 @@ void btrfs_put_root(struct btrfs_root *root)
btrfs_drew_lock_destroy(&root->snapshot_lock);
free_root_extent_buffers(root);
#ifdef CONFIG_BTRFS_DEBUG
spin_lock(&root->fs_info->fs_roots_radix_lock);
spin_lock(&root->fs_info->fs_roots_lock);
list_del_init(&root->leak_list);
spin_unlock(&root->fs_info->fs_roots_radix_lock);
spin_unlock(&root->fs_info->fs_roots_lock);
#endif
kfree(root);
}
@ -2346,28 +2352,21 @@ void btrfs_put_root(struct btrfs_root *root)
void btrfs_free_fs_roots(struct btrfs_fs_info *fs_info)
{
int ret;
struct btrfs_root *gang[8];
int i;
struct btrfs_root *root;
unsigned long index = 0;
while (!list_empty(&fs_info->dead_roots)) {
gang[0] = list_entry(fs_info->dead_roots.next,
struct btrfs_root, root_list);
list_del(&gang[0]->root_list);
root = list_entry(fs_info->dead_roots.next,
struct btrfs_root, root_list);
list_del(&root->root_list);
if (test_bit(BTRFS_ROOT_IN_RADIX, &gang[0]->state))
btrfs_drop_and_free_fs_root(fs_info, gang[0]);
btrfs_put_root(gang[0]);
if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
btrfs_drop_and_free_fs_root(fs_info, root);
btrfs_put_root(root);
}
while (1) {
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, 0,
ARRAY_SIZE(gang));
if (!ret)
break;
for (i = 0; i < ret; i++)
btrfs_drop_and_free_fs_root(fs_info, gang[i]);
xa_for_each(&fs_info->fs_roots, index, root) {
btrfs_drop_and_free_fs_root(fs_info, root);
}
}
@ -2444,7 +2443,9 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
unsigned int flags = WQ_MEM_RECLAIM | WQ_FREEZABLE | WQ_UNBOUND;
fs_info->workers =
btrfs_alloc_workqueue(fs_info, "worker",
btrfs_alloc_workqueue(fs_info, "worker", flags, max_active, 16);
fs_info->hipri_workers =
btrfs_alloc_workqueue(fs_info, "worker-high",
flags | WQ_HIGHPRI, max_active, 16);
fs_info->delalloc_workers =
@ -2476,8 +2477,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->endio_raid56_workers =
btrfs_alloc_workqueue(fs_info, "endio-raid56", flags,
max_active, 4);
fs_info->rmw_workers =
btrfs_alloc_workqueue(fs_info, "rmw", flags, max_active, 2);
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
max_active, 2);
@ -2492,8 +2492,8 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->discard_ctl.discard_workers =
alloc_workqueue("btrfs_discard", WQ_UNBOUND | WQ_FREEZABLE, 1);
if (!(fs_info->workers && fs_info->delalloc_workers &&
fs_info->flush_workers &&
if (!(fs_info->workers && fs_info->hipri_workers &&
fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->endio_meta_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
@ -2815,12 +2815,14 @@ static int validate_super(struct btrfs_fs_info *fs_info,
}
/*
* For 4K page size, we only support 4K sector size.
* For 64K page size, we support 64K and 4K sector sizes.
* We only support at most two sectorsizes: 4K and PAGE_SIZE.
*
* We can support 16K sectorsize with 64K page size without problem,
* but such sectorsize/pagesize combination doesn't make much sense.
* 4K will be our future standard, PAGE_SIZE is supported from the very
* beginning.
*/
if ((PAGE_SIZE == SZ_4K && sectorsize != PAGE_SIZE) ||
(PAGE_SIZE == SZ_64K && (sectorsize != SZ_4K &&
sectorsize != SZ_64K))) {
if (sectorsize > PAGE_SIZE || (sectorsize != SZ_4K && sectorsize != PAGE_SIZE)) {
btrfs_err(fs_info,
"sectorsize %llu not yet supported for page size %lu",
sectorsize, PAGE_SIZE);
@ -3132,8 +3134,8 @@ static int __cold init_tree_roots(struct btrfs_fs_info *fs_info)
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
{
INIT_RADIX_TREE(&fs_info->fs_roots_radix, GFP_ATOMIC);
INIT_RADIX_TREE(&fs_info->buffer_radix, GFP_ATOMIC);
xa_init_flags(&fs_info->fs_roots, GFP_ATOMIC);
xa_init_flags(&fs_info->extent_buffers, GFP_ATOMIC);
INIT_LIST_HEAD(&fs_info->trans_list);
INIT_LIST_HEAD(&fs_info->dead_roots);
INIT_LIST_HEAD(&fs_info->delayed_iputs);
@ -3141,7 +3143,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&fs_info->caching_block_groups);
spin_lock_init(&fs_info->delalloc_root_lock);
spin_lock_init(&fs_info->trans_lock);
spin_lock_init(&fs_info->fs_roots_radix_lock);
spin_lock_init(&fs_info->fs_roots_lock);
spin_lock_init(&fs_info->delayed_iput_lock);
spin_lock_init(&fs_info->defrag_inodes_lock);
spin_lock_init(&fs_info->super_lock);
@ -3209,9 +3211,8 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
btrfs_init_balance(fs_info);
btrfs_init_async_reclaim_work(fs_info);
spin_lock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT;
fs_info->first_logical_byte = (u64)-1;
rwlock_init(&fs_info->block_group_cache_lock);
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
extent_io_tree_init(fs_info, &fs_info->excluded_extents,
IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
@ -3295,7 +3296,7 @@ static int init_mount_fs_info(struct btrfs_fs_info *fs_info, struct super_block
static int btrfs_uuid_rescan_kthread(void *data)
{
struct btrfs_fs_info *fs_info = (struct btrfs_fs_info *)data;
struct btrfs_fs_info *fs_info = data;
int ret;
/*
@ -3373,7 +3374,7 @@ int btrfs_start_pre_rw_mount(struct btrfs_fs_info *fs_info)
/*
* btrfs_find_orphan_roots() is responsible for finding all the dead
* roots (with 0 refs), flag them with BTRFS_ROOT_DEAD_TREE and load
* them into the fs_info->fs_roots_radix tree. This must be done before
* them into the fs_info->fs_roots. This must be done before
* calling btrfs_orphan_cleanup() on the tree root. If we don't do it
* first, then btrfs_orphan_cleanup() will delete a dead root's orphan
* item before the root's tree is deleted - this means that if we unmount
@ -3611,7 +3612,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
~BTRFS_FEATURE_INCOMPAT_SUPP;
if (features) {
btrfs_err(fs_info,
"cannot mount because of unsupported optional features (%llx)",
"cannot mount because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_alloc;
@ -3649,7 +3650,7 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
~BTRFS_FEATURE_COMPAT_RO_SUPP;
if (!sb_rdonly(sb) && features) {
btrfs_err(fs_info,
"cannot mount read-write because of unsupported optional features (%llx)",
"cannot mount read-write because of unsupported optional features (0x%llx)",
features);
err = -EINVAL;
goto fail_alloc;
@ -3672,14 +3673,6 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
btrfs_warn(fs_info,
"read-write for sector size %u with page size %lu is experimental",
sectorsize, PAGE_SIZE);
if (btrfs_super_incompat_flags(fs_info->super_copy) &
BTRFS_FEATURE_INCOMPAT_RAID56) {
btrfs_err(fs_info,
"RAID56 is not yet supported for sector size %u with page size %lu",
sectorsize, PAGE_SIZE);
err = -EINVAL;
goto fail_alloc;
}
subpage_info = kzalloc(sizeof(*subpage_info), GFP_KERNEL);
if (!subpage_info)
goto fail_alloc;
@ -4157,7 +4150,8 @@ static int write_dev_supers(struct btrfs_device *device,
if (i == 0 && !btrfs_test_opt(device->fs_info, NOBARRIER))
bio->bi_opf |= REQ_FUA;
btrfsic_submit_bio(bio);
btrfsic_check_bio(bio);
submit_bio(bio);
if (btrfs_advance_sb_log(device, i))
errors++;
@ -4271,7 +4265,8 @@ static void write_dev_flush(struct btrfs_device *device)
init_completion(&device->flush_wait);
bio->bi_private = &device->flush_wait;
btrfsic_submit_bio(bio);
btrfsic_check_bio(bio);
submit_bio(bio);
set_bit(BTRFS_DEV_STATE_FLUSH_SENT, &device->dev_state);
}
@ -4504,12 +4499,11 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
{
bool drop_ref = false;
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_delete(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid);
if (test_and_clear_bit(BTRFS_ROOT_IN_RADIX, &root->state))
spin_lock(&fs_info->fs_roots_lock);
xa_erase(&fs_info->fs_roots, (unsigned long)root->root_key.objectid);
if (test_and_clear_bit(BTRFS_ROOT_REGISTERED, &root->state))
drop_ref = true;
spin_unlock(&fs_info->fs_roots_radix_lock);
spin_unlock(&fs_info->fs_roots_lock);
if (BTRFS_FS_ERROR(fs_info)) {
ASSERT(root->log_root == NULL);
@ -4525,50 +4519,48 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info)
{
u64 root_objectid = 0;
struct btrfs_root *gang[8];
int i = 0;
struct btrfs_root *roots[8];
unsigned long index = 0;
int i;
int err = 0;
unsigned int ret = 0;
int grabbed;
while (1) {
spin_lock(&fs_info->fs_roots_radix_lock);
ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang));
if (!ret) {
spin_unlock(&fs_info->fs_roots_radix_lock);
break;
}
root_objectid = gang[ret - 1]->root_key.objectid + 1;
struct btrfs_root *root;
for (i = 0; i < ret; i++) {
/* Avoid to grab roots in dead_roots */
if (btrfs_root_refs(&gang[i]->root_item) == 0) {
gang[i] = NULL;
continue;
}
/* grab all the search result for later use */
gang[i] = btrfs_grab_root(gang[i]);
spin_lock(&fs_info->fs_roots_lock);
if (!xa_find(&fs_info->fs_roots, &index, ULONG_MAX, XA_PRESENT)) {
spin_unlock(&fs_info->fs_roots_lock);
return err;
}
spin_unlock(&fs_info->fs_roots_radix_lock);
for (i = 0; i < ret; i++) {
if (!gang[i])
continue;
root_objectid = gang[i]->root_key.objectid;
err = btrfs_orphan_cleanup(gang[i]);
if (err)
grabbed = 0;
xa_for_each_start(&fs_info->fs_roots, index, root, index) {
/* Avoid grabbing roots in dead_roots */
if (btrfs_root_refs(&root->root_item) > 0)
roots[grabbed++] = btrfs_grab_root(root);
if (grabbed >= ARRAY_SIZE(roots))
break;
btrfs_put_root(gang[i]);
}
root_objectid++;
spin_unlock(&fs_info->fs_roots_lock);
for (i = 0; i < grabbed; i++) {
if (!roots[i])
continue;
index = roots[i]->root_key.objectid;
err = btrfs_orphan_cleanup(roots[i]);
if (err)
goto out;
btrfs_put_root(roots[i]);
}
index++;
}
/* release the uncleaned roots due to error */
for (; i < ret; i++) {
if (gang[i])
btrfs_put_root(gang[i]);
out:
/* Release the roots that remain uncleaned due to error */
for (; i < grabbed; i++) {
if (roots[i])
btrfs_put_root(roots[i]);
}
return err;
}
@ -4863,13 +4855,6 @@ void btrfs_btree_balance_dirty_nodelay(struct btrfs_fs_info *fs_info)
__btrfs_btree_balance_dirty(fs_info, 0);
}
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
struct btrfs_key *first_key)
{
return btree_read_extent_buffer_pages(buf, parent_transid,
level, first_key);
}
static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
{
/* cleanup FS via transaction */
@ -4885,31 +4870,28 @@ static void btrfs_error_commit_super(struct btrfs_fs_info *fs_info)
static void btrfs_drop_all_logs(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *gang[8];
u64 root_objectid = 0;
int ret;
unsigned long index = 0;
int grabbed = 0;
struct btrfs_root *roots[8];
spin_lock(&fs_info->fs_roots_radix_lock);
while ((ret = radix_tree_gang_lookup(&fs_info->fs_roots_radix,
(void **)gang, root_objectid,
ARRAY_SIZE(gang))) != 0) {
int i;
spin_lock(&fs_info->fs_roots_lock);
while ((grabbed = xa_extract(&fs_info->fs_roots, (void **)roots, index,
ULONG_MAX, 8, XA_PRESENT))) {
for (int i = 0; i < grabbed; i++)
roots[i] = btrfs_grab_root(roots[i]);
spin_unlock(&fs_info->fs_roots_lock);
for (i = 0; i < ret; i++)
gang[i] = btrfs_grab_root(gang[i]);
spin_unlock(&fs_info->fs_roots_radix_lock);
for (i = 0; i < ret; i++) {
if (!gang[i])
for (int i = 0; i < grabbed; i++) {
if (!roots[i])
continue;
root_objectid = gang[i]->root_key.objectid;
btrfs_free_log(NULL, gang[i]);
btrfs_put_root(gang[i]);
index = roots[i]->root_key.objectid;
btrfs_free_log(NULL, roots[i]);
btrfs_put_root(roots[i]);
}
root_objectid++;
spin_lock(&fs_info->fs_roots_radix_lock);
index++;
spin_lock(&fs_info->fs_roots_lock);
}
spin_unlock(&fs_info->fs_roots_radix_lock);
spin_unlock(&fs_info->fs_roots_lock);
btrfs_free_log_root_tree(NULL, fs_info);
}

View File

@ -87,8 +87,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
blk_status_t btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags);
void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@ -120,13 +119,12 @@ void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_buffer(struct extent_buffer *buf, u64 parent_transid, int level,
struct btrfs_key *first_key);
int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
int level, struct btrfs_key *first_key);
blk_status_t btrfs_bio_wq_end_io(struct btrfs_fs_info *info, struct bio *bio,
enum btrfs_wq_endio_type metadata);
blk_status_t btrfs_wq_submit_bio(struct inode *inode, struct bio *bio,
int mirror_num, unsigned long bio_flags,
u64 dio_file_offset,
int mirror_num, u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);

View File

@ -895,7 +895,13 @@ again:
err = -ENOENT;
while (1) {
if (ptr >= end) {
WARN_ON(ptr > end);
if (ptr > end) {
err = -EUCLEAN;
btrfs_print_leaf(path->nodes[0]);
btrfs_crit(fs_info,
"overrun extent record at slot %d while looking for inline extent for root %llu owner %llu offset %llu parent %llu",
path->slots[0], root_objectid, owner, offset, parent);
}
break;
}
iref = (struct btrfs_extent_inline_ref *)ptr;
@ -1577,12 +1583,12 @@ static int run_delayed_extent_op(struct btrfs_trans_handle *trans,
u32 item_size;
int ret;
int err = 0;
int metadata = !extent_op->is_data;
int metadata = 1;
if (TRANS_ABORTED(trans))
return 0;
if (metadata && !btrfs_fs_incompat(fs_info, SKINNY_METADATA))
if (!btrfs_fs_incompat(fs_info, SKINNY_METADATA))
metadata = 0;
path = btrfs_alloc_path();
@ -2180,7 +2186,7 @@ out:
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags,
int level, int is_data)
int level)
{
struct btrfs_delayed_extent_op *extent_op;
int ret;
@ -2192,7 +2198,6 @@ int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_flags = true;
extent_op->update_key = false;
extent_op->is_data = is_data ? true : false;
extent_op->level = level;
ret = btrfs_add_delayed_extent_op(trans, eb->start, eb->len, extent_op);
@ -2357,15 +2362,10 @@ out:
}
int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
u64 bytenr, bool strict)
u64 bytenr, bool strict, struct btrfs_path *path)
{
struct btrfs_path *path;
int ret;
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
do {
ret = check_committed_ref(root, path, objectid,
offset, bytenr, strict);
@ -2376,7 +2376,7 @@ int btrfs_cross_ref_exist(struct btrfs_root *root, u64 objectid, u64 offset,
} while (ret == -EAGAIN);
out:
btrfs_free_path(path);
btrfs_release_path(path);
if (btrfs_is_data_reloc_root(root))
WARN_ON(ret > 0);
return ret;
@ -2497,24 +2497,21 @@ static u64 get_alloc_profile_by_root(struct btrfs_root *root, int data)
return ret;
}
static u64 first_logical_byte(struct btrfs_fs_info *fs_info, u64 search_start)
static u64 first_logical_byte(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_group *cache;
u64 bytenr;
struct rb_node *leftmost;
u64 bytenr = 0;
spin_lock(&fs_info->block_group_cache_lock);
bytenr = fs_info->first_logical_byte;
spin_unlock(&fs_info->block_group_cache_lock);
read_lock(&fs_info->block_group_cache_lock);
/* Get the block group with the lowest logical start address. */
leftmost = rb_first_cached(&fs_info->block_group_cache_tree);
if (leftmost) {
struct btrfs_block_group *bg;
if (bytenr < (u64)-1)
return bytenr;
cache = btrfs_lookup_first_block_group(fs_info, search_start);
if (!cache)
return 0;
bytenr = cache->start;
btrfs_put_block_group(cache);
bg = rb_entry(leftmost, struct btrfs_block_group, cache_node);
bytenr = bg->start;
}
read_unlock(&fs_info->block_group_cache_lock);
return bytenr;
}
@ -3803,8 +3800,7 @@ static int do_allocation_zoned(struct btrfs_block_group *block_group,
/* Check RO and no space case before trying to activate it */
spin_lock(&block_group->lock);
if (block_group->ro ||
block_group->alloc_offset == block_group->zone_capacity) {
if (block_group->ro || btrfs_zoned_bg_is_full(block_group)) {
ret = 1;
/*
* May need to clear fs_info->{treelog,data_reloc}_bg.
@ -4272,7 +4268,7 @@ static noinline int find_free_extent(struct btrfs_root *root,
return ret;
ffe_ctl->search_start = max(ffe_ctl->search_start,
first_logical_byte(fs_info, 0));
first_logical_byte(fs_info));
ffe_ctl->search_start = max(ffe_ctl->search_start, ffe_ctl->hint_byte);
if (ffe_ctl->search_start == ffe_ctl->hint_byte) {
block_group = btrfs_lookup_block_group(fs_info,
@ -4959,7 +4955,6 @@ struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
extent_op->flags_to_set = flags;
extent_op->update_key = skinny_metadata ? false : true;
extent_op->update_flags = true;
extent_op->is_data = false;
extent_op->level = level;
btrfs_init_generic_ref(&generic_ref, BTRFS_ADD_DELAYED_EXTENT,
@ -5144,7 +5139,7 @@ static noinline int walk_down_proc(struct btrfs_trans_handle *trans,
ret = btrfs_dec_ref(trans, root, eb, 0);
BUG_ON(ret); /* -ENOMEM */
ret = btrfs_set_disk_extent_flags(trans, eb, flag,
btrfs_header_level(eb), 0);
btrfs_header_level(eb));
BUG_ON(ret); /* -ENOMEM */
wc->flags[level] |= flag;
}
@ -5818,7 +5813,7 @@ int btrfs_drop_snapshot(struct btrfs_root *root, int update_ref, int for_reloc)
btrfs_qgroup_convert_reserved_meta(root, INT_MAX);
btrfs_qgroup_free_meta_all_pertrans(root);
if (test_bit(BTRFS_ROOT_IN_RADIX, &root->state))
if (test_bit(BTRFS_ROOT_REGISTERED, &root->state))
btrfs_add_dropped_root(trans, root);
else
btrfs_put_root(root);

File diff suppressed because it is too large Load Diff

View File

@ -7,15 +7,9 @@
#include <linux/refcount.h>
#include <linux/fiemap.h>
#include <linux/btrfs_tree.h>
#include "compression.h"
#include "ulist.h"
/*
* flags for bio submission. The high bits indicate the compression
* type for this bio
*/
#define EXTENT_BIO_COMPRESSED 1
#define EXTENT_BIO_FLAG_SHIFT 16
enum {
EXTENT_BUFFER_UPTODATE,
EXTENT_BUFFER_DIRTY,
@ -32,7 +26,6 @@ enum {
/* write IO error */
EXTENT_BUFFER_WRITE_ERR,
EXTENT_BUFFER_NO_CHECK,
EXTENT_BUFFER_ZONE_FINISH,
};
/* these are flags for __process_pages_contig */
@ -71,9 +64,9 @@ struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
typedef blk_status_t (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
int mirror_num,
unsigned long bio_flags);
enum btrfs_compression_type compress_type);
typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
struct bio *bio, u64 dio_file_offset);
@ -102,17 +95,6 @@ struct extent_buffer {
#endif
};
/*
* Structure to record info about the bio being assembled, and other info like
* how many bytes are there before stripe/ordered extent boundary.
*/
struct btrfs_bio_ctrl {
struct bio *bio;
unsigned long bio_flags;
u32 len_to_stripe_boundary;
u32 len_to_oe_boundary;
};
/*
* Structure to record how many bytes and which ranges are set/cleared
*/
@ -158,17 +140,6 @@ static inline void extent_changeset_free(struct extent_changeset *changeset)
kfree(changeset);
}
static inline void extent_set_compress_type(unsigned long *bio_flags,
int compress_type)
{
*bio_flags |= compress_type << EXTENT_BIO_FLAG_SHIFT;
}
static inline int extent_compress_type(unsigned long bio_flags)
{
return bio_flags >> EXTENT_BIO_FLAG_SHIFT;
}
struct extent_map_tree;
typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
@ -178,11 +149,7 @@ typedef struct extent_map *(get_extent_t)(struct btrfs_inode *inode,
int try_release_extent_mapping(struct page *page, gfp_t mask);
int try_release_extent_buffer(struct page *page);
int __must_check submit_one_bio(struct bio *bio, int mirror_num,
unsigned long bio_flags);
int btrfs_do_readpage(struct page *page, struct extent_map **em_cached,
struct btrfs_bio_ctrl *bio_ctrl,
unsigned int read_flags, u64 *prev_em_start);
int btrfs_readpage(struct file *file, struct page *page);
int extent_write_full_page(struct page *page, struct writeback_control *wbc);
int extent_write_locked_range(struct inode *inode, u64 start, u64 end);
int extent_writepages(struct address_space *mapping,
@ -277,8 +244,10 @@ void extent_range_redirty_for_io(struct inode *inode, u64 start, u64 end);
void extent_clear_unlock_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
struct page *locked_page,
u32 bits_to_clear, unsigned long page_ops);
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
struct bio *btrfs_bio_alloc(unsigned int nr_iovecs);
struct bio *btrfs_bio_clone(struct bio *bio);
struct bio *btrfs_bio_clone(struct block_device *bdev, struct bio *bio);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
@ -297,7 +266,7 @@ struct io_failure_record {
u64 start;
u64 len;
u64 logical;
unsigned long bio_flags;
enum btrfs_compression_type compress_type;
int this_mirror;
int failed_mirror;
};

View File

@ -1460,8 +1460,27 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
return ret;
}
static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait)
/*
* Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
*
* @pos: File offset.
* @write_bytes: The length to write, will be updated to the nocow writeable
* range.
*
* This function will flush ordered extents in the range to ensure proper
* nocow checks.
*
* Return:
* > 0 If we can nocow, and updates @write_bytes.
* 0 If we can't do a nocow write.
* -EAGAIN If we can't do a nocow write because snapshoting of the inode's
* root is in progress.
* < 0 If an error happened.
*
* NOTE: Callers need to call btrfs_check_nocow_unlock() if we return > 0.
*/
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
@ -1472,7 +1491,7 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
if (!(inode->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
return 0;
if (!nowait && !btrfs_drew_try_write_lock(&root->snapshot_lock))
if (!btrfs_drew_try_write_lock(&root->snapshot_lock))
return -EAGAIN;
lockstart = round_down(pos, fs_info->sectorsize);
@ -1480,71 +1499,21 @@ static int check_can_nocow(struct btrfs_inode *inode, loff_t pos,
fs_info->sectorsize) - 1;
num_bytes = lockend - lockstart + 1;
if (nowait) {
struct btrfs_ordered_extent *ordered;
if (!try_lock_extent(&inode->io_tree, lockstart, lockend))
return -EAGAIN;
ordered = btrfs_lookup_ordered_range(inode, lockstart,
num_bytes);
if (ordered) {
btrfs_put_ordered_extent(ordered);
ret = -EAGAIN;
goto out_unlock;
}
} else {
btrfs_lock_and_flush_ordered_range(inode, lockstart,
lockend, NULL);
}
btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
NULL, NULL, NULL, false);
if (ret <= 0) {
ret = 0;
if (!nowait)
btrfs_drew_write_unlock(&root->snapshot_lock);
btrfs_drew_write_unlock(&root->snapshot_lock);
} else {
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
}
out_unlock:
unlock_extent(&inode->io_tree, lockstart, lockend);
return ret;
}
static int check_nocow_nolock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes)
{
return check_can_nocow(inode, pos, write_bytes, true);
}
/*
* Check if we can do nocow write into the range [@pos, @pos + @write_bytes)
*
* @pos: File offset
* @write_bytes: The length to write, will be updated to the nocow writeable
* range
*
* This function will flush ordered extents in the range to ensure proper
* nocow checks.
*
* Return:
* >0 and update @write_bytes if we can do nocow write
* 0 if we can't do nocow write
* -EAGAIN if we can't get the needed lock or there are ordered extents
* for * (nowait == true) case
* <0 if other error happened
*
* NOTE: Callers need to release the lock by btrfs_check_nocow_unlock().
*/
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes)
{
return check_can_nocow(inode, pos, write_bytes, false);
}
void btrfs_check_nocow_unlock(struct btrfs_inode *inode)
{
btrfs_drew_write_unlock(&inode->root->snapshot_lock);
@ -1579,20 +1548,15 @@ static int btrfs_write_check(struct kiocb *iocb, struct iov_iter *from,
loff_t oldsize;
loff_t start_pos;
if (iocb->ki_flags & IOCB_NOWAIT) {
size_t nocow_bytes = count;
/* We will allocate space in case nodatacow is not set, so bail */
if (check_nocow_nolock(BTRFS_I(inode), pos, &nocow_bytes) <= 0)
return -EAGAIN;
/*
* There are holes in the range or parts of the range that must
* be COWed (shared extents, RO block groups, etc), so just bail
* out.
*/
if (nocow_bytes < count)
return -EAGAIN;
}
/*
* Quickly bail out on NOWAIT writes if we don't have the nodatacow or
* prealloc flags, as without those flags we always have to COW. We will
* later check if we can really COW into the target range (using
* can_nocow_extent() at btrfs_get_blocks_direct_write()).
*/
if ((iocb->ki_flags & IOCB_NOWAIT) &&
!(BTRFS_I(inode)->flags & (BTRFS_INODE_NODATACOW | BTRFS_INODE_PREALLOC)))
return -EAGAIN;
current->backing_dev_info = inode_to_bdi(inode);
ret = file_remove_privs(file);
@ -1720,7 +1684,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
WARN_ON(reserve_bytes == 0);
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
reserve_bytes,
reserve_bytes);
reserve_bytes, false);
if (ret) {
if (!only_release_metadata)
btrfs_free_reserved_data_space(BTRFS_I(inode),
@ -1965,8 +1929,7 @@ relock:
*/
again:
from->nofault = true;
err = iomap_dio_rw(iocb, from, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
IOMAP_DIO_PARTIAL, written);
err = btrfs_dio_rw(iocb, from, written);
from->nofault = false;
/* No increment (+=) because iomap returns a cumulative value. */
@ -2570,10 +2533,10 @@ static int find_first_non_hole(struct btrfs_inode *inode, u64 *start, u64 *len)
return ret;
}
static int btrfs_punch_hole_lock_range(struct inode *inode,
const u64 lockstart,
const u64 lockend,
struct extent_state **cached_state)
static void btrfs_punch_hole_lock_range(struct inode *inode,
const u64 lockstart,
const u64 lockend,
struct extent_state **cached_state)
{
/*
* For subpage case, if the range is not at page boundary, we could
@ -2587,40 +2550,29 @@ static int btrfs_punch_hole_lock_range(struct inode *inode,
const u64 page_lockend = round_down(lockend + 1, PAGE_SIZE) - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
int ret;
truncate_pagecache_range(inode, lockstart, lockend);
lock_extent_bits(&BTRFS_I(inode)->io_tree, lockstart, lockend,
cached_state);
ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
lockend);
/*
* We need to make sure we have no ordered extents in this range
* and nobody raced in and read a page in this range, if we did
* we need to try again.
* We can't have ordered extents in the range, nor dirty/writeback
* pages, because we have locked the inode's VFS lock in exclusive
* mode, we have locked the inode's i_mmap_lock in exclusive mode,
* we have flushed all delalloc in the range and we have waited
* for any ordered extents in the range to complete.
* We can race with anyone reading pages from this range, so after
* locking the range check if we have pages in the range, and if
* we do, unlock the range and retry.
*/
if ((!ordered ||
(ordered->file_offset + ordered->num_bytes <= lockstart ||
ordered->file_offset > lockend)) &&
!filemap_range_has_page(inode->i_mapping,
page_lockstart, page_lockend)) {
if (ordered)
btrfs_put_ordered_extent(ordered);
if (!filemap_range_has_page(inode->i_mapping, page_lockstart,
page_lockend))
break;
}
if (ordered)
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree, lockstart,
lockend, cached_state);
ret = btrfs_wait_ordered_range(inode, lockstart,
lockend - lockstart + 1);
if (ret)
return ret;
}
return 0;
btrfs_assert_inode_range_clean(BTRFS_I(inode), lockstart, lockend);
}
static int btrfs_insert_replace_extent(struct btrfs_trans_handle *trans,
@ -2976,11 +2928,12 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
bool truncated_block = false;
bool updated_inode = false;
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
return ret;
goto out_only_mutex;
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
ino_size = round_up(inode->i_size, fs_info->sectorsize);
ret = find_first_non_hole(BTRFS_I(inode), &offset, &len);
if (ret < 0)
@ -3072,10 +3025,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
goto out_only_mutex;
}
ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
&cached_state);
if (ret)
goto out_only_mutex;
btrfs_punch_hole_lock_range(inode, lockstart, lockend, &cached_state);
path = btrfs_alloc_path();
if (!path) {
@ -3237,8 +3187,6 @@ static int btrfs_zero_range(struct inode *inode,
u64 bytes_to_reserve = 0;
bool space_reserved = false;
inode_dio_wait(inode);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, alloc_start,
alloc_end - alloc_start);
if (IS_ERR(em)) {
@ -3368,10 +3316,8 @@ reserve_space:
if (ret < 0)
goto out;
space_reserved = true;
ret = btrfs_punch_hole_lock_range(inode, lockstart, lockend,
&cached_state);
if (ret)
goto out;
btrfs_punch_hole_lock_range(inode, lockstart, lockend,
&cached_state);
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode), &data_reserved,
alloc_start, bytes_to_reserve);
if (ret) {
@ -3417,6 +3363,9 @@ static long btrfs_fallocate(struct file *file, int mode,
u64 alloc_hint = 0;
u64 locked_end;
u64 actual_end = 0;
u64 data_space_needed = 0;
u64 data_space_reserved = 0;
u64 qgroup_reserved = 0;
struct extent_map *em;
int blocksize = btrfs_inode_sectorsize(BTRFS_I(inode));
int ret;
@ -3437,18 +3386,6 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_PUNCH_HOLE)
return btrfs_punch_hole(file, offset, len);
/*
* Only trigger disk allocation, don't trigger qgroup reserve
*
* For qgroup space, it will be checked later.
*/
if (!(mode & FALLOC_FL_ZERO_RANGE)) {
ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
alloc_end - alloc_start);
if (ret < 0)
return ret;
}
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
@ -3485,8 +3422,12 @@ static long btrfs_fallocate(struct file *file, int mode,
}
/*
* wait for ordered IO before we have any locks. We'll loop again
* below with the locks held.
* We have locked the inode at the VFS level (in exclusive mode) and we
* have locked the i_mmap_lock lock (in exclusive mode). Now before
* locking the file range, flush all dealloc in the range and wait for
* all ordered extents in the range to complete. After this we can lock
* the file range and, due to the previous locking we did, we know there
* can't be more delalloc or ordered extents in the range.
*/
ret = btrfs_wait_ordered_range(inode, alloc_start,
alloc_end - alloc_start);
@ -3500,38 +3441,10 @@ static long btrfs_fallocate(struct file *file, int mode,
}
locked_end = alloc_end - 1;
while (1) {
struct btrfs_ordered_extent *ordered;
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state);
/* the extent lock is ordered inside the running
* transaction
*/
lock_extent_bits(&BTRFS_I(inode)->io_tree, alloc_start,
locked_end, &cached_state);
ordered = btrfs_lookup_first_ordered_extent(BTRFS_I(inode),
locked_end);
if (ordered &&
ordered->file_offset + ordered->num_bytes > alloc_start &&
ordered->file_offset < alloc_end) {
btrfs_put_ordered_extent(ordered);
unlock_extent_cached(&BTRFS_I(inode)->io_tree,
alloc_start, locked_end,
&cached_state);
/*
* we can't wait on the range with the transaction
* running or with the extent lock held
*/
ret = btrfs_wait_ordered_range(inode, alloc_start,
alloc_end - alloc_start);
if (ret)
goto out;
} else {
if (ordered)
btrfs_put_ordered_extent(ordered);
break;
}
}
btrfs_assert_inode_range_clean(BTRFS_I(inode), alloc_start, locked_end);
/* First, check if we exceed the qgroup limit */
INIT_LIST_HEAD(&reserve_list);
@ -3548,48 +3461,64 @@ static long btrfs_fallocate(struct file *file, int mode,
if (em->block_start == EXTENT_MAP_HOLE ||
(cur_offset >= inode->i_size &&
!test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
ret = add_falloc_range(&reserve_list, cur_offset,
last_byte - cur_offset);
const u64 range_len = last_byte - cur_offset;
ret = add_falloc_range(&reserve_list, cur_offset, range_len);
if (ret < 0) {
free_extent_map(em);
break;
}
ret = btrfs_qgroup_reserve_data(BTRFS_I(inode),
&data_reserved, cur_offset,
last_byte - cur_offset);
&data_reserved, cur_offset, range_len);
if (ret < 0) {
cur_offset = last_byte;
free_extent_map(em);
break;
}
} else {
/*
* Do not need to reserve unwritten extent for this
* range, free reserved data space first, otherwise
* it'll result in false ENOSPC error.
*/
btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, cur_offset,
last_byte - cur_offset);
qgroup_reserved += range_len;
data_space_needed += range_len;
}
free_extent_map(em);
cur_offset = last_byte;
}
if (!ret && data_space_needed > 0) {
/*
* We are safe to reserve space here as we can't have delalloc
* in the range, see above.
*/
ret = btrfs_alloc_data_chunk_ondemand(BTRFS_I(inode),
data_space_needed);
if (!ret)
data_space_reserved = data_space_needed;
}
/*
* If ret is still 0, means we're OK to fallocate.
* Or just cleanup the list and exit.
*/
list_for_each_entry_safe(range, tmp, &reserve_list, list) {
if (!ret)
if (!ret) {
ret = btrfs_prealloc_file_range(inode, mode,
range->start,
range->len, i_blocksize(inode),
offset + len, &alloc_hint);
else
/*
* btrfs_prealloc_file_range() releases space even
* if it returns an error.
*/
data_space_reserved -= range->len;
qgroup_reserved -= range->len;
} else if (data_space_reserved > 0) {
btrfs_free_reserved_data_space(BTRFS_I(inode),
data_reserved, range->start,
range->len);
data_reserved, range->start,
range->len);
data_space_reserved -= range->len;
qgroup_reserved -= range->len;
} else if (qgroup_reserved > 0) {
btrfs_qgroup_free_data(BTRFS_I(inode), data_reserved,
range->start, range->len);
qgroup_reserved -= range->len;
}
list_del(&range->list);
kfree(range);
}
@ -3606,10 +3535,6 @@ out_unlock:
&cached_state);
out:
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
/* Let go of our reservation. */
if (ret != 0 && !(mode & FALLOC_FL_ZERO_RANGE))
btrfs_free_reserved_data_space(BTRFS_I(inode), data_reserved,
cur_offset, alloc_end - cur_offset);
extent_changeset_free(data_reserved);
return ret;
}
@ -3767,8 +3692,7 @@ again:
*/
pagefault_disable();
to->nofault = true;
ret = iomap_dio_rw(iocb, to, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
IOMAP_DIO_PARTIAL, read);
ret = btrfs_dio_rw(iocb, to, read);
to->nofault = false;
pagefault_enable();

View File

@ -2630,16 +2630,19 @@ out:
static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
u64 bytenr, u64 size, bool used)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct btrfs_space_info *sinfo = block_group->space_info;
struct btrfs_free_space_ctl *ctl = block_group->free_space_ctl;
u64 offset = bytenr - block_group->start;
u64 to_free, to_unusable;
const int bg_reclaim_threshold = READ_ONCE(fs_info->bg_reclaim_threshold);
int bg_reclaim_threshold = 0;
bool initial = (size == block_group->length);
u64 reclaimable_unusable;
WARN_ON(!initial && offset + size > block_group->zone_capacity);
if (!initial)
bg_reclaim_threshold = READ_ONCE(sinfo->bg_reclaim_threshold);
spin_lock(&ctl->tree_lock);
if (!used)
to_free = size;
@ -4069,7 +4072,7 @@ static int cleanup_free_space_cache_v1(struct btrfs_fs_info *fs_info,
btrfs_info(fs_info, "cleaning free space cache v1");
node = rb_first(&fs_info->block_group_cache_tree);
node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
block_group = rb_entry(node, struct btrfs_block_group, cache_node);
ret = btrfs_remove_free_space_inode(trans, NULL, block_group);

View File

@ -1178,7 +1178,7 @@ int btrfs_create_free_space_tree(struct btrfs_fs_info *fs_info)
goto abort;
}
node = rb_first(&fs_info->block_group_cache_tree);
node = rb_first_cached(&fs_info->block_group_cache_tree);
while (node) {
block_group = rb_entry(node, struct btrfs_block_group,
cache_node);

File diff suppressed because it is too large Load Diff

View File

@ -540,9 +540,35 @@ int __pure btrfs_is_empty_uuid(u8 *uuid)
return 1;
}
/*
* Calculate the number of transaction items to reserve for creating a subvolume
* or snapshot, not including the inode, directory entries, or parent directory.
*/
static unsigned int create_subvol_num_items(struct btrfs_qgroup_inherit *inherit)
{
/*
* 1 to add root block
* 1 to add root item
* 1 to add root ref
* 1 to add root backref
* 1 to add UUID item
* 1 to add qgroup info
* 1 to add qgroup limit
*
* Ideally the last two would only be accounted if qgroups are enabled,
* but that can change between now and the time we would insert them.
*/
unsigned int num_items = 7;
if (inherit) {
/* 2 to add qgroup relations for each inherited qgroup */
num_items += 2 * inherit->num_qgroups;
}
return num_items;
}
static noinline int create_subvol(struct user_namespace *mnt_userns,
struct inode *dir, struct dentry *dentry,
const char *name, int namelen,
struct btrfs_qgroup_inherit *inherit)
{
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
@ -555,11 +581,15 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
struct btrfs_root *new_root;
struct btrfs_block_rsv block_rsv;
struct timespec64 cur_time = current_time(dir);
struct inode *inode;
struct btrfs_new_inode_args new_inode_args = {
.dir = dir,
.dentry = dentry,
.subvol = true,
};
unsigned int trans_num_items;
int ret;
dev_t anon_dev = 0;
dev_t anon_dev;
u64 objectid;
u64 index = 0;
root_item = kzalloc(sizeof(*root_item), GFP_KERNEL);
if (!root_item)
@ -567,11 +597,7 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
ret = btrfs_get_free_objectid(fs_info->tree_root, &objectid);
if (ret)
goto fail_free;
ret = get_anon_bdev(&anon_dev);
if (ret < 0)
goto fail_free;
goto out_root_item;
/*
* Don't create subvolume whose level is not zero. Or qgroup will be
@ -579,36 +605,47 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
*/
if (btrfs_qgroup_level(objectid)) {
ret = -ENOSPC;
goto fail_free;
goto out_root_item;
}
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
/*
* The same as the snapshot creation, please see the comment
* of create_snapshot().
*/
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv, 8, false);
ret = get_anon_bdev(&anon_dev);
if (ret < 0)
goto out_root_item;
new_inode_args.inode = btrfs_new_subvol_inode(mnt_userns, dir);
if (!new_inode_args.inode) {
ret = -ENOMEM;
goto out_anon_dev;
}
ret = btrfs_new_inode_prepare(&new_inode_args, &trans_num_items);
if (ret)
goto fail_free;
goto out_inode;
trans_num_items += create_subvol_num_items(inherit);
btrfs_init_block_rsv(&block_rsv, BTRFS_BLOCK_RSV_TEMP);
ret = btrfs_subvolume_reserve_metadata(root, &block_rsv,
trans_num_items, false);
if (ret)
goto out_new_inode_args;
trans = btrfs_start_transaction(root, 0);
if (IS_ERR(trans)) {
ret = PTR_ERR(trans);
btrfs_subvolume_release_metadata(root, &block_rsv);
goto fail_free;
goto out_new_inode_args;
}
trans->block_rsv = &block_rsv;
trans->bytes_reserved = block_rsv.size;
ret = btrfs_qgroup_inherit(trans, 0, objectid, inherit);
if (ret)
goto fail;
goto out;
leaf = btrfs_alloc_tree_block(trans, root, 0, objectid, NULL, 0, 0, 0,
BTRFS_NESTING_NORMAL);
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
goto fail;
goto out;
}
btrfs_mark_buffer_dirty(leaf);
@ -663,75 +700,46 @@ static noinline int create_subvol(struct user_namespace *mnt_userns,
btrfs_tree_unlock(leaf);
btrfs_free_tree_block(trans, objectid, leaf, 0, 1);
free_extent_buffer(leaf);
goto fail;
goto out;
}
free_extent_buffer(leaf);
leaf = NULL;
key.offset = (u64)-1;
new_root = btrfs_get_new_fs_root(fs_info, objectid, anon_dev);
if (IS_ERR(new_root)) {
free_anon_bdev(anon_dev);
ret = PTR_ERR(new_root);
btrfs_abort_transaction(trans, ret);
goto fail;
goto out;
}
/* Freeing will be done in btrfs_put_root() of new_root */
/* anon_dev is owned by new_root now. */
anon_dev = 0;
BTRFS_I(new_inode_args.inode)->root = new_root;
/* ... and new_root is owned by new_inode_args.inode now. */
ret = btrfs_record_root_in_trans(trans, new_root);
if (ret) {
btrfs_put_root(new_root);
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_create_subvol_root(trans, new_root, root, mnt_userns);
btrfs_put_root(new_root);
if (ret) {
/* We potentially lose an unused inode item here */
btrfs_abort_transaction(trans, ret);
goto fail;
}
/*
* insert the directory item
*/
ret = btrfs_set_inode_index(BTRFS_I(dir), &index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_insert_dir_item(trans, name, namelen, BTRFS_I(dir), &key,
BTRFS_FT_DIR, index);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
btrfs_i_size_write(BTRFS_I(dir), dir->i_size + namelen * 2);
ret = btrfs_update_inode(trans, root, BTRFS_I(dir));
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
}
ret = btrfs_add_root_ref(trans, objectid, root->root_key.objectid,
btrfs_ino(BTRFS_I(dir)), index, name, namelen);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
goto out;
}
ret = btrfs_uuid_tree_add(trans, root_item->uuid,
BTRFS_UUID_KEY_SUBVOL, objectid);
if (ret)
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
fail:
kfree(root_item);
ret = btrfs_create_new_inode(trans, &new_inode_args);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto out;
}
d_instantiate_new(dentry, new_inode_args.inode);
new_inode_args.inode = NULL;
out:
trans->block_rsv = NULL;
trans->bytes_reserved = 0;
btrfs_subvolume_release_metadata(root, &block_rsv);
@ -740,18 +748,14 @@ fail:
btrfs_end_transaction(trans);
else
ret = btrfs_commit_transaction(trans);
if (!ret) {
inode = btrfs_lookup_dentry(dir, dentry);
if (IS_ERR(inode))
return PTR_ERR(inode);
d_instantiate(dentry, inode);
}
return ret;
fail_free:
out_new_inode_args:
btrfs_new_inode_args_destroy(&new_inode_args);
out_inode:
iput(new_inode_args.inode);
out_anon_dev:
if (anon_dev)
free_anon_bdev(anon_dev);
out_root_item:
kfree(root_item);
return ret;
}
@ -763,6 +767,7 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
struct btrfs_fs_info *fs_info = btrfs_sb(dir->i_sb);
struct inode *inode;
struct btrfs_pending_snapshot *pending_snapshot;
unsigned int trans_num_items;
struct btrfs_trans_handle *trans;
int ret;
@ -800,16 +805,14 @@ static int create_snapshot(struct btrfs_root *root, struct inode *dir,
btrfs_init_block_rsv(&pending_snapshot->block_rsv,
BTRFS_BLOCK_RSV_TEMP);
/*
* 1 - parent dir inode
* 2 - dir entries
* 1 - root item
* 2 - root ref/backref
* 1 - root of snapshot
* 1 - UUID item
* 1 to add dir item
* 1 to add dir index
* 1 to update parent inode item
*/
trans_num_items = create_subvol_num_items(inherit) + 3;
ret = btrfs_subvolume_reserve_metadata(BTRFS_I(dir)->root,
&pending_snapshot->block_rsv, 8,
false);
&pending_snapshot->block_rsv,
trans_num_items, false);
if (ret)
goto free_pending;
@ -979,7 +982,7 @@ static noinline int btrfs_mksubvol(const struct path *parent,
if (snap_src)
error = create_snapshot(snap_src, dir, dentry, readonly, inherit);
else
error = create_subvol(mnt_userns, dir, dentry, name, namelen, inherit);
error = create_subvol(mnt_userns, dir, dentry, inherit);
if (!error)
fsnotify_mkdir(dir, dentry);
@ -1413,8 +1416,19 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (!em)
break;
/* Skip hole/inline/preallocated extents */
if (em->block_start >= EXTENT_MAP_LAST_BYTE ||
/*
* If the file extent is an inlined one, we may still want to
* defrag it (fallthrough) if it will cause a regular extent.
* This is for users who want to convert inline extents to
* regular ones through max_inline= mount option.
*/
if (em->block_start == EXTENT_MAP_INLINE &&
em->len <= inode->root->fs_info->max_inline)
goto next;
/* Skip hole/delalloc/preallocated extents */
if (em->block_start == EXTENT_MAP_HOLE ||
em->block_start == EXTENT_MAP_DELALLOC ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))
goto next;
@ -1473,6 +1487,15 @@ static int defrag_collect_targets(struct btrfs_inode *inode,
if (em->len >= get_extent_max_capacity(em))
goto next;
/*
* Normally there are no more extents after an inline one, thus
* @next_mergeable will normally be false and not defragged.
* So if an inline extent passed all above checks, just add it
* for defrag, and be converted to regular extents.
*/
if (em->block_start == EXTENT_MAP_INLINE)
goto add;
next_mergeable = defrag_check_next_extent(&inode->vfs_inode, em,
extent_thresh, newer_than, locked);
if (!next_mergeable) {
@ -2594,7 +2617,7 @@ err:
static noinline int btrfs_ioctl_tree_search(struct inode *inode,
void __user *argp)
{
struct btrfs_ioctl_search_args __user *uargs;
struct btrfs_ioctl_search_args __user *uargs = argp;
struct btrfs_ioctl_search_key sk;
int ret;
size_t buf_size;
@ -2602,8 +2625,6 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
uargs = (struct btrfs_ioctl_search_args __user *)argp;
if (copy_from_user(&sk, &uargs->key, sizeof(sk)))
return -EFAULT;
@ -2626,7 +2647,7 @@ static noinline int btrfs_ioctl_tree_search(struct inode *inode,
static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
void __user *argp)
{
struct btrfs_ioctl_search_args_v2 __user *uarg;
struct btrfs_ioctl_search_args_v2 __user *uarg = argp;
struct btrfs_ioctl_search_args_v2 args;
int ret;
size_t buf_size;
@ -2636,7 +2657,6 @@ static noinline int btrfs_ioctl_tree_search_v2(struct inode *inode,
return -EPERM;
/* copy search header and buffer size */
uarg = (struct btrfs_ioctl_search_args_v2 __user *)argp;
if (copy_from_user(&args, uarg, sizeof(args)))
return -EFAULT;
@ -4344,10 +4364,6 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
bool need_unlock; /* for mut. excl. ops lock */
int ret;
if (!arg)
btrfs_warn(fs_info,
"IOC_BALANCE ioctl (v1) is deprecated and will be removed in kernel 5.18");
if (!capable(CAP_SYS_ADMIN))
return -EPERM;
@ -4355,6 +4371,13 @@ static long btrfs_ioctl_balance(struct file *file, void __user *arg)
if (ret)
return ret;
bargs = memdup_user(arg, sizeof(*bargs));
if (IS_ERR(bargs)) {
ret = PTR_ERR(bargs);
bargs = NULL;
goto out;
}
again:
if (btrfs_exclop_start(fs_info, BTRFS_EXCLOP_BALANCE)) {
mutex_lock(&fs_info->balance_mutex);
@ -4402,59 +4425,42 @@ again:
}
locked:
if (arg) {
bargs = memdup_user(arg, sizeof(*bargs));
if (IS_ERR(bargs)) {
ret = PTR_ERR(bargs);
if (bargs->flags & BTRFS_BALANCE_RESUME) {
if (!fs_info->balance_ctl) {
ret = -ENOTCONN;
goto out_unlock;
}
if (bargs->flags & BTRFS_BALANCE_RESUME) {
if (!fs_info->balance_ctl) {
ret = -ENOTCONN;
goto out_bargs;
}
bctl = fs_info->balance_ctl;
spin_lock(&fs_info->balance_lock);
bctl->flags |= BTRFS_BALANCE_RESUME;
spin_unlock(&fs_info->balance_lock);
btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
bctl = fs_info->balance_ctl;
spin_lock(&fs_info->balance_lock);
bctl->flags |= BTRFS_BALANCE_RESUME;
spin_unlock(&fs_info->balance_lock);
btrfs_exclop_balance(fs_info, BTRFS_EXCLOP_BALANCE);
goto do_balance;
}
goto do_balance;
}
} else {
bargs = NULL;
if (bargs->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
ret = -EINVAL;
goto out_unlock;
}
if (fs_info->balance_ctl) {
ret = -EINPROGRESS;
goto out_bargs;
goto out_unlock;
}
bctl = kzalloc(sizeof(*bctl), GFP_KERNEL);
if (!bctl) {
ret = -ENOMEM;
goto out_bargs;
goto out_unlock;
}
if (arg) {
memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
bctl->flags = bargs->flags;
} else {
/* balance everything - no filters */
bctl->flags |= BTRFS_BALANCE_TYPE_MASK;
}
if (bctl->flags & ~(BTRFS_BALANCE_ARGS_MASK | BTRFS_BALANCE_TYPE_MASK)) {
ret = -EINVAL;
goto out_bctl;
}
memcpy(&bctl->data, &bargs->data, sizeof(bctl->data));
memcpy(&bctl->meta, &bargs->meta, sizeof(bctl->meta));
memcpy(&bctl->sys, &bargs->sys, sizeof(bctl->sys));
bctl->flags = bargs->flags;
do_balance:
/*
* Ownership of bctl and exclusive operation goes to btrfs_balance.
@ -4467,21 +4473,19 @@ do_balance:
ret = btrfs_balance(fs_info, bctl, bargs);
bctl = NULL;
if ((ret == 0 || ret == -ECANCELED) && arg) {
if (ret == 0 || ret == -ECANCELED) {
if (copy_to_user(arg, bargs, sizeof(*bargs)))
ret = -EFAULT;
}
out_bctl:
kfree(bctl);
out_bargs:
kfree(bargs);
out_unlock:
mutex_unlock(&fs_info->balance_mutex);
if (need_unlock)
btrfs_exclop_finish(fs_info);
out:
mnt_drop_write_file(file);
kfree(bargs);
return ret;
}

View File

@ -380,9 +380,8 @@ static struct prop_handler prop_handlers[] = {
},
};
static int inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
struct inode *parent)
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode, struct inode *parent)
{
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_fs_info *fs_info = root->fs_info;
@ -457,41 +456,6 @@ static int inherit_props(struct btrfs_trans_handle *trans,
return 0;
}
int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
struct inode *dir)
{
if (!dir)
return 0;
return inherit_props(trans, inode, dir);
}
int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *parent_root)
{
struct super_block *sb = root->fs_info->sb;
struct inode *parent_inode, *child_inode;
int ret;
parent_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, parent_root);
if (IS_ERR(parent_inode))
return PTR_ERR(parent_inode);
child_inode = btrfs_iget(sb, BTRFS_FIRST_FREE_OBJECTID, root);
if (IS_ERR(child_inode)) {
iput(parent_inode);
return PTR_ERR(child_inode);
}
ret = inherit_props(trans, child_inode, parent_inode);
iput(child_inode);
iput(parent_inode);
return ret;
}
void __init btrfs_props_init(void)
{
int i;

View File

@ -23,8 +23,4 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
struct inode *inode,
struct inode *dir);
int btrfs_subvol_inherit_props(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_root *parent_root);
#endif

View File

@ -2290,7 +2290,7 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
return 0;
if (!extent_buffer_uptodate(root_eb)) {
ret = btrfs_read_buffer(root_eb, root_gen, root_level, NULL);
ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
if (ret)
goto out;
}
@ -3939,12 +3939,13 @@ int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
}
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce)
enum btrfs_qgroup_rsv_type type, bool enforce,
bool noflush)
{
int ret;
ret = btrfs_qgroup_reserve_meta(root, num_bytes, type, enforce);
if (ret <= 0 && ret != -EDQUOT)
if ((ret <= 0 && ret != -EDQUOT) || noflush)
return ret;
ret = try_flush_qgroup(root);

View File

@ -364,19 +364,23 @@ int btrfs_qgroup_free_data(struct btrfs_inode *inode,
int btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
int __btrfs_qgroup_reserve_meta(struct btrfs_root *root, int num_bytes,
enum btrfs_qgroup_rsv_type type, bool enforce);
enum btrfs_qgroup_rsv_type type, bool enforce,
bool noflush);
/* Reserve metadata space for pertrans and prealloc type */
static inline int btrfs_qgroup_reserve_meta_pertrans(struct btrfs_root *root,
int num_bytes, bool enforce)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
BTRFS_QGROUP_RSV_META_PERTRANS, enforce);
BTRFS_QGROUP_RSV_META_PERTRANS,
enforce, false);
}
static inline int btrfs_qgroup_reserve_meta_prealloc(struct btrfs_root *root,
int num_bytes, bool enforce)
int num_bytes, bool enforce,
bool noflush)
{
return __btrfs_qgroup_reserve_meta(root, num_bytes,
BTRFS_QGROUP_RSV_META_PREALLOC, enforce);
BTRFS_QGROUP_RSV_META_PREALLOC,
enforce, noflush);
}
void __btrfs_qgroup_free_meta(struct btrfs_root *root, int num_bytes,

File diff suppressed because it is too large Load Diff

View File

@ -31,15 +31,14 @@ struct btrfs_raid_bio;
struct btrfs_device;
int raid56_parity_recover(struct bio *bio, struct btrfs_io_context *bioc,
u64 stripe_len, int mirror_num, int generic_io);
int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc,
u64 stripe_len);
u32 stripe_len, int mirror_num, int generic_io);
int raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc, u32 stripe_len);
void raid56_add_scrub_pages(struct btrfs_raid_bio *rbio, struct page *page,
u64 logical);
unsigned int pgoff, u64 logical);
struct btrfs_raid_bio *raid56_parity_alloc_scrub_rbio(struct bio *bio,
struct btrfs_io_context *bioc, u64 stripe_len,
struct btrfs_io_context *bioc, u32 stripe_len,
struct btrfs_device *scrub_dev,
unsigned long *dbitmap, int stripe_nsectors);
void raid56_parity_submit_scrub_rbio(struct btrfs_raid_bio *rbio);

View File

@ -614,14 +614,23 @@ static void btrfs_double_extent_unlock(struct inode *inode1, u64 loff1,
static void btrfs_double_extent_lock(struct inode *inode1, u64 loff1,
struct inode *inode2, u64 loff2, u64 len)
{
u64 range1_end = loff1 + len - 1;
u64 range2_end = loff2 + len - 1;
if (inode1 < inode2) {
swap(inode1, inode2);
swap(loff1, loff2);
swap(range1_end, range2_end);
} else if (inode1 == inode2 && loff2 < loff1) {
swap(loff1, loff2);
swap(range1_end, range2_end);
}
lock_extent(&BTRFS_I(inode1)->io_tree, loff1, loff1 + len - 1);
lock_extent(&BTRFS_I(inode2)->io_tree, loff2, loff2 + len - 1);
lock_extent(&BTRFS_I(inode1)->io_tree, loff1, range1_end);
lock_extent(&BTRFS_I(inode2)->io_tree, loff2, range2_end);
btrfs_assert_inode_range_clean(BTRFS_I(inode1), loff1, range1_end);
btrfs_assert_inode_range_clean(BTRFS_I(inode2), loff2, range2_end);
}
static void btrfs_double_mmap_lock(struct inode *inode1, struct inode *inode2)
@ -771,7 +780,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
struct inode *inode_in = file_inode(file_in);
struct inode *inode_out = file_inode(file_out);
u64 bs = BTRFS_I(inode_out)->root->fs_info->sb->s_blocksize;
bool same_inode = inode_out == inode_in;
u64 wb_len;
int ret;
@ -809,15 +817,6 @@ static int btrfs_remap_file_range_prep(struct file *file_in, loff_t pos_in,
else
wb_len = ALIGN(*len, bs);
/*
* Since we don't lock ranges, wait for ongoing lockless dio writes (as
* any in progress could create its ordered extents after we wait for
* existing ordered extents below).
*/
inode_dio_wait(inode_in);
if (!same_inode)
inode_dio_wait(inode_out);
/*
* Workaround to make sure NOCOW buffered write reach disk as NOCOW.
*

View File

@ -362,7 +362,7 @@ struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr)
rb_node = rb_simple_search(&rc->reloc_root_tree.rb_root, bytenr);
if (rb_node) {
node = rb_entry(rb_node, struct mapping_node, rb_node);
root = (struct btrfs_root *)node->data;
root = node->data;
}
spin_unlock(&rc->reloc_root_tree.lock);
return btrfs_grab_root(root);
@ -2997,7 +2997,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
/* Reserve metadata for this range */
ret = btrfs_delalloc_reserve_metadata(BTRFS_I(inode),
clamped_len, clamped_len);
clamped_len, clamped_len,
false);
if (ret)
goto release_page;
@ -3845,8 +3846,7 @@ out:
btrfs_end_transaction(trans);
btrfs_btree_balance_dirty(fs_info);
if (err) {
if (inode)
iput(inode);
iput(inode);
inode = ERR_PTR(err);
}
return inode;
@ -3977,6 +3977,17 @@ int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start)
if (!bg)
return -ENOENT;
/*
* Relocation of a data block group creates ordered extents. Without
* sb_start_write(), we can freeze the filesystem while unfinished
* ordered extents are left. Such ordered extents can cause a deadlock
* e.g. when syncfs() is waiting for their completion but they can't
* finish because they block when joining a transaction, due to the
* fact that the freeze locks are being held in write mode.
*/
if (bg->flags & BTRFS_BLOCK_GROUP_DATA)
ASSERT(sb_write_started(fs_info->sb));
if (btrfs_pinned_by_swapfile(fs_info, bg)) {
btrfs_put_block_group(bg);
return -ETXTBSY;

View File

@ -509,7 +509,8 @@ int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
/* One for parent inode, two for dir entries */
qgroup_num_bytes = 3 * fs_info->nodesize;
ret = btrfs_qgroup_reserve_meta_prealloc(root,
qgroup_num_bytes, true);
qgroup_num_bytes, true,
false);
if (ret)
return ret;
}

File diff suppressed because it is too large Load Diff

View File

@ -10,7 +10,6 @@
#include <linux/mount.h>
#include <linux/xattr.h>
#include <linux/posix_acl_xattr.h>
#include <linux/radix-tree.h>
#include <linux/vmalloc.h>
#include <linux/string.h>
#include <linux/compat.h>
@ -128,11 +127,18 @@ struct send_ctx {
struct list_head new_refs;
struct list_head deleted_refs;
struct radix_tree_root name_cache;
struct xarray name_cache;
struct list_head name_cache_list;
int name_cache_size;
/*
* The inode we are currently processing. It's not NULL only when we
* need to issue write commands for data extents from this inode.
*/
struct inode *cur_inode;
struct file_ra_state ra;
u64 page_cache_clear_start;
bool clean_page_cache;
/*
* We process inodes by their increasing order, so if before an
@ -262,14 +268,13 @@ struct orphan_dir_info {
struct name_cache_entry {
struct list_head list;
/*
* radix_tree has only 32bit entries but we need to handle 64bit inums.
* We use the lower 32bit of the 64bit inum to store it in the tree. If
* more then one inum would fall into the same entry, we use radix_list
* to store the additional entries. radix_list is also used to store
* entries where two entries have the same inum but different
* generations.
* On 32bit kernels, xarray has only 32bit indices, but we need to
* handle 64bit inums. We use the lower 32bit of the 64bit inum to store
* it in the tree. If more than one inum would fall into the same entry,
* we use inum_aliases to store the additional entries. inum_aliases is
* also used to store entries with the same inum but different generations.
*/
struct list_head radix_list;
struct list_head inum_aliases;
u64 ino;
u64 gen;
u64 parent_ino;
@ -2019,9 +2024,9 @@ out:
}
/*
* Insert a name cache entry. On 32bit kernels the radix tree index is 32bit,
* Insert a name cache entry. On 32bit kernels the xarray index is 32bit,
* so we need to do some special handling in case we have clashes. This function
* takes care of this with the help of name_cache_entry::radix_list.
* takes care of this with the help of name_cache_entry::inum_aliases.
* In case of error, nce is kfreed.
*/
static int name_cache_insert(struct send_ctx *sctx,
@ -2030,8 +2035,7 @@ static int name_cache_insert(struct send_ctx *sctx,
int ret = 0;
struct list_head *nce_head;
nce_head = radix_tree_lookup(&sctx->name_cache,
(unsigned long)nce->ino);
nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino);
if (!nce_head) {
nce_head = kmalloc(sizeof(*nce_head), GFP_KERNEL);
if (!nce_head) {
@ -2040,14 +2044,14 @@ static int name_cache_insert(struct send_ctx *sctx,
}
INIT_LIST_HEAD(nce_head);
ret = radix_tree_insert(&sctx->name_cache, nce->ino, nce_head);
ret = xa_insert(&sctx->name_cache, nce->ino, nce_head, GFP_KERNEL);
if (ret < 0) {
kfree(nce_head);
kfree(nce);
return ret;
}
}
list_add_tail(&nce->radix_list, nce_head);
list_add_tail(&nce->inum_aliases, nce_head);
list_add_tail(&nce->list, &sctx->name_cache_list);
sctx->name_cache_size++;
@ -2059,15 +2063,14 @@ static void name_cache_delete(struct send_ctx *sctx,
{
struct list_head *nce_head;
nce_head = radix_tree_lookup(&sctx->name_cache,
(unsigned long)nce->ino);
nce_head = xa_load(&sctx->name_cache, (unsigned long)nce->ino);
if (!nce_head) {
btrfs_err(sctx->send_root->fs_info,
"name_cache_delete lookup failed ino %llu cache size %d, leaking memory",
nce->ino, sctx->name_cache_size);
}
list_del(&nce->radix_list);
list_del(&nce->inum_aliases);
list_del(&nce->list);
sctx->name_cache_size--;
@ -2075,7 +2078,7 @@ static void name_cache_delete(struct send_ctx *sctx,
* We may not get to the final release of nce_head if the lookup fails
*/
if (nce_head && list_empty(nce_head)) {
radix_tree_delete(&sctx->name_cache, (unsigned long)nce->ino);
xa_erase(&sctx->name_cache, (unsigned long)nce->ino);
kfree(nce_head);
}
}
@ -2086,11 +2089,11 @@ static struct name_cache_entry *name_cache_search(struct send_ctx *sctx,
struct list_head *nce_head;
struct name_cache_entry *cur;
nce_head = radix_tree_lookup(&sctx->name_cache, (unsigned long)ino);
nce_head = xa_load(&sctx->name_cache, (unsigned long)ino);
if (!nce_head)
return NULL;
list_for_each_entry(cur, nce_head, radix_list) {
list_for_each_entry(cur, nce_head, inum_aliases) {
if (cur->ino == ino && cur->gen == gen)
return cur;
}
@ -2675,61 +2678,43 @@ out:
static int did_create_dir(struct send_ctx *sctx, u64 dir)
{
int ret = 0;
int iter_ret = 0;
struct btrfs_path *path = NULL;
struct btrfs_key key;
struct btrfs_key found_key;
struct btrfs_key di_key;
struct extent_buffer *eb;
struct btrfs_dir_item *di;
int slot;
path = alloc_path_for_send();
if (!path) {
ret = -ENOMEM;
goto out;
}
if (!path)
return -ENOMEM;
key.objectid = dir;
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, sctx->send_root, &key, path, 0, 0);
if (ret < 0)
goto out;
while (1) {
eb = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(eb)) {
ret = btrfs_next_leaf(sctx->send_root, path);
if (ret < 0) {
goto out;
} else if (ret > 0) {
ret = 0;
break;
}
continue;
}
btrfs_for_each_slot(sctx->send_root, &key, &found_key, path, iter_ret) {
struct extent_buffer *eb = path->nodes[0];
btrfs_item_key_to_cpu(eb, &found_key, slot);
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
goto out;
break;
}
di = btrfs_item_ptr(eb, slot, struct btrfs_dir_item);
di = btrfs_item_ptr(eb, path->slots[0], struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
if (di_key.type != BTRFS_ROOT_ITEM_KEY &&
di_key.objectid < sctx->send_progress) {
ret = 1;
goto out;
break;
}
path->slots[0]++;
}
/* Catch error found during iteration */
if (iter_ret < 0)
ret = iter_ret;
out:
btrfs_free_path(path);
return ret;
}
@ -2933,6 +2918,7 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
u64 send_progress)
{
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root = sctx->parent_root;
struct btrfs_path *path;
struct btrfs_key key;
@ -2959,23 +2945,9 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
if (odi)
key.offset = odi->last_dir_index_offset;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
while (1) {
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct waiting_dir_move *dm;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0])) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
else if (ret > 0)
break;
continue;
}
btrfs_item_key_to_cpu(path->nodes[0], &found_key,
path->slots[0]);
if (found_key.objectid != key.objectid ||
found_key.type != key.type)
break;
@ -3010,8 +2982,10 @@ static int can_rmdir(struct send_ctx *sctx, u64 dir, u64 dir_gen,
ret = 0;
goto out;
}
path->slots[0]++;
}
if (iter_ret < 0) {
ret = iter_ret;
goto out;
}
free_orphan_dir_info(sctx, odi);
@ -3579,7 +3553,7 @@ static int check_ino_in_path(struct btrfs_root *root,
}
/*
* Check if ino ino1 is an ancestor of inode ino2 in the given root for any
* Check if inode ino1 is an ancestor of inode ino2 in the given root for any
* possible path (in case ino2 is not a directory and has multiple hard links).
* Return 1 if true, 0 if false and < 0 on error.
*/
@ -3591,6 +3565,7 @@ static int is_ancestor(struct btrfs_root *root,
{
bool free_fs_path = false;
int ret = 0;
int iter_ret = 0;
struct btrfs_path *path = NULL;
struct btrfs_key key;
@ -3611,26 +3586,12 @@ static int is_ancestor(struct btrfs_root *root,
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
while (true) {
btrfs_for_each_slot(root, &key, &key, path, iter_ret) {
struct extent_buffer *leaf = path->nodes[0];
int slot = path->slots[0];
u32 cur_offset = 0;
u32 item_size;
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
if (ret > 0)
break;
continue;
}
btrfs_item_key_to_cpu(leaf, &key, slot);
if (key.objectid != ino2)
break;
if (key.type != BTRFS_INODE_REF_KEY &&
@ -3668,10 +3629,12 @@ static int is_ancestor(struct btrfs_root *root,
if (ret)
goto out;
}
path->slots[0]++;
}
ret = 0;
out:
if (iter_ret < 0)
ret = iter_ret;
out:
btrfs_free_path(path);
if (free_fs_path)
fs_path_free(fs_path);
@ -4551,13 +4514,12 @@ out:
static int process_all_refs(struct send_ctx *sctx,
enum btrfs_compare_tree_result cmd)
{
int ret;
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *eb;
int slot;
iterate_inode_ref_t cb;
int pending_move = 0;
@ -4581,24 +4543,7 @@ static int process_all_refs(struct send_ctx *sctx,
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
while (1) {
eb = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(eb)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto out;
else if (ret > 0)
break;
continue;
}
btrfs_item_key_to_cpu(eb, &found_key, slot);
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
(found_key.type != BTRFS_INODE_REF_KEY &&
found_key.type != BTRFS_INODE_EXTREF_KEY))
@ -4607,8 +4552,11 @@ static int process_all_refs(struct send_ctx *sctx,
ret = iterate_inode_ref(root, path, &found_key, 0, cb, sctx);
if (ret < 0)
goto out;
path->slots[0]++;
}
/* Catch error found during iteration */
if (iter_ret < 0) {
ret = iter_ret;
goto out;
}
btrfs_release_path(path);
@ -4870,13 +4818,12 @@ out:
static int process_all_new_xattrs(struct send_ctx *sctx)
{
int ret;
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *eb;
int slot;
path = alloc_path_for_send();
if (!path)
@ -4887,39 +4834,21 @@ static int process_all_new_xattrs(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_XATTR_ITEM_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
while (1) {
eb = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(eb)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0) {
goto out;
} else if (ret > 0) {
ret = 0;
break;
}
continue;
}
btrfs_item_key_to_cpu(eb, &found_key, slot);
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
goto out;
break;
}
ret = iterate_dir_item(root, path, __process_new_xattr, sctx);
if (ret < 0)
goto out;
path->slots[0]++;
break;
}
/* Catch error found during iteration */
if (iter_ret < 0)
ret = iter_ret;
out:
btrfs_free_path(path);
return ret;
}
@ -4946,7 +4875,6 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
{
struct btrfs_root *root = sctx->send_root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode *inode;
struct page *page;
pgoff_t index = offset >> PAGE_SHIFT;
pgoff_t last_index;
@ -4957,37 +4885,30 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
if (ret)
return ret;
inode = btrfs_iget(fs_info->sb, sctx->cur_ino, root);
if (IS_ERR(inode))
return PTR_ERR(inode);
last_index = (offset + len - 1) >> PAGE_SHIFT;
/* initial readahead */
memset(&sctx->ra, 0, sizeof(struct file_ra_state));
file_ra_state_init(&sctx->ra, inode->i_mapping);
while (index <= last_index) {
unsigned cur_len = min_t(unsigned, len,
PAGE_SIZE - pg_offset);
page = find_lock_page(inode->i_mapping, index);
page = find_lock_page(sctx->cur_inode->i_mapping, index);
if (!page) {
page_cache_sync_readahead(inode->i_mapping, &sctx->ra,
NULL, index, last_index + 1 - index);
page_cache_sync_readahead(sctx->cur_inode->i_mapping,
&sctx->ra, NULL, index,
last_index + 1 - index);
page = find_or_create_page(inode->i_mapping, index,
GFP_KERNEL);
page = find_or_create_page(sctx->cur_inode->i_mapping,
index, GFP_KERNEL);
if (!page) {
ret = -ENOMEM;
break;
}
}
if (PageReadahead(page)) {
page_cache_async_readahead(inode->i_mapping, &sctx->ra,
NULL, page, index, last_index + 1 - index);
}
if (PageReadahead(page))
page_cache_async_readahead(sctx->cur_inode->i_mapping,
&sctx->ra, NULL, page, index,
last_index + 1 - index);
if (!PageUptodate(page)) {
btrfs_readpage(NULL, page);
@ -5013,7 +4934,7 @@ static int put_file_data(struct send_ctx *sctx, u64 offset, u32 len)
len -= cur_len;
sctx->send_size += cur_len;
}
iput(inode);
return ret;
}
@ -5220,12 +5141,49 @@ static int send_extent_data(struct send_ctx *sctx,
const u64 offset,
const u64 len)
{
const u64 end = offset + len;
u64 read_size = max_send_read_size(sctx);
u64 sent = 0;
if (sctx->flags & BTRFS_SEND_FLAG_NO_FILE_DATA)
return send_update_extent(sctx, offset, len);
if (sctx->cur_inode == NULL) {
struct btrfs_root *root = sctx->send_root;
sctx->cur_inode = btrfs_iget(root->fs_info->sb, sctx->cur_ino, root);
if (IS_ERR(sctx->cur_inode)) {
int err = PTR_ERR(sctx->cur_inode);
sctx->cur_inode = NULL;
return err;
}
memset(&sctx->ra, 0, sizeof(struct file_ra_state));
file_ra_state_init(&sctx->ra, sctx->cur_inode->i_mapping);
/*
* It's very likely there are no pages from this inode in the page
* cache, so after reading extents and sending their data, we clean
* the page cache to avoid trashing the page cache (adding pressure
* to the page cache and forcing eviction of other data more useful
* for applications).
*
* We decide if we should clean the page cache simply by checking
* if the inode's mapping nrpages is 0 when we first open it, and
* not by using something like filemap_range_has_page() before
* reading an extent because when we ask the readahead code to
* read a given file range, it may (and almost always does) read
* pages from beyond that range (see the documentation for
* page_cache_sync_readahead()), so it would not be reliable,
* because after reading the first extent future calls to
* filemap_range_has_page() would return true because the readahead
* on the previous extent resulted in reading pages of the current
* extent as well.
*/
sctx->clean_page_cache = (sctx->cur_inode->i_mapping->nrpages == 0);
sctx->page_cache_clear_start = round_down(offset, PAGE_SIZE);
}
while (sent < len) {
u64 size = min(len - sent, read_size);
int ret;
@ -5235,6 +5193,37 @@ static int send_extent_data(struct send_ctx *sctx,
return ret;
sent += size;
}
if (sctx->clean_page_cache && IS_ALIGNED(end, PAGE_SIZE)) {
/*
* Always operate only on ranges that are a multiple of the page
* size. This is not only to prevent zeroing parts of a page in
* the case of subpage sector size, but also to guarantee we evict
* pages, as passing a range that is smaller than page size does
* not evict the respective page (only zeroes part of its content).
*
* Always start from the end offset of the last range cleared.
* This is because the readahead code may (and very often does)
* reads pages beyond the range we request for readahead. So if
* we have an extent layout like this:
*
* [ extent A ] [ extent B ] [ extent C ]
*
* When we ask page_cache_sync_readahead() to read extent A, it
* may also trigger reads for pages of extent B. If we are doing
* an incremental send and extent B has not changed between the
* parent and send snapshots, some or all of its pages may end
* up being read and placed in the page cache. So when truncating
* the page cache we always start from the end offset of the
* previously processed extent up to the end of the current
* extent.
*/
truncate_inode_pages_range(&sctx->cur_inode->i_data,
sctx->page_cache_clear_start,
end - 1);
sctx->page_cache_clear_start = end;
}
return 0;
}
@ -5965,13 +5954,12 @@ out:
static int process_all_extents(struct send_ctx *sctx)
{
int ret;
int ret = 0;
int iter_ret = 0;
struct btrfs_root *root;
struct btrfs_path *path;
struct btrfs_key key;
struct btrfs_key found_key;
struct extent_buffer *eb;
int slot;
root = sctx->send_root;
path = alloc_path_for_send();
@ -5981,41 +5969,21 @@ static int process_all_extents(struct send_ctx *sctx)
key.objectid = sctx->cmp_key->objectid;
key.type = BTRFS_EXTENT_DATA_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto out;
while (1) {
eb = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(eb)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0) {
goto out;
} else if (ret > 0) {
ret = 0;
break;
}
continue;
}
btrfs_item_key_to_cpu(eb, &found_key, slot);
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid ||
found_key.type != key.type) {
ret = 0;
goto out;
break;
}
ret = process_extent(sctx, path, &found_key);
if (ret < 0)
goto out;
path->slots[0]++;
break;
}
/* Catch error found during iteration */
if (iter_ret < 0)
ret = iter_ret;
out:
btrfs_free_path(path);
return ret;
}
@ -6205,8 +6173,11 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx)
{
LIST_HEAD(deleted_refs);
struct btrfs_path *path;
struct btrfs_root *root = sctx->parent_root;
struct btrfs_key key;
struct btrfs_key found_key;
struct parent_paths_ctx ctx;
int iter_ret = 0;
int ret;
path = alloc_path_for_send();
@ -6216,39 +6187,26 @@ static int btrfs_unlink_all_paths(struct send_ctx *sctx)
key.objectid = sctx->cur_ino;
key.type = BTRFS_INODE_REF_KEY;
key.offset = 0;
ret = btrfs_search_slot(NULL, sctx->parent_root, &key, path, 0, 0);
if (ret < 0)
goto out;
ctx.refs = &deleted_refs;
ctx.sctx = sctx;
while (true) {
struct extent_buffer *eb = path->nodes[0];
int slot = path->slots[0];
if (slot >= btrfs_header_nritems(eb)) {
ret = btrfs_next_leaf(sctx->parent_root, path);
if (ret < 0)
goto out;
else if (ret > 0)
break;
continue;
}
btrfs_item_key_to_cpu(eb, &key, slot);
if (key.objectid != sctx->cur_ino)
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
if (found_key.objectid != key.objectid)
break;
if (key.type != BTRFS_INODE_REF_KEY &&
key.type != BTRFS_INODE_EXTREF_KEY)
if (found_key.type != key.type &&
found_key.type != BTRFS_INODE_EXTREF_KEY)
break;
ret = iterate_inode_ref(sctx->parent_root, path, &key, 1,
ret = iterate_inode_ref(root, path, &found_key, 1,
record_parent_ref, &ctx);
if (ret < 0)
goto out;
path->slots[0]++;
}
/* Catch error found during iteration */
if (iter_ret < 0) {
ret = iter_ret;
goto out;
}
while (!list_empty(&deleted_refs)) {
@ -6270,6 +6228,30 @@ out:
return ret;
}
static void close_current_inode(struct send_ctx *sctx)
{
u64 i_size;
if (sctx->cur_inode == NULL)
return;
i_size = i_size_read(sctx->cur_inode);
/*
* If we are doing an incremental send, we may have extents between the
* last processed extent and the i_size that have not been processed
* because they haven't changed but we may have read some of their pages
* through readahead, see the comments at send_extent_data().
*/
if (sctx->clean_page_cache && sctx->page_cache_clear_start < i_size)
truncate_inode_pages_range(&sctx->cur_inode->i_data,
sctx->page_cache_clear_start,
round_up(i_size, PAGE_SIZE) - 1);
iput(sctx->cur_inode);
sctx->cur_inode = NULL;
}
static int changed_inode(struct send_ctx *sctx,
enum btrfs_compare_tree_result result)
{
@ -6280,6 +6262,8 @@ static int changed_inode(struct send_ctx *sctx,
u64 left_gen = 0;
u64 right_gen = 0;
close_current_inode(sctx);
sctx->cur_ino = key->objectid;
sctx->cur_inode_new_gen = 0;
sctx->cur_inode_last_extent = (u64)-1;
@ -7534,7 +7518,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
INIT_LIST_HEAD(&sctx->new_refs);
INIT_LIST_HEAD(&sctx->deleted_refs);
INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
xa_init_flags(&sctx->name_cache, GFP_KERNEL);
INIT_LIST_HEAD(&sctx->name_cache_list);
sctx->flags = arg->flags;
@ -7766,6 +7750,8 @@ out:
name_cache_free(sctx);
close_current_inode(sctx);
kfree(sctx);
}

View File

@ -181,6 +181,12 @@ void btrfs_clear_space_info_full(struct btrfs_fs_info *info)
found->full = 0;
}
/*
* Block groups with more than this value (percents) of unusable space will be
* scheduled for background reclaim.
*/
#define BTRFS_DEFAULT_ZONED_RECLAIM_THRESH (75)
static int create_space_info(struct btrfs_fs_info *info, u64 flags)
{
@ -203,6 +209,9 @@ static int create_space_info(struct btrfs_fs_info *info, u64 flags)
INIT_LIST_HEAD(&space_info->priority_tickets);
space_info->clamp = 1;
if (btrfs_is_zoned(info))
space_info->bg_reclaim_threshold = BTRFS_DEFAULT_ZONED_RECLAIM_THRESH;
ret = btrfs_sysfs_add_space_info_type(info, space_info);
if (ret)
return ret;
@ -519,7 +528,7 @@ static void shrink_delalloc(struct btrfs_fs_info *fs_info,
items = calc_reclaim_items_nr(fs_info, to_reclaim) * 2;
}
trans = (struct btrfs_trans_handle *)current->journal_info;
trans = current->journal_info;
/*
* If we are doing more ordered than delalloc we need to just wait on

View File

@ -3,6 +3,8 @@
#ifndef BTRFS_SPACE_INFO_H
#define BTRFS_SPACE_INFO_H
#include "volumes.h"
struct btrfs_space_info {
spinlock_t lock;
@ -24,6 +26,12 @@ struct btrfs_space_info {
the space info if we had an ENOSPC in the
allocator. */
/*
* Once a block group drops below this threshold (percents) we'll
* schedule it for reclaim.
*/
int bg_reclaim_threshold;
int clamp; /* Used to scale our threshold for preemptive
flushing. The value is >> clamp, so turns
out to be a 2^clamp divisor. */

View File

@ -63,6 +63,29 @@
* This means a slightly higher tree locking latency.
*/
bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page)
{
if (fs_info->sectorsize >= PAGE_SIZE)
return false;
/*
* Only data pages (either through DIO or compression) can have no
* mapping. And if page->mapping->host is data inode, it's subpage.
* As we have ruled our sectorsize >= PAGE_SIZE case already.
*/
if (!page->mapping || !page->mapping->host ||
is_data_inode(page->mapping->host))
return true;
/*
* Now the only remaining case is metadata, which we only go subpage
* routine if nodesize < PAGE_SIZE.
*/
if (fs_info->nodesize < PAGE_SIZE)
return true;
return false;
}
void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize)
{
unsigned int cur = 0;
@ -107,7 +130,7 @@ int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
ASSERT(PageLocked(page));
/* Either not subpage, or the page already has private attached */
if (fs_info->sectorsize == PAGE_SIZE || PagePrivate(page))
if (!btrfs_is_subpage(fs_info, page) || PagePrivate(page))
return 0;
subpage = btrfs_alloc_subpage(fs_info, type);
@ -124,10 +147,10 @@ void btrfs_detach_subpage(const struct btrfs_fs_info *fs_info,
struct btrfs_subpage *subpage;
/* Either not subpage, or already detached */
if (fs_info->sectorsize == PAGE_SIZE || !PagePrivate(page))
if (!btrfs_is_subpage(fs_info, page) || !PagePrivate(page))
return;
subpage = (struct btrfs_subpage *)detach_page_private(page);
subpage = detach_page_private(page);
ASSERT(subpage);
btrfs_free_subpage(subpage);
}
@ -175,7 +198,7 @@ void btrfs_page_inc_eb_refs(const struct btrfs_fs_info *fs_info,
{
struct btrfs_subpage *subpage;
if (fs_info->sectorsize == PAGE_SIZE)
if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->mapping);
@ -190,7 +213,7 @@ void btrfs_page_dec_eb_refs(const struct btrfs_fs_info *fs_info,
{
struct btrfs_subpage *subpage;
if (fs_info->sectorsize == PAGE_SIZE)
if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->mapping);
@ -319,7 +342,7 @@ bool btrfs_subpage_end_and_test_writer(const struct btrfs_fs_info *fs_info,
int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) {
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) {
lock_page(page);
return 0;
}
@ -336,7 +359,7 @@ int btrfs_page_start_writer_lock(const struct btrfs_fs_info *fs_info,
void btrfs_page_end_writer_lock(const struct btrfs_fs_info *fs_info,
struct page *page, u64 start, u32 len)
{
if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE)
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page))
return unlock_page(page);
btrfs_subpage_clamp_range(page, &start, &len);
if (btrfs_subpage_end_and_test_writer(fs_info, page, start, len))
@ -620,7 +643,7 @@ IMPLEMENT_BTRFS_SUBPAGE_TEST_OP(checked);
void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
set_page_func(page); \
return; \
} \
@ -629,7 +652,7 @@ void btrfs_page_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
clear_page_func(page); \
return; \
} \
@ -638,14 +661,14 @@ void btrfs_page_clear_##name(const struct btrfs_fs_info *fs_info, \
bool btrfs_page_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
return test_page_func(page); \
return btrfs_subpage_test_##name(fs_info, page, start, len); \
} \
void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
set_page_func(page); \
return; \
} \
@ -655,7 +678,7 @@ void btrfs_page_clamp_set_##name(const struct btrfs_fs_info *fs_info, \
void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) { \
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) { \
clear_page_func(page); \
return; \
} \
@ -665,7 +688,7 @@ void btrfs_page_clamp_clear_##name(const struct btrfs_fs_info *fs_info, \
bool btrfs_page_clamp_test_##name(const struct btrfs_fs_info *fs_info, \
struct page *page, u64 start, u32 len) \
{ \
if (unlikely(!fs_info) || fs_info->sectorsize == PAGE_SIZE) \
if (unlikely(!fs_info) || !btrfs_is_subpage(fs_info, page)) \
return test_page_func(page); \
btrfs_subpage_clamp_range(page, &start, &len); \
return btrfs_subpage_test_##name(fs_info, page, start, len); \
@ -694,7 +717,7 @@ void btrfs_page_assert_not_dirty(const struct btrfs_fs_info *fs_info,
return;
ASSERT(!PageDirty(page));
if (fs_info->sectorsize == PAGE_SIZE)
if (!btrfs_is_subpage(fs_info, page))
return;
ASSERT(PagePrivate(page) && page->private);
@ -722,8 +745,8 @@ void btrfs_page_unlock_writer(struct btrfs_fs_info *fs_info, struct page *page,
struct btrfs_subpage *subpage;
ASSERT(PageLocked(page));
/* For regular page size case, we just unlock the page */
if (fs_info->sectorsize == PAGE_SIZE)
/* For non-subpage case, we just unlock the page */
if (!btrfs_is_subpage(fs_info, page))
return unlock_page(page);
ASSERT(PagePrivate(page) && page->private);

View File

@ -74,6 +74,8 @@ enum btrfs_subpage_type {
BTRFS_SUBPAGE_DATA,
};
bool btrfs_is_subpage(const struct btrfs_fs_info *fs_info, struct page *page);
void btrfs_init_subpage_info(struct btrfs_subpage_info *subpage_info, u32 sectorsize);
int btrfs_attach_subpage(const struct btrfs_fs_info *fs_info,
struct page *page, enum btrfs_subpage_type type);

View File

@ -261,7 +261,7 @@ static struct ratelimit_state printk_limits[] = {
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
@ -292,10 +292,10 @@ void __cold btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, .
char statestr[STATE_STRING_BUF_LEN];
btrfs_state_to_string(fs_info, statestr);
printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
_printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
fs_info->sb->s_id, statestr, &vaf);
} else {
printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
_printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
}
}
@ -1903,6 +1903,7 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
old_pool_size, new_pool_size);
btrfs_workqueue_set_max(fs_info->workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->hipri_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->delalloc_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->caching_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_workers, new_pool_size);
@ -1912,8 +1913,6 @@ static void btrfs_resize_thread_pool(struct btrfs_fs_info *fs_info,
btrfs_workqueue_set_max(fs_info->endio_write_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->endio_freespace_worker, new_pool_size);
btrfs_workqueue_set_max(fs_info->delayed_workers, new_pool_size);
btrfs_workqueue_set_max(fs_info->scrub_wr_completion_workers,
new_pool_size);
}
static inline void btrfs_remount_begin(struct btrfs_fs_info *fs_info,

View File

@ -394,11 +394,9 @@ static ssize_t supported_sectorsizes_show(struct kobject *kobj,
{
ssize_t ret = 0;
/* 4K sector size is also supported with 64K page size */
if (PAGE_SIZE == SZ_64K)
/* An artificial limit to only support 4K and PAGE_SIZE */
if (PAGE_SIZE > SZ_4K)
ret += sysfs_emit_at(buf, ret, "%u ", SZ_4K);
/* Only sectorsize == PAGE_SIZE is now supported */
ret += sysfs_emit_at(buf, ret, "%lu\n", PAGE_SIZE);
return ret;
@ -722,6 +720,42 @@ SPACE_INFO_ATTR(bytes_zone_unusable);
SPACE_INFO_ATTR(disk_used);
SPACE_INFO_ATTR(disk_total);
static ssize_t btrfs_sinfo_bg_reclaim_threshold_show(struct kobject *kobj,
struct kobj_attribute *a,
char *buf)
{
struct btrfs_space_info *space_info = to_space_info(kobj);
ssize_t ret;
ret = sysfs_emit(buf, "%d\n", READ_ONCE(space_info->bg_reclaim_threshold));
return ret;
}
static ssize_t btrfs_sinfo_bg_reclaim_threshold_store(struct kobject *kobj,
struct kobj_attribute *a,
const char *buf, size_t len)
{
struct btrfs_space_info *space_info = to_space_info(kobj);
int thresh;
int ret;
ret = kstrtoint(buf, 10, &thresh);
if (ret)
return ret;
if (thresh < 0 || thresh > 100)
return -EINVAL;
WRITE_ONCE(space_info->bg_reclaim_threshold, thresh);
return len;
}
BTRFS_ATTR_RW(space_info, bg_reclaim_threshold,
btrfs_sinfo_bg_reclaim_threshold_show,
btrfs_sinfo_bg_reclaim_threshold_store);
/*
* Allocation information about block group types.
*
@ -738,6 +772,7 @@ static struct attribute *space_info_attrs[] = {
BTRFS_ATTR_PTR(space_info, bytes_zone_unusable),
BTRFS_ATTR_PTR(space_info, disk_used),
BTRFS_ATTR_PTR(space_info, disk_total),
BTRFS_ATTR_PTR(space_info, bg_reclaim_threshold),
NULL,
};
ATTRIBUTE_GROUPS(space_info);

View File

@ -150,8 +150,8 @@ struct btrfs_fs_info *btrfs_alloc_dummy_fs_info(u32 nodesize, u32 sectorsize)
void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
{
struct radix_tree_iter iter;
void **slot;
unsigned long index;
struct extent_buffer *eb;
struct btrfs_device *dev, *tmp;
if (!fs_info)
@ -163,25 +163,9 @@ void btrfs_free_dummy_fs_info(struct btrfs_fs_info *fs_info)
test_mnt->mnt_sb->s_fs_info = NULL;
spin_lock(&fs_info->buffer_lock);
radix_tree_for_each_slot(slot, &fs_info->buffer_radix, &iter, 0) {
struct extent_buffer *eb;
eb = radix_tree_deref_slot_protected(slot, &fs_info->buffer_lock);
if (!eb)
continue;
/* Shouldn't happen but that kind of thinking creates CVE's */
if (radix_tree_exception(eb)) {
if (radix_tree_deref_retry(eb))
slot = radix_tree_iter_retry(&iter);
continue;
}
slot = radix_tree_iter_resume(slot, &iter);
spin_unlock(&fs_info->buffer_lock);
xa_for_each(&fs_info->extent_buffers, index, eb) {
free_extent_buffer_stale(eb);
spin_lock(&fs_info->buffer_lock);
}
spin_unlock(&fs_info->buffer_lock);
btrfs_mapping_tree_free(&fs_info->mapping_tree);
list_for_each_entry_safe(dev, tmp, &fs_info->fs_devices->devices,
@ -202,7 +186,7 @@ void btrfs_free_dummy_root(struct btrfs_root *root)
if (!root)
return;
/* Will be freed by btrfs_free_fs_roots */
if (WARN_ON(test_bit(BTRFS_ROOT_IN_RADIX, &root->state)))
if (WARN_ON(test_bit(BTRFS_ROOT_REGISTERED, &root->state)))
return;
btrfs_global_root_delete(root);
btrfs_put_root(root);

View File

@ -23,7 +23,7 @@
#include "space-info.h"
#include "zoned.h"
#define BTRFS_ROOT_TRANS_TAG 0
#define BTRFS_ROOT_TRANS_TAG XA_MARK_0
/*
* Transaction states and transitions
@ -221,7 +221,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
* the caching thread will re-start it's search from 3, and thus find
* the hole from [4,6) to add to the free space cache.
*/
spin_lock(&fs_info->block_group_cache_lock);
write_lock(&fs_info->block_group_cache_lock);
list_for_each_entry_safe(caching_ctl, next,
&fs_info->caching_block_groups, list) {
struct btrfs_block_group *cache = caching_ctl->block_group;
@ -234,7 +234,7 @@ static noinline void switch_commit_roots(struct btrfs_trans_handle *trans)
cache->last_byte_to_unpin = caching_ctl->progress;
}
}
spin_unlock(&fs_info->block_group_cache_lock);
write_unlock(&fs_info->block_group_cache_lock);
up_write(&fs_info->commit_root_sem);
}
@ -437,15 +437,15 @@ static int record_root_in_trans(struct btrfs_trans_handle *trans,
*/
smp_wmb();
spin_lock(&fs_info->fs_roots_radix_lock);
spin_lock(&fs_info->fs_roots_lock);
if (root->last_trans == trans->transid && !force) {
spin_unlock(&fs_info->fs_roots_radix_lock);
spin_unlock(&fs_info->fs_roots_lock);
return 0;
}
radix_tree_tag_set(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
xa_set_mark(&fs_info->fs_roots,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_lock);
root->last_trans = trans->transid;
/* this is pretty tricky. We don't want to
@ -487,11 +487,9 @@ void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
spin_unlock(&cur_trans->dropped_roots_lock);
/* Make sure we don't try to update the root at commit time */
spin_lock(&fs_info->fs_roots_radix_lock);
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
xa_clear_mark(&fs_info->fs_roots,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
}
int btrfs_record_root_in_trans(struct btrfs_trans_handle *trans,
@ -1404,9 +1402,8 @@ void btrfs_add_dead_root(struct btrfs_root *root)
static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_root *gang[8];
int i;
int ret;
struct btrfs_root *root;
unsigned long index;
/*
* At this point no one can be using this transaction to modify any tree
@ -1414,57 +1411,46 @@ static noinline int commit_fs_roots(struct btrfs_trans_handle *trans)
*/
ASSERT(trans->transaction->state == TRANS_STATE_COMMIT_DOING);
spin_lock(&fs_info->fs_roots_radix_lock);
while (1) {
ret = radix_tree_gang_lookup_tag(&fs_info->fs_roots_radix,
(void **)gang, 0,
ARRAY_SIZE(gang),
BTRFS_ROOT_TRANS_TAG);
if (ret == 0)
break;
for (i = 0; i < ret; i++) {
struct btrfs_root *root = gang[i];
int ret2;
spin_lock(&fs_info->fs_roots_lock);
xa_for_each_marked(&fs_info->fs_roots, index, root, BTRFS_ROOT_TRANS_TAG) {
int ret;
/*
* At this point we can neither have tasks logging inodes
* from a root nor trying to commit a log tree.
*/
ASSERT(atomic_read(&root->log_writers) == 0);
ASSERT(atomic_read(&root->log_commit[0]) == 0);
ASSERT(atomic_read(&root->log_commit[1]) == 0);
/*
* At this point we can neither have tasks logging inodes
* from a root nor trying to commit a log tree.
*/
ASSERT(atomic_read(&root->log_writers) == 0);
ASSERT(atomic_read(&root->log_commit[0]) == 0);
ASSERT(atomic_read(&root->log_commit[1]) == 0);
radix_tree_tag_clear(&fs_info->fs_roots_radix,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_radix_lock);
xa_clear_mark(&fs_info->fs_roots,
(unsigned long)root->root_key.objectid,
BTRFS_ROOT_TRANS_TAG);
spin_unlock(&fs_info->fs_roots_lock);
btrfs_free_log(trans, root);
ret2 = btrfs_update_reloc_root(trans, root);
if (ret2)
return ret2;
btrfs_free_log(trans, root);
ret = btrfs_update_reloc_root(trans, root);
if (ret)
return ret;
/* see comments in should_cow_block() */
clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_mb__after_atomic();
/* See comments in should_cow_block() */
clear_bit(BTRFS_ROOT_FORCE_COW, &root->state);
smp_mb__after_atomic();
if (root->commit_root != root->node) {
list_add_tail(&root->dirty_list,
&trans->transaction->switch_commits);
btrfs_set_root_node(&root->root_item,
root->node);
}
ret2 = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key,
&root->root_item);
if (ret2)
return ret2;
spin_lock(&fs_info->fs_roots_radix_lock);
btrfs_qgroup_free_meta_all_pertrans(root);
if (root->commit_root != root->node) {
list_add_tail(&root->dirty_list,
&trans->transaction->switch_commits);
btrfs_set_root_node(&root->root_item, root->node);
}
ret = btrfs_update_root(trans, fs_info->tree_root,
&root->root_key, &root->root_item);
if (ret)
return ret;
spin_lock(&fs_info->fs_roots_lock);
btrfs_qgroup_free_meta_all_pertrans(root);
}
spin_unlock(&fs_info->fs_roots_radix_lock);
spin_unlock(&fs_info->fs_roots_lock);
return 0;
}

View File

@ -1855,3 +1855,58 @@ out:
return ret;
}
ALLOW_ERROR_INJECTION(btrfs_check_node, ERRNO);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner)
{
const bool is_subvol = is_fstree(root_owner);
const u64 eb_owner = btrfs_header_owner(eb);
/*
* Skip dummy fs, as selftests don't create unique ebs for each dummy
* root.
*/
if (test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &eb->fs_info->fs_state))
return 0;
/*
* There are several call sites (backref walking, qgroup, and data
* reloc) passing 0 as @root_owner, as they are not holding the
* tree root. In that case, we can not do a reliable ownership check,
* so just exit.
*/
if (root_owner == 0)
return 0;
/*
* These trees use key.offset as their owner, our callers don't have
* the extra capacity to pass key.offset here. So we just skip them.
*/
if (root_owner == BTRFS_TREE_LOG_OBJECTID ||
root_owner == BTRFS_TREE_RELOC_OBJECTID)
return 0;
if (!is_subvol) {
/* For non-subvolume trees, the eb owner should match root owner */
if (unlikely(root_owner != eb_owner)) {
btrfs_crit(eb->fs_info,
"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect %llu",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
root_owner, btrfs_header_bytenr(eb), eb_owner,
root_owner);
return -EUCLEAN;
}
return 0;
}
/*
* For subvolume trees, owners can mismatch, but they should all belong
* to subvolume trees.
*/
if (unlikely(is_subvol != is_fstree(eb_owner))) {
btrfs_crit(eb->fs_info,
"corrupted %s, root=%llu block=%llu owner mismatch, have %llu expect [%llu, %llu]",
btrfs_header_level(eb) == 0 ? "leaf" : "node",
root_owner, btrfs_header_bytenr(eb), eb_owner,
BTRFS_FIRST_FREE_OBJECTID, BTRFS_LAST_FREE_OBJECTID);
return -EUCLEAN;
}
return 0;
}

View File

@ -25,5 +25,6 @@ int btrfs_check_node(struct extent_buffer *node);
int btrfs_check_chunk_valid(struct extent_buffer *leaf,
struct btrfs_chunk *chunk, u64 logical);
int btrfs_check_eb_owner(const struct extent_buffer *eb, u64 root_owner);
#endif

View File

@ -333,7 +333,7 @@ static int process_one_buffer(struct btrfs_root *log,
* pin down any logged extents, so we have to read the block.
*/
if (btrfs_fs_incompat(fs_info, MIXED_GROUPS)) {
ret = btrfs_read_buffer(eb, gen, level, NULL);
ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
if (ret)
return ret;
}
@ -894,8 +894,7 @@ update_inode:
btrfs_update_inode_bytes(BTRFS_I(inode), nbytes, drop_args.bytes_found);
ret = btrfs_update_inode(trans, root, BTRFS_I(inode));
out:
if (inode)
iput(inode);
iput(inode);
return ret;
}
@ -2575,7 +2574,7 @@ static int replay_one_buffer(struct btrfs_root *log, struct extent_buffer *eb,
int i;
int ret;
ret = btrfs_read_buffer(eb, gen, level, NULL);
ret = btrfs_read_extent_buffer(eb, gen, level, NULL);
if (ret)
return ret;
@ -2786,7 +2785,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
path->slots[*level]++;
if (wc->free) {
ret = btrfs_read_buffer(next, ptr_gen,
ret = btrfs_read_extent_buffer(next, ptr_gen,
*level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
@ -2815,7 +2814,7 @@ static noinline int walk_down_log_tree(struct btrfs_trans_handle *trans,
free_extent_buffer(next);
continue;
}
ret = btrfs_read_buffer(next, ptr_gen, *level - 1, &first_key);
ret = btrfs_read_extent_buffer(next, ptr_gen, *level - 1, &first_key);
if (ret) {
free_extent_buffer(next);
return ret;

View File

@ -164,24 +164,12 @@ const struct btrfs_raid_attr btrfs_raid_array[BTRFS_NR_RAID_TYPES] = {
*/
enum btrfs_raid_types __attribute_const__ btrfs_bg_flags_to_raid_index(u64 flags)
{
if (flags & BTRFS_BLOCK_GROUP_RAID10)
return BTRFS_RAID_RAID10;
else if (flags & BTRFS_BLOCK_GROUP_RAID1)
return BTRFS_RAID_RAID1;
else if (flags & BTRFS_BLOCK_GROUP_RAID1C3)
return BTRFS_RAID_RAID1C3;
else if (flags & BTRFS_BLOCK_GROUP_RAID1C4)
return BTRFS_RAID_RAID1C4;
else if (flags & BTRFS_BLOCK_GROUP_DUP)
return BTRFS_RAID_DUP;
else if (flags & BTRFS_BLOCK_GROUP_RAID0)
return BTRFS_RAID_RAID0;
else if (flags & BTRFS_BLOCK_GROUP_RAID5)
return BTRFS_RAID_RAID5;
else if (flags & BTRFS_BLOCK_GROUP_RAID6)
return BTRFS_RAID_RAID6;
const u64 profile = (flags & BTRFS_BLOCK_GROUP_PROFILE_MASK);
return BTRFS_RAID_SINGLE; /* BTRFS_BLOCK_GROUP_SINGLE */
if (!profile)
return BTRFS_RAID_SINGLE;
return BTRFS_BG_FLAG_TO_INDEX(profile);
}
const char *btrfs_bg_type_to_raid_name(u64 flags)
@ -4062,13 +4050,6 @@ static inline int validate_convert_profile(struct btrfs_fs_info *fs_info,
if (!(bargs->flags & BTRFS_BALANCE_ARGS_CONVERT))
return true;
if (fs_info->sectorsize < PAGE_SIZE &&
bargs->target & BTRFS_BLOCK_GROUP_RAID56_MASK) {
btrfs_err(fs_info,
"RAID56 is not yet supported for sectorsize %u with page size %lu",
fs_info->sectorsize, PAGE_SIZE);
return false;
}
/* Profile is valid and does not have bits outside of the allowed set */
if (alloc_profile_is_valid(bargs->target, 1) &&
(bargs->target & ~allowed) == 0)
@ -6312,7 +6293,7 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
u64 offset;
u64 stripe_offset;
u64 stripe_nr;
u64 stripe_len;
u32 stripe_len;
u64 raid56_full_stripe_start = (u64)-1;
int data_stripes;
@ -6323,19 +6304,13 @@ int btrfs_get_io_geometry(struct btrfs_fs_info *fs_info, struct extent_map *em,
offset = logical - em->start;
/* Len of a stripe in a chunk */
stripe_len = map->stripe_len;
/* Stripe where this block falls in */
stripe_nr = div64_u64(offset, stripe_len);
/* Offset of stripe in the chunk */
stripe_offset = stripe_nr * stripe_len;
if (offset < stripe_offset) {
btrfs_crit(fs_info,
"stripe math has gone wrong, stripe_offset=%llu offset=%llu start=%llu logical=%llu stripe_len=%llu",
stripe_offset, offset, em->start, logical, stripe_len);
return -EINVAL;
}
/*
* Stripe_nr is where this block falls in
* stripe_offset is the offset of this block in its stripe.
*/
stripe_nr = div64_u64_rem(offset, stripe_len, &stripe_offset);
ASSERT(stripe_offset < U32_MAX);
/* stripe_offset is the offset of this block in its stripe */
stripe_offset = offset - stripe_offset;
data_stripes = nr_data_stripes(map);
/* Only stripe based profiles needs to check against stripe length. */
@ -6737,11 +6712,11 @@ static void submit_stripe_bio(struct btrfs_io_context *bioc, struct bio *bio,
bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, rcu_str_deref(dev->name),
dev->devid, bio->bi_iter.bi_size);
bio_set_dev(bio, dev->bdev);
btrfs_bio_counter_inc_noblocked(fs_info);
btrfsic_submit_bio(bio);
btrfsic_check_bio(bio);
submit_bio(bio);
}
static void bioc_error(struct btrfs_io_context *bioc, struct bio *bio, u64 logical)
@ -6823,10 +6798,12 @@ blk_status_t btrfs_map_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
continue;
}
if (dev_nr < total_devs - 1)
bio = btrfs_bio_clone(first_bio);
else
if (dev_nr < total_devs - 1) {
bio = btrfs_bio_clone(dev->bdev, first_bio);
} else {
bio = first_bio;
bio_set_dev(bio, dev->bdev);
}
submit_stripe_bio(bioc, bio, bioc->stripes[dev_nr].physical, dev);
}
@ -7359,7 +7336,6 @@ static int read_one_dev(struct extent_buffer *leaf,
int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
{
struct btrfs_root *root = fs_info->tree_root;
struct btrfs_super_block *super_copy = fs_info->super_copy;
struct extent_buffer *sb;
struct btrfs_disk_key *disk_key;
@ -7375,30 +7351,16 @@ int btrfs_read_sys_array(struct btrfs_fs_info *fs_info)
struct btrfs_key key;
ASSERT(BTRFS_SUPER_INFO_SIZE <= fs_info->nodesize);
/*
* This will create extent buffer of nodesize, superblock size is
* fixed to BTRFS_SUPER_INFO_SIZE. If nodesize > sb size, this will
* overallocate but we can keep it as-is, only the first page is used.
* We allocated a dummy extent, just to use extent buffer accessors.
* There will be unused space after BTRFS_SUPER_INFO_SIZE, but
* that's fine, we will not go beyond system chunk array anyway.
*/
sb = btrfs_find_create_tree_block(fs_info, BTRFS_SUPER_INFO_OFFSET,
root->root_key.objectid, 0);
if (IS_ERR(sb))
return PTR_ERR(sb);
sb = alloc_dummy_extent_buffer(fs_info, BTRFS_SUPER_INFO_OFFSET);
if (!sb)
return -ENOMEM;
set_extent_buffer_uptodate(sb);
/*
* The sb extent buffer is artificial and just used to read the system array.
* set_extent_buffer_uptodate() call does not properly mark all it's
* pages up-to-date when the page is larger: extent does not cover the
* whole page and consequently check_page_uptodate does not find all
* the page's extents up-to-date (the hole beyond sb),
* write_extent_buffer then triggers a WARN_ON.
*
* Regular short extents go through mark_extent_buffer_dirty/writeback cycle,
* but sb spans only this function. Add an explicit SetPageUptodate call
* to silence the warning eg. on PowerPC 64.
*/
if (PAGE_SIZE > BTRFS_SUPER_INFO_SIZE)
SetPageUptodate(sb->pages[0]);
write_extent_buffer(sb, super_copy, 0, BTRFS_SUPER_INFO_SIZE);
array_size = btrfs_super_sys_array_size(super_copy);
@ -7561,6 +7523,7 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
struct btrfs_key found_key;
int ret;
int slot;
int iter_ret = 0;
u64 total_dev = 0;
u64 last_ra_node = 0;
@ -7604,30 +7567,18 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
key.objectid = BTRFS_DEV_ITEMS_OBJECTID;
key.offset = 0;
key.type = 0;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto error;
while (1) {
struct extent_buffer *node;
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *node = path->nodes[1];
leaf = path->nodes[0];
slot = path->slots[0];
if (slot >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret == 0)
continue;
if (ret < 0)
goto error;
break;
}
node = path->nodes[1];
if (node) {
if (last_ra_node != node->start) {
readahead_tree_node_children(node);
last_ra_node = node->start;
}
}
btrfs_item_key_to_cpu(leaf, &found_key, slot);
if (found_key.type == BTRFS_DEV_ITEM_KEY) {
struct btrfs_dev_item *dev_item;
dev_item = btrfs_item_ptr(leaf, slot,
@ -7652,7 +7603,11 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
if (ret)
goto error;
}
path->slots[0]++;
}
/* Catch error found during iteration */
if (iter_ret < 0) {
ret = iter_ret;
goto error;
}
/*
@ -7660,12 +7615,12 @@ int btrfs_read_chunk_tree(struct btrfs_fs_info *fs_info)
* do another round of validation checks.
*/
if (total_dev != fs_info->fs_devices->total_devices) {
btrfs_err(fs_info,
"super_num_devices %llu mismatch with num_devices %llu found here",
btrfs_warn(fs_info,
"super block num_devices %llu mismatch with DEV_ITEM count %llu, will be repaired on next transaction commit",
btrfs_super_num_devices(fs_info->super_copy),
total_dev);
ret = -EINVAL;
goto error;
fs_info->fs_devices->total_devices = total_dev;
btrfs_set_super_num_devices(fs_info->super_copy, total_dev);
}
if (btrfs_super_total_bytes(fs_info->super_copy) <
fs_info->fs_devices->total_rw_bytes) {
@ -8277,7 +8232,7 @@ bool btrfs_pinned_by_swapfile(struct btrfs_fs_info *fs_info, void *ptr)
static int relocating_repair_kthread(void *data)
{
struct btrfs_block_group *cache = (struct btrfs_block_group *)data;
struct btrfs_block_group *cache = data;
struct btrfs_fs_info *fs_info = cache->fs_info;
u64 target;
int ret = 0;

View File

@ -17,17 +17,51 @@ extern struct mutex uuid_mutex;
#define BTRFS_STRIPE_LEN SZ_64K
/* Used by sanity check for btrfs_raid_types. */
#define const_ffs(n) (__builtin_ctzll(n) + 1)
/*
* The conversion from BTRFS_BLOCK_GROUP_* bits to btrfs_raid_type requires
* RAID0 always to be the lowest profile bit.
* Although it's part of on-disk format and should never change, do extra
* compile-time sanity checks.
*/
static_assert(const_ffs(BTRFS_BLOCK_GROUP_RAID0) <
const_ffs(BTRFS_BLOCK_GROUP_PROFILE_MASK & ~BTRFS_BLOCK_GROUP_RAID0));
static_assert(const_ilog2(BTRFS_BLOCK_GROUP_RAID0) >
ilog2(BTRFS_BLOCK_GROUP_TYPE_MASK));
/* ilog2() can handle both constants and variables */
#define BTRFS_BG_FLAG_TO_INDEX(profile) \
ilog2((profile) >> (ilog2(BTRFS_BLOCK_GROUP_RAID0) - 1))
enum btrfs_raid_types {
/* SINGLE is the special one as it doesn't have on-disk bit. */
BTRFS_RAID_SINGLE = 0,
BTRFS_RAID_RAID0 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID0),
BTRFS_RAID_RAID1 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1),
BTRFS_RAID_DUP = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_DUP),
BTRFS_RAID_RAID10 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID10),
BTRFS_RAID_RAID5 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID5),
BTRFS_RAID_RAID6 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID6),
BTRFS_RAID_RAID1C3 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C3),
BTRFS_RAID_RAID1C4 = BTRFS_BG_FLAG_TO_INDEX(BTRFS_BLOCK_GROUP_RAID1C4),
BTRFS_NR_RAID_TYPES
};
struct btrfs_io_geometry {
/* remaining bytes before crossing a stripe */
u64 len;
/* offset of logical address in chunk */
u64 offset;
/* length of single IO stripe */
u64 stripe_len;
u32 stripe_len;
/* offset of address in stripe */
u32 stripe_offset;
/* number of stripe where address falls */
u64 stripe_nr;
/* offset of address in stripe */
u64 stripe_offset;
/* offset of raid56 stripe into the chunk */
u64 raid56_stripe_offset;
};
@ -430,7 +464,7 @@ struct map_lookup {
u64 type;
int io_align;
int io_width;
u64 stripe_len;
u32 stripe_len;
int num_stripes;
int sub_stripes;
int verified_stripes; /* For mount time dev extent verification */

View File

@ -272,10 +272,12 @@ out:
ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
{
struct btrfs_key found_key;
struct btrfs_key key;
struct inode *inode = d_inode(dentry);
struct btrfs_root *root = BTRFS_I(inode)->root;
struct btrfs_path *path;
int iter_ret = 0;
int ret = 0;
size_t total_size = 0, size_left = size;
@ -294,44 +296,23 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
path->reada = READA_FORWARD;
/* search for our xattrs */
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto err;
while (1) {
btrfs_for_each_slot(root, &key, &found_key, path, iter_ret) {
struct extent_buffer *leaf;
int slot;
struct btrfs_dir_item *di;
struct btrfs_key found_key;
u32 item_size;
u32 cur;
leaf = path->nodes[0];
slot = path->slots[0];
/* this is where we start walking through the path */
if (slot >= btrfs_header_nritems(leaf)) {
/*
* if we've reached the last slot in this leaf we need
* to go to the next leaf and reset everything
*/
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto err;
else if (ret > 0)
break;
continue;
}
btrfs_item_key_to_cpu(leaf, &found_key, slot);
/* check to make sure this item is what we want */
if (found_key.objectid != key.objectid)
break;
if (found_key.type > BTRFS_XATTR_ITEM_KEY)
break;
if (found_key.type < BTRFS_XATTR_ITEM_KEY)
goto next_item;
continue;
di = btrfs_item_ptr(leaf, slot, struct btrfs_dir_item);
item_size = btrfs_item_size(leaf, slot);
@ -351,8 +332,8 @@ ssize_t btrfs_listxattr(struct dentry *dentry, char *buffer, size_t size)
goto next;
if (!buffer || (name_len + 1) > size_left) {
ret = -ERANGE;
goto err;
iter_ret = -ERANGE;
break;
}
read_extent_buffer(leaf, buffer, name_ptr, name_len);
@ -364,12 +345,13 @@ next:
cur += this_len;
di = (struct btrfs_dir_item *)((char *)di + this_len);
}
next_item:
path->slots[0]++;
}
ret = total_size;
err:
if (iter_ret < 0)
ret = iter_ret;
else
ret = total_size;
btrfs_free_path(path);
return ret;

View File

@ -51,11 +51,13 @@
#define BTRFS_MIN_ACTIVE_ZONES (BTRFS_SUPER_MIRROR_MAX + 5)
/*
* Maximum supported zone size. Currently, SMR disks have a zone size of
* 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range. We do not
* expect the zone size to become larger than 8GiB in the near future.
* Minimum / maximum supported zone size. Currently, SMR disks have a zone
* size of 256MiB, and we are expecting ZNS drives to be in the 1-4GiB range.
* We do not expect the zone size to become larger than 8GiB or smaller than
* 4MiB in the near future.
*/
#define BTRFS_MAX_ZONE_SIZE SZ_8G
#define BTRFS_MIN_ZONE_SIZE SZ_4M
#define SUPER_INFO_SECTORS ((u64)BTRFS_SUPER_INFO_SIZE >> SECTOR_SHIFT)
@ -401,6 +403,13 @@ int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache)
zone_info->zone_size, BTRFS_MAX_ZONE_SIZE);
ret = -EINVAL;
goto out;
} else if (zone_info->zone_size < BTRFS_MIN_ZONE_SIZE) {
btrfs_err_in_rcu(fs_info,
"zoned: %s: zone size %llu smaller than supported minimum %u",
rcu_str_deref(device->name),
zone_info->zone_size, BTRFS_MIN_ZONE_SIZE);
ret = -EINVAL;
goto out;
}
nr_sectors = bdev_nr_sectors(bdev);
@ -1835,7 +1844,7 @@ bool btrfs_zone_activate(struct btrfs_block_group *block_group)
}
/* No space left */
if (block_group->alloc_offset == block_group->zone_capacity) {
if (btrfs_zoned_bg_is_full(block_group)) {
ret = false;
goto out_unlock;
}
@ -1872,20 +1881,14 @@ out_unlock:
return ret;
}
int btrfs_zone_finish(struct btrfs_block_group *block_group)
static int do_zone_finish(struct btrfs_block_group *block_group, bool fully_written)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
struct map_lookup *map;
struct btrfs_device *device;
u64 physical;
bool need_zone_finish;
int ret = 0;
int i;
if (!btrfs_is_zoned(fs_info))
return 0;
map = block_group->physical_map;
spin_lock(&block_group->lock);
if (!block_group->zone_is_active) {
spin_unlock(&block_group->lock);
@ -1895,39 +1898,55 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
/* Check if we have unwritten allocated space */
if ((block_group->flags &
(BTRFS_BLOCK_GROUP_METADATA | BTRFS_BLOCK_GROUP_SYSTEM)) &&
block_group->alloc_offset > block_group->meta_write_pointer) {
block_group->start + block_group->alloc_offset > block_group->meta_write_pointer) {
spin_unlock(&block_group->lock);
return -EAGAIN;
}
spin_unlock(&block_group->lock);
ret = btrfs_inc_block_group_ro(block_group, false);
if (ret)
return ret;
/* Ensure all writes in this block group finish */
btrfs_wait_block_group_reservations(block_group);
/* No need to wait for NOCOW writers. Zoned mode does not allow that. */
btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
block_group->length);
spin_lock(&block_group->lock);
/*
* Bail out if someone already deactivated the block group, or
* allocated space is left in the block group.
* If we are sure that the block group is full (= no more room left for
* new allocation) and the IO for the last usable block is completed, we
* don't need to wait for the other IOs. This holds because we ensure
* the sequential IO submissions using the ZONE_APPEND command for data
* and block_group->meta_write_pointer for metadata.
*/
if (!block_group->zone_is_active) {
if (!fully_written) {
spin_unlock(&block_group->lock);
btrfs_dec_block_group_ro(block_group);
return 0;
ret = btrfs_inc_block_group_ro(block_group, false);
if (ret)
return ret;
/* Ensure all writes in this block group finish */
btrfs_wait_block_group_reservations(block_group);
/* No need to wait for NOCOW writers. Zoned mode does not allow that */
btrfs_wait_ordered_roots(fs_info, U64_MAX, block_group->start,
block_group->length);
spin_lock(&block_group->lock);
/*
* Bail out if someone already deactivated the block group, or
* allocated space is left in the block group.
*/
if (!block_group->zone_is_active) {
spin_unlock(&block_group->lock);
btrfs_dec_block_group_ro(block_group);
return 0;
}
if (block_group->reserved) {
spin_unlock(&block_group->lock);
btrfs_dec_block_group_ro(block_group);
return -EAGAIN;
}
}
if (block_group->reserved) {
spin_unlock(&block_group->lock);
btrfs_dec_block_group_ro(block_group);
return -EAGAIN;
}
/*
* The block group is not fully allocated, so not fully written yet. We
* need to send ZONE_FINISH command to free up an active zone.
*/
need_zone_finish = !btrfs_zoned_bg_is_full(block_group);
block_group->zone_is_active = 0;
block_group->alloc_offset = block_group->zone_capacity;
@ -1936,24 +1955,29 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
btrfs_clear_data_reloc_bg(block_group);
spin_unlock(&block_group->lock);
map = block_group->physical_map;
for (i = 0; i < map->num_stripes; i++) {
device = map->stripes[i].dev;
physical = map->stripes[i].physical;
struct btrfs_device *device = map->stripes[i].dev;
const u64 physical = map->stripes[i].physical;
if (device->zone_info->max_active_zones == 0)
continue;
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
device->zone_info->zone_size >> SECTOR_SHIFT,
GFP_NOFS);
if (need_zone_finish) {
ret = blkdev_zone_mgmt(device->bdev, REQ_OP_ZONE_FINISH,
physical >> SECTOR_SHIFT,
device->zone_info->zone_size >> SECTOR_SHIFT,
GFP_NOFS);
if (ret)
return ret;
if (ret)
return ret;
}
btrfs_dev_clear_active_zone(device, physical);
}
btrfs_dec_block_group_ro(block_group);
if (!fully_written)
btrfs_dec_block_group_ro(block_group);
spin_lock(&fs_info->zone_active_bgs_lock);
ASSERT(!list_empty(&block_group->active_bg_list));
@ -1966,6 +1990,14 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group)
return 0;
}
int btrfs_zone_finish(struct btrfs_block_group *block_group)
{
if (!btrfs_is_zoned(block_group->fs_info))
return 0;
return do_zone_finish(block_group, false);
}
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
{
struct btrfs_fs_info *fs_info = fs_devices->fs_info;
@ -1997,9 +2029,7 @@ bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags)
void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 length)
{
struct btrfs_block_group *block_group;
struct map_lookup *map;
struct btrfs_device *device;
u64 physical;
u64 min_alloc_bytes;
if (!btrfs_is_zoned(fs_info))
return;
@ -2007,44 +2037,54 @@ void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical, u64 len
block_group = btrfs_lookup_block_group(fs_info, logical);
ASSERT(block_group);
if (logical + length < block_group->start + block_group->zone_capacity)
/* No MIXED_BG on zoned btrfs. */
if (block_group->flags & BTRFS_BLOCK_GROUP_DATA)
min_alloc_bytes = fs_info->sectorsize;
else
min_alloc_bytes = fs_info->nodesize;
/* Bail out if we can allocate more data from this block group. */
if (logical + length + min_alloc_bytes <=
block_group->start + block_group->zone_capacity)
goto out;
spin_lock(&block_group->lock);
if (!block_group->zone_is_active) {
spin_unlock(&block_group->lock);
goto out;
}
block_group->zone_is_active = 0;
/* We should have consumed all the free space */
ASSERT(block_group->alloc_offset == block_group->zone_capacity);
ASSERT(block_group->free_space_ctl->free_space == 0);
btrfs_clear_treelog_bg(block_group);
btrfs_clear_data_reloc_bg(block_group);
spin_unlock(&block_group->lock);
map = block_group->physical_map;
device = map->stripes[0].dev;
physical = map->stripes[0].physical;
if (!device->zone_info->max_active_zones)
goto out;
btrfs_dev_clear_active_zone(device, physical);
spin_lock(&fs_info->zone_active_bgs_lock);
ASSERT(!list_empty(&block_group->active_bg_list));
list_del_init(&block_group->active_bg_list);
spin_unlock(&fs_info->zone_active_bgs_lock);
btrfs_put_block_group(block_group);
do_zone_finish(block_group, true);
out:
btrfs_put_block_group(block_group);
}
static void btrfs_zone_finish_endio_workfn(struct work_struct *work)
{
struct btrfs_block_group *bg =
container_of(work, struct btrfs_block_group, zone_finish_work);
wait_on_extent_buffer_writeback(bg->last_eb);
free_extent_buffer(bg->last_eb);
btrfs_zone_finish_endio(bg->fs_info, bg->start, bg->length);
btrfs_put_block_group(bg);
}
void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb)
{
if (!bg->seq_zone || eb->start + eb->len * 2 <= bg->start + bg->zone_capacity)
return;
if (WARN_ON(bg->zone_finish_work.func == btrfs_zone_finish_endio_workfn)) {
btrfs_err(bg->fs_info, "double scheduling of bg %llu zone finishing",
bg->start);
return;
}
/* For the work */
btrfs_get_block_group(bg);
atomic_inc(&eb->refs);
bg->last_eb = eb;
INIT_WORK(&bg->zone_finish_work, btrfs_zone_finish_endio_workfn);
queue_work(system_unbound_wq, &bg->zone_finish_work);
}
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg)
{
struct btrfs_fs_info *fs_info = bg->fs_info;
@ -2072,3 +2112,30 @@ void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info)
}
mutex_unlock(&fs_devices->device_list_mutex);
}
bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
{
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
u64 used = 0;
u64 total = 0;
u64 factor;
ASSERT(btrfs_is_zoned(fs_info));
if (fs_info->bg_reclaim_threshold == 0)
return false;
mutex_lock(&fs_devices->device_list_mutex);
list_for_each_entry(device, &fs_devices->devices, dev_list) {
if (!device->bdev)
continue;
total += device->disk_total_bytes;
used += device->bytes_used;
}
mutex_unlock(&fs_devices->device_list_mutex);
factor = div64_u64(used * 100, total);
return factor >= fs_info->bg_reclaim_threshold;
}

View File

@ -10,11 +10,7 @@
#include "block-group.h"
#include "btrfs_inode.h"
/*
* Block groups with more than this value (percents) of unusable space will be
* scheduled for background reclaim.
*/
#define BTRFS_DEFAULT_RECLAIM_THRESH 75
#define BTRFS_DEFAULT_RECLAIM_THRESH (75)
struct btrfs_zoned_device_info {
/*
@ -76,8 +72,11 @@ int btrfs_zone_finish(struct btrfs_block_group *block_group);
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
u64 length);
void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb);
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
#else /* CONFIG_BLK_DEV_ZONED */
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
struct blk_zone *zone)
@ -233,9 +232,17 @@ static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
u64 logical, u64 length) { }
static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
struct extent_buffer *eb) { }
static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
{
return false;
}
#endif
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
@ -370,4 +377,10 @@ static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
}
static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg)
{
ASSERT(btrfs_is_zoned(bg->fs_info));
return (bg->alloc_offset == bg->zone_capacity);
}
#endif

View File

@ -93,22 +93,26 @@ static inline struct workspace *list_to_workspace(struct list_head *list)
void zstd_free_workspace(struct list_head *ws);
struct list_head *zstd_alloc_workspace(unsigned int level);
/*
* zstd_reclaim_timer_fn - reclaim timer
/**
* Timer callback to free unused workspaces.
*
* @t: timer
*
* This scans the lru_list and attempts to reclaim any workspace that hasn't
* been used for ZSTD_BTRFS_RECLAIM_JIFFIES.
*
* The context is softirq and does not need the _bh locking primitives.
*/
static void zstd_reclaim_timer_fn(struct timer_list *timer)
{
unsigned long reclaim_threshold = jiffies - ZSTD_BTRFS_RECLAIM_JIFFIES;
struct list_head *pos, *next;
spin_lock_bh(&wsm.lock);
spin_lock(&wsm.lock);
if (list_empty(&wsm.lru_list)) {
spin_unlock_bh(&wsm.lock);
spin_unlock(&wsm.lock);
return;
}
@ -137,7 +141,7 @@ static void zstd_reclaim_timer_fn(struct timer_list *timer)
if (!list_empty(&wsm.lru_list))
mod_timer(&wsm.timer, jiffies + ZSTD_BTRFS_RECLAIM_JIFFIES);
spin_unlock_bh(&wsm.lock);
spin_unlock(&wsm.lock);
}
/*

View File

@ -399,7 +399,7 @@ static ssize_t erofs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
if (!err)
return iomap_dio_rw(iocb, to, &erofs_iomap_ops,
NULL, 0, 0);
NULL, 0, NULL, 0);
if (err < 0)
return err;
}

View File

@ -76,7 +76,7 @@ static ssize_t ext4_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
return generic_file_read_iter(iocb, to);
}
ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, 0);
ret = iomap_dio_rw(iocb, to, &ext4_iomap_ops, NULL, 0, NULL, 0);
inode_unlock_shared(inode);
file_accessed(iocb->ki_filp);
@ -565,7 +565,7 @@ static ssize_t ext4_dio_write_iter(struct kiocb *iocb, struct iov_iter *from)
iomap_ops = &ext4_iomap_overwrite_ops;
ret = iomap_dio_rw(iocb, from, iomap_ops, &ext4_dio_write_ops,
(unaligned_io || extend) ? IOMAP_DIO_FORCE_WAIT : 0,
0);
NULL, 0);
if (ret == -ENOTBLK)
ret = 0;

View File

@ -4308,7 +4308,7 @@ static ssize_t f2fs_dio_read_iter(struct kiocb *iocb, struct iov_iter *to)
*/
inc_page_count(sbi, F2FS_DIO_READ);
dio = __iomap_dio_rw(iocb, to, &f2fs_iomap_ops,
&f2fs_iomap_dio_read_ops, 0, 0);
&f2fs_iomap_dio_read_ops, 0, NULL, 0);
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
if (ret != -EIOCBQUEUED)
@ -4526,7 +4526,7 @@ static ssize_t f2fs_dio_write_iter(struct kiocb *iocb, struct iov_iter *from,
if (pos + count > inode->i_size)
dio_flags |= IOMAP_DIO_FORCE_WAIT;
dio = __iomap_dio_rw(iocb, from, &f2fs_iomap_ops,
&f2fs_iomap_dio_write_ops, dio_flags, 0);
&f2fs_iomap_dio_write_ops, dio_flags, NULL, 0);
if (IS_ERR_OR_NULL(dio)) {
ret = PTR_ERR_OR_ZERO(dio);
if (ret == -ENOTBLK)

View File

@ -835,7 +835,7 @@ retry:
pagefault_disable();
to->nofault = true;
ret = iomap_dio_rw(iocb, to, &gfs2_iomap_ops, NULL,
IOMAP_DIO_PARTIAL, read);
IOMAP_DIO_PARTIAL, NULL, read);
to->nofault = false;
pagefault_enable();
if (ret <= 0 && ret != -EFAULT)
@ -898,7 +898,7 @@ retry:
from->nofault = true;
ret = iomap_dio_rw(iocb, from, &gfs2_iomap_ops, NULL,
IOMAP_DIO_PARTIAL, written);
IOMAP_DIO_PARTIAL, NULL, written);
from->nofault = false;
if (ret <= 0) {
if (ret == -ENOTBLK)

View File

@ -51,6 +51,15 @@ struct iomap_dio {
};
};
static struct bio *iomap_dio_alloc_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, unsigned short nr_vecs, unsigned int opf)
{
if (dio->dops && dio->dops->bio_set)
return bio_alloc_bioset(iter->iomap.bdev, nr_vecs, opf,
GFP_KERNEL, dio->dops->bio_set);
return bio_alloc(iter->iomap.bdev, nr_vecs, opf, GFP_KERNEL);
}
static void iomap_dio_submit_bio(const struct iomap_iter *iter,
struct iomap_dio *dio, struct bio *bio, loff_t pos)
{
@ -145,7 +154,7 @@ static inline void iomap_dio_set_error(struct iomap_dio *dio, int ret)
cmpxchg(&dio->error, 0, ret);
}
static void iomap_dio_bio_end_io(struct bio *bio)
void iomap_dio_bio_end_io(struct bio *bio)
{
struct iomap_dio *dio = bio->bi_private;
bool should_dirty = (dio->flags & IOMAP_DIO_DIRTY);
@ -177,16 +186,16 @@ static void iomap_dio_bio_end_io(struct bio *bio)
bio_put(bio);
}
}
EXPORT_SYMBOL_GPL(iomap_dio_bio_end_io);
static void iomap_dio_zero(const struct iomap_iter *iter, struct iomap_dio *dio,
loff_t pos, unsigned len)
{
struct inode *inode = file_inode(dio->iocb->ki_filp);
struct page *page = ZERO_PAGE(0);
int flags = REQ_SYNC | REQ_IDLE;
struct bio *bio;
bio = bio_alloc(iter->iomap.bdev, 1, REQ_OP_WRITE | flags, GFP_KERNEL);
bio = iomap_dio_alloc_bio(iter, dio, 1, REQ_OP_WRITE | REQ_SYNC | REQ_IDLE);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(&iter->iomap, pos);
@ -311,7 +320,7 @@ static loff_t iomap_dio_bio_iter(const struct iomap_iter *iter,
goto out;
}
bio = bio_alloc(iomap->bdev, nr_pages, bio_opf, GFP_KERNEL);
bio = iomap_dio_alloc_bio(iter, dio, nr_pages, bio_opf);
fscrypt_set_bio_crypt_ctx(bio, inode, pos >> inode->i_blkbits,
GFP_KERNEL);
bio->bi_iter.bi_sector = iomap_sector(iomap, pos);
@ -474,7 +483,7 @@ static loff_t iomap_dio_iter(const struct iomap_iter *iter,
struct iomap_dio *
__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, size_t done_before)
unsigned int dio_flags, void *private, size_t done_before)
{
struct address_space *mapping = iocb->ki_filp->f_mapping;
struct inode *inode = file_inode(iocb->ki_filp);
@ -483,6 +492,7 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
.pos = iocb->ki_pos,
.len = iov_iter_count(iter),
.flags = IOMAP_DIRECT,
.private = private,
};
loff_t end = iomi.pos + iomi.len - 1, ret = 0;
bool wait_for_completion =
@ -672,11 +682,12 @@ EXPORT_SYMBOL_GPL(__iomap_dio_rw);
ssize_t
iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, size_t done_before)
unsigned int dio_flags, void *private, size_t done_before)
{
struct iomap_dio *dio;
dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, done_before);
dio = __iomap_dio_rw(iocb, iter, ops, dops, dio_flags, private,
done_before);
if (IS_ERR_OR_NULL(dio))
return PTR_ERR_OR_ZERO(dio);
return iomap_dio_complete(dio);

View File

@ -225,7 +225,7 @@ xfs_file_dio_read(
ret = xfs_ilock_iocb(iocb, XFS_IOLOCK_SHARED);
if (ret)
return ret;
ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, 0);
ret = iomap_dio_rw(iocb, to, &xfs_read_iomap_ops, NULL, 0, NULL, 0);
xfs_iunlock(ip, XFS_IOLOCK_SHARED);
return ret;
@ -534,7 +534,7 @@ xfs_file_dio_write_aligned(
}
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
&xfs_dio_write_ops, 0, 0);
&xfs_dio_write_ops, 0, NULL, 0);
out_unlock:
if (iolock)
xfs_iunlock(ip, iolock);
@ -612,7 +612,7 @@ retry_exclusive:
trace_xfs_file_direct_write(iocb, from);
ret = iomap_dio_rw(iocb, from, &xfs_direct_write_iomap_ops,
&xfs_dio_write_ops, flags, 0);
&xfs_dio_write_ops, flags, NULL, 0);
/*
* Retry unaligned I/O with exclusive blocking semantics if the DIO

View File

@ -900,7 +900,7 @@ static ssize_t zonefs_file_dio_write(struct kiocb *iocb, struct iov_iter *from)
ret = zonefs_file_dio_append(iocb, from);
else
ret = iomap_dio_rw(iocb, from, &zonefs_iomap_ops,
&zonefs_write_dio_ops, 0, 0);
&zonefs_write_dio_ops, 0, NULL, 0);
if (zi->i_ztype == ZONEFS_ZTYPE_SEQ &&
(ret > 0 || ret == -EIOCBQUEUED)) {
if (ret > 0)
@ -1042,7 +1042,7 @@ static ssize_t zonefs_file_read_iter(struct kiocb *iocb, struct iov_iter *to)
}
file_accessed(iocb->ki_filp);
ret = iomap_dio_rw(iocb, to, &zonefs_iomap_ops,
&zonefs_read_dio_ops, 0, 0);
&zonefs_read_dio_ops, 0, NULL, 0);
} else {
ret = generic_file_read_iter(iocb, to);
if (ret == -EIO)

View File

@ -1708,6 +1708,11 @@ static inline bool __sb_start_write_trylock(struct super_block *sb, int level)
#define __sb_writers_release(sb, lev) \
percpu_rwsem_release(&(sb)->s_writers.rw_sem[(lev)-1], 1, _THIS_IP_)
static inline bool sb_write_started(const struct super_block *sb)
{
return lockdep_is_held_type(sb->s_writers.rw_sem + SB_FREEZE_WRITE - 1, 1);
}
/**
* sb_end_write - drop write access to a superblock
* @sb: the super we wrote to

View File

@ -188,6 +188,7 @@ struct iomap_iter {
unsigned flags;
struct iomap iomap;
struct iomap srcmap;
void *private;
};
int iomap_iter(struct iomap_iter *iter, const struct iomap_ops *ops);
@ -320,6 +321,16 @@ struct iomap_dio_ops {
unsigned flags);
void (*submit_io)(const struct iomap_iter *iter, struct bio *bio,
loff_t file_offset);
/*
* Filesystems wishing to attach private information to a direct io bio
* must provide a ->submit_io method that attaches the additional
* information to the bio and changes the ->bi_end_io callback to a
* custom function. This function should, at a minimum, perform any
* relevant post-processing of the bio and end with a call to
* iomap_dio_bio_end_io.
*/
struct bio_set *bio_set;
};
/*
@ -344,11 +355,12 @@ struct iomap_dio_ops {
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, size_t done_before);
unsigned int dio_flags, void *private, size_t done_before);
struct iomap_dio *__iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
unsigned int dio_flags, size_t done_before);
unsigned int dio_flags, void *private, size_t done_before);
ssize_t iomap_dio_complete(struct iomap_dio *dio);
void iomap_dio_bio_end_io(struct bio *bio);
#ifdef CONFIG_SWAP
struct file;

View File

@ -24,7 +24,7 @@ struct btrfs_free_cluster;
struct map_lookup;
struct extent_buffer;
struct btrfs_work;
struct __btrfs_workqueue;
struct btrfs_workqueue;
struct btrfs_qgroup_extent_record;
struct btrfs_qgroup;
struct extent_io_tree;
@ -1457,42 +1457,36 @@ DEFINE_EVENT(btrfs__work, btrfs_ordered_sched,
TP_ARGS(work)
);
DECLARE_EVENT_CLASS(btrfs__workqueue,
DECLARE_EVENT_CLASS(btrfs_workqueue,
TP_PROTO(const struct __btrfs_workqueue *wq,
const char *name, int high),
TP_PROTO(const struct btrfs_workqueue *wq, const char *name),
TP_ARGS(wq, name, high),
TP_ARGS(wq, name),
TP_STRUCT__entry_btrfs(
__field( const void *, wq )
__string( name, name )
__field( int , high )
),
TP_fast_assign_btrfs(btrfs_workqueue_owner(wq),
__entry->wq = wq;
__assign_str(name, name);
__entry->high = high;
),
TP_printk_btrfs("name=%s%s wq=%p", __get_str(name),
__print_flags(__entry->high, "",
{(WQ_HIGHPRI), "-high"}),
TP_printk_btrfs("name=%s wq=%p", __get_str(name),
__entry->wq)
);
DEFINE_EVENT(btrfs__workqueue, btrfs_workqueue_alloc,
DEFINE_EVENT(btrfs_workqueue, btrfs_workqueue_alloc,
TP_PROTO(const struct __btrfs_workqueue *wq,
const char *name, int high),
TP_PROTO(const struct btrfs_workqueue *wq, const char *name),
TP_ARGS(wq, name, high)
TP_ARGS(wq, name)
);
DECLARE_EVENT_CLASS(btrfs__workqueue_done,
DECLARE_EVENT_CLASS(btrfs_workqueue_done,
TP_PROTO(const struct __btrfs_workqueue *wq),
TP_PROTO(const struct btrfs_workqueue *wq),
TP_ARGS(wq),
@ -1507,9 +1501,9 @@ DECLARE_EVENT_CLASS(btrfs__workqueue_done,
TP_printk_btrfs("wq=%p", __entry->wq)
);
DEFINE_EVENT(btrfs__workqueue_done, btrfs_workqueue_destroy,
DEFINE_EVENT(btrfs_workqueue_done, btrfs_workqueue_destroy,
TP_PROTO(const struct __btrfs_workqueue *wq),
TP_PROTO(const struct btrfs_workqueue *wq),
TP_ARGS(wq)
);

View File

@ -880,19 +880,6 @@ struct btrfs_dev_replace_item {
#define BTRFS_BLOCK_GROUP_RESERVED (BTRFS_AVAIL_ALLOC_BIT_SINGLE | \
BTRFS_SPACE_INFO_GLOBAL_RSV)
enum btrfs_raid_types {
BTRFS_RAID_RAID10,
BTRFS_RAID_RAID1,
BTRFS_RAID_DUP,
BTRFS_RAID_RAID0,
BTRFS_RAID_SINGLE,
BTRFS_RAID_RAID5,
BTRFS_RAID_RAID6,
BTRFS_RAID_RAID1C3,
BTRFS_RAID_RAID1C4,
BTRFS_NR_RAID_TYPES
};
#define BTRFS_BLOCK_GROUP_TYPE_MASK (BTRFS_BLOCK_GROUP_DATA | \
BTRFS_BLOCK_GROUP_SYSTEM | \
BTRFS_BLOCK_GROUP_METADATA)