for-6.2-tag

-----BEGIN PGP SIGNATURE-----
 
 iQIzBAABCgAdFiEE8rQSAMVO+zA4DBdWxWXV+ddtWDsFAmOSLtIACgkQxWXV+ddt
 WDvpQA//dQ3Wosz5puFNiZvoSUn/BnYJueZHjwF0bWY8OYINkF1PvDenu/WotyFz
 Ozf4Yl4Afxncz+FjDnOtlpr6KsSU5NqdGM3NrY0eNsxd2t1KrTsN0LgkA4m24p8b
 YsYp7pygbMm7c+h0X4uFpebY4lABkEPCBXnI//ktsls0xG5sOvGfZA3rdUP0bou2
 JTn6hk+s0cLTNoTiOCGNHRJbeTzHLR0viZj/E4LCJfCeJvAmOLZamUjqe9sBNYAg
 YtsrZTpUIL3JgmRi5B6jG4fHSXOnE14mKmRIR3xPME6J6eoYyNOeuSh1oNmJEuoE
 B7nD5We+x5+isjXNw/V5CQrs7FF09UbdpbNb9NF5CYQWv40OCeefuai1opGtBUxX
 dvbfmf1blYpWW/wfFOKQwMOsl8kZIZYx68FW2OBUNglB6yRpX/3QgFSGb8kPCr83
 DW2ttqwkpSNPMKk92I/owIc4BRvZ+LMR/PimEHB/Sa2apZA2/L+7RGwoaaei1QNX
 1tJxHWeJFLDZ+YRxjO1eKqhWdGQPn1kkq8LoXLi3tGaNF4kYQfhWOSM3WRowvx1q
 f99XRgA8JQnqZS83zqRIspWlpFK0CFdvzG1Zlqx+eoxERfeaMNA2fHxv1YCyFV4+
 TiXgsnCo+PIBwlvL/HjUWZgYE9+AD+NN5vyoE2UDYff4AgBFTE8=
 =Nqg9
 -----END PGP SIGNATURE-----

Merge tag 'for-6.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux

Pull btrfs updates from David Sterba:
 "This round there are a lot of cleanups and moved code so the diffstat
  looks huge, otherwise there are some nice performance improvements and
  an update to raid56 reliability.

  User visible features:

   - raid56 reliability vs performance trade off:
      - fix destructive RMW for raid5 data (raid6 still needs work): do
        full checksum verification for all data during RMW cycle, this
        should prevent rewriting potentially corrupted data without
        notice
      - stripes are cached in memory which should reduce the performance
        impact but still can hurt some workloads
      - checksums are verified after repair again
      - this is the last option without introducing additional features
        (write intent bitmap, journal, another tree), the extra checksum
        read/verification was supposed to be avoided by the original
        implementation exactly for performance reasons but that caused
        all the reliability problems

   - discard=async by default for devices that support it

   - implement emergency flush reserve to avoid almost all unnecessary
     transaction aborts due to ENOSPC in cases where there are too many
     delayed refs or delayed allocation

   - skip block group synchronization if there's no change in used
     bytes, can reduce transaction commit count for some workloads

  Performance improvements:

   - fiemap and lseek:
      - overall speedup due to skipping unnecessary or duplicate
        searches (-40% run time)
      - cache some data structures and sharedness of extents (-30% run
        time)

   - send:
      - faster backref resolution when finding clones
      - cached leaf to root mapping for faster backref walking
      - improved clone/sharing detection
      - overall run time improvements (-70%)

  Core:

   - module initialization converted to a table of function pointers run
     in a sequence

   - preparation for fscrypt, extend passing file names across calls,
     dir item can store encryption status

   - raid56 updates:
      - more accurate error tracking of sectors within stripe
      - simplify recovery path and remove dedicated endio worker kthread
      - simplify scrub call paths
      - refactoring to support the extra data checksum verification
        during RMW cycle

   - tree block parentness checks consolidated and done at metadata read
     time

   - improved error handling

   - cleanups:
      - move a lot of code for better synchronization between kernel and
        user space sources, split big files
      - enum cleanups
      - GFP flag cleanups
      - header file cleanups, prototypes, dependencies
      - redundant parameter cleanups
      - inline extent handling simplifications
      - inode parameter conversion
      - data structure cleanups, reductions, renames, merges"

* tag 'for-6.2-tag' of git://git.kernel.org/pub/scm/linux/kernel/git/kdave/linux: (249 commits)
  btrfs: print transaction aborted messages with an error level
  btrfs: sync some cleanups from progs into uapi/btrfs.h
  btrfs: do not BUG_ON() on ENOMEM when dropping extent items for a range
  btrfs: fix extent map use-after-free when handling missing device in read_one_chunk
  btrfs: remove outdated logic from overwrite_item() and add assertion
  btrfs: unify overwrite_item() and do_overwrite_item()
  btrfs: replace strncpy() with strscpy()
  btrfs: fix uninitialized variable in find_first_clear_extent_bit
  btrfs: fix uninitialized parent in insert_state
  btrfs: add might_sleep() annotations
  btrfs: add stack helpers for a few btrfs items
  btrfs: add nr_global_roots to the super block definition
  btrfs: remove BTRFS_LEAF_DATA_OFFSET
  btrfs: add helpers for manipulating leaf items and data
  btrfs: add eb to btrfs_node_key_ptr_offset
  btrfs: pass the extent buffer for the btrfs_item_nr helpers
  btrfs: move the csum helpers into ctree.h
  btrfs: move eb offset helpers into extent_io.h
  btrfs: move file_extent_item helpers into file-item.h
  btrfs: move leaf_data_end into ctree.c
  ...
This commit is contained in:
Linus Torvalds 2022-12-12 20:47:51 -08:00
commit 149c51f876
121 changed files with 11247 additions and 9489 deletions

View File

@ -23,15 +23,15 @@ obj-$(CONFIG_BTRFS_FS) := btrfs.o
btrfs-y += super.o ctree.o extent-tree.o print-tree.o root-tree.o dir-item.o \
file-item.o inode-item.o disk-io.o \
transaction.o inode.o file.o tree-defrag.o \
extent_map.o sysfs.o struct-funcs.o xattr.o ordered-data.o \
transaction.o inode.o file.o defrag.o \
extent_map.o sysfs.o accessors.o xattr.o ordered-data.o \
extent_io.o volumes.o async-thread.o ioctl.o locking.o orphan.o \
export.o tree-log.o free-space-cache.o zlib.o lzo.o zstd.o \
compression.o delayed-ref.o relocation.o delayed-inode.o scrub.o \
backref.o ulist.o qgroup.o send.o dev-replace.o raid56.o \
uuid-tree.o props.o free-space-tree.o tree-checker.o space-info.o \
block-rsv.o delalloc-space.o block-group.o discard.o reflink.o \
subpage.o tree-mod-log.o extent-io-tree.o
subpage.o tree-mod-log.o extent-io-tree.o fs.o messages.o bio.o
btrfs-$(CONFIG_BTRFS_FS_POSIX_ACL) += acl.o
btrfs-$(CONFIG_BTRFS_FS_CHECK_INTEGRITY) += check-integrity.o

View File

@ -4,8 +4,9 @@
*/
#include <asm/unaligned.h>
#include "messages.h"
#include "ctree.h"
#include "accessors.h"
static bool check_setget_bounds(const struct extent_buffer *eb,
const void *ptr, unsigned off, int size)
@ -23,6 +24,13 @@ static bool check_setget_bounds(const struct extent_buffer *eb,
return true;
}
void btrfs_init_map_token(struct btrfs_map_token *token, struct extent_buffer *eb)
{
token->eb = eb;
token->kaddr = page_address(eb->pages[0]);
token->offset = 0;
}
/*
* Macro templates that define helpers to read/write extent buffer data of a
* given size, that are also used via ctree.h for access to item members by
@ -160,7 +168,7 @@ DEFINE_BTRFS_SETGET_BITS(64)
void btrfs_node_key(const struct extent_buffer *eb,
struct btrfs_disk_key *disk_key, int nr)
{
unsigned long ptr = btrfs_node_key_ptr_offset(nr);
unsigned long ptr = btrfs_node_key_ptr_offset(eb, nr);
read_eb_member(eb, (struct btrfs_key_ptr *)ptr,
struct btrfs_key_ptr, key, disk_key);
}

1073
fs/btrfs/accessors.h Normal file

File diff suppressed because it is too large Load Diff

View File

@ -11,10 +11,10 @@
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/slab.h>
#include "ctree.h"
#include "btrfs_inode.h"
#include "xattr.h"
#include "acl.h"
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu)
{

27
fs/btrfs/acl.h Normal file
View File

@ -0,0 +1,27 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_ACL_H
#define BTRFS_ACL_H
#ifdef CONFIG_BTRFS_FS_POSIX_ACL
struct posix_acl *btrfs_get_acl(struct inode *inode, int type, bool rcu);
int btrfs_set_acl(struct user_namespace *mnt_userns, struct dentry *dentry,
struct posix_acl *acl, int type);
int __btrfs_set_acl(struct btrfs_trans_handle *trans, struct inode *inode,
struct posix_acl *acl, int type);
#else
#define btrfs_get_acl NULL
#define btrfs_set_acl NULL
static inline int __btrfs_set_acl(struct btrfs_trans_handle *trans,
struct inode *inode, struct posix_acl *acl,
int type)
{
return -EOPNOTSUPP;
}
#endif
#endif

File diff suppressed because it is too large Load Diff

View File

@ -7,10 +7,128 @@
#define BTRFS_BACKREF_H
#include <linux/btrfs.h>
#include "messages.h"
#include "ulist.h"
#include "disk-io.h"
#include "extent_io.h"
/*
* Used by implementations of iterate_extent_inodes_t (see definition below) to
* signal that backref iteration can stop immediately and no error happened.
* The value must be non-negative and must not be 0, 1 (which is a common return
* value from things like btrfs_search_slot() and used internally in the backref
* walking code) and different from BACKREF_FOUND_SHARED and
* BACKREF_FOUND_NOT_SHARED
*/
#define BTRFS_ITERATE_EXTENT_INODES_STOP 5
/*
* Should return 0 if no errors happened and iteration of backrefs should
* continue. Can return BTRFS_ITERATE_EXTENT_INODES_STOP or any other non-zero
* value to immediately stop iteration and possibly signal an error back to
* the caller.
*/
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 num_bytes,
u64 root, void *ctx);
/*
* Context and arguments for backref walking functions. Some of the fields are
* to be filled by the caller of such functions while other are filled by the
* functions themselves, as described below.
*/
struct btrfs_backref_walk_ctx {
/*
* The address of the extent for which we are doing backref walking.
* Can be either a data extent or a metadata extent.
*
* Must always be set by the top level caller.
*/
u64 bytenr;
/*
* Offset relative to the target extent. This is only used for data
* extents, and it's meaningful because we can have file extent items
* that point only to a section of a data extent ("bookend" extents),
* and we want to filter out any that don't point to a section of the
* data extent containing the given offset.
*
* Must always be set by the top level caller.
*/
u64 extent_item_pos;
/*
* If true and bytenr corresponds to a data extent, then references from
* all file extent items that point to the data extent are considered,
* @extent_item_pos is ignored.
*/
bool ignore_extent_item_pos;
/* A valid transaction handle or NULL. */
struct btrfs_trans_handle *trans;
/*
* The file system's info object, can not be NULL.
*
* Must always be set by the top level caller.
*/
struct btrfs_fs_info *fs_info;
/*
* Time sequence acquired from btrfs_get_tree_mod_seq(), in case the
* caller joined the tree mod log to get a consistent view of b+trees
* while we do backref walking, or BTRFS_SEQ_LAST.
* When using BTRFS_SEQ_LAST, delayed refs are not checked and it uses
* commit roots when searching b+trees - this is a special case for
* qgroups used during a transaction commit.
*/
u64 time_seq;
/*
* Used to collect the bytenr of metadata extents that point to the
* target extent.
*/
struct ulist *refs;
/*
* List used to collect the IDs of the roots from which the target
* extent is accessible. Can be NULL in case the caller does not care
* about collecting root IDs.
*/
struct ulist *roots;
/*
* Used by iterate_extent_inodes() and the main backref walk code
* (find_parent_nodes()). Lookup and store functions for an optional
* cache which maps the logical address (bytenr) of leaves to an array
* of root IDs.
*/
bool (*cache_lookup)(u64 leaf_bytenr, void *user_ctx,
const u64 **root_ids_ret, int *root_count_ret);
void (*cache_store)(u64 leaf_bytenr, const struct ulist *root_ids,
void *user_ctx);
/*
* If this is not NULL, then the backref walking code will call this
* for each indirect data extent reference as soon as it finds one,
* before collecting all the remaining backrefs and before resolving
* indirect backrefs. This allows for the caller to terminate backref
* walking as soon as it finds one backref that matches some specific
* criteria. The @cache_lookup and @cache_store callbacks should not
* be NULL in order to use this callback.
*/
iterate_extent_inodes_t *indirect_ref_iterator;
/*
* If this is not NULL, then the backref walking code will call this for
* each extent item it's meant to process before it actually starts
* processing it. If this returns anything other than 0, then it stops
* the backref walking code immediately.
*/
int (*check_extent_item)(u64 bytenr, const struct btrfs_extent_item *ei,
const struct extent_buffer *leaf, void *user_ctx);
/*
* If this is not NULL, then the backref walking code will call this for
* each extent data ref it finds (BTRFS_EXTENT_DATA_REF_KEY keys) before
* processing that data ref. If this callback return false, then it will
* ignore this data ref and it will never resolve the indirect data ref,
* saving time searching for leaves in a fs tree with file extent items
* matching the data ref.
*/
bool (*skip_data_ref)(u64 root, u64 ino, u64 offset, void *user_ctx);
/* Context object to pass to the callbacks defined above. */
void *user_ctx;
};
struct inode_fs_paths {
struct btrfs_path *btrfs_path;
struct btrfs_root *fs_root;
@ -23,17 +141,59 @@ struct btrfs_backref_shared_cache_entry {
bool is_shared;
};
struct btrfs_backref_shared_cache {
#define BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE 8
struct btrfs_backref_share_check_ctx {
/* Ulists used during backref walking. */
struct ulist refs;
/*
* The current leaf the caller of btrfs_is_data_extent_shared() is at.
* Typically the caller (at the moment only fiemap) tries to determine
* the sharedness of data extents point by file extent items from entire
* leaves.
*/
u64 curr_leaf_bytenr;
/*
* The previous leaf the caller was at in the previous call to
* btrfs_is_data_extent_shared(). This may be the same as the current
* leaf. On the first call it must be 0.
*/
u64 prev_leaf_bytenr;
/*
* A path from a root to a leaf that has a file extent item pointing to
* a given data extent should never exceed the maximum b+tree height.
*/
struct btrfs_backref_shared_cache_entry entries[BTRFS_MAX_LEVEL];
bool use_cache;
struct btrfs_backref_shared_cache_entry path_cache_entries[BTRFS_MAX_LEVEL];
bool use_path_cache;
/*
* Cache the sharedness result for the last few extents we have found,
* but only for extents for which we have multiple file extent items
* that point to them.
* It's very common to have several file extent items that point to the
* same extent (bytenr) but with different offsets and lengths. This
* typically happens for COW writes, partial writes into prealloc
* extents, NOCOW writes after snapshoting a root, hole punching or
* reflinking within the same file (less common perhaps).
* So keep a small cache with the lookup results for the extent pointed
* by the last few file extent items. This cache is checked, with a
* linear scan, whenever btrfs_is_data_extent_shared() is called, so
* it must be small so that it does not negatively affect performance in
* case we don't have multiple file extent items that point to the same
* data extent.
*/
struct {
u64 bytenr;
bool is_shared;
} prev_extents_cache[BTRFS_BACKREF_CTX_PREV_EXTENTS_SIZE];
/*
* The slot in the prev_extents_cache array that will be used for
* storing the sharedness result of a new data extent.
*/
int prev_extents_cache_slot;
};
typedef int (iterate_extent_inodes_t)(u64 inum, u64 offset, u64 root,
void *ctx);
struct btrfs_backref_share_check_ctx *btrfs_alloc_backref_share_check_ctx(void);
void btrfs_free_backref_share_ctx(struct btrfs_backref_share_check_ctx *ctx);
int extent_from_logical(struct btrfs_fs_info *fs_info, u64 logical,
struct btrfs_path *path, struct btrfs_key *found_key,
@ -43,11 +203,9 @@ int tree_backref_for_extent(unsigned long *ptr, struct extent_buffer *eb,
struct btrfs_key *key, struct btrfs_extent_item *ei,
u32 item_size, u64 *out_root, u8 *out_level);
int iterate_extent_inodes(struct btrfs_fs_info *fs_info,
u64 extent_item_objectid,
u64 extent_offset, int search_commit_root,
iterate_extent_inodes_t *iterate, void *ctx,
bool ignore_offset);
int iterate_extent_inodes(struct btrfs_backref_walk_ctx *ctx,
bool search_commit_root,
iterate_extent_inodes_t *iterate, void *user_ctx);
int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
struct btrfs_path *path, void *ctx,
@ -55,13 +213,8 @@ int iterate_inodes_from_logical(u64 logical, struct btrfs_fs_info *fs_info,
int paths_from_inode(u64 inum, struct inode_fs_paths *ipath);
int btrfs_find_all_leafs(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **leafs,
const u64 *extent_item_pos, bool ignore_offset);
int btrfs_find_all_roots(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 time_seq, struct ulist **roots,
int btrfs_find_all_leafs(struct btrfs_backref_walk_ctx *ctx);
int btrfs_find_all_roots(struct btrfs_backref_walk_ctx *ctx,
bool skip_commit_root_sem);
char *btrfs_ref_to_path(struct btrfs_root *fs_root, struct btrfs_path *path,
u32 name_len, unsigned long name_off,
@ -77,10 +230,9 @@ int btrfs_find_one_extref(struct btrfs_root *root, u64 inode_objectid,
u64 start_off, struct btrfs_path *path,
struct btrfs_inode_extref **ret_extref,
u64 *found_off);
int btrfs_is_data_extent_shared(struct btrfs_root *root, u64 inum, u64 bytenr,
int btrfs_is_data_extent_shared(struct btrfs_inode *inode, u64 bytenr,
u64 extent_gen,
struct ulist *roots, struct ulist *tmp,
struct btrfs_backref_shared_cache *cache);
struct btrfs_backref_share_check_ctx *ctx);
int __init btrfs_prelim_ref_init(void);
void __cold btrfs_prelim_ref_exit(void);
@ -111,8 +263,7 @@ struct btrfs_backref_iter {
u32 end_ptr;
};
struct btrfs_backref_iter *btrfs_backref_iter_alloc(
struct btrfs_fs_info *fs_info, gfp_t gfp_flag);
struct btrfs_backref_iter *btrfs_backref_iter_alloc(struct btrfs_fs_info *fs_info);
static inline void btrfs_backref_iter_free(struct btrfs_backref_iter *iter)
{

381
fs/btrfs/bio.c Normal file
View File

@ -0,0 +1,381 @@
// SPDX-License-Identifier: GPL-2.0
/*
* Copyright (C) 2007 Oracle. All rights reserved.
* Copyright (C) 2022 Christoph Hellwig.
*/
#include <linux/bio.h>
#include "bio.h"
#include "ctree.h"
#include "volumes.h"
#include "raid56.h"
#include "async-thread.h"
#include "check-integrity.h"
#include "dev-replace.h"
#include "rcu-string.h"
#include "zoned.h"
static struct bio_set btrfs_bioset;
/*
* Initialize a btrfs_bio structure. This skips the embedded bio itself as it
* is already initialized by the block layer.
*/
static inline void btrfs_bio_init(struct btrfs_bio *bbio,
btrfs_bio_end_io_t end_io, void *private)
{
memset(bbio, 0, offsetof(struct btrfs_bio, bio));
bbio->end_io = end_io;
bbio->private = private;
}
/*
* Allocate a btrfs_bio structure. The btrfs_bio is the main I/O container for
* btrfs, and is used for all I/O submitted through btrfs_submit_bio.
*
* Just like the underlying bio_alloc_bioset it will not fail as it is backed by
* a mempool.
*/
struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
btrfs_bio_end_io_t end_io, void *private)
{
struct bio *bio;
bio = bio_alloc_bioset(NULL, nr_vecs, opf, GFP_NOFS, &btrfs_bioset);
btrfs_bio_init(btrfs_bio(bio), end_io, private);
return bio;
}
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
btrfs_bio_end_io_t end_io, void *private)
{
struct bio *bio;
struct btrfs_bio *bbio;
ASSERT(offset <= UINT_MAX && size <= UINT_MAX);
bio = bio_alloc_clone(orig->bi_bdev, orig, GFP_NOFS, &btrfs_bioset);
bbio = btrfs_bio(bio);
btrfs_bio_init(bbio, end_io, private);
bio_trim(bio, offset >> 9, size >> 9);
bbio->iter = bio->bi_iter;
return bio;
}
static void btrfs_log_dev_io_error(struct bio *bio, struct btrfs_device *dev)
{
if (!dev || !dev->bdev)
return;
if (bio->bi_status != BLK_STS_IOERR && bio->bi_status != BLK_STS_TARGET)
return;
if (btrfs_op(bio) == BTRFS_MAP_WRITE)
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
if (!(bio->bi_opf & REQ_RAHEAD))
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_READ_ERRS);
if (bio->bi_opf & REQ_PREFLUSH)
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_FLUSH_ERRS);
}
static struct workqueue_struct *btrfs_end_io_wq(struct btrfs_fs_info *fs_info,
struct bio *bio)
{
if (bio->bi_opf & REQ_META)
return fs_info->endio_meta_workers;
return fs_info->endio_workers;
}
static void btrfs_end_bio_work(struct work_struct *work)
{
struct btrfs_bio *bbio = container_of(work, struct btrfs_bio, end_io_work);
bbio->end_io(bbio);
}
static void btrfs_simple_end_io(struct bio *bio)
{
struct btrfs_fs_info *fs_info = bio->bi_private;
struct btrfs_bio *bbio = btrfs_bio(bio);
btrfs_bio_counter_dec(fs_info);
if (bio->bi_status)
btrfs_log_dev_io_error(bio, bbio->device);
if (bio_op(bio) == REQ_OP_READ) {
INIT_WORK(&bbio->end_io_work, btrfs_end_bio_work);
queue_work(btrfs_end_io_wq(fs_info, bio), &bbio->end_io_work);
} else {
bbio->end_io(bbio);
}
}
static void btrfs_raid56_end_io(struct bio *bio)
{
struct btrfs_io_context *bioc = bio->bi_private;
struct btrfs_bio *bbio = btrfs_bio(bio);
btrfs_bio_counter_dec(bioc->fs_info);
bbio->mirror_num = bioc->mirror_num;
bbio->end_io(bbio);
btrfs_put_bioc(bioc);
}
static void btrfs_orig_write_end_io(struct bio *bio)
{
struct btrfs_io_stripe *stripe = bio->bi_private;
struct btrfs_io_context *bioc = stripe->bioc;
struct btrfs_bio *bbio = btrfs_bio(bio);
btrfs_bio_counter_dec(bioc->fs_info);
if (bio->bi_status) {
atomic_inc(&bioc->error);
btrfs_log_dev_io_error(bio, stripe->dev);
}
/*
* Only send an error to the higher layers if it is beyond the tolerance
* threshold.
*/
if (atomic_read(&bioc->error) > bioc->max_errors)
bio->bi_status = BLK_STS_IOERR;
else
bio->bi_status = BLK_STS_OK;
bbio->end_io(bbio);
btrfs_put_bioc(bioc);
}
static void btrfs_clone_write_end_io(struct bio *bio)
{
struct btrfs_io_stripe *stripe = bio->bi_private;
if (bio->bi_status) {
atomic_inc(&stripe->bioc->error);
btrfs_log_dev_io_error(bio, stripe->dev);
}
/* Pass on control to the original bio this one was cloned from */
bio_endio(stripe->bioc->orig_bio);
bio_put(bio);
}
static void btrfs_submit_dev_bio(struct btrfs_device *dev, struct bio *bio)
{
if (!dev || !dev->bdev ||
test_bit(BTRFS_DEV_STATE_MISSING, &dev->dev_state) ||
(btrfs_op(bio) == BTRFS_MAP_WRITE &&
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state))) {
bio_io_error(bio);
return;
}
bio_set_dev(bio, dev->bdev);
/*
* For zone append writing, bi_sector must point the beginning of the
* zone
*/
if (bio_op(bio) == REQ_OP_ZONE_APPEND) {
u64 physical = bio->bi_iter.bi_sector << SECTOR_SHIFT;
if (btrfs_dev_is_sequential(dev, physical)) {
u64 zone_start = round_down(physical,
dev->fs_info->zone_size);
bio->bi_iter.bi_sector = zone_start >> SECTOR_SHIFT;
} else {
bio->bi_opf &= ~REQ_OP_ZONE_APPEND;
bio->bi_opf |= REQ_OP_WRITE;
}
}
btrfs_debug_in_rcu(dev->fs_info,
"%s: rw %d 0x%x, sector=%llu, dev=%lu (%s id %llu), size=%u",
__func__, bio_op(bio), bio->bi_opf, bio->bi_iter.bi_sector,
(unsigned long)dev->bdev->bd_dev, btrfs_dev_name(dev),
dev->devid, bio->bi_iter.bi_size);
btrfsic_check_bio(bio);
submit_bio(bio);
}
static void btrfs_submit_mirrored_bio(struct btrfs_io_context *bioc, int dev_nr)
{
struct bio *orig_bio = bioc->orig_bio, *bio;
ASSERT(bio_op(orig_bio) != REQ_OP_READ);
/* Reuse the bio embedded into the btrfs_bio for the last mirror */
if (dev_nr == bioc->num_stripes - 1) {
bio = orig_bio;
bio->bi_end_io = btrfs_orig_write_end_io;
} else {
bio = bio_alloc_clone(NULL, orig_bio, GFP_NOFS, &fs_bio_set);
bio_inc_remaining(orig_bio);
bio->bi_end_io = btrfs_clone_write_end_io;
}
bio->bi_private = &bioc->stripes[dev_nr];
bio->bi_iter.bi_sector = bioc->stripes[dev_nr].physical >> SECTOR_SHIFT;
bioc->stripes[dev_nr].bioc = bioc;
btrfs_submit_dev_bio(bioc->stripes[dev_nr].dev, bio);
}
void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio, int mirror_num)
{
u64 logical = bio->bi_iter.bi_sector << 9;
u64 length = bio->bi_iter.bi_size;
u64 map_length = length;
struct btrfs_io_context *bioc = NULL;
struct btrfs_io_stripe smap;
int ret;
btrfs_bio_counter_inc_blocked(fs_info);
ret = __btrfs_map_block(fs_info, btrfs_op(bio), logical, &map_length,
&bioc, &smap, &mirror_num, 1);
if (ret) {
btrfs_bio_counter_dec(fs_info);
btrfs_bio_end_io(btrfs_bio(bio), errno_to_blk_status(ret));
return;
}
if (map_length < length) {
btrfs_crit(fs_info,
"mapping failed logical %llu bio len %llu len %llu",
logical, length, map_length);
BUG();
}
if (!bioc) {
/* Single mirror read/write fast path */
btrfs_bio(bio)->mirror_num = mirror_num;
btrfs_bio(bio)->device = smap.dev;
bio->bi_iter.bi_sector = smap.physical >> SECTOR_SHIFT;
bio->bi_private = fs_info;
bio->bi_end_io = btrfs_simple_end_io;
btrfs_submit_dev_bio(smap.dev, bio);
} else if (bioc->map_type & BTRFS_BLOCK_GROUP_RAID56_MASK) {
/* Parity RAID write or read recovery */
bio->bi_private = bioc;
bio->bi_end_io = btrfs_raid56_end_io;
if (bio_op(bio) == REQ_OP_READ)
raid56_parity_recover(bio, bioc, mirror_num);
else
raid56_parity_write(bio, bioc);
} else {
/* Write to multiple mirrors */
int total_devs = bioc->num_stripes;
int dev_nr;
bioc->orig_bio = bio;
for (dev_nr = 0; dev_nr < total_devs; dev_nr++)
btrfs_submit_mirrored_bio(bioc, dev_nr);
}
}
/*
* Submit a repair write.
*
* This bypasses btrfs_submit_bio deliberately, as that writes all copies in a
* RAID setup. Here we only want to write the one bad copy, so we do the
* mapping ourselves and submit the bio directly.
*
* The I/O is issued sychronously to block the repair read completion from
* freeing the bio.
*/
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num)
{
struct btrfs_device *dev;
struct bio_vec bvec;
struct bio bio;
u64 map_length = 0;
u64 sector;
struct btrfs_io_context *bioc = NULL;
int ret = 0;
ASSERT(!(fs_info->sb->s_flags & SB_RDONLY));
BUG_ON(!mirror_num);
if (btrfs_repair_one_zone(fs_info, logical))
return 0;
map_length = length;
/*
* Avoid races with device replace and make sure our bioc has devices
* associated to its stripes that don't go away while we are doing the
* read repair operation.
*/
btrfs_bio_counter_inc_blocked(fs_info);
if (btrfs_is_parity_mirror(fs_info, logical, length)) {
/*
* Note that we don't use BTRFS_MAP_WRITE because it's supposed
* to update all raid stripes, but here we just want to correct
* bad stripe, thus BTRFS_MAP_READ is abused to only get the bad
* stripe's dev and sector.
*/
ret = btrfs_map_block(fs_info, BTRFS_MAP_READ, logical,
&map_length, &bioc, 0);
if (ret)
goto out_counter_dec;
ASSERT(bioc->mirror_num == 1);
} else {
ret = btrfs_map_block(fs_info, BTRFS_MAP_WRITE, logical,
&map_length, &bioc, mirror_num);
if (ret)
goto out_counter_dec;
BUG_ON(mirror_num != bioc->mirror_num);
}
sector = bioc->stripes[bioc->mirror_num - 1].physical >> 9;
dev = bioc->stripes[bioc->mirror_num - 1].dev;
btrfs_put_bioc(bioc);
if (!dev || !dev->bdev ||
!test_bit(BTRFS_DEV_STATE_WRITEABLE, &dev->dev_state)) {
ret = -EIO;
goto out_counter_dec;
}
bio_init(&bio, dev->bdev, &bvec, 1, REQ_OP_WRITE | REQ_SYNC);
bio.bi_iter.bi_sector = sector;
__bio_add_page(&bio, page, length, pg_offset);
btrfsic_check_bio(&bio);
ret = submit_bio_wait(&bio);
if (ret) {
/* try to remap that extent elsewhere? */
btrfs_dev_stat_inc_and_print(dev, BTRFS_DEV_STAT_WRITE_ERRS);
goto out_bio_uninit;
}
btrfs_info_rl_in_rcu(fs_info,
"read error corrected: ino %llu off %llu (dev %s sector %llu)",
ino, start, btrfs_dev_name(dev), sector);
ret = 0;
out_bio_uninit:
bio_uninit(&bio);
out_counter_dec:
btrfs_bio_counter_dec(fs_info);
return ret;
}
int __init btrfs_bioset_init(void)
{
if (bioset_init(&btrfs_bioset, BIO_POOL_SIZE,
offsetof(struct btrfs_bio, bio),
BIOSET_NEED_BVECS))
return -ENOMEM;
return 0;
}
void __cold btrfs_bioset_exit(void)
{
bioset_exit(&btrfs_bioset);
}

127
fs/btrfs/bio.h Normal file
View File

@ -0,0 +1,127 @@
/* SPDX-License-Identifier: GPL-2.0 */
/*
* Copyright (C) 2007 Oracle. All rights reserved.
* Copyright (C) 2022 Christoph Hellwig.
*/
#ifndef BTRFS_BIO_H
#define BTRFS_BIO_H
#include <linux/bio.h>
#include <linux/workqueue.h>
#include "tree-checker.h"
struct btrfs_bio;
struct btrfs_fs_info;
#define BTRFS_BIO_INLINE_CSUM_SIZE 64
/*
* Maximum number of sectors for a single bio to limit the size of the
* checksum array. This matches the number of bio_vecs per bio and thus the
* I/O size for buffered I/O.
*/
#define BTRFS_MAX_BIO_SECTORS (256)
typedef void (*btrfs_bio_end_io_t)(struct btrfs_bio *bbio);
/*
* Additional info to pass along bio.
*
* Mostly for btrfs specific features like csum and mirror_num.
*/
struct btrfs_bio {
unsigned int mirror_num:7;
/*
* Extra indicator for metadata bios.
* For some btrfs bios they use pages without a mapping, thus
* we can not rely on page->mapping->host to determine if
* it's a metadata bio.
*/
unsigned int is_metadata:1;
struct bvec_iter iter;
/* for direct I/O */
u64 file_offset;
/* @device is for stripe IO submission. */
struct btrfs_device *device;
union {
/* For data checksum verification. */
struct {
u8 *csum;
u8 csum_inline[BTRFS_BIO_INLINE_CSUM_SIZE];
};
/* For metadata parentness verification. */
struct btrfs_tree_parent_check parent_check;
};
/* End I/O information supplied to btrfs_bio_alloc */
btrfs_bio_end_io_t end_io;
void *private;
/* For read end I/O handling */
struct work_struct end_io_work;
/*
* This member must come last, bio_alloc_bioset will allocate enough
* bytes for entire btrfs_bio but relies on bio being last.
*/
struct bio bio;
};
static inline struct btrfs_bio *btrfs_bio(struct bio *bio)
{
return container_of(bio, struct btrfs_bio, bio);
}
int __init btrfs_bioset_init(void);
void __cold btrfs_bioset_exit(void);
struct bio *btrfs_bio_alloc(unsigned int nr_vecs, blk_opf_t opf,
btrfs_bio_end_io_t end_io, void *private);
struct bio *btrfs_bio_clone_partial(struct bio *orig, u64 offset, u64 size,
btrfs_bio_end_io_t end_io, void *private);
static inline void btrfs_bio_end_io(struct btrfs_bio *bbio, blk_status_t status)
{
bbio->bio.bi_status = status;
bbio->end_io(bbio);
}
static inline void btrfs_bio_free_csum(struct btrfs_bio *bbio)
{
if (bbio->is_metadata)
return;
if (bbio->csum != bbio->csum_inline) {
kfree(bbio->csum);
bbio->csum = NULL;
}
}
/*
* Iterate through a btrfs_bio (@bbio) on a per-sector basis.
*
* bvl - struct bio_vec
* bbio - struct btrfs_bio
* iters - struct bvec_iter
* bio_offset - unsigned int
*/
#define btrfs_bio_for_each_sector(fs_info, bvl, bbio, iter, bio_offset) \
for ((iter) = (bbio)->iter, (bio_offset) = 0; \
(iter).bi_size && \
(((bvl) = bio_iter_iovec((&(bbio)->bio), (iter))), 1); \
(bio_offset) += fs_info->sectorsize, \
bio_advance_iter_single(&(bbio)->bio, &(iter), \
(fs_info)->sectorsize))
void btrfs_submit_bio(struct btrfs_fs_info *fs_info, struct bio *bio,
int mirror_num);
int btrfs_repair_io_failure(struct btrfs_fs_info *fs_info, u64 ino, u64 start,
u64 length, u64 logical, struct page *page,
unsigned int pg_offset, int mirror_num);
#endif

View File

@ -17,6 +17,21 @@
#include "discard.h"
#include "raid56.h"
#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#ifdef CONFIG_BTRFS_DEBUG
int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
(btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
block_group->flags & BTRFS_BLOCK_GROUP_DATA);
}
#endif
/*
* Return target flags in extended format or 0 if restripe for this chunk_type
@ -284,7 +299,7 @@ struct btrfs_block_group *btrfs_next_block_group(
return cache;
}
/**
/*
* Check if we can do a NOCOW write for a given extent.
*
* @fs_info: The filesystem information object.
@ -325,11 +340,9 @@ struct btrfs_block_group *btrfs_inc_nocow_writers(struct btrfs_fs_info *fs_info,
return bg;
}
/**
/*
* Decrement the number of NOCOW writers in a block group.
*
* @bg: The block group.
*
* This is meant to be called after a previous call to btrfs_inc_nocow_writers(),
* and on the block group returned by that call. Typically this is called after
* creating an ordered extent for a NOCOW write, to prevent races with scrub and
@ -1527,6 +1540,30 @@ static inline bool btrfs_should_reclaim(struct btrfs_fs_info *fs_info)
return true;
}
static bool should_reclaim_block_group(struct btrfs_block_group *bg, u64 bytes_freed)
{
const struct btrfs_space_info *space_info = bg->space_info;
const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
const u64 new_val = bg->used;
const u64 old_val = new_val + bytes_freed;
u64 thresh;
if (reclaim_thresh == 0)
return false;
thresh = mult_perc(bg->length, reclaim_thresh);
/*
* If we were below the threshold before don't reclaim, we are likely a
* brand new block group and we don't want to relocate new block groups.
*/
if (old_val < thresh)
return false;
if (new_val >= thresh)
return false;
return true;
}
void btrfs_reclaim_bgs_work(struct work_struct *work)
{
struct btrfs_fs_info *fs_info =
@ -1594,6 +1631,40 @@ void btrfs_reclaim_bgs_work(struct work_struct *work)
up_write(&space_info->groups_sem);
goto next;
}
if (bg->used == 0) {
/*
* It is possible that we trigger relocation on a block
* group as its extents are deleted and it first goes
* below the threshold, then shortly after goes empty.
*
* In this case, relocating it does delete it, but has
* some overhead in relocation specific metadata, looking
* for the non-existent extents and running some extra
* transactions, which we can avoid by using one of the
* other mechanisms for dealing with empty block groups.
*/
if (!btrfs_test_opt(fs_info, DISCARD_ASYNC))
btrfs_mark_bg_unused(bg);
spin_unlock(&bg->lock);
up_write(&space_info->groups_sem);
goto next;
}
/*
* The block group might no longer meet the reclaim condition by
* the time we get around to reclaiming it, so to avoid
* reclaiming overly full block_groups, skip reclaiming them.
*
* Since the decision making process also depends on the amount
* being freed, pass in a fake giant value to skip that extra
* check, which is more meaningful when adding to the list in
* the first place.
*/
if (!should_reclaim_block_group(bg, bg->length)) {
spin_unlock(&bg->lock);
up_write(&space_info->groups_sem);
goto next;
}
spin_unlock(&bg->lock);
/* Get out fast, in case we're unmounting the filesystem */
@ -1740,8 +1811,8 @@ static void set_avail_alloc_bits(struct btrfs_fs_info *fs_info, u64 flags)
write_sequnlock(&fs_info->profiles_lock);
}
/**
* Map a physical disk address to a list of logical addresses
/*
* Map a physical disk address to a list of logical addresses.
*
* @fs_info: the filesystem
* @chunk_start: logical address of block group
@ -2001,6 +2072,7 @@ static int read_one_block_group(struct btrfs_fs_info *info,
cache->length = key->offset;
cache->used = btrfs_stack_block_group_used(bgi);
cache->commit_used = cache->used;
cache->flags = btrfs_stack_block_group_flags(bgi);
cache->global_root_id = btrfs_stack_block_group_chunk_objectid(bgi);
@ -2481,7 +2553,7 @@ struct btrfs_block_group *btrfs_make_block_group(struct btrfs_trans_handle *tran
cache->global_root_id = calculate_global_root_id(fs_info, cache->start);
if (btrfs_fs_compat_ro(fs_info, FREE_SPACE_TREE))
cache->needs_free_space = 1;
set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
ret = btrfs_load_block_group_zone_info(cache, true);
if (ret) {
@ -2692,6 +2764,25 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
struct extent_buffer *leaf;
struct btrfs_block_group_item bgi;
struct btrfs_key key;
u64 old_commit_used;
u64 used;
/*
* Block group items update can be triggered out of commit transaction
* critical section, thus we need a consistent view of used bytes.
* We cannot use cache->used directly outside of the spin lock, as it
* may be changed.
*/
spin_lock(&cache->lock);
old_commit_used = cache->commit_used;
used = cache->used;
/* No change in used bytes, can safely skip it. */
if (cache->commit_used == used) {
spin_unlock(&cache->lock);
return 0;
}
cache->commit_used = used;
spin_unlock(&cache->lock);
key.objectid = cache->start;
key.type = BTRFS_BLOCK_GROUP_ITEM_KEY;
@ -2706,7 +2797,7 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
bi = btrfs_item_ptr_offset(leaf, path->slots[0]);
btrfs_set_stack_block_group_used(&bgi, cache->used);
btrfs_set_stack_block_group_used(&bgi, used);
btrfs_set_stack_block_group_chunk_objectid(&bgi,
cache->global_root_id);
btrfs_set_stack_block_group_flags(&bgi, cache->flags);
@ -2714,6 +2805,12 @@ static int update_block_group_item(struct btrfs_trans_handle *trans,
btrfs_mark_buffer_dirty(leaf);
fail:
btrfs_release_path(path);
/* We didn't update the block group item, need to revert @commit_used. */
if (ret < 0) {
spin_lock(&cache->lock);
cache->commit_used = old_commit_used;
spin_unlock(&cache->lock);
}
return ret;
}
@ -3211,31 +3308,6 @@ int btrfs_write_dirty_block_groups(struct btrfs_trans_handle *trans)
return ret;
}
static inline bool should_reclaim_block_group(struct btrfs_block_group *bg,
u64 bytes_freed)
{
const struct btrfs_space_info *space_info = bg->space_info;
const int reclaim_thresh = READ_ONCE(space_info->bg_reclaim_threshold);
const u64 new_val = bg->used;
const u64 old_val = new_val + bytes_freed;
u64 thresh;
if (reclaim_thresh == 0)
return false;
thresh = div_factor_fine(bg->length, reclaim_thresh);
/*
* If we were below the threshold before don't reclaim, we are likely a
* brand new block group and we don't want to relocate new block groups.
*/
if (old_val < thresh)
return false;
if (new_val >= thresh)
return false;
return true;
}
int btrfs_update_block_group(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes, bool alloc)
{
@ -3347,8 +3419,9 @@ int btrfs_update_block_group(struct btrfs_trans_handle *trans,
return ret;
}
/**
* btrfs_add_reserved_bytes - update the block_group and space info counters
/*
* Update the block_group and space info counters.
*
* @cache: The cache we are manipulating
* @ram_bytes: The number of bytes of file content, and will be same to
* @num_bytes except for the compress path.
@ -3391,8 +3464,9 @@ int btrfs_add_reserved_bytes(struct btrfs_block_group *cache,
return ret;
}
/**
* btrfs_free_reserved_bytes - update the block_group and space info counters
/*
* Update the block_group and space info counters.
*
* @cache: The cache we are manipulating
* @num_bytes: The number of bytes in question
* @delalloc: The blocks are allocated for the delalloc write
@ -3449,13 +3523,13 @@ static int should_alloc_chunk(struct btrfs_fs_info *fs_info,
*/
if (force == CHUNK_ALLOC_LIMITED) {
thresh = btrfs_super_total_bytes(fs_info->super_copy);
thresh = max_t(u64, SZ_64M, div_factor_fine(thresh, 1));
thresh = max_t(u64, SZ_64M, mult_perc(thresh, 1));
if (sinfo->total_bytes - bytes_used < thresh)
return 1;
}
if (bytes_used + SZ_2M < div_factor(sinfo->total_bytes, 8))
if (bytes_used + SZ_2M < mult_perc(sinfo->total_bytes, 80))
return 0;
return 1;
}

View File

@ -55,6 +55,10 @@ enum btrfs_block_group_flags {
BLOCK_GROUP_FLAG_CHUNK_ITEM_INSERTED,
BLOCK_GROUP_FLAG_ZONE_IS_ACTIVE,
BLOCK_GROUP_FLAG_ZONED_DATA_RELOC,
/* Does the block group need to be added to the free space tree? */
BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE,
/* Indicate that the block group is placed on a sequential zone */
BLOCK_GROUP_FLAG_SEQUENTIAL_ZONE,
};
enum btrfs_caching_type {
@ -99,6 +103,12 @@ struct btrfs_block_group {
u64 cache_generation;
u64 global_root_id;
/*
* The last committed used bytes of this block group, if the above @used
* is still the same as @commit_used, we don't need to update block
* group item of this block group.
*/
u64 commit_used;
/*
* If the free space extent count exceeds this number, convert the block
* group to bitmaps.
@ -202,15 +212,6 @@ struct btrfs_block_group {
/* Lock for free space tree operations. */
struct mutex free_space_lock;
/*
* Does the block group need to be added to the free space tree?
* Protected by free_space_lock.
*/
int needs_free_space;
/* Flag indicating this block group is placed on a sequential zone */
bool seq_zone;
/*
* Number of extents in this block group used for swap files.
* All accesses protected by the spinlock 'lock'.
@ -251,16 +252,7 @@ static inline bool btrfs_is_block_group_data_only(
}
#ifdef CONFIG_BTRFS_DEBUG
static inline int btrfs_should_fragment_free_space(
struct btrfs_block_group *block_group)
{
struct btrfs_fs_info *fs_info = block_group->fs_info;
return (btrfs_test_opt(fs_info, FRAGMENT_METADATA) &&
block_group->flags & BTRFS_BLOCK_GROUP_METADATA) ||
(btrfs_test_opt(fs_info, FRAGMENT_DATA) &&
block_group->flags & BTRFS_BLOCK_GROUP_DATA);
}
int btrfs_should_fragment_free_space(struct btrfs_block_group *block_group);
#endif
struct btrfs_block_group *btrfs_lookup_first_block_group(

View File

@ -7,6 +7,8 @@
#include "transaction.h"
#include "block-group.h"
#include "disk-io.h"
#include "fs.h"
#include "accessors.h"
/*
* HOW DO BLOCK RESERVES WORK
@ -225,7 +227,7 @@ int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
return ret;
}
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent)
{
u64 num_bytes = 0;
int ret = -ENOSPC;
@ -234,7 +236,7 @@ int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor)
return 0;
spin_lock(&block_rsv->lock);
num_bytes = div_factor(block_rsv->size, min_factor);
num_bytes = mult_perc(block_rsv->size, min_percent);
if (block_rsv->reserved >= num_bytes)
ret = 0;
spin_unlock(&block_rsv->lock);
@ -323,31 +325,6 @@ void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
spin_unlock(&block_rsv->lock);
}
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor)
{
struct btrfs_block_rsv *global_rsv = &fs_info->global_block_rsv;
u64 min_bytes;
if (global_rsv->space_info != dest->space_info)
return -ENOSPC;
spin_lock(&global_rsv->lock);
min_bytes = div_factor(global_rsv->size, min_factor);
if (global_rsv->reserved < min_bytes + num_bytes) {
spin_unlock(&global_rsv->lock);
return -ENOSPC;
}
global_rsv->reserved -= num_bytes;
if (global_rsv->reserved < global_rsv->size)
global_rsv->full = false;
spin_unlock(&global_rsv->lock);
btrfs_block_rsv_add_bytes(dest, num_bytes, true);
return 0;
}
void btrfs_update_global_block_rsv(struct btrfs_fs_info *fs_info)
{
struct btrfs_block_rsv *block_rsv = &fs_info->global_block_rsv;
@ -552,5 +529,17 @@ try_reserve:
if (!ret)
return global_rsv;
}
/*
* All hope is lost, but of course our reservations are overly
* pessimistic, so instead of possibly having an ENOSPC abort here, try
* one last time to force a reservation if there's enough actual space
* on disk to make the reservation.
*/
ret = btrfs_reserve_metadata_bytes(fs_info, block_rsv, blocksize,
BTRFS_RESERVE_FLUSH_EMERGENCY);
if (!ret)
return block_rsv;
return ERR_PTR(ret);
}

View File

@ -4,6 +4,7 @@
#define BTRFS_BLOCK_RSV_H
struct btrfs_trans_handle;
struct btrfs_root;
enum btrfs_reserve_flush_enum;
/*
@ -62,7 +63,7 @@ void btrfs_free_block_rsv(struct btrfs_fs_info *fs_info,
int btrfs_block_rsv_add(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 num_bytes,
enum btrfs_reserve_flush_enum flush);
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_factor);
int btrfs_block_rsv_check(struct btrfs_block_rsv *block_rsv, int min_percent);
int btrfs_block_rsv_refill(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *block_rsv, u64 min_reserved,
enum btrfs_reserve_flush_enum flush);
@ -70,9 +71,6 @@ int btrfs_block_rsv_migrate(struct btrfs_block_rsv *src_rsv,
struct btrfs_block_rsv *dst_rsv, u64 num_bytes,
bool update_size);
int btrfs_block_rsv_use_bytes(struct btrfs_block_rsv *block_rsv, u64 num_bytes);
int btrfs_cond_migrate_bytes(struct btrfs_fs_info *fs_info,
struct btrfs_block_rsv *dest, u64 num_bytes,
int min_factor);
void btrfs_block_rsv_add_bytes(struct btrfs_block_rsv *block_rsv,
u64 num_bytes, bool update_size);
u64 btrfs_block_rsv_release(struct btrfs_fs_info *fs_info,

View File

@ -411,29 +411,142 @@ static inline void btrfs_inode_split_flags(u64 inode_item_flags,
#define CSUM_FMT "0x%*phN"
#define CSUM_FMT_VALUE(size, bytes) size, bytes
static inline void btrfs_print_data_csum_error(struct btrfs_inode *inode,
u64 logical_start, u8 *csum, u8 *csum_expected, int mirror_num)
{
struct btrfs_root *root = inode->root;
const u32 csum_size = root->fs_info->csum_size;
void btrfs_submit_data_write_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
void btrfs_submit_data_read_bio(struct btrfs_inode *inode, struct bio *bio,
int mirror_num, enum btrfs_compression_type compress_type);
void btrfs_submit_dio_repair_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
blk_status_t btrfs_submit_bio_start(struct btrfs_inode *inode, struct bio *bio);
blk_status_t btrfs_submit_bio_start_direct_io(struct btrfs_inode *inode,
struct bio *bio,
u64 dio_file_offset);
int btrfs_check_sector_csum(struct btrfs_fs_info *fs_info, struct page *page,
u32 pgoff, u8 *csum, const u8 * const csum_expected);
int btrfs_check_data_csum(struct btrfs_inode *inode, struct btrfs_bio *bbio,
u32 bio_offset, struct page *page, u32 pgoff);
unsigned int btrfs_verify_data_csum(struct btrfs_bio *bbio,
u32 bio_offset, struct page *page,
u64 start, u64 end);
noinline int can_nocow_extent(struct inode *inode, u64 offset, u64 *len,
u64 *orig_start, u64 *orig_block_len,
u64 *ram_bytes, bool nowait, bool strict);
/* Output minus objectid, which is more meaningful */
if (root->root_key.objectid >= BTRFS_LAST_FREE_OBJECTID)
btrfs_warn_rl(root->fs_info,
"csum failed root %lld ino %lld off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
root->root_key.objectid, btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
else
btrfs_warn_rl(root->fs_info,
"csum failed root %llu ino %llu off %llu csum " CSUM_FMT " expected csum " CSUM_FMT " mirror %d",
root->root_key.objectid, btrfs_ino(inode),
logical_start,
CSUM_FMT_VALUE(csum_size, csum),
CSUM_FMT_VALUE(csum_size, csum_expected),
mirror_num);
}
void __btrfs_del_delalloc_inode(struct btrfs_root *root, struct btrfs_inode *inode);
struct inode *btrfs_lookup_dentry(struct inode *dir, struct dentry *dentry);
int btrfs_set_inode_index(struct btrfs_inode *dir, u64 *index);
int btrfs_unlink_inode(struct btrfs_trans_handle *trans,
struct btrfs_inode *dir, struct btrfs_inode *inode,
const struct fscrypt_str *name);
int btrfs_add_link(struct btrfs_trans_handle *trans,
struct btrfs_inode *parent_inode, struct btrfs_inode *inode,
const struct fscrypt_str *name, int add_backref, u64 index);
int btrfs_delete_subvolume(struct btrfs_inode *dir, struct dentry *dentry);
int btrfs_truncate_block(struct btrfs_inode *inode, loff_t from, loff_t len,
int front);
int btrfs_start_delalloc_snapshot(struct btrfs_root *root, bool in_reclaim_context);
int btrfs_start_delalloc_roots(struct btrfs_fs_info *fs_info, long nr,
bool in_reclaim_context);
int btrfs_set_extent_delalloc(struct btrfs_inode *inode, u64 start, u64 end,
unsigned int extra_bits,
struct extent_state **cached_state);
struct btrfs_new_inode_args {
/* Input */
struct inode *dir;
struct dentry *dentry;
struct inode *inode;
bool orphan;
bool subvol;
/* Output from btrfs_new_inode_prepare(), input to btrfs_create_new_inode(). */
struct posix_acl *default_acl;
struct posix_acl *acl;
struct fscrypt_name fname;
};
int btrfs_new_inode_prepare(struct btrfs_new_inode_args *args,
unsigned int *trans_num_items);
int btrfs_create_new_inode(struct btrfs_trans_handle *trans,
struct btrfs_new_inode_args *args);
void btrfs_new_inode_args_destroy(struct btrfs_new_inode_args *args);
struct inode *btrfs_new_subvol_inode(struct user_namespace *mnt_userns,
struct inode *dir);
void btrfs_set_delalloc_extent(struct btrfs_inode *inode, struct extent_state *state,
u32 bits);
void btrfs_clear_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *state, u32 bits);
void btrfs_merge_delalloc_extent(struct btrfs_inode *inode, struct extent_state *new,
struct extent_state *other);
void btrfs_split_delalloc_extent(struct btrfs_inode *inode,
struct extent_state *orig, u64 split);
void btrfs_set_range_writeback(struct btrfs_inode *inode, u64 start, u64 end);
vm_fault_t btrfs_page_mkwrite(struct vm_fault *vmf);
void btrfs_evict_inode(struct inode *inode);
struct inode *btrfs_alloc_inode(struct super_block *sb);
void btrfs_destroy_inode(struct inode *inode);
void btrfs_free_inode(struct inode *inode);
int btrfs_drop_inode(struct inode *inode);
int __init btrfs_init_cachep(void);
void __cold btrfs_destroy_cachep(void);
struct inode *btrfs_iget_path(struct super_block *s, u64 ino,
struct btrfs_root *root, struct btrfs_path *path);
struct inode *btrfs_iget(struct super_block *s, u64 ino, struct btrfs_root *root);
struct extent_map *btrfs_get_extent(struct btrfs_inode *inode,
struct page *page, size_t pg_offset,
u64 start, u64 end);
int btrfs_update_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode);
int btrfs_update_inode_fallback(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode);
int btrfs_orphan_add(struct btrfs_trans_handle *trans, struct btrfs_inode *inode);
int btrfs_orphan_cleanup(struct btrfs_root *root);
int btrfs_cont_expand(struct btrfs_inode *inode, loff_t oldsize, loff_t size);
void btrfs_add_delayed_iput(struct btrfs_inode *inode);
void btrfs_run_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_wait_on_delayed_iputs(struct btrfs_fs_info *fs_info);
int btrfs_prealloc_file_range(struct inode *inode, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
int btrfs_prealloc_file_range_trans(struct inode *inode,
struct btrfs_trans_handle *trans, int mode,
u64 start, u64 num_bytes, u64 min_size,
loff_t actual_len, u64 *alloc_hint);
int btrfs_run_delalloc_range(struct btrfs_inode *inode, struct page *locked_page,
u64 start, u64 end, int *page_started,
unsigned long *nr_written, struct writeback_control *wbc);
int btrfs_writepage_cow_fixup(struct page *page);
void btrfs_writepage_endio_finish_ordered(struct btrfs_inode *inode,
struct page *page, u64 start,
u64 end, bool uptodate);
int btrfs_encoded_io_compression_from_extent(struct btrfs_fs_info *fs_info,
int compress_type);
int btrfs_encoded_read_regular_fill_pages(struct btrfs_inode *inode,
u64 file_offset, u64 disk_bytenr,
u64 disk_io_size,
struct page **pages);
ssize_t btrfs_encoded_read(struct kiocb *iocb, struct iov_iter *iter,
struct btrfs_ioctl_encoded_io_args *encoded);
ssize_t btrfs_do_encoded_write(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
ssize_t btrfs_dio_read(struct kiocb *iocb, struct iov_iter *iter,
size_t done_before);
struct iomap_dio *btrfs_dio_write(struct kiocb *iocb, struct iov_iter *iter,
size_t done_before);
extern const struct dentry_operations btrfs_dentry_operations;
/* Inode locking type flags, by default the exclusive lock is taken. */
enum btrfs_ilock_type {
ENUM_BIT(BTRFS_ILOCK_SHARED),
ENUM_BIT(BTRFS_ILOCK_TRY),
ENUM_BIT(BTRFS_ILOCK_MMAP),
};
int btrfs_inode_lock(struct btrfs_inode *inode, unsigned int ilock_flags);
void btrfs_inode_unlock(struct btrfs_inode *inode, unsigned int ilock_flags);
void btrfs_update_inode_bytes(struct btrfs_inode *inode, const u64 add_bytes,
const u64 del_bytes);
void btrfs_assert_inode_range_clean(struct btrfs_inode *inode, u64 start, u64 end);
#endif

View File

@ -82,6 +82,7 @@
#include <linux/mm.h>
#include <linux/string.h>
#include <crypto/hash.h>
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@ -92,6 +93,7 @@
#include "check-integrity.h"
#include "rcu-string.h"
#include "compression.h"
#include "accessors.h"
#define BTRFSIC_BLOCK_HASHTABLE_SIZE 0x10000
#define BTRFSIC_BLOCK_LINK_HASHTABLE_SIZE 0x10000
@ -755,7 +757,7 @@ static int btrfsic_process_superblock_dev_mirror(
btrfs_info_in_rcu(fs_info,
"new initial S-block (bdev %p, %s) @%llu (%pg/%llu/%d)",
superblock_bdev,
rcu_str_deref(device->name), dev_bytenr,
btrfs_dev_name(device), dev_bytenr,
dev_state->bdev, dev_bytenr,
superblock_mirror_num);
list_add(&superblock_tmp->all_blocks_node,

View File

@ -23,16 +23,19 @@
#include <crypto/hash.h>
#include "misc.h"
#include "ctree.h"
#include "fs.h"
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "volumes.h"
#include "bio.h"
#include "ordered-data.h"
#include "compression.h"
#include "extent_io.h"
#include "extent_map.h"
#include "subpage.h"
#include "zoned.h"
#include "file-item.h"
#include "super.h"
static const char* const btrfs_compress_types[] = { "", "zlib", "lzo", "zstd" };
@ -116,7 +119,7 @@ static int compression_decompress_bio(struct list_head *ws,
}
static int compression_decompress(int type, struct list_head *ws,
unsigned char *data_in, struct page *dest_page,
const u8 *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen)
{
switch (type) {
@ -183,7 +186,7 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
u64 start = bbio->file_offset + offset;
if (!status &&
(!csum || !btrfs_check_data_csum(inode, bbio, offset,
(!csum || !btrfs_check_data_csum(bi, bbio, offset,
bv.bv_page, bv.bv_offset))) {
btrfs_clean_io_failure(bi, start, bv.bv_page,
bv.bv_offset);
@ -191,9 +194,9 @@ static void end_compressed_bio_read(struct btrfs_bio *bbio)
int ret;
refcount_inc(&cb->pending_ios);
ret = btrfs_repair_one_sector(inode, bbio, offset,
ret = btrfs_repair_one_sector(BTRFS_I(inode), bbio, offset,
bv.bv_page, bv.bv_offset,
btrfs_submit_data_read_bio);
true);
if (ret) {
refcount_dec(&cb->pending_ios);
status = errno_to_blk_status(ret);
@ -1229,7 +1232,7 @@ static int btrfs_decompress_bio(struct compressed_bio *cb)
* single page, and we want to read a single page out of it.
* start_byte tells us the offset into the compressed data we're interested in
*/
int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen)
{
struct list_head *workspace;
@ -1243,12 +1246,13 @@ int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
return ret;
}
void __init btrfs_init_compress(void)
int __init btrfs_init_compress(void)
{
btrfs_init_workspace_manager(BTRFS_COMPRESS_NONE);
btrfs_init_workspace_manager(BTRFS_COMPRESS_ZLIB);
btrfs_init_workspace_manager(BTRFS_COMPRESS_LZO);
zstd_init_workspace_manager();
return 0;
}
void __cold btrfs_exit_compress(void)

View File

@ -6,6 +6,7 @@
#ifndef BTRFS_COMPRESSION_H
#define BTRFS_COMPRESSION_H
#include <linux/blk_types.h>
#include <linux/sizes.h>
struct btrfs_inode;
@ -77,7 +78,7 @@ static inline unsigned int btrfs_compress_level(unsigned int type_level)
return ((type_level & 0xF0) >> 4);
}
void __init btrfs_init_compress(void);
int __init btrfs_init_compress(void);
void __cold btrfs_exit_compress(void);
int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
@ -85,7 +86,7 @@ int btrfs_compress_pages(unsigned int type_level, struct address_space *mapping,
unsigned long *out_pages,
unsigned long *total_in,
unsigned long *total_out);
int btrfs_decompress(int type, unsigned char *data_in, struct page *dest_page,
int btrfs_decompress(int type, const u8 *data_in, struct page *dest_page,
unsigned long start_byte, size_t srclen, size_t destlen);
int btrfs_decompress_buf2page(const char *buf, u32 buf_len,
struct compressed_bio *cb, u32 decompressed);
@ -149,7 +150,7 @@ int zlib_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out);
int zlib_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zlib_decompress(struct list_head *ws, unsigned char *data_in,
int zlib_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen);
struct list_head *zlib_alloc_workspace(unsigned int level);
@ -160,7 +161,7 @@ int lzo_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out);
int lzo_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int lzo_decompress(struct list_head *ws, unsigned char *data_in,
int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen);
struct list_head *lzo_alloc_workspace(unsigned int level);
@ -170,7 +171,7 @@ int zstd_compress_pages(struct list_head *ws, struct address_space *mapping,
u64 start, struct page **pages, unsigned long *out_pages,
unsigned long *total_in, unsigned long *total_out);
int zstd_decompress_bio(struct list_head *ws, struct compressed_bio *cb);
int zstd_decompress(struct list_head *ws, unsigned char *data_in,
int zstd_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen);
void zstd_init_workspace_manager(void);

View File

@ -8,6 +8,7 @@
#include <linux/rbtree.h>
#include <linux/mm.h>
#include <linux/error-injection.h>
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
@ -17,6 +18,13 @@
#include "qgroup.h"
#include "tree-mod-log.h"
#include "tree-checker.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "relocation.h"
#include "file-item.h"
static struct kmem_cache *btrfs_path_cachep;
static int split_node(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path, int level);
@ -44,6 +52,104 @@ static const struct btrfs_csums {
.driver = "blake2b-256" },
};
/*
* The leaf data grows from end-to-front in the node. this returns the address
* of the start of the last item, which is the stop of the leaf data stack.
*/
static unsigned int leaf_data_end(const struct extent_buffer *leaf)
{
u32 nr = btrfs_header_nritems(leaf);
if (nr == 0)
return BTRFS_LEAF_DATA_SIZE(leaf->fs_info);
return btrfs_item_offset(leaf, nr - 1);
}
/*
* Move data in a @leaf (using memmove, safe for overlapping ranges).
*
* @leaf: leaf that we're doing a memmove on
* @dst_offset: item data offset we're moving to
* @src_offset: item data offset were' moving from
* @len: length of the data we're moving
*
* Wrapper around memmove_extent_buffer() that takes into account the header on
* the leaf. The btrfs_item offset's start directly after the header, so we
* have to adjust any offsets to account for the header in the leaf. This
* handles that math to simplify the callers.
*/
static inline void memmove_leaf_data(const struct extent_buffer *leaf,
unsigned long dst_offset,
unsigned long src_offset,
unsigned long len)
{
memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, 0) + dst_offset,
btrfs_item_nr_offset(leaf, 0) + src_offset, len);
}
/*
* Copy item data from @src into @dst at the given @offset.
*
* @dst: destination leaf that we're copying into
* @src: source leaf that we're copying from
* @dst_offset: item data offset we're copying to
* @src_offset: item data offset were' copying from
* @len: length of the data we're copying
*
* Wrapper around copy_extent_buffer() that takes into account the header on
* the leaf. The btrfs_item offset's start directly after the header, so we
* have to adjust any offsets to account for the header in the leaf. This
* handles that math to simplify the callers.
*/
static inline void copy_leaf_data(const struct extent_buffer *dst,
const struct extent_buffer *src,
unsigned long dst_offset,
unsigned long src_offset, unsigned long len)
{
copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, 0) + dst_offset,
btrfs_item_nr_offset(src, 0) + src_offset, len);
}
/*
* Move items in a @leaf (using memmove).
*
* @dst: destination leaf for the items
* @dst_item: the item nr we're copying into
* @src_item: the item nr we're copying from
* @nr_items: the number of items to copy
*
* Wrapper around memmove_extent_buffer() that does the math to get the
* appropriate offsets into the leaf from the item numbers.
*/
static inline void memmove_leaf_items(const struct extent_buffer *leaf,
int dst_item, int src_item, int nr_items)
{
memmove_extent_buffer(leaf, btrfs_item_nr_offset(leaf, dst_item),
btrfs_item_nr_offset(leaf, src_item),
nr_items * sizeof(struct btrfs_item));
}
/*
* Copy items from @src into @dst at the given @offset.
*
* @dst: destination leaf for the items
* @src: source leaf for the items
* @dst_item: the item nr we're copying into
* @src_item: the item nr we're copying from
* @nr_items: the number of items to copy
*
* Wrapper around copy_extent_buffer() that does the math to get the
* appropriate offsets into the leaf from the item numbers.
*/
static inline void copy_leaf_items(const struct extent_buffer *dst,
const struct extent_buffer *src,
int dst_item, int src_item, int nr_items)
{
copy_extent_buffer(dst, src, btrfs_item_nr_offset(dst, dst_item),
btrfs_item_nr_offset(src, src_item),
nr_items * sizeof(struct btrfs_item));
}
int btrfs_super_csum_size(const struct btrfs_super_block *s)
{
u16 t = btrfs_super_csum_type(s);
@ -78,6 +184,8 @@ size_t __attribute_const__ btrfs_get_num_csums(void)
struct btrfs_path *btrfs_alloc_path(void)
{
might_sleep();
return kmem_cache_zalloc(btrfs_path_cachep, GFP_NOFS);
}
@ -487,7 +595,7 @@ static noinline int __btrfs_cow_block(struct btrfs_trans_handle *trans,
} else {
WARN_ON(trans->transid != btrfs_header_generation(parent));
btrfs_tree_mod_log_insert_key(parent, parent_slot,
BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BTRFS_MOD_LOG_KEY_REPLACE);
btrfs_set_node_blockptr(parent, parent_slot,
cow->start);
btrfs_set_node_ptr_generation(parent, parent_slot,
@ -850,19 +958,22 @@ struct extent_buffer *btrfs_read_node_slot(struct extent_buffer *parent,
int slot)
{
int level = btrfs_header_level(parent);
struct btrfs_tree_parent_check check = { 0 };
struct extent_buffer *eb;
struct btrfs_key first_key;
if (slot < 0 || slot >= btrfs_header_nritems(parent))
return ERR_PTR(-ENOENT);
BUG_ON(level == 0);
btrfs_node_key_to_cpu(parent, &first_key, slot);
check.level = level - 1;
check.transid = btrfs_node_ptr_generation(parent, slot);
check.owner_root = btrfs_header_owner(parent);
check.has_first_key = true;
btrfs_node_key_to_cpu(parent, &check.first_key, slot);
eb = read_tree_block(parent->fs_info, btrfs_node_blockptr(parent, slot),
btrfs_header_owner(parent),
btrfs_node_ptr_generation(parent, slot),
level - 1, &first_key);
&check);
if (IS_ERR(eb))
return eb;
if (!extent_buffer_uptodate(eb)) {
@ -1016,7 +1127,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
struct btrfs_disk_key right_key;
btrfs_node_key(right, &right_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &right_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@ -1062,7 +1173,7 @@ static noinline int balance_level(struct btrfs_trans_handle *trans,
struct btrfs_disk_key mid_key;
btrfs_node_key(mid, &mid_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &mid_key, pslot);
btrfs_mark_buffer_dirty(parent);
@ -1164,7 +1275,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
orig_slot += left_nr;
btrfs_node_key(mid, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot,
BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot);
btrfs_mark_buffer_dirty(parent);
@ -1218,7 +1329,7 @@ static noinline int push_nodes_for_insert(struct btrfs_trans_handle *trans,
btrfs_node_key(right, &disk_key, 0);
ret = btrfs_tree_mod_log_insert_key(parent, pslot + 1,
BTRFS_MOD_LOG_KEY_REPLACE, GFP_NOFS);
BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(parent, &disk_key, pslot + 1);
btrfs_mark_buffer_dirty(parent);
@ -1421,10 +1532,10 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
const struct btrfs_key *key)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_tree_parent_check check = { 0 };
u64 blocknr;
u64 gen;
struct extent_buffer *tmp;
struct btrfs_key first_key;
int ret;
int parent_level;
bool unlock_up;
@ -1433,7 +1544,11 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
blocknr = btrfs_node_blockptr(*eb_ret, slot);
gen = btrfs_node_ptr_generation(*eb_ret, slot);
parent_level = btrfs_header_level(*eb_ret);
btrfs_node_key_to_cpu(*eb_ret, &first_key, slot);
btrfs_node_key_to_cpu(*eb_ret, &check.first_key, slot);
check.has_first_key = true;
check.level = parent_level - 1;
check.transid = gen;
check.owner_root = root->root_key.objectid;
/*
* If we need to read an extent buffer from disk and we are holding locks
@ -1455,7 +1570,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
* parents (shared tree blocks).
*/
if (btrfs_verify_level_key(tmp,
parent_level - 1, &first_key, gen)) {
parent_level - 1, &check.first_key, gen)) {
free_extent_buffer(tmp);
return -EUCLEAN;
}
@ -1472,7 +1587,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
btrfs_unlock_up_safe(p, level + 1);
/* now we're allowed to do a blocking uptodate check */
ret = btrfs_read_extent_buffer(tmp, gen, parent_level - 1, &first_key);
ret = btrfs_read_extent_buffer(tmp, &check);
if (ret) {
free_extent_buffer(tmp);
btrfs_release_path(p);
@ -1502,8 +1617,7 @@ read_block_for_search(struct btrfs_root *root, struct btrfs_path *p,
if (p->reada != READA_NONE)
reada_for_search(fs_info, p, level, slot, key->objectid);
tmp = read_tree_block(fs_info, blocknr, root->root_key.objectid,
gen, parent_level - 1, &first_key);
tmp = read_tree_block(fs_info, blocknr, &check);
if (IS_ERR(tmp)) {
btrfs_release_path(p);
return PTR_ERR(tmp);
@ -1934,6 +2048,8 @@ int btrfs_search_slot(struct btrfs_trans_handle *trans, struct btrfs_root *root,
int min_write_lock_level;
int prev_cmp;
might_sleep();
lowest_level = p->lowest_level;
WARN_ON(lowest_level && ins_len > 0);
WARN_ON(p->nodes[0] != NULL);
@ -2357,7 +2473,7 @@ int btrfs_search_backwards(struct btrfs_root *root, struct btrfs_key *key,
return ret;
}
/**
/*
* Search for a valid slot for the given path.
*
* @root: The root node of the tree.
@ -2416,7 +2532,7 @@ static void fixup_low_keys(struct btrfs_path *path,
break;
t = path->nodes[i];
ret = btrfs_tree_mod_log_insert_key(t, tslot,
BTRFS_MOD_LOG_KEY_REPLACE, GFP_ATOMIC);
BTRFS_MOD_LOG_KEY_REPLACE);
BUG_ON(ret < 0);
btrfs_set_node_key(t, key, tslot);
btrfs_mark_buffer_dirty(path->nodes[i]);
@ -2585,8 +2701,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
return ret;
}
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(dst_nritems),
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(dst, dst_nritems),
btrfs_node_key_ptr_offset(src, 0),
push_items * sizeof(struct btrfs_key_ptr));
if (push_items < src_nritems) {
@ -2594,8 +2710,8 @@ static int push_node_left(struct btrfs_trans_handle *trans,
* Don't call btrfs_tree_mod_log_insert_move() here, key removal
* was already fully logged by btrfs_tree_mod_log_eb_copy() above.
*/
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(push_items),
memmove_extent_buffer(src, btrfs_node_key_ptr_offset(src, 0),
btrfs_node_key_ptr_offset(src, push_items),
(src_nritems - push_items) *
sizeof(struct btrfs_key_ptr));
}
@ -2655,8 +2771,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
}
ret = btrfs_tree_mod_log_insert_move(dst, push_items, 0, dst_nritems);
BUG_ON(ret < 0);
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(push_items),
btrfs_node_key_ptr_offset(0),
memmove_extent_buffer(dst, btrfs_node_key_ptr_offset(dst, push_items),
btrfs_node_key_ptr_offset(dst, 0),
(dst_nritems) *
sizeof(struct btrfs_key_ptr));
@ -2667,8 +2783,8 @@ static int balance_node_right(struct btrfs_trans_handle *trans,
return ret;
}
copy_extent_buffer(dst, src,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(src_nritems - push_items),
btrfs_node_key_ptr_offset(dst, 0),
btrfs_node_key_ptr_offset(src, src_nritems - push_items),
push_items * sizeof(struct btrfs_key_ptr));
btrfs_set_header_nritems(src, src_nritems - push_items);
@ -2771,13 +2887,13 @@ static void insert_ptr(struct btrfs_trans_handle *trans,
BUG_ON(ret < 0);
}
memmove_extent_buffer(lower,
btrfs_node_key_ptr_offset(slot + 1),
btrfs_node_key_ptr_offset(slot),
btrfs_node_key_ptr_offset(lower, slot + 1),
btrfs_node_key_ptr_offset(lower, slot),
(nritems - slot) * sizeof(struct btrfs_key_ptr));
}
if (level) {
ret = btrfs_tree_mod_log_insert_key(lower, slot,
BTRFS_MOD_LOG_KEY_ADD, GFP_NOFS);
BTRFS_MOD_LOG_KEY_ADD);
BUG_ON(ret < 0);
}
btrfs_set_node_key(lower, key, slot);
@ -2854,8 +2970,8 @@ static noinline int split_node(struct btrfs_trans_handle *trans,
return ret;
}
copy_extent_buffer(split, c,
btrfs_node_key_ptr_offset(0),
btrfs_node_key_ptr_offset(mid),
btrfs_node_key_ptr_offset(split, 0),
btrfs_node_key_ptr_offset(c, mid),
(c_nritems - mid) * sizeof(struct btrfs_key_ptr));
btrfs_set_header_nritems(split, c_nritems - mid);
btrfs_set_header_nritems(c, mid);
@ -2995,25 +3111,17 @@ static noinline int __push_leaf_right(struct btrfs_path *path,
/* make room in the right data area */
data_end = leaf_data_end(right);
memmove_extent_buffer(right,
BTRFS_LEAF_DATA_OFFSET + data_end - push_space,
BTRFS_LEAF_DATA_OFFSET + data_end,
memmove_leaf_data(right, data_end - push_space, data_end,
BTRFS_LEAF_DATA_SIZE(fs_info) - data_end);
/* copy from the left data area */
copy_extent_buffer(right, left, BTRFS_LEAF_DATA_OFFSET +
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
BTRFS_LEAF_DATA_OFFSET + leaf_data_end(left),
push_space);
copy_leaf_data(right, left, BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
leaf_data_end(left), push_space);
memmove_extent_buffer(right, btrfs_item_nr_offset(push_items),
btrfs_item_nr_offset(0),
right_nritems * sizeof(struct btrfs_item));
memmove_leaf_items(right, push_items, 0, right_nritems);
/* copy the items from left to right */
copy_extent_buffer(right, left, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(left_nritems - push_items),
push_items * sizeof(struct btrfs_item));
copy_leaf_items(right, left, 0, left_nritems - push_items, push_items);
/* update the item pointers */
btrfs_init_map_token(&token, right);
@ -3205,19 +3313,13 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
WARN_ON(!empty && push_items == btrfs_header_nritems(right));
/* push data from right to left */
copy_extent_buffer(left, right,
btrfs_item_nr_offset(btrfs_header_nritems(left)),
btrfs_item_nr_offset(0),
push_items * sizeof(struct btrfs_item));
copy_leaf_items(left, right, btrfs_header_nritems(left), 0, push_items);
push_space = BTRFS_LEAF_DATA_SIZE(fs_info) -
btrfs_item_offset(right, push_items - 1);
copy_extent_buffer(left, right, BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(left) - push_space,
BTRFS_LEAF_DATA_OFFSET +
btrfs_item_offset(right, push_items - 1),
push_space);
copy_leaf_data(left, right, leaf_data_end(left) - push_space,
btrfs_item_offset(right, push_items - 1), push_space);
old_left_nritems = btrfs_header_nritems(left);
BUG_ON(old_left_nritems <= 0);
@ -3240,15 +3342,12 @@ static noinline int __push_leaf_left(struct btrfs_path *path, int data_size,
if (push_items < right_nritems) {
push_space = btrfs_item_offset(right, push_items - 1) -
leaf_data_end(right);
memmove_extent_buffer(right, BTRFS_LEAF_DATA_OFFSET +
memmove_leaf_data(right,
BTRFS_LEAF_DATA_SIZE(fs_info) - push_space,
BTRFS_LEAF_DATA_OFFSET +
leaf_data_end(right), push_space);
memmove_extent_buffer(right, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(push_items),
(btrfs_header_nritems(right) - push_items) *
sizeof(struct btrfs_item));
memmove_leaf_items(right, 0, push_items,
btrfs_header_nritems(right) - push_items);
}
btrfs_init_map_token(&token, right);
@ -3380,13 +3479,9 @@ static noinline void copy_for_split(struct btrfs_trans_handle *trans,
btrfs_set_header_nritems(right, nritems);
data_copy_size = btrfs_item_data_end(l, mid) - leaf_data_end(l);
copy_extent_buffer(right, l, btrfs_item_nr_offset(0),
btrfs_item_nr_offset(mid),
nritems * sizeof(struct btrfs_item));
copy_leaf_items(right, l, 0, mid, nritems);
copy_extent_buffer(right, l,
BTRFS_LEAF_DATA_OFFSET + BTRFS_LEAF_DATA_SIZE(fs_info) -
data_copy_size, BTRFS_LEAF_DATA_OFFSET +
copy_leaf_data(right, l, BTRFS_LEAF_DATA_SIZE(fs_info) - data_copy_size,
leaf_data_end(l), data_copy_size);
rt_data_off = BTRFS_LEAF_DATA_SIZE(fs_info) - btrfs_item_data_end(l, mid);
@ -3757,9 +3852,7 @@ static noinline int split_item(struct btrfs_path *path,
nritems = btrfs_header_nritems(leaf);
if (slot != nritems) {
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + 1),
btrfs_item_nr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_item));
memmove_leaf_items(leaf, slot + 1, slot, nritems - slot);
}
btrfs_cpu_key_to_disk(&disk_key, new_key);
@ -3870,9 +3963,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
/* shift the data */
if (from_end) {
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data_start + new_size - data_end);
memmove_leaf_data(leaf, data_end + size_diff, data_end,
old_data_start + new_size - data_end);
} else {
struct btrfs_disk_key disk_key;
u64 offset;
@ -3897,9 +3989,8 @@ void btrfs_truncate_item(struct btrfs_path *path, u32 new_size, int from_end)
}
}
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + size_diff, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data_start - data_end);
memmove_leaf_data(leaf, data_end + size_diff, data_end,
old_data_start - data_end);
offset = btrfs_disk_key_offset(&disk_key);
btrfs_set_disk_key_offset(&disk_key, offset + size_diff);
@ -3964,9 +4055,8 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
/* shift the data */
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end - data_size, BTRFS_LEAF_DATA_OFFSET +
data_end, old_data - data_end);
memmove_leaf_data(leaf, data_end - data_size, data_end,
old_data - data_end);
data_end = old_data;
old_size = btrfs_item_size(leaf, slot);
@ -3979,14 +4069,15 @@ void btrfs_extend_item(struct btrfs_path *path, u32 data_size)
}
}
/**
* setup_items_for_insert - Helper called before inserting one or more items
* to a leaf. Main purpose is to save stack depth by doing the bulk of the work
* in a function that doesn't call btrfs_search_slot
/*
* Make space in the node before inserting one or more items.
*
* @root: root we are inserting items to
* @path: points to the leaf/slot where we are going to insert new items
* @batch: information about the batch of items to insert
*
* Main purpose is to save stack depth by doing the bulk of the work in a
* function that doesn't call btrfs_search_slot
*/
static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *path,
const struct btrfs_item_batch *batch)
@ -4049,15 +4140,11 @@ static void setup_items_for_insert(struct btrfs_root *root, struct btrfs_path *p
ioff - batch->total_data_size);
}
/* shift the items */
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot + batch->nr),
btrfs_item_nr_offset(slot),
(nritems - slot) * sizeof(struct btrfs_item));
memmove_leaf_items(leaf, slot + batch->nr, slot, nritems - slot);
/* shift the data */
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end - batch->total_data_size,
BTRFS_LEAF_DATA_OFFSET + data_end,
old_data - data_end);
memmove_leaf_data(leaf, data_end - batch->total_data_size,
data_end, old_data - data_end);
data_end = old_data;
}
@ -4211,13 +4298,13 @@ static void del_ptr(struct btrfs_root *root, struct btrfs_path *path,
BUG_ON(ret < 0);
}
memmove_extent_buffer(parent,
btrfs_node_key_ptr_offset(slot),
btrfs_node_key_ptr_offset(slot + 1),
btrfs_node_key_ptr_offset(parent, slot),
btrfs_node_key_ptr_offset(parent, slot + 1),
sizeof(struct btrfs_key_ptr) *
(nritems - slot - 1));
} else if (level) {
ret = btrfs_tree_mod_log_insert_key(parent, slot,
BTRFS_MOD_LOG_KEY_REMOVE, GFP_NOFS);
BTRFS_MOD_LOG_KEY_REMOVE);
BUG_ON(ret < 0);
}
@ -4292,9 +4379,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
for (i = 0; i < nr; i++)
dsize += btrfs_item_size(leaf, slot + i);
memmove_extent_buffer(leaf, BTRFS_LEAF_DATA_OFFSET +
data_end + dsize,
BTRFS_LEAF_DATA_OFFSET + data_end,
memmove_leaf_data(leaf, data_end + dsize, data_end,
last_off - data_end);
btrfs_init_map_token(&token, leaf);
@ -4305,10 +4390,7 @@ int btrfs_del_items(struct btrfs_trans_handle *trans, struct btrfs_root *root,
btrfs_set_token_item_offset(&token, i, ioff + dsize);
}
memmove_extent_buffer(leaf, btrfs_item_nr_offset(slot),
btrfs_item_nr_offset(slot + nr),
sizeof(struct btrfs_item) *
(nritems - slot - nr));
memmove_leaf_items(leaf, slot, slot + nr, nritems - slot - nr);
}
btrfs_set_header_nritems(leaf, nritems - nr);
nritems -= nr;
@ -4850,6 +4932,14 @@ done:
return ret;
}
int btrfs_next_old_item(struct btrfs_root *root, struct btrfs_path *path, u64 time_seq)
{
path->slots[0]++;
if (path->slots[0] >= btrfs_header_nritems(path->nodes[0]))
return btrfs_next_old_leaf(root, path, time_seq);
return 0;
}
/*
* this uses btrfs_prev_leaf to walk backwards in the tree, and keeps
* searching until it gets past min_objectid or finds an item of 'type'
@ -4933,3 +5023,18 @@ int btrfs_previous_extent_item(struct btrfs_root *root,
}
return 1;
}
int __init btrfs_ctree_init(void)
{
btrfs_path_cachep = kmem_cache_create("btrfs_path",
sizeof(struct btrfs_path), 0,
SLAB_MEM_SPREAD, NULL);
if (!btrfs_path_cachep)
return -ENOMEM;
return 0;
}
void __cold btrfs_ctree_exit(void)
{
kmem_cache_destroy(btrfs_path_cachep);
}

File diff suppressed because it is too large Load Diff

1376
fs/btrfs/defrag.c Normal file

File diff suppressed because it is too large Load Diff

22
fs/btrfs/defrag.h Normal file
View File

@ -0,0 +1,22 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_DEFRAG_H
#define BTRFS_DEFRAG_H
int btrfs_defrag_file(struct inode *inode, struct file_ra_state *ra,
struct btrfs_ioctl_defrag_range_args *range,
u64 newer_than, unsigned long max_to_defrag);
int __init btrfs_auto_defrag_init(void);
void __cold btrfs_auto_defrag_exit(void);
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u32 extent_thresh);
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info);
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info);
int btrfs_defrag_leaves(struct btrfs_trans_handle *trans, struct btrfs_root *root);
static inline int btrfs_defrag_cancelled(struct btrfs_fs_info *fs_info)
{
return signal_pending(current);
}
#endif

View File

@ -1,5 +1,6 @@
// SPDX-License-Identifier: GPL-2.0
#include "messages.h"
#include "ctree.h"
#include "delalloc-space.h"
#include "block-rsv.h"
@ -8,6 +9,7 @@
#include "transaction.h"
#include "qgroup.h"
#include "block-group.h"
#include "fs.h"
/*
* HOW DOES THIS WORK
@ -200,8 +202,8 @@ void btrfs_free_reserved_data_space(struct btrfs_inode *inode,
btrfs_qgroup_free_data(inode, reserved, start, len);
}
/**
* Release any excessive reservation
/*
* Release any excessive reservations for an inode.
*
* @inode: the inode we need to release from
* @qgroup_free: free or convert qgroup meta. Unlike normal operation, qgroup
@ -375,8 +377,8 @@ int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
return 0;
}
/**
* Release a metadata reservation for an inode
/*
* Release a metadata reservation for an inode.
*
* @inode: the inode to release the reservation for.
* @num_bytes: the number of bytes we are releasing.
@ -403,8 +405,9 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
btrfs_inode_rsv_release(inode, qgroup_free);
}
/**
* btrfs_delalloc_release_extents - release our outstanding_extents
/*
* Release our outstanding_extents for an inode.
*
* @inode: the inode to balance the reservation for.
* @num_bytes: the number of bytes we originally reserved with
*
@ -431,9 +434,9 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
btrfs_inode_rsv_release(inode, true);
}
/**
* btrfs_delalloc_reserve_space - reserve data and metadata space for
* delalloc
/*
* Reserve data and metadata space for delalloc
*
* @inode: inode we're writing to
* @start: start range we are writing to
* @len: how long the range we are writing to
@ -442,19 +445,19 @@ void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes)
*
* This will do the following things
*
* - reserve space in data space info for num bytes
* and reserve precious corresponding qgroup space
* - reserve space in data space info for num bytes and reserve precious
* corresponding qgroup space
* (Done in check_data_free_space)
*
* - reserve space for metadata space, based on the number of outstanding
* extents and how much csums will be needed
* also reserve metadata space in a per root over-reserve method.
* extents and how much csums will be needed also reserve metadata space in a
* per root over-reserve method.
* - add to the inodes->delalloc_bytes
* - add it to the fs_info's delalloc inodes list.
* (Above 3 all done in delalloc_reserve_metadata)
*
* Return 0 for success
* Return <0 for error(-ENOSPC or -EQUOT)
* Return <0 for error(-ENOSPC or -EDQUOT)
*/
int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len)
@ -473,7 +476,7 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
return ret;
}
/**
/*
* Release data and metadata space for delalloc
*
* @inode: inode we're releasing space for
@ -482,10 +485,10 @@ int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
* @len: length of the space already reserved
* @qgroup_free: should qgroup reserved-space also be freed
*
* This function will release the metadata space that was not used and will
* decrement ->delalloc_bytes and remove it from the fs_info delalloc_inodes
* list if there are no delalloc bytes left.
* Also it will handle the qgroup reserved space.
* Release the metadata space that was not used and will decrement
* ->delalloc_bytes and remove it from the fs_info->delalloc_inodes list if
* there are no delalloc bytes left. Also it will handle the qgroup reserved
* space.
*/
void btrfs_delalloc_release_space(struct btrfs_inode *inode,
struct extent_changeset *reserved,

View File

@ -20,5 +20,8 @@ void btrfs_delalloc_release_metadata(struct btrfs_inode *inode, u64 num_bytes,
bool qgroup_free);
int btrfs_delalloc_reserve_space(struct btrfs_inode *inode,
struct extent_changeset **reserved, u64 start, u64 len);
int btrfs_delalloc_reserve_metadata(struct btrfs_inode *inode, u64 num_bytes,
u64 disk_num_bytes, bool noflush);
void btrfs_delalloc_release_extents(struct btrfs_inode *inode, u64 num_bytes);
#endif /* BTRFS_DELALLOC_SPACE_H */

View File

@ -6,14 +6,19 @@
#include <linux/slab.h>
#include <linux/iversion.h>
#include "ctree.h"
#include "fs.h"
#include "messages.h"
#include "misc.h"
#include "delayed-inode.h"
#include "disk-io.h"
#include "transaction.h"
#include "ctree.h"
#include "qgroup.h"
#include "locking.h"
#include "inode-item.h"
#include "space-info.h"
#include "accessors.h"
#include "file-item.h"
#define BTRFS_DELAYED_WRITEBACK 512
#define BTRFS_DELAYED_BACKGROUND 128
@ -1412,7 +1417,7 @@ void btrfs_balance_delayed_items(struct btrfs_fs_info *fs_info)
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
struct btrfs_disk_key *disk_key, u8 type,
struct btrfs_disk_key *disk_key, u8 flags,
u64 index)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
@ -1443,7 +1448,7 @@ int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
btrfs_set_stack_dir_transid(dir_item, trans->transid);
btrfs_set_stack_dir_data_len(dir_item, 0);
btrfs_set_stack_dir_name_len(dir_item, name_len);
btrfs_set_stack_dir_type(dir_item, type);
btrfs_set_stack_dir_flags(dir_item, flags);
memcpy((char *)(dir_item + 1), name, name_len);
data_len = delayed_item->data_len + sizeof(struct btrfs_item);
@ -1641,8 +1646,8 @@ bool btrfs_readdir_get_delayed_items(struct inode *inode,
* We can only do one readdir with delayed items at a time because of
* item->readdir_list.
*/
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
btrfs_inode_lock(inode, 0);
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
btrfs_inode_lock(BTRFS_I(inode), 0);
mutex_lock(&delayed_node->mutex);
item = __btrfs_first_delayed_insertion_item(delayed_node);
@ -1753,7 +1758,7 @@ int btrfs_readdir_delayed_dir_index(struct dir_context *ctx,
name = (char *)(di + 1);
name_len = btrfs_stack_dir_name_len(di);
d_type = fs_ftype_to_dtype(di->type);
d_type = fs_ftype_to_dtype(btrfs_dir_flags_to_ftype(di->type));
btrfs_disk_key_to_cpu(&location, &di->location);
over = !dir_emit(ctx, name, name_len,

View File

@ -113,7 +113,7 @@ static inline void btrfs_init_delayed_root(
int btrfs_insert_delayed_dir_index(struct btrfs_trans_handle *trans,
const char *name, int name_len,
struct btrfs_inode *dir,
struct btrfs_disk_key *disk_key, u8 type,
struct btrfs_disk_key *disk_key, u8 flags,
u64 index);
int btrfs_delete_delayed_dir_index(struct btrfs_trans_handle *trans,

View File

@ -6,12 +6,14 @@
#include <linux/sched.h>
#include <linux/slab.h>
#include <linux/sort.h>
#include "messages.h"
#include "ctree.h"
#include "delayed-ref.h"
#include "transaction.h"
#include "qgroup.h"
#include "space-info.h"
#include "tree-mod-log.h"
#include "fs.h"
struct kmem_cache *btrfs_delayed_ref_head_cachep;
struct kmem_cache *btrfs_delayed_tree_ref_cachep;
@ -69,14 +71,14 @@ int btrfs_should_throttle_delayed_refs(struct btrfs_trans_handle *trans)
return btrfs_check_space_for_delayed_refs(trans->fs_info);
}
/**
* Release a ref head's reservation
/*
* Release a ref head's reservation.
*
* @fs_info: the filesystem
* @nr: number of items to drop
*
* This drops the delayed ref head's count from the delayed refs rsv and frees
* any excess reservation we had.
* Drops the delayed ref head's count from the delayed refs rsv and free any
* excess reservation we had.
*/
void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
{
@ -102,8 +104,7 @@ void btrfs_delayed_refs_rsv_release(struct btrfs_fs_info *fs_info, int nr)
}
/*
* btrfs_update_delayed_refs_rsv - adjust the size of the delayed refs rsv
* @trans - the trans that may have generated delayed refs
* Adjust the size of the delayed refs rsv.
*
* This is to be called anytime we may have adjusted trans->delayed_ref_updates,
* it'll calculate the additional size and add it to the delayed_refs_rsv.
@ -137,8 +138,8 @@ void btrfs_update_delayed_refs_rsv(struct btrfs_trans_handle *trans)
trans->delayed_ref_updates = 0;
}
/**
* Transfer bytes to our delayed refs rsv
/*
* Transfer bytes to our delayed refs rsv.
*
* @fs_info: the filesystem
* @src: source block rsv to transfer from
@ -186,8 +187,8 @@ void btrfs_migrate_to_delayed_refs_rsv(struct btrfs_fs_info *fs_info,
delayed_refs_rsv->space_info, to_free);
}
/**
* Refill based on our delayed refs usage
/*
* Refill based on our delayed refs usage.
*
* @fs_info: the filesystem
* @flush: control how we can flush for this reservation.

View File

@ -18,11 +18,13 @@
#include "volumes.h"
#include "async-thread.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "dev-replace.h"
#include "sysfs.h"
#include "zoned.h"
#include "block-group.h"
#include "fs.h"
#include "accessors.h"
#include "scrub.h"
/*
* Device replace overview
@ -246,7 +248,6 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices = fs_info->fs_devices;
struct btrfs_device *device;
struct block_device *bdev;
struct rcu_string *name;
u64 devid = BTRFS_DEV_REPLACE_DEVID;
int ret = 0;
@ -290,19 +291,12 @@ static int btrfs_init_dev_replace_tgtdev(struct btrfs_fs_info *fs_info,
}
device = btrfs_alloc_device(NULL, &devid, NULL);
device = btrfs_alloc_device(NULL, &devid, NULL, device_path);
if (IS_ERR(device)) {
ret = PTR_ERR(device);
goto error;
}
name = rcu_string_strdup(device_path, GFP_KERNEL);
if (!name) {
btrfs_free_device(device);
ret = -ENOMEM;
goto error;
}
rcu_assign_pointer(device->name, name);
ret = lookup_bdev(device_path, &device->devt);
if (ret)
goto error;
@ -456,14 +450,6 @@ out:
return ret;
}
static char* btrfs_dev_name(struct btrfs_device *device)
{
if (!device || test_bit(BTRFS_DEV_STATE_MISSING, &device->dev_state))
return "<missing disk>";
else
return rcu_str_deref(device->name);
}
static int mark_block_group_to_copy(struct btrfs_fs_info *fs_info,
struct btrfs_device *src_dev)
{
@ -679,7 +665,7 @@ static int btrfs_dev_replace_start(struct btrfs_fs_info *fs_info,
"dev_replace from %s (devid %llu) to %s started",
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name));
btrfs_dev_name(tgt_device));
/*
* from now on, the writes to the srcdev are all duplicated to
@ -938,7 +924,7 @@ static int btrfs_dev_replace_finishing(struct btrfs_fs_info *fs_info,
"btrfs_scrub_dev(%s, %llu, %s) failed %d",
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name), scrub_ret);
btrfs_dev_name(tgt_device), scrub_ret);
error:
up_write(&dev_replace->rwsem);
mutex_unlock(&fs_info->chunk_mutex);
@ -956,7 +942,7 @@ error:
"dev_replace from %s (devid %llu) to %s finished",
btrfs_dev_name(src_device),
src_device->devid,
rcu_str_deref(tgt_device->name));
btrfs_dev_name(tgt_device));
clear_bit(BTRFS_DEV_STATE_REPLACE_TGT, &tgt_device->dev_state);
tgt_device->devid = src_device->devid;
src_device->devid = BTRFS_DEV_REPLACE_DEVID;

View File

@ -25,5 +25,13 @@ int __pure btrfs_dev_replace_is_ongoing(struct btrfs_dev_replace *dev_replace);
bool btrfs_finish_block_group_to_copy(struct btrfs_device *srcdev,
struct btrfs_block_group *cache,
u64 physical);
void btrfs_bio_counter_inc_blocked(struct btrfs_fs_info *fs_info);
void btrfs_bio_counter_sub(struct btrfs_fs_info *fs_info, s64 amount);
static inline void btrfs_bio_counter_dec(struct btrfs_fs_info *fs_info)
{
btrfs_bio_counter_sub(fs_info, 1);
}
#endif

View File

@ -3,9 +3,12 @@
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "accessors.h"
#include "dir-item.h"
/*
* insert a name into a directory, doing overflow properly if there is a hash
@ -81,7 +84,7 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
leaf = path->nodes[0];
btrfs_cpu_key_to_disk(&disk_key, &location);
btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
btrfs_set_dir_type(leaf, dir_item, BTRFS_FT_XATTR);
btrfs_set_dir_flags(leaf, dir_item, BTRFS_FT_XATTR);
btrfs_set_dir_name_len(leaf, dir_item, name_len);
btrfs_set_dir_transid(leaf, dir_item, trans->transid);
btrfs_set_dir_data_len(leaf, dir_item, data_len);
@ -103,8 +106,8 @@ int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
* to use for the second index (if one is created).
* Will return 0 or -ENOMEM
*/
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
int name_len, struct btrfs_inode *dir,
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name, struct btrfs_inode *dir,
struct btrfs_key *location, u8 type, u64 index)
{
int ret = 0;
@ -120,7 +123,7 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
key.objectid = btrfs_ino(dir);
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
key.offset = btrfs_name_hash(name->name, name->len);
path = btrfs_alloc_path();
if (!path)
@ -128,9 +131,9 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
btrfs_cpu_key_to_disk(&disk_key, location);
data_size = sizeof(*dir_item) + name_len;
data_size = sizeof(*dir_item) + name->len;
dir_item = insert_with_overflow(trans, root, path, &key, data_size,
name, name_len);
name->name, name->len);
if (IS_ERR(dir_item)) {
ret = PTR_ERR(dir_item);
if (ret == -EEXIST)
@ -138,15 +141,18 @@ int btrfs_insert_dir_item(struct btrfs_trans_handle *trans, const char *name,
goto out_free;
}
if (IS_ENCRYPTED(&dir->vfs_inode))
type |= BTRFS_FT_ENCRYPTED;
leaf = path->nodes[0];
btrfs_set_dir_item_key(leaf, dir_item, &disk_key);
btrfs_set_dir_type(leaf, dir_item, type);
btrfs_set_dir_flags(leaf, dir_item, type);
btrfs_set_dir_data_len(leaf, dir_item, 0);
btrfs_set_dir_name_len(leaf, dir_item, name_len);
btrfs_set_dir_name_len(leaf, dir_item, name->len);
btrfs_set_dir_transid(leaf, dir_item, trans->transid);
name_ptr = (unsigned long)(dir_item + 1);
write_extent_buffer(leaf, name, name_ptr, name_len);
write_extent_buffer(leaf, name->name, name_ptr, name->len);
btrfs_mark_buffer_dirty(leaf);
second_insert:
@ -157,7 +163,7 @@ second_insert:
}
btrfs_release_path(path);
ret2 = btrfs_insert_delayed_dir_index(trans, name, name_len, dir,
ret2 = btrfs_insert_delayed_dir_index(trans, name->name, name->len, dir,
&disk_key, type, index);
out_free:
btrfs_free_path(path);
@ -206,7 +212,7 @@ static struct btrfs_dir_item *btrfs_lookup_match_dir(
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
const char *name, int name_len,
const struct fscrypt_str *name,
int mod)
{
struct btrfs_key key;
@ -214,9 +220,10 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
key.offset = btrfs_name_hash(name->name, name->len);
di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
di = btrfs_lookup_match_dir(trans, root, path, &key, name->name,
name->len, mod);
if (IS_ERR(di) && PTR_ERR(di) == -ENOENT)
return NULL;
@ -224,7 +231,7 @@ struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
}
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const char *name, int name_len)
const struct fscrypt_str *name)
{
int ret;
struct btrfs_key key;
@ -240,9 +247,10 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
key.objectid = dir;
key.type = BTRFS_DIR_ITEM_KEY;
key.offset = btrfs_name_hash(name, name_len);
key.offset = btrfs_name_hash(name->name, name->len);
di = btrfs_lookup_match_dir(NULL, root, path, &key, name, name_len, 0);
di = btrfs_lookup_match_dir(NULL, root, path, &key, name->name,
name->len, 0);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
/* Nothing found, we're safe */
@ -262,11 +270,8 @@ int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
goto out;
}
/*
* see if there is room in the item to insert this
* name
*/
data_size = sizeof(*di) + name_len;
/* See if there is room in the item to insert this name. */
data_size = sizeof(*di) + name->len;
leaf = path->nodes[0];
slot = path->slots[0];
if (data_size + btrfs_item_size(leaf, slot) +
@ -303,8 +308,7 @@ struct btrfs_dir_item *
btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
u64 index, const char *name, int name_len,
int mod)
u64 index, const struct fscrypt_str *name, int mod)
{
struct btrfs_dir_item *di;
struct btrfs_key key;
@ -313,7 +317,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
key.type = BTRFS_DIR_INDEX_KEY;
key.offset = index;
di = btrfs_lookup_match_dir(trans, root, path, &key, name, name_len, mod);
di = btrfs_lookup_match_dir(trans, root, path, &key, name->name,
name->len, mod);
if (di == ERR_PTR(-ENOENT))
return NULL;
@ -321,9 +326,8 @@ btrfs_lookup_dir_index_item(struct btrfs_trans_handle *trans,
}
struct btrfs_dir_item *
btrfs_search_dir_index_item(struct btrfs_root *root,
struct btrfs_path *path, u64 dirid,
const char *name, int name_len)
btrfs_search_dir_index_item(struct btrfs_root *root, struct btrfs_path *path,
u64 dirid, const struct fscrypt_str *name)
{
struct btrfs_dir_item *di;
struct btrfs_key key;
@ -338,7 +342,7 @@ btrfs_search_dir_index_item(struct btrfs_root *root,
break;
di = btrfs_match_dir_item_name(root->fs_info, path,
name, name_len);
name->name, name->len);
if (di)
return di;
}

42
fs/btrfs/dir-item.h Normal file
View File

@ -0,0 +1,42 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_DIR_ITEM_H
#define BTRFS_DIR_ITEM_H
int btrfs_check_dir_item_collision(struct btrfs_root *root, u64 dir,
const struct fscrypt_str *name);
int btrfs_insert_dir_item(struct btrfs_trans_handle *trans,
const struct fscrypt_str *name, struct btrfs_inode *dir,
struct btrfs_key *location, u8 type, u64 index);
struct btrfs_dir_item *btrfs_lookup_dir_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
const struct fscrypt_str *name, int mod);
struct btrfs_dir_item *btrfs_lookup_dir_index_item(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
u64 index, const struct fscrypt_str *name, int mod);
struct btrfs_dir_item *btrfs_search_dir_index_item(struct btrfs_root *root,
struct btrfs_path *path, u64 dirid,
const struct fscrypt_str *name);
int btrfs_delete_one_dir_name(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
struct btrfs_dir_item *di);
int btrfs_insert_xattr_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
const char *name, u16 name_len,
const void *data, u16 data_len);
struct btrfs_dir_item *btrfs_lookup_xattr(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 dir,
const char *name, u16 name_len,
int mod);
struct btrfs_dir_item *btrfs_match_dir_item_name(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
const char *name,
int name_len);
#endif

View File

@ -11,6 +11,7 @@
#include "block-group.h"
#include "discard.h"
#include "free-space-cache.h"
#include "fs.h"
/*
* This contains the logic to handle async discard.
@ -61,7 +62,7 @@
#define BTRFS_DISCARD_MAX_DELAY_MSEC (1000UL)
#define BTRFS_DISCARD_MAX_IOPS (10U)
/* Montonically decreasing minimum length filters after index 0 */
/* Monotonically decreasing minimum length filters after index 0 */
static int discard_minlen[BTRFS_NR_DISCARD_LISTS] = {
0,
BTRFS_ASYNC_DISCARD_MAX_FILTER,
@ -146,8 +147,9 @@ static bool remove_from_discard_list(struct btrfs_discard_ctl *discard_ctl,
return running;
}
/**
* find_next_block_group - find block_group that's up next for discarding
/*
* Find block_group that's up next for discarding.
*
* @discard_ctl: discard control
* @now: current time
*
@ -184,17 +186,17 @@ static struct btrfs_block_group *find_next_block_group(
return ret_block_group;
}
/**
* Wrap find_next_block_group()
/*
* Look up next block group and set it for use.
*
* @discard_ctl: discard control
* @discard_state: the discard_state of the block_group after state management
* @discard_index: the discard_index of the block_group after state management
* @now: time when discard was invoked, in ns
*
* This wraps find_next_block_group() and sets the block_group to be in use.
* discard_state's control flow is managed here. Variables related to
* discard_state are reset here as needed (eg discard_cursor). @discard_state
* Wrap find_next_block_group() and set the block_group to be in use.
* @discard_state's control flow is managed here. Variables related to
* @discard_state are reset here as needed (eg. @discard_cursor). @discard_state
* and @discard_index are remembered as it may change while we're discarding,
* but we want the discard to execute in the context determined here.
*/
@ -233,8 +235,9 @@ again:
return block_group;
}
/**
* btrfs_discard_check_filter - updates a block groups filters
/*
* Update a block group's filters.
*
* @block_group: block group of interest
* @bytes: recently freed region size after coalescing
*
@ -271,8 +274,9 @@ void btrfs_discard_check_filter(struct btrfs_block_group *block_group,
}
}
/**
* btrfs_update_discard_index - moves a block group along the discard lists
/*
* Move a block group along the discard lists.
*
* @discard_ctl: discard control
* @block_group: block_group of interest
*
@ -291,13 +295,14 @@ static void btrfs_update_discard_index(struct btrfs_discard_ctl *discard_ctl,
add_to_discard_list(discard_ctl, block_group);
}
/**
* btrfs_discard_cancel_work - remove a block_group from the discard lists
/*
* Remove a block_group from the discard lists.
*
* @discard_ctl: discard control
* @block_group: block_group of interest
*
* This removes @block_group from the discard lists. If necessary, it waits on
* the current work and then reschedules the delayed work.
* Remove @block_group from the discard lists. If necessary, wait on the
* current work and then reschedule the delayed work.
*/
void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
@ -308,12 +313,13 @@ void btrfs_discard_cancel_work(struct btrfs_discard_ctl *discard_ctl,
}
}
/**
* btrfs_discard_queue_work - handles queuing the block_groups
/*
* Handles queuing the block_groups.
*
* @discard_ctl: discard control
* @block_group: block_group of interest
*
* This maintains the LRU order of the discard lists.
* Maintain the LRU order of the discard lists.
*/
void btrfs_discard_queue_work(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
@ -383,7 +389,8 @@ static void __btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
}
/*
* btrfs_discard_schedule_work - responsible for scheduling the discard work
* Responsible for scheduling the discard work.
*
* @discard_ctl: discard control
* @override: override the current timer
*
@ -401,15 +408,16 @@ void btrfs_discard_schedule_work(struct btrfs_discard_ctl *discard_ctl,
spin_unlock(&discard_ctl->lock);
}
/**
* btrfs_finish_discard_pass - determine next step of a block_group
/*
* Determine next step of a block_group.
*
* @discard_ctl: discard control
* @block_group: block_group of interest
*
* This determines the next step for a block group after it's finished going
* through a pass on a discard list. If it is unused and fully trimmed, we can
* mark it unused and send it to the unused_bgs path. Otherwise, pass it onto
* the appropriate filter list or let it fall off.
* Determine the next step for a block group after it's finished going through
* a pass on a discard list. If it is unused and fully trimmed, we can mark it
* unused and send it to the unused_bgs path. Otherwise, pass it onto the
* appropriate filter list or let it fall off.
*/
static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
struct btrfs_block_group *block_group)
@ -426,12 +434,13 @@ static void btrfs_finish_discard_pass(struct btrfs_discard_ctl *discard_ctl,
}
}
/**
* btrfs_discard_workfn - discard work function
/*
* Discard work queue callback
*
* @work: work
*
* This finds the next block_group to start discarding and then discards a
* single region. It does this in a two-pass fashion: first extents and second
* Find the next block_group to start discarding and then discard a single
* region. It does this in a two-pass fashion: first extents and second
* bitmaps. Completely discarded block groups are sent to the unused_bgs path.
*/
static void btrfs_discard_workfn(struct work_struct *work)
@ -507,11 +516,12 @@ static void btrfs_discard_workfn(struct work_struct *work)
spin_unlock(&discard_ctl->lock);
}
/**
* btrfs_run_discard_work - determines if async discard should be running
/*
* Determine if async discard should be running.
*
* @discard_ctl: discard control
*
* Checks if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
* Check if the file system is writeable and BTRFS_FS_DISCARD_RUNNING is set.
*/
bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
{
@ -523,8 +533,9 @@ bool btrfs_run_discard_work(struct btrfs_discard_ctl *discard_ctl)
test_bit(BTRFS_FS_DISCARD_RUNNING, &fs_info->flags));
}
/**
* btrfs_discard_calc_delay - recalculate the base delay
/*
* Recalculate the base delay.
*
* @discard_ctl: discard control
*
* Recalculate the base delay which is based off the total number of
@ -545,7 +556,7 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
spin_lock(&discard_ctl->lock);
/*
* The following is to fix a potential -1 discrepenancy that we're not
* The following is to fix a potential -1 discrepancy that we're not
* sure how to reproduce. But given that this is the only place that
* utilizes these numbers and this is only called by from
* btrfs_finish_extent_commit() which is synchronized, we can correct
@ -578,13 +589,14 @@ void btrfs_discard_calc_delay(struct btrfs_discard_ctl *discard_ctl)
spin_unlock(&discard_ctl->lock);
}
/**
* btrfs_discard_update_discardable - propagate discard counters
/*
* Propagate discard counters.
*
* @block_group: block_group of interest
*
* This propagates deltas of counters up to the discard_ctl. It maintains a
* current counter and a previous counter passing the delta up to the global
* stat. Then the current counter value becomes the previous counter value.
* Propagate deltas of counters up to the discard_ctl. It maintains a current
* counter and a previous counter passing the delta up to the global stat.
* Then the current counter value becomes the previous counter value.
*/
void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
{
@ -619,8 +631,9 @@ void btrfs_discard_update_discardable(struct btrfs_block_group *block_group)
}
}
/**
* btrfs_discard_punt_unused_bgs_list - punt unused_bgs list to discard lists
/*
* Punt unused_bgs list to discard lists.
*
* @fs_info: fs_info of interest
*
* The unused_bgs list needs to be punted to the discard lists because the
@ -644,8 +657,9 @@ void btrfs_discard_punt_unused_bgs_list(struct btrfs_fs_info *fs_info)
spin_unlock(&fs_info->unused_bgs_lock);
}
/**
* btrfs_discard_purge_list - purge discard lists
/*
* Purge discard lists.
*
* @discard_ctl: discard control
*
* If we are disabling async discard, we may have intercepted block groups that

View File

@ -23,7 +23,7 @@
#include "disk-io.h"
#include "transaction.h"
#include "btrfs_inode.h"
#include "volumes.h"
#include "bio.h"
#include "print-tree.h"
#include "locking.h"
#include "tree-log.h"
@ -43,6 +43,15 @@
#include "space-info.h"
#include "zoned.h"
#include "subpage.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "defrag.h"
#include "uuid-tree.h"
#include "relocation.h"
#include "scrub.h"
#include "super.h"
#define BTRFS_SUPER_FLAG_SUPP (BTRFS_HEADER_FLAG_WRITTEN |\
BTRFS_HEADER_FLAG_RELOC |\
@ -75,12 +84,12 @@ static void btrfs_free_csum_hash(struct btrfs_fs_info *fs_info)
* just before they are sent down the IO stack.
*/
struct async_submit_bio {
struct inode *inode;
struct btrfs_inode *inode;
struct bio *bio;
extent_submit_bio_start_t *submit_bio_start;
enum btrfs_wq_submit_cmd submit_cmd;
int mirror_num;
/* Optional parameter for submit_bio_start used by direct io */
/* Optional parameter for used by direct io */
u64 dio_file_offset;
struct btrfs_work work;
blk_status_t status;
@ -246,40 +255,54 @@ int btrfs_verify_level_key(struct extent_buffer *eb, int level,
return ret;
}
static int btrfs_repair_eb_io_failure(const struct extent_buffer *eb,
int mirror_num)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 start = eb->start;
int i, num_pages = num_extent_pages(eb);
int ret = 0;
if (sb_rdonly(fs_info->sb))
return -EROFS;
for (i = 0; i < num_pages; i++) {
struct page *p = eb->pages[i];
ret = btrfs_repair_io_failure(fs_info, 0, start, PAGE_SIZE,
start, p, start - page_offset(p), mirror_num);
if (ret)
break;
start += PAGE_SIZE;
}
return ret;
}
/*
* helper to read a given tree block, doing retries as required when
* the checksums don't match and we have alternate mirrors to try.
*
* @parent_transid: expected transid, skip check if 0
* @level: expected level, mandatory check
* @first_key: expected key of first slot, skip check if NULL
* @check: expected tree parentness check, see the comments of the
* structure for details.
*/
int btrfs_read_extent_buffer(struct extent_buffer *eb,
u64 parent_transid, int level,
struct btrfs_key *first_key)
struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
struct extent_io_tree *io_tree;
int failed = 0;
int ret;
int num_copies = 0;
int mirror_num = 0;
int failed_mirror = 0;
io_tree = &BTRFS_I(fs_info->btree_inode)->io_tree;
ASSERT(check);
while (1) {
clear_bit(EXTENT_BUFFER_CORRUPT, &eb->bflags);
ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num);
if (!ret) {
if (verify_parent_transid(io_tree, eb,
parent_transid, 0))
ret = -EIO;
else if (btrfs_verify_level_key(eb, level,
first_key, parent_transid))
ret = -EUCLEAN;
else
ret = read_extent_buffer_pages(eb, WAIT_COMPLETE, mirror_num, check);
if (!ret)
break;
}
num_copies = btrfs_num_copies(fs_info,
eb->start, eb->len);
@ -455,7 +478,8 @@ static int check_tree_block_fsid(struct extent_buffer *eb)
}
/* Do basic extent buffer checks at read time */
static int validate_extent_buffer(struct extent_buffer *eb)
static int validate_extent_buffer(struct extent_buffer *eb,
struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = eb->fs_info;
u64 found_start;
@ -465,6 +489,8 @@ static int validate_extent_buffer(struct extent_buffer *eb)
const u8 *header_csum;
int ret = 0;
ASSERT(check);
found_start = btrfs_header_bytenr(eb);
if (found_start != eb->start) {
btrfs_err_rl(fs_info,
@ -503,6 +529,45 @@ static int validate_extent_buffer(struct extent_buffer *eb)
goto out;
}
if (found_level != check->level) {
ret = -EIO;
goto out;
}
if (unlikely(check->transid &&
btrfs_header_generation(eb) != check->transid)) {
btrfs_err_rl(eb->fs_info,
"parent transid verify failed on logical %llu mirror %u wanted %llu found %llu",
eb->start, eb->read_mirror, check->transid,
btrfs_header_generation(eb));
ret = -EIO;
goto out;
}
if (check->has_first_key) {
struct btrfs_key *expect_key = &check->first_key;
struct btrfs_key found_key;
if (found_level)
btrfs_node_key_to_cpu(eb, &found_key, 0);
else
btrfs_item_key_to_cpu(eb, &found_key, 0);
if (unlikely(btrfs_comp_cpu_keys(expect_key, &found_key))) {
btrfs_err(fs_info,
"tree first key mismatch detected, bytenr=%llu parent_transid=%llu key expected=(%llu,%u,%llu) has=(%llu,%u,%llu)",
eb->start, check->transid,
expect_key->objectid,
expect_key->type, expect_key->offset,
found_key.objectid, found_key.type,
found_key.offset);
ret = -EUCLEAN;
goto out;
}
}
if (check->owner_root) {
ret = btrfs_check_eb_owner(eb, check->owner_root);
if (ret < 0)
goto out;
}
/*
* If this is a leaf block and it is corrupt, set the corrupt bit so
* that we don't try and read the other copies of this block, just
@ -527,13 +592,15 @@ out:
}
static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
int mirror)
int mirror, struct btrfs_tree_parent_check *check)
{
struct btrfs_fs_info *fs_info = btrfs_sb(page->mapping->host->i_sb);
struct extent_buffer *eb;
bool reads_done;
int ret = 0;
ASSERT(check);
/*
* We don't allow bio merge for subpage metadata read, so we should
* only get one eb for each endio hook.
@ -557,7 +624,7 @@ static int validate_subpage_buffer(struct page *page, u64 start, u64 end,
ret = -EIO;
goto err;
}
ret = validate_extent_buffer(eb);
ret = validate_extent_buffer(eb, check);
if (ret < 0)
goto err;
@ -587,7 +654,8 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ASSERT(page->private);
if (btrfs_sb(page->mapping->host->i_sb)->nodesize < PAGE_SIZE)
return validate_subpage_buffer(page, start, end, mirror);
return validate_subpage_buffer(page, start, end, mirror,
&bbio->parent_check);
eb = (struct extent_buffer *)page->private;
@ -606,7 +674,7 @@ int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
ret = -EIO;
goto err;
}
ret = validate_extent_buffer(eb);
ret = validate_extent_buffer(eb, &bbio->parent_check);
err:
if (ret) {
/*
@ -628,8 +696,18 @@ static void run_one_async_start(struct btrfs_work *work)
blk_status_t ret;
async = container_of(work, struct async_submit_bio, work);
ret = async->submit_bio_start(async->inode, async->bio,
async->dio_file_offset);
switch (async->submit_cmd) {
case WQ_SUBMIT_METADATA:
ret = btree_submit_bio_start(async->bio);
break;
case WQ_SUBMIT_DATA:
ret = btrfs_submit_bio_start(async->inode, async->bio);
break;
case WQ_SUBMIT_DATA_DIO:
ret = btrfs_submit_bio_start_direct_io(async->inode,
async->bio, async->dio_file_offset);
break;
}
if (ret)
async->status = ret;
}
@ -646,7 +724,7 @@ static void run_one_async_done(struct btrfs_work *work)
{
struct async_submit_bio *async =
container_of(work, struct async_submit_bio, work);
struct inode *inode = async->inode;
struct btrfs_inode *inode = async->inode;
struct btrfs_bio *bbio = btrfs_bio(async->bio);
/* If an error occurred we just want to clean up the bio and move on */
@ -661,7 +739,7 @@ static void run_one_async_done(struct btrfs_work *work)
* This changes nothing when cgroups aren't in use.
*/
async->bio->bi_opf |= REQ_CGROUP_PUNT;
btrfs_submit_bio(btrfs_sb(inode->i_sb), async->bio, async->mirror_num);
btrfs_submit_bio(inode->root->fs_info, async->bio, async->mirror_num);
}
static void run_one_async_free(struct btrfs_work *work)
@ -679,11 +757,10 @@ static void run_one_async_free(struct btrfs_work *work)
* - true if the work has been succesfuly submitted
* - false in case of error
*/
bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start)
bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd)
{
struct btrfs_fs_info *fs_info = BTRFS_I(inode)->root->fs_info;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct async_submit_bio *async;
async = kmalloc(sizeof(*async), GFP_NOFS);
@ -693,7 +770,7 @@ bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
async->inode = inode;
async->bio = bio;
async->mirror_num = mirror_num;
async->submit_bio_start = submit_bio_start;
async->submit_cmd = cmd;
btrfs_init_work(&async->work, run_one_async_start, run_one_async_done,
run_one_async_free);
@ -727,8 +804,7 @@ static blk_status_t btree_csum_one_bio(struct bio *bio)
return errno_to_blk_status(ret);
}
static blk_status_t btree_submit_bio_start(struct inode *inode, struct bio *bio,
u64 dio_file_offset)
blk_status_t btree_submit_bio_start(struct bio *bio)
{
/*
* when we're called for a write, we're already in the async
@ -749,13 +825,14 @@ static bool should_async_write(struct btrfs_fs_info *fs_info,
return true;
}
void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num)
void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num)
{
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_bio *bbio = btrfs_bio(bio);
blk_status_t ret;
bio->bi_opf |= REQ_META;
bbio->is_metadata = 1;
if (btrfs_op(bio) != BTRFS_MAP_WRITE) {
btrfs_submit_bio(fs_info, bio, mirror_num);
@ -766,8 +843,8 @@ void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_
* Kthread helpers are used to submit writes so that checksumming can
* happen in parallel across all CPUs.
*/
if (should_async_write(fs_info, BTRFS_I(inode)) &&
btrfs_wq_submit_bio(inode, bio, mirror_num, 0, btree_submit_bio_start))
if (should_async_write(fs_info, inode) &&
btrfs_wq_submit_bio(inode, bio, mirror_num, 0, WQ_SUBMIT_METADATA))
return;
ret = btree_csum_one_bio(bio);
@ -919,28 +996,28 @@ struct extent_buffer *btrfs_find_create_tree_block(
* Read tree block at logical address @bytenr and do variant basic but critical
* verification.
*
* @owner_root: the objectid of the root owner for this block.
* @parent_transid: expected transid of this tree block, skip check if 0
* @level: expected level, mandatory check
* @first_key: expected key in slot 0, skip check if NULL
* @check: expected tree parentness check, see comments of the
* structure for details.
*/
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 owner_root, u64 parent_transid,
int level, struct btrfs_key *first_key)
struct btrfs_tree_parent_check *check)
{
struct extent_buffer *buf = NULL;
int ret;
buf = btrfs_find_create_tree_block(fs_info, bytenr, owner_root, level);
ASSERT(check);
buf = btrfs_find_create_tree_block(fs_info, bytenr, check->owner_root,
check->level);
if (IS_ERR(buf))
return buf;
ret = btrfs_read_extent_buffer(buf, parent_transid, level, first_key);
ret = btrfs_read_extent_buffer(buf, check);
if (ret) {
free_extent_buffer_stale(buf);
return ERR_PTR(ret);
}
if (btrfs_check_eb_owner(buf, owner_root)) {
if (btrfs_check_eb_owner(buf, check->owner_root)) {
free_extent_buffer_stale(buf);
return ERR_PTR(-EUCLEAN);
}
@ -1027,9 +1104,9 @@ static void __setup_root(struct btrfs_root *root, struct btrfs_fs_info *fs_info,
root->anon_dev = 0;
if (!dummy) {
extent_io_tree_init(fs_info, &root->dirty_log_pages,
IO_TREE_ROOT_DIRTY_LOG_PAGES, NULL);
IO_TREE_ROOT_DIRTY_LOG_PAGES);
extent_io_tree_init(fs_info, &root->log_csum_range,
IO_TREE_LOG_CSUM_RANGE, NULL);
IO_TREE_LOG_CSUM_RANGE);
}
spin_lock_init(&root->root_item_lock);
@ -1167,6 +1244,13 @@ struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr)
return btrfs_global_root(fs_info, &key);
}
struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{
if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
return fs_info->block_group_root;
return btrfs_extent_root(fs_info, 0);
}
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid)
{
@ -1197,7 +1281,7 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
if (IS_ERR(leaf)) {
ret = PTR_ERR(leaf);
leaf = NULL;
goto fail_unlock;
goto fail;
}
root->node = leaf;
@ -1232,9 +1316,6 @@ struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
return root;
fail_unlock:
if (leaf)
btrfs_tree_unlock(leaf);
fail:
btrfs_put_root(root);
@ -1352,6 +1433,7 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
struct btrfs_key *key)
{
struct btrfs_root *root;
struct btrfs_tree_parent_check check = { 0 };
struct btrfs_fs_info *fs_info = tree_root->fs_info;
u64 generation;
int ret;
@ -1371,9 +1453,11 @@ static struct btrfs_root *read_tree_root_path(struct btrfs_root *tree_root,
generation = btrfs_root_generation(&root->root_item);
level = btrfs_root_level(&root->root_item);
root->node = read_tree_block(fs_info,
btrfs_root_bytenr(&root->root_item),
key->objectid, generation, level, NULL);
check.level = level;
check.transid = generation;
check.owner_root = key->objectid;
root->node = read_tree_block(fs_info, btrfs_root_bytenr(&root->root_item),
&check);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
@ -2084,8 +2168,6 @@ static void btrfs_stop_all_workers(struct btrfs_fs_info *fs_info)
btrfs_destroy_workqueue(fs_info->workers);
if (fs_info->endio_workers)
destroy_workqueue(fs_info->endio_workers);
if (fs_info->endio_raid56_workers)
destroy_workqueue(fs_info->endio_raid56_workers);
if (fs_info->rmw_workers)
destroy_workqueue(fs_info->rmw_workers);
if (fs_info->compressed_write_workers)
@ -2231,7 +2313,7 @@ static void btrfs_init_btree_inode(struct btrfs_fs_info *fs_info)
RB_CLEAR_NODE(&BTRFS_I(inode)->rb_node);
extent_io_tree_init(fs_info, &BTRFS_I(inode)->io_tree,
IO_TREE_BTREE_INODE_IO, NULL);
IO_TREE_BTREE_INODE_IO);
extent_map_tree_init(&BTRFS_I(inode)->extent_tree);
BTRFS_I(inode)->root = btrfs_grab_root(fs_info->tree_root);
@ -2291,8 +2373,6 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
alloc_workqueue("btrfs-endio", flags, max_active);
fs_info->endio_meta_workers =
alloc_workqueue("btrfs-endio-meta", flags, max_active);
fs_info->endio_raid56_workers =
alloc_workqueue("btrfs-endio-raid56", flags, max_active);
fs_info->rmw_workers = alloc_workqueue("btrfs-rmw", flags, max_active);
fs_info->endio_write_workers =
btrfs_alloc_workqueue(fs_info, "endio-write", flags,
@ -2314,7 +2394,7 @@ static int btrfs_init_workqueues(struct btrfs_fs_info *fs_info)
fs_info->delalloc_workers && fs_info->flush_workers &&
fs_info->endio_workers && fs_info->endio_meta_workers &&
fs_info->compressed_write_workers &&
fs_info->endio_write_workers && fs_info->endio_raid56_workers &&
fs_info->endio_write_workers &&
fs_info->endio_freespace_worker && fs_info->rmw_workers &&
fs_info->caching_workers && fs_info->fixup_workers &&
fs_info->delayed_workers && fs_info->qgroup_rescan_workers &&
@ -2350,6 +2430,7 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
struct btrfs_fs_devices *fs_devices)
{
int ret;
struct btrfs_tree_parent_check check = { 0 };
struct btrfs_root *log_tree_root;
struct btrfs_super_block *disk_super = fs_info->super_copy;
u64 bytenr = btrfs_super_log_root(disk_super);
@ -2365,10 +2446,10 @@ static int btrfs_replay_log(struct btrfs_fs_info *fs_info,
if (!log_tree_root)
return -ENOMEM;
log_tree_root->node = read_tree_block(fs_info, bytenr,
BTRFS_TREE_LOG_OBJECTID,
fs_info->generation + 1, level,
NULL);
check.level = level;
check.transid = fs_info->generation + 1;
check.owner_root = BTRFS_TREE_LOG_OBJECTID;
log_tree_root->node = read_tree_block(fs_info, bytenr, &check);
if (IS_ERR(log_tree_root->node)) {
btrfs_warn(fs_info, "failed to read log tree");
ret = PTR_ERR(log_tree_root->node);
@ -2846,10 +2927,14 @@ out:
static int load_super_root(struct btrfs_root *root, u64 bytenr, u64 gen, int level)
{
struct btrfs_tree_parent_check check = {
.level = level,
.transid = gen,
.owner_root = root->root_key.objectid
};
int ret = 0;
root->node = read_tree_block(root->fs_info, bytenr,
root->root_key.objectid, gen, level, NULL);
root->node = read_tree_block(root->fs_info, bytenr, &check);
if (IS_ERR(root->node)) {
ret = PTR_ERR(root->node);
root->node = NULL;
@ -3057,7 +3142,7 @@ void btrfs_init_fs_info(struct btrfs_fs_info *fs_info)
fs_info->block_group_cache_tree = RB_ROOT_CACHED;
extent_io_tree_init(fs_info, &fs_info->excluded_extents,
IO_TREE_FS_EXCLUDED_EXTENTS, NULL);
IO_TREE_FS_EXCLUDED_EXTENTS);
mutex_init(&fs_info->ordered_operations_mutex);
mutex_init(&fs_info->tree_log_mutex);
@ -3743,10 +3828,18 @@ int __cold open_ctree(struct super_block *sb, struct btrfs_fs_devices *fs_device
}
/*
* Mount does not set all options immediately, we can do it now and do
* not have to wait for transaction commit
* For devices supporting discard turn on discard=async automatically,
* unless it's already set or disabled. This could be turned off by
* nodiscard for the same mount.
*/
btrfs_apply_pending_changes(fs_info);
if (!(btrfs_test_opt(fs_info, DISCARD_SYNC) ||
btrfs_test_opt(fs_info, DISCARD_ASYNC) ||
btrfs_test_opt(fs_info, NODISCARD)) &&
fs_info->fs_devices->discardable) {
btrfs_set_and_info(fs_info, DISCARD_ASYNC,
"auto enabling async discard");
btrfs_clear_opt(fs_info->mount_opt, NODISCARD);
}
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
if (btrfs_test_opt(fs_info, CHECK_INTEGRITY)) {
@ -3875,7 +3968,7 @@ static void btrfs_end_super_write(struct bio *bio)
if (bio->bi_status) {
btrfs_warn_rl_in_rcu(device->fs_info,
"lost page write due to IO error on %s (%d)",
rcu_str_deref(device->name),
btrfs_dev_name(device),
blk_status_to_errno(bio->bi_status));
ClearPageUptodate(page);
SetPageError(page);

View File

@ -27,14 +27,14 @@ static inline u64 btrfs_sb_offset(int mirror)
struct btrfs_device;
struct btrfs_fs_devices;
struct btrfs_tree_parent_check;
void btrfs_check_leaked_roots(struct btrfs_fs_info *fs_info);
void btrfs_init_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_verify_level_key(struct extent_buffer *eb, int level,
struct btrfs_key *first_key, u64 parent_transid);
struct extent_buffer *read_tree_block(struct btrfs_fs_info *fs_info, u64 bytenr,
u64 owner_root, u64 parent_transid,
int level, struct btrfs_key *first_key);
struct btrfs_tree_parent_check *check);
struct extent_buffer *btrfs_find_create_tree_block(
struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root,
@ -75,6 +75,7 @@ struct btrfs_root *btrfs_global_root(struct btrfs_fs_info *fs_info,
struct btrfs_key *key);
struct btrfs_root *btrfs_csum_root(struct btrfs_fs_info *fs_info, u64 bytenr);
struct btrfs_root *btrfs_extent_root(struct btrfs_fs_info *fs_info, u64 bytenr);
struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info);
void btrfs_free_fs_info(struct btrfs_fs_info *fs_info);
int btrfs_cleanup_fs_roots(struct btrfs_fs_info *fs_info);
@ -85,7 +86,7 @@ void btrfs_drop_and_free_fs_root(struct btrfs_fs_info *fs_info,
int btrfs_validate_metadata_buffer(struct btrfs_bio *bbio,
struct page *page, u64 start, u64 end,
int mirror);
void btrfs_submit_metadata_bio(struct inode *inode, struct bio *bio, int mirror_num);
void btrfs_submit_metadata_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num);
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
struct btrfs_root *btrfs_alloc_dummy_root(struct btrfs_fs_info *fs_info);
#endif
@ -106,24 +107,22 @@ static inline struct btrfs_root *btrfs_grab_root(struct btrfs_root *root)
return NULL;
}
static inline struct btrfs_root *btrfs_block_group_root(struct btrfs_fs_info *fs_info)
{
if (btrfs_fs_compat_ro(fs_info, BLOCK_GROUP_TREE))
return fs_info->block_group_root;
return btrfs_extent_root(fs_info, 0);
}
void btrfs_put_root(struct btrfs_root *root);
void btrfs_mark_buffer_dirty(struct extent_buffer *buf);
int btrfs_buffer_uptodate(struct extent_buffer *buf, u64 parent_transid,
int atomic);
int btrfs_read_extent_buffer(struct extent_buffer *buf, u64 parent_transid,
int level, struct btrfs_key *first_key);
bool btrfs_wq_submit_bio(struct inode *inode, struct bio *bio, int mirror_num,
u64 dio_file_offset,
extent_submit_bio_start_t *submit_bio_start);
blk_status_t btrfs_submit_bio_done(void *private_data, struct bio *bio,
int mirror_num);
int btrfs_read_extent_buffer(struct extent_buffer *buf,
struct btrfs_tree_parent_check *check);
enum btrfs_wq_submit_cmd {
WQ_SUBMIT_METADATA,
WQ_SUBMIT_DATA,
WQ_SUBMIT_DATA_DIO,
};
bool btrfs_wq_submit_bio(struct btrfs_inode *inode, struct bio *bio, int mirror_num,
u64 dio_file_offset, enum btrfs_wq_submit_cmd cmd);
blk_status_t btree_submit_bio_start(struct bio *bio);
int btrfs_alloc_log_tree_node(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_init_log_root_tree(struct btrfs_trans_handle *trans,
@ -136,8 +135,6 @@ void btrfs_cleanup_one_transaction(struct btrfs_transaction *trans,
struct btrfs_fs_info *fs_info);
struct btrfs_root *btrfs_create_tree(struct btrfs_trans_handle *trans,
u64 objectid);
int btree_lock_page_hook(struct page *page, void *data,
void (*flush_fn)(void *));
int btrfs_get_num_tolerated_disk_barrier_failures(u64 flags);
int btrfs_get_free_objectid(struct btrfs_root *root, u64 *objectid);
int btrfs_init_root_free_objectid(struct btrfs_root *root);

View File

@ -7,6 +7,8 @@
#include "btrfs_inode.h"
#include "print-tree.h"
#include "export.h"
#include "accessors.h"
#include "super.h"
#define BTRFS_FID_SIZE_NON_CONNECTABLE (offsetof(struct btrfs_fid, \
parent_objectid) / 4)
@ -57,9 +59,20 @@ static int btrfs_encode_fh(struct inode *inode, u32 *fh, int *max_len,
return type;
}
/*
* Read dentry of inode with @objectid from filesystem root @root_objectid.
*
* @sb: the filesystem super block
* @objectid: inode objectid
* @root_objectid: object id of the subvolume root where to look up the inode
* @generation: optional, if not zero, verify that the found inode
* generation matches
*
* Return dentry alias for the inode, otherwise an error. In case the
* generation does not match return ESTALE.
*/
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
u64 root_objectid, u64 generation,
int check_generation)
u64 root_objectid, u64 generation)
{
struct btrfs_fs_info *fs_info = btrfs_sb(sb);
struct btrfs_root *root;
@ -77,7 +90,7 @@ struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
if (IS_ERR(inode))
return ERR_CAST(inode);
if (check_generation && generation != inode->i_generation) {
if (generation != 0 && generation != inode->i_generation) {
iput(inode);
return ERR_PTR(-ESTALE);
}
@ -106,7 +119,7 @@ static struct dentry *btrfs_fh_to_parent(struct super_block *sb, struct fid *fh,
objectid = fid->parent_objectid;
generation = fid->parent_gen;
return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
return btrfs_get_dentry(sb, objectid, root_objectid, generation);
}
static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
@ -128,7 +141,7 @@ static struct dentry *btrfs_fh_to_dentry(struct super_block *sb, struct fid *fh,
root_objectid = fid->root_objectid;
generation = fid->gen;
return btrfs_get_dentry(sb, objectid, root_objectid, generation, 1);
return btrfs_get_dentry(sb, objectid, root_objectid, generation);
}
struct dentry *btrfs_get_parent(struct dentry *child)
@ -188,7 +201,7 @@ struct dentry *btrfs_get_parent(struct dentry *child)
if (found_key.type == BTRFS_ROOT_BACKREF_KEY) {
return btrfs_get_dentry(fs_info->sb, key.objectid,
found_key.offset, 0, 0);
found_key.offset, 0);
}
return d_obtain_alias(btrfs_iget(fs_info->sb, key.objectid, root));

View File

@ -19,8 +19,7 @@ struct btrfs_fid {
} __attribute__ ((packed));
struct dentry *btrfs_get_dentry(struct super_block *sb, u64 objectid,
u64 root_objectid, u64 generation,
int check_generation);
u64 root_objectid, u64 generation);
struct dentry *btrfs_get_parent(struct dentry *child);
#endif

View File

@ -2,6 +2,7 @@
#include <linux/slab.h>
#include <trace/events/btrfs.h>
#include "messages.h"
#include "ctree.h"
#include "extent-io-tree.h"
#include "btrfs_inode.h"
@ -57,17 +58,17 @@ static inline void __btrfs_debug_check_extent_io_range(const char *caller,
struct extent_io_tree *tree,
u64 start, u64 end)
{
struct inode *inode = tree->private_data;
struct btrfs_inode *inode = tree->inode;
u64 isize;
if (!inode)
return;
isize = i_size_read(inode);
isize = i_size_read(&inode->vfs_inode);
if (end >= PAGE_SIZE && (end % 2) == 0 && end != isize - 1) {
btrfs_debug_rl(BTRFS_I(inode)->root->fs_info,
btrfs_debug_rl(inode->root->fs_info,
"%s: ino %llu isize %llu odd range [%llu,%llu]",
caller, btrfs_ino(BTRFS_I(inode)), isize, start, end);
caller, btrfs_ino(inode), isize, start, end);
}
}
#else
@ -93,13 +94,12 @@ struct tree_entry {
};
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data)
struct extent_io_tree *tree, unsigned int owner)
{
tree->fs_info = fs_info;
tree->state = RB_ROOT;
spin_lock_init(&tree->lock);
tree->private_data = private_data;
tree->inode = NULL;
tree->owner = owner;
if (owner == IO_TREE_INODE_FILE_EXTENT)
lockdep_set_class(&tree->lock, &file_extent_tree_class);
@ -346,9 +346,8 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
other = prev_state(state);
if (other && other->end == state->start - 1 &&
other->state == state->state) {
if (tree->private_data)
btrfs_merge_delalloc_extent(tree->private_data,
state, other);
if (tree->inode)
btrfs_merge_delalloc_extent(tree->inode, state, other);
state->start = other->start;
rb_erase(&other->rb_node, &tree->state);
RB_CLEAR_NODE(&other->rb_node);
@ -357,9 +356,8 @@ static void merge_state(struct extent_io_tree *tree, struct extent_state *state)
other = next_state(state);
if (other && other->start == state->end + 1 &&
other->state == state->state) {
if (tree->private_data)
btrfs_merge_delalloc_extent(tree->private_data, state,
other);
if (tree->inode)
btrfs_merge_delalloc_extent(tree->inode, state, other);
state->end = other->end;
rb_erase(&other->rb_node, &tree->state);
RB_CLEAR_NODE(&other->rb_node);
@ -374,8 +372,8 @@ static void set_state_bits(struct extent_io_tree *tree,
u32 bits_to_set = bits & ~EXTENT_CTLBITS;
int ret;
if (tree->private_data)
btrfs_set_delalloc_extent(tree->private_data, state, bits);
if (tree->inode)
btrfs_set_delalloc_extent(tree->inode, state, bits);
ret = add_extent_changeset(state, bits_to_set, changeset, 1);
BUG_ON(ret < 0);
@ -397,7 +395,7 @@ static int insert_state(struct extent_io_tree *tree,
u32 bits, struct extent_changeset *changeset)
{
struct rb_node **node;
struct rb_node *parent;
struct rb_node *parent = NULL;
const u64 end = state->end;
set_state_bits(tree, state, bits, changeset);
@ -462,8 +460,8 @@ static int split_state(struct extent_io_tree *tree, struct extent_state *orig,
struct rb_node *parent = NULL;
struct rb_node **node;
if (tree->private_data)
btrfs_split_delalloc_extent(tree->private_data, orig, split);
if (tree->inode)
btrfs_split_delalloc_extent(tree->inode, orig, split);
prealloc->start = orig->start;
prealloc->end = split - 1;
@ -510,8 +508,8 @@ static struct extent_state *clear_state_bit(struct extent_io_tree *tree,
u32 bits_to_clear = bits & ~EXTENT_CTLBITS;
int ret;
if (tree->private_data)
btrfs_clear_delalloc_extent(tree->private_data, state, bits);
if (tree->inode)
btrfs_clear_delalloc_extent(tree->inode, state, bits);
ret = add_extent_changeset(state, bits_to_clear, changeset, 0);
BUG_ON(ret < 0);
@ -572,7 +570,7 @@ int __clear_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (bits & (EXTENT_LOCKED | EXTENT_BOUNDARY))
clear = 1;
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
if (!prealloc) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
@ -636,7 +634,8 @@ hit_next:
if (state->start < start) {
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
if (!prealloc)
goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
extent_io_tree_panic(tree, err);
@ -657,7 +656,8 @@ hit_next:
*/
if (state->start <= end && state->end > end) {
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
if (!prealloc)
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
extent_io_tree_panic(tree, err);
@ -714,7 +714,8 @@ static void wait_on_state(struct extent_io_tree *tree,
* The range [start, end] is inclusive.
* The tree lock is taken by this function
*/
void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits)
void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
struct extent_state **cached_state)
{
struct extent_state *state;
@ -722,6 +723,16 @@ void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits)
spin_lock(&tree->lock);
again:
/*
* Maintain cached_state, as we may not remove it from the tree if there
* are more bits than the bits we're waiting on set on this state.
*/
if (cached_state && *cached_state) {
state = *cached_state;
if (extent_state_in_tree(state) &&
state->start <= start && start < state->end)
goto process_node;
}
while (1) {
/*
* This search will find all the extents that end after our
@ -752,6 +763,12 @@ process_node:
}
}
out:
/* This state is no longer useful, clear it and free it up. */
if (cached_state && *cached_state) {
state = *cached_state;
*cached_state = NULL;
free_extent_state(state);
}
spin_unlock(&tree->lock);
}
@ -939,13 +956,17 @@ out:
* sleeping, so the gfp mask is used to indicate what is allowed.
*
* If any of the exclusive bits are set, this will fail with -EEXIST if some
* part of the range already has the desired bits set. The start of the
* existing range is returned in failed_start in this case.
* part of the range already has the desired bits set. The extent_state of the
* existing range is returned in failed_state in this case, and the start of the
* existing range is returned in failed_start. failed_state is used as an
* optimization for wait_extent_bit, failed_start must be used as the source of
* truth as failed_state may have changed since we returned.
*
* [start, end] is inclusive This takes the tree lock.
*/
static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, u64 *failed_start,
struct extent_state **failed_state,
struct extent_state **cached_state,
struct extent_changeset *changeset, gfp_t mask)
{
@ -964,9 +985,9 @@ static int __set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
if (exclusive_bits)
ASSERT(failed_start);
else
ASSERT(failed_start == NULL);
ASSERT(failed_start == NULL && failed_state == NULL);
again:
if (!prealloc && gfpflags_allow_blocking(mask)) {
if (!prealloc) {
/*
* Don't care for allocation failure here because we might end
* up not needing the pre-allocated extent state at all, which
@ -991,7 +1012,8 @@ again:
state = tree_search_for_insert(tree, start, &p, &parent);
if (!state) {
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
if (!prealloc)
goto search_again;
prealloc->start = start;
prealloc->end = end;
insert_state_fast(tree, prealloc, p, parent, bits, changeset);
@ -1012,6 +1034,7 @@ hit_next:
if (state->start == start && state->end <= end) {
if (state->state & exclusive_bits) {
*failed_start = state->start;
cache_state(state, failed_state);
err = -EEXIST;
goto out;
}
@ -1047,6 +1070,7 @@ hit_next:
if (state->start < start) {
if (state->state & exclusive_bits) {
*failed_start = start;
cache_state(state, failed_state);
err = -EEXIST;
goto out;
}
@ -1062,7 +1086,8 @@ hit_next:
}
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
if (!prealloc)
goto search_again;
err = split_state(tree, state, prealloc, start);
if (err)
extent_io_tree_panic(tree, err);
@ -1099,7 +1124,8 @@ hit_next:
this_end = last_start - 1;
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
if (!prealloc)
goto search_again;
/*
* Avoid to free 'prealloc' if it can be merged with the later
@ -1125,12 +1151,14 @@ hit_next:
if (state->start <= end && state->end > end) {
if (state->state & exclusive_bits) {
*failed_start = start;
cache_state(state, failed_state);
err = -EEXIST;
goto out;
}
prealloc = alloc_extent_state_atomic(prealloc);
BUG_ON(!prealloc);
if (!prealloc)
goto search_again;
err = split_state(tree, state, prealloc, end + 1);
if (err)
extent_io_tree_panic(tree, err);
@ -1162,8 +1190,8 @@ out:
int set_extent_bit(struct extent_io_tree *tree, u64 start, u64 end,
u32 bits, struct extent_state **cached_state, gfp_t mask)
{
return __set_extent_bit(tree, start, end, bits, NULL, cached_state,
NULL, mask);
return __set_extent_bit(tree, start, end, bits, NULL, NULL,
cached_state, NULL, mask);
}
/*
@ -1397,7 +1425,7 @@ void find_first_clear_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits)
{
struct extent_state *state;
struct extent_state *prev = NULL, *next;
struct extent_state *prev = NULL, *next = NULL;
spin_lock(&tree->lock);
@ -1487,15 +1515,37 @@ out:
}
/*
* Count the number of bytes in the tree that have a given bit(s) set. This
* can be fairly slow, except for EXTENT_DIRTY which is cached. The total
* number found is returned.
* Count the number of bytes in the tree that have a given bit(s) set for a
* given range.
*
* @tree: The io tree to search.
* @start: The start offset of the range. This value is updated to the
* offset of the first byte found with the given bit(s), so it
* can end up being bigger than the initial value.
* @search_end: The end offset (inclusive value) of the search range.
* @max_bytes: The maximum byte count we are interested. The search stops
* once it reaches this count.
* @bits: The bits the range must have in order to be accounted for.
* If multiple bits are set, then only subranges that have all
* the bits set are accounted for.
* @contig: Indicate if we should ignore holes in the range or not. If
* this is true, then stop once we find a hole.
* @cached_state: A cached state to be used across multiple calls to this
* function in order to speedup searches. Use NULL if this is
* called only once or if each call does not start where the
* previous one ended.
*
* Returns the total number of bytes found within the given range that have
* all given bits set. If the returned number of bytes is greater than zero
* then @start is updated with the offset of the first byte with the bits set.
*/
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end, u64 max_bytes,
u32 bits, int contig)
u32 bits, int contig,
struct extent_state **cached_state)
{
struct extent_state *state;
struct extent_state *state = NULL;
struct extent_state *cached;
u64 cur_start = *start;
u64 total_bytes = 0;
u64 last = 0;
@ -1506,11 +1556,41 @@ u64 count_range_bits(struct extent_io_tree *tree,
spin_lock(&tree->lock);
if (!cached_state || !*cached_state)
goto search;
cached = *cached_state;
if (!extent_state_in_tree(cached))
goto search;
if (cached->start <= cur_start && cur_start <= cached->end) {
state = cached;
} else if (cached->start > cur_start) {
struct extent_state *prev;
/*
* The cached state starts after our search range's start. Check
* if the previous state record starts at or before the range we
* are looking for, and if so, use it - this is a common case
* when there are holes between records in the tree. If there is
* no previous state record, we can start from our cached state.
*/
prev = prev_state(cached);
if (!prev)
state = cached;
else if (prev->start <= cur_start && cur_start <= prev->end)
state = prev;
}
/*
* This search will find all the extents that end after our range
* starts.
*/
search:
if (!state)
state = tree_search(tree, cur_start);
while (state) {
if (state->start > search_end)
break;
@ -1531,7 +1611,16 @@ u64 count_range_bits(struct extent_io_tree *tree,
}
state = next_state(state);
}
if (cached_state) {
free_extent_state(*cached_state);
*cached_state = state;
if (state)
refcount_inc(&state->refs);
}
spin_unlock(&tree->lock);
return total_bytes;
}
@ -1598,8 +1687,8 @@ int set_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
*/
ASSERT(!(bits & EXTENT_LOCKED));
return __set_extent_bit(tree, start, end, bits, NULL, NULL, changeset,
GFP_NOFS);
return __set_extent_bit(tree, start, end, bits, NULL, NULL, NULL,
changeset, GFP_NOFS);
}
int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
@ -1615,17 +1704,18 @@ int clear_record_extent_bits(struct extent_io_tree *tree, u64 start, u64 end,
changeset);
}
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached)
{
int err;
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
NULL, NULL, GFP_NOFS);
NULL, cached, NULL, GFP_NOFS);
if (err == -EEXIST) {
if (failed_start > start)
clear_extent_bit(tree, start, failed_start - 1,
EXTENT_LOCKED, NULL);
EXTENT_LOCKED, cached);
return 0;
}
return 1;
@ -1638,20 +1728,22 @@ int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end)
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached_state)
{
struct extent_state *failed_state = NULL;
int err;
u64 failed_start;
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED, &failed_start,
cached_state, NULL, GFP_NOFS);
&failed_state, cached_state, NULL, GFP_NOFS);
while (err == -EEXIST) {
if (failed_start != start)
clear_extent_bit(tree, start, failed_start - 1,
EXTENT_LOCKED, cached_state);
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED);
wait_extent_bit(tree, failed_start, end, EXTENT_LOCKED,
&failed_state);
err = __set_extent_bit(tree, start, end, EXTENT_LOCKED,
&failed_start, cached_state, NULL,
GFP_NOFS);
&failed_start, &failed_state,
cached_state, NULL, GFP_NOFS);
}
return err;
}

View File

@ -3,43 +3,48 @@
#ifndef BTRFS_EXTENT_IO_TREE_H
#define BTRFS_EXTENT_IO_TREE_H
#include "misc.h"
struct extent_changeset;
struct io_failure_record;
/* Bits for the extent state */
#define EXTENT_DIRTY (1U << 0)
#define EXTENT_UPTODATE (1U << 1)
#define EXTENT_LOCKED (1U << 2)
#define EXTENT_NEW (1U << 3)
#define EXTENT_DELALLOC (1U << 4)
#define EXTENT_DEFRAG (1U << 5)
#define EXTENT_BOUNDARY (1U << 6)
#define EXTENT_NODATASUM (1U << 7)
#define EXTENT_CLEAR_META_RESV (1U << 8)
#define EXTENT_NEED_WAIT (1U << 9)
#define EXTENT_NORESERVE (1U << 11)
#define EXTENT_QGROUP_RESERVED (1U << 12)
#define EXTENT_CLEAR_DATA_RESV (1U << 13)
/*
* Must be cleared only during ordered extent completion or on error paths if we
* did not manage to submit bios and create the ordered extents for the range.
* Should not be cleared during page release and page invalidation (if there is
* an ordered extent in flight), that is left for the ordered extent completion.
enum {
ENUM_BIT(EXTENT_DIRTY),
ENUM_BIT(EXTENT_UPTODATE),
ENUM_BIT(EXTENT_LOCKED),
ENUM_BIT(EXTENT_NEW),
ENUM_BIT(EXTENT_DELALLOC),
ENUM_BIT(EXTENT_DEFRAG),
ENUM_BIT(EXTENT_BOUNDARY),
ENUM_BIT(EXTENT_NODATASUM),
ENUM_BIT(EXTENT_CLEAR_META_RESV),
ENUM_BIT(EXTENT_NEED_WAIT),
ENUM_BIT(EXTENT_NORESERVE),
ENUM_BIT(EXTENT_QGROUP_RESERVED),
ENUM_BIT(EXTENT_CLEAR_DATA_RESV),
/*
* Must be cleared only during ordered extent completion or on error
* paths if we did not manage to submit bios and create the ordered
* extents for the range. Should not be cleared during page release
* and page invalidation (if there is an ordered extent in flight),
* that is left for the ordered extent completion.
*/
#define EXTENT_DELALLOC_NEW (1U << 14)
/*
* When an ordered extent successfully completes for a region marked as a new
* delalloc range, use this flag when clearing a new delalloc range to indicate
* that the VFS' inode number of bytes should be incremented and the inode's new
* delalloc bytes decremented, in an atomic way to prevent races with stat(2).
ENUM_BIT(EXTENT_DELALLOC_NEW),
/*
* When an ordered extent successfully completes for a region marked as
* a new delalloc range, use this flag when clearing a new delalloc
* range to indicate that the VFS' inode number of bytes should be
* incremented and the inode's new delalloc bytes decremented, in an
* atomic way to prevent races with stat(2).
*/
#define EXTENT_ADD_INODE_BYTES (1U << 15)
/*
* Set during truncate when we're clearing an entire range and we just want the
* extent states to go away.
ENUM_BIT(EXTENT_ADD_INODE_BYTES),
/*
* Set during truncate when we're clearing an entire range and we just
* want the extent states to go away.
*/
#define EXTENT_CLEAR_ALL_BITS (1U << 16)
ENUM_BIT(EXTENT_CLEAR_ALL_BITS),
};
#define EXTENT_DO_ACCOUNTING (EXTENT_CLEAR_META_RESV | \
EXTENT_CLEAR_DATA_RESV)
@ -75,7 +80,8 @@ enum {
struct extent_io_tree {
struct rb_root state;
struct btrfs_fs_info *fs_info;
void *private_data;
/* Inode associated with this tree, or NULL. */
struct btrfs_inode *inode;
/* Who owns this io tree, should be one of IO_TREE_* */
u8 owner;
@ -99,21 +105,22 @@ struct extent_state {
};
void extent_io_tree_init(struct btrfs_fs_info *fs_info,
struct extent_io_tree *tree, unsigned int owner,
void *private_data);
struct extent_io_tree *tree, unsigned int owner);
void extent_io_tree_release(struct extent_io_tree *tree);
int lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached);
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end);
int try_lock_extent(struct extent_io_tree *tree, u64 start, u64 end,
struct extent_state **cached);
int __init extent_state_init_cachep(void);
void __cold extent_state_free_cachep(void);
u64 count_range_bits(struct extent_io_tree *tree,
u64 *start, u64 search_end,
u64 max_bytes, u32 bits, int contig);
u64 max_bytes, u32 bits, int contig,
struct extent_state **cached_state);
void free_extent_state(struct extent_state *state);
int test_range_bit(struct extent_io_tree *tree, u64 start, u64 end,
@ -139,13 +146,6 @@ static inline int unlock_extent(struct extent_io_tree *tree, u64 start, u64 end,
GFP_NOFS, NULL);
}
static inline int unlock_extent_atomic(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached)
{
return __clear_extent_bit(tree, start, end, EXTENT_LOCKED, cached,
GFP_ATOMIC, NULL);
}
static inline int clear_extent_bits(struct extent_io_tree *tree, u64 start,
u64 end, u32 bits)
{
@ -217,13 +217,6 @@ static inline int set_extent_new(struct extent_io_tree *tree, u64 start,
return set_extent_bit(tree, start, end, EXTENT_NEW, NULL, GFP_NOFS);
}
static inline int set_extent_uptodate(struct extent_io_tree *tree, u64 start,
u64 end, struct extent_state **cached_state, gfp_t mask)
{
return set_extent_bit(tree, start, end, EXTENT_UPTODATE,
cached_state, mask);
}
int find_first_extent_bit(struct extent_io_tree *tree, u64 start,
u64 *start_ret, u64 *end_ret, u32 bits,
struct extent_state **cached_state);
@ -234,6 +227,7 @@ int find_contiguous_extent_bit(struct extent_io_tree *tree, u64 start,
bool btrfs_find_delalloc_range(struct extent_io_tree *tree, u64 *start,
u64 *end, u64 max_bytes,
struct extent_state **cached_state);
void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits);
void wait_extent_bit(struct extent_io_tree *tree, u64 start, u64 end, u32 bits,
struct extent_state **cached_state);
#endif /* BTRFS_EXTENT_IO_TREE_H */

View File

@ -36,6 +36,13 @@
#include "rcu-string.h"
#include "zoned.h"
#include "dev-replace.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "file-item.h"
#include "orphan.h"
#include "tree-checker.h"
#undef SCRAMBLE_DELAYED_REFS
@ -5255,8 +5262,8 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
u64 bytenr;
u64 generation;
u64 parent;
struct btrfs_tree_parent_check check = { 0 };
struct btrfs_key key;
struct btrfs_key first_key;
struct btrfs_ref ref = { 0 };
struct extent_buffer *next;
int level = wc->level;
@ -5278,7 +5285,12 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
}
bytenr = btrfs_node_blockptr(path->nodes[level], path->slots[level]);
btrfs_node_key_to_cpu(path->nodes[level], &first_key,
check.level = level - 1;
check.transid = generation;
check.owner_root = root->root_key.objectid;
check.has_first_key = true;
btrfs_node_key_to_cpu(path->nodes[level], &check.first_key,
path->slots[level]);
next = find_extent_buffer(fs_info, bytenr);
@ -5340,8 +5352,7 @@ static noinline int do_walk_down(struct btrfs_trans_handle *trans,
if (!next) {
if (reada && level == 1)
reada_walk_down(trans, root, wc, path);
next = read_tree_block(fs_info, bytenr, root->root_key.objectid,
generation, level - 1, &first_key);
next = read_tree_block(fs_info, bytenr, &check);
if (IS_ERR(next)) {
return PTR_ERR(next);
} else if (!extent_buffer_uptodate(next)) {
@ -5973,40 +5984,6 @@ int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
return ret;
}
/*
* helper to account the unused space of all the readonly block group in the
* space_info. takes mirrors into account.
*/
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{
struct btrfs_block_group *block_group;
u64 free_bytes = 0;
int factor;
/* It's df, we don't care if it's racy */
if (list_empty(&sinfo->ro_bgs))
return 0;
spin_lock(&sinfo->lock);
list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
spin_lock(&block_group->lock);
if (!block_group->ro) {
spin_unlock(&block_group->lock);
continue;
}
factor = btrfs_bg_type_to_factor(block_group->flags);
free_bytes += (block_group->length -
block_group->used) * factor;
spin_unlock(&block_group->lock);
}
spin_unlock(&sinfo->lock);
return free_bytes;
}
int btrfs_error_unpin_extent_range(struct btrfs_fs_info *fs_info,
u64 start, u64 end)
{
@ -6072,7 +6049,7 @@ static int btrfs_trim_free_extents(struct btrfs_device *device, u64 *trimmed)
btrfs_warn_in_rcu(fs_info,
"ignoring attempt to trim beyond device size: offset %llu length %llu device %s device size %llu",
start, end - start + 1,
rcu_str_deref(device->name),
btrfs_dev_name(device),
device->total_bytes);
mutex_unlock(&fs_info->chunk_mutex);
ret = 0;

78
fs/btrfs/extent-tree.h Normal file
View File

@ -0,0 +1,78 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_EXTENT_TREE_H
#define BTRFS_EXTENT_TREE_H
enum btrfs_inline_ref_type {
BTRFS_REF_TYPE_INVALID,
BTRFS_REF_TYPE_BLOCK,
BTRFS_REF_TYPE_DATA,
BTRFS_REF_TYPE_ANY,
};
int btrfs_get_extent_inline_ref_type(const struct extent_buffer *eb,
struct btrfs_extent_inline_ref *iref,
enum btrfs_inline_ref_type is_data);
u64 hash_extent_data_ref(u64 root_objectid, u64 owner, u64 offset);
int btrfs_add_excluded_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 num_bytes);
void btrfs_free_excluded_extents(struct btrfs_block_group *cache);
int btrfs_run_delayed_refs(struct btrfs_trans_handle *trans, unsigned long count);
void btrfs_cleanup_ref_head_accounting(struct btrfs_fs_info *fs_info,
struct btrfs_delayed_ref_root *delayed_refs,
struct btrfs_delayed_ref_head *head);
int btrfs_lookup_data_extent(struct btrfs_fs_info *fs_info, u64 start, u64 len);
int btrfs_lookup_extent_info(struct btrfs_trans_handle *trans,
struct btrfs_fs_info *fs_info, u64 bytenr,
u64 offset, int metadata, u64 *refs, u64 *flags);
int btrfs_pin_extent(struct btrfs_trans_handle *trans, u64 bytenr, u64 num,
int reserved);
int btrfs_pin_extent_for_log_replay(struct btrfs_trans_handle *trans,
u64 bytenr, u64 num_bytes);
int btrfs_exclude_logged_extents(struct extent_buffer *eb);
int btrfs_cross_ref_exist(struct btrfs_root *root,
u64 objectid, u64 offset, u64 bytenr, bool strict,
struct btrfs_path *path);
struct extent_buffer *btrfs_alloc_tree_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
u64 parent, u64 root_objectid,
const struct btrfs_disk_key *key,
int level, u64 hint,
u64 empty_size,
enum btrfs_lock_nesting nest);
void btrfs_free_tree_block(struct btrfs_trans_handle *trans,
u64 root_id,
struct extent_buffer *buf,
u64 parent, int last_ref);
int btrfs_alloc_reserved_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 owner,
u64 offset, u64 ram_bytes,
struct btrfs_key *ins);
int btrfs_alloc_logged_file_extent(struct btrfs_trans_handle *trans,
u64 root_objectid, u64 owner, u64 offset,
struct btrfs_key *ins);
int btrfs_reserve_extent(struct btrfs_root *root, u64 ram_bytes, u64 num_bytes,
u64 min_alloc_size, u64 empty_size, u64 hint_byte,
struct btrfs_key *ins, int is_data, int delalloc);
int btrfs_inc_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_dec_ref(struct btrfs_trans_handle *trans, struct btrfs_root *root,
struct extent_buffer *buf, int full_backref);
int btrfs_set_disk_extent_flags(struct btrfs_trans_handle *trans,
struct extent_buffer *eb, u64 flags, int level);
int btrfs_free_extent(struct btrfs_trans_handle *trans, struct btrfs_ref *ref);
int btrfs_free_reserved_extent(struct btrfs_fs_info *fs_info,
u64 start, u64 len, int delalloc);
int btrfs_pin_reserved_extent(struct btrfs_trans_handle *trans, u64 start, u64 len);
int btrfs_finish_extent_commit(struct btrfs_trans_handle *trans);
int btrfs_inc_extent_ref(struct btrfs_trans_handle *trans, struct btrfs_ref *generic_ref);
int __must_check btrfs_drop_snapshot(struct btrfs_root *root, int update_ref,
int for_reloc);
int btrfs_drop_subtree(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct extent_buffer *node,
struct extent_buffer *parent);
#endif

File diff suppressed because it is too large Load Diff

View File

@ -9,6 +9,7 @@
#include <linux/btrfs_tree.h>
#include "compression.h"
#include "ulist.h"
#include "misc.h"
enum {
EXTENT_BUFFER_UPTODATE,
@ -29,13 +30,15 @@ enum {
};
/* these are flags for __process_pages_contig */
#define PAGE_UNLOCK (1 << 0)
/* Page starts writeback, clear dirty bit and set writeback bit */
#define PAGE_START_WRITEBACK (1 << 1)
#define PAGE_END_WRITEBACK (1 << 2)
#define PAGE_SET_ORDERED (1 << 3)
#define PAGE_SET_ERROR (1 << 4)
#define PAGE_LOCK (1 << 5)
enum {
ENUM_BIT(PAGE_UNLOCK),
/* Page starts writeback, clear dirty bit and set writeback bit */
ENUM_BIT(PAGE_START_WRITEBACK),
ENUM_BIT(PAGE_END_WRITEBACK),
ENUM_BIT(PAGE_SET_ORDERED),
ENUM_BIT(PAGE_SET_ERROR),
ENUM_BIT(PAGE_LOCK),
};
/*
* page->private values. Every page that is controlled by the extent
@ -63,17 +66,11 @@ struct btrfs_inode;
struct btrfs_fs_info;
struct io_failure_record;
struct extent_io_tree;
struct btrfs_tree_parent_check;
int __init extent_buffer_init_cachep(void);
void __cold extent_buffer_free_cachep(void);
typedef void (submit_bio_hook_t)(struct inode *inode, struct bio *bio,
int mirror_num,
enum btrfs_compression_type compress_type);
typedef blk_status_t (extent_submit_bio_start_t)(struct inode *inode,
struct bio *bio, u64 dio_file_offset);
#define INLINE_EXTENT_BUFFER_PAGES (BTRFS_MAX_METADATA_BLOCKSIZE / PAGE_SIZE)
struct extent_buffer {
u64 start;
@ -98,6 +95,39 @@ struct extent_buffer {
#endif
};
/*
* Get the correct offset inside the page of extent buffer.
*
* @eb: target extent buffer
* @start: offset inside the extent buffer
*
* Will handle both sectorsize == PAGE_SIZE and sectorsize < PAGE_SIZE cases.
*/
static inline size_t get_eb_offset_in_page(const struct extent_buffer *eb,
unsigned long offset)
{
/*
* For sectorsize == PAGE_SIZE case, eb->start will always be aligned
* to PAGE_SIZE, thus adding it won't cause any difference.
*
* For sectorsize < PAGE_SIZE, we must only read the data that belongs
* to the eb, thus we have to take the eb->start into consideration.
*/
return offset_in_page(offset + eb->start);
}
static inline unsigned long get_eb_page_index(unsigned long offset)
{
/*
* For sectorsize == PAGE_SIZE case, plain >> PAGE_SHIFT is enough.
*
* For sectorsize < PAGE_SIZE case, we only support 64K PAGE_SIZE,
* and have ensured that all tree blocks are contained in one page,
* thus we always get index == 0.
*/
return offset >> PAGE_SHIFT;
}
/*
* Structure to record how many bytes and which ranges are set/cleared
*/
@ -174,8 +204,8 @@ void free_extent_buffer_stale(struct extent_buffer *eb);
#define WAIT_NONE 0
#define WAIT_COMPLETE 1
#define WAIT_PAGE_LOCK 2
int read_extent_buffer_pages(struct extent_buffer *eb, int wait,
int mirror_num);
int read_extent_buffer_pages(struct extent_buffer *eb, int wait, int mirror_num,
struct btrfs_tree_parent_check *parent_check);
void wait_on_extent_buffer_writeback(struct extent_buffer *eb);
void btrfs_readahead_tree_block(struct btrfs_fs_info *fs_info,
u64 bytenr, u64 owner_root, u64 gen, int level);
@ -248,7 +278,6 @@ int extent_invalidate_folio(struct extent_io_tree *tree,
int btrfs_alloc_page_array(unsigned int nr_pages, struct page **page_array);
void end_extent_writepage(struct page *page, int err, u64 start, u64 end);
int btrfs_repair_eb_io_failure(const struct extent_buffer *eb, int mirror_num);
/*
* When IO fails, either with EIO or csum verification fails, we
@ -272,9 +301,9 @@ struct io_failure_record {
int num_copies;
};
int btrfs_repair_one_sector(struct inode *inode, struct btrfs_bio *failed_bbio,
int btrfs_repair_one_sector(struct btrfs_inode *inode, struct btrfs_bio *failed_bbio,
u32 bio_offset, struct page *page, unsigned int pgoff,
submit_bio_hook_t *submit_bio_hook);
bool submit_buffered);
void btrfs_free_io_failure_record(struct btrfs_inode *inode, u64 start, u64 end);
int btrfs_clean_io_failure(struct btrfs_inode *inode, u64 start,
struct page *page, unsigned int pg_offset);

View File

@ -3,6 +3,7 @@
#include <linux/err.h>
#include <linux/slab.h>
#include <linux/spinlock.h>
#include "messages.h"
#include "ctree.h"
#include "volumes.h"
#include "extent_map.h"
@ -27,12 +28,9 @@ void __cold extent_map_exit(void)
kmem_cache_destroy(extent_map_cache);
}
/**
* extent_map_tree_init - initialize extent map tree
* @tree: tree to initialize
*
* Initialize the extent tree @tree. Should be called for each new inode
* or other user of the extent_map interface.
/*
* Initialize the extent tree @tree. Should be called for each new inode or
* other user of the extent_map interface.
*/
void extent_map_tree_init(struct extent_map_tree *tree)
{
@ -41,12 +39,9 @@ void extent_map_tree_init(struct extent_map_tree *tree)
rwlock_init(&tree->lock);
}
/**
* alloc_extent_map - allocate new extent map structure
*
* Allocate a new extent_map structure. The new structure is
* returned with a reference count of one and needs to be
* freed using free_extent_map()
/*
* Allocate a new extent_map structure. The new structure is returned with a
* reference count of one and needs to be freed using free_extent_map()
*/
struct extent_map *alloc_extent_map(void)
{
@ -61,12 +56,9 @@ struct extent_map *alloc_extent_map(void)
return em;
}
/**
* free_extent_map - drop reference count of an extent_map
* @em: extent map being released
*
* Drops the reference out on @em by one and free the structure
* if the reference count hits zero.
/*
* Drop the reference out on @em by one and free the structure if the reference
* count hits zero.
*/
void free_extent_map(struct extent_map *em)
{
@ -81,7 +73,7 @@ void free_extent_map(struct extent_map *em)
}
}
/* simple helper to do math around the end of an extent, handling wrap */
/* Do the math around the end of an extent, handling wrapping. */
static u64 range_end(u64 start, u64 len)
{
if (start + len < start)
@ -137,8 +129,8 @@ static int tree_insert(struct rb_root_cached *root, struct extent_map *em)
}
/*
* search through the tree for an extent_map with a given offset. If
* it can't be found, try to find some neighboring extents
* Search through the tree for an extent_map with a given offset. If it can't
* be found, try to find some neighboring extents
*/
static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
struct rb_node **prev_or_next_ret)
@ -190,7 +182,7 @@ static struct rb_node *__tree_search(struct rb_root *root, u64 offset,
return NULL;
}
/* check to see if two extent_map structs are adjacent and safe to merge */
/* Check to see if two extent_map structs are adjacent and safe to merge. */
static int mergable_maps(struct extent_map *prev, struct extent_map *next)
{
if (test_bit(EXTENT_FLAG_PINNED, &prev->flags))
@ -288,8 +280,9 @@ static void try_merge_map(struct extent_map_tree *tree, struct extent_map *em)
}
}
/**
* unpin_extent_cache - unpin an extent from the cache
/*
* Unpin an extent from the cache.
*
* @tree: tree to unpin the extent in
* @start: logical offset in the file
* @len: length of the extent
@ -392,7 +385,7 @@ static void extent_map_device_clear_bits(struct extent_map *em, unsigned bits)
}
}
/**
/*
* Add new extent map to the extent tree
*
* @tree: tree to insert new map in
@ -451,8 +444,9 @@ __lookup_extent_mapping(struct extent_map_tree *tree,
return em;
}
/**
* lookup_extent_mapping - lookup extent_map
/*
* Lookup extent_map that intersects @start + @len range.
*
* @tree: tree to lookup in
* @start: byte offset to start the search
* @len: length of the lookup range
@ -468,8 +462,9 @@ struct extent_map *lookup_extent_mapping(struct extent_map_tree *tree,
return __lookup_extent_mapping(tree, start, len, 1);
}
/**
* search_extent_mapping - find a nearby extent map
/*
* Find a nearby extent map intersecting @start + @len (not an exact search).
*
* @tree: tree to lookup in
* @start: byte offset to start the search
* @len: length of the lookup range
@ -485,13 +480,14 @@ struct extent_map *search_extent_mapping(struct extent_map_tree *tree,
return __lookup_extent_mapping(tree, start, len, 0);
}
/**
* remove_extent_mapping - removes an extent_map from the extent tree
/*
* Remove an extent_map from the extent tree.
*
* @tree: extent tree to remove from
* @em: extent map being removed
*
* Removes @em from @tree. No reference counts are dropped, and no checks
* are done to see if the range is in use
* Remove @em from @tree. No reference counts are dropped, and no checks
* are done to see if the range is in use.
*/
void remove_extent_mapping(struct extent_map_tree *tree, struct extent_map *em)
{
@ -523,7 +519,7 @@ void replace_extent_mapping(struct extent_map_tree *tree,
setup_extent_mapping(tree, new, modified);
}
static struct extent_map *next_extent_map(struct extent_map *em)
static struct extent_map *next_extent_map(const struct extent_map *em)
{
struct rb_node *next;
@ -585,8 +581,8 @@ static noinline int merge_extent_mapping(struct extent_map_tree *em_tree,
return add_extent_mapping(em_tree, em, 0);
}
/**
* Add extent mapping into em_tree
/*
* Add extent mapping into em_tree.
*
* @fs_info: the filesystem
* @em_tree: extent tree into which we want to insert the extent mapping
@ -613,6 +609,13 @@ int btrfs_add_extent_mapping(struct btrfs_fs_info *fs_info,
int ret;
struct extent_map *em = *em_in;
/*
* Tree-checker should have rejected any inline extent with non-zero
* file offset. Here just do a sanity check.
*/
if (em->block_start == EXTENT_MAP_INLINE)
ASSERT(em->start == 0);
ret = add_extent_mapping(em_tree, em, 0);
/* it is possible that someone inserted the extent into the tree
* while we had the lock dropped. It is also possible that

View File

@ -9,13 +9,18 @@
#include <linux/highmem.h>
#include <linux/sched/mm.h>
#include <crypto/hash.h>
#include "messages.h"
#include "misc.h"
#include "ctree.h"
#include "disk-io.h"
#include "transaction.h"
#include "volumes.h"
#include "bio.h"
#include "print-tree.h"
#include "compression.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
#include "super.h"
#define __MAX_CSUM_ITEMS(r, size) ((unsigned long)(((BTRFS_LEAF_DATA_SIZE(r) - \
sizeof(struct btrfs_item) * 2) / \
@ -24,8 +29,8 @@
#define MAX_CSUM_ITEMS(r, size) (min_t(u32, __MAX_CSUM_ITEMS(r, size), \
PAGE_SIZE))
/**
* Set inode's size according to filesystem options
/*
* Set inode's size according to filesystem options.
*
* @inode: inode we want to update the disk_i_size for
* @new_i_size: i_size we want to set to, 0 if we use i_size
@ -64,8 +69,8 @@ void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_siz
spin_unlock(&inode->lock);
}
/**
* Mark range within a file as having a new extent inserted
/*
* Mark range within a file as having a new extent inserted.
*
* @inode: inode being modified
* @start: start file offset of the file extent we've inserted
@ -92,8 +97,8 @@ int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start,
EXTENT_DIRTY);
}
/**
* Marks an inode range as not having a backing extent
/*
* Mark an inode range as not having a backing extent.
*
* @inode: inode being modified
* @start: start file offset of the file extent we've inserted
@ -121,12 +126,26 @@ int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
start + len - 1, EXTENT_DIRTY, NULL);
}
static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
u16 csum_size)
static size_t bytes_to_csum_size(const struct btrfs_fs_info *fs_info, u32 bytes)
{
u32 ncsums = (PAGE_SIZE - sizeof(struct btrfs_ordered_sum)) / csum_size;
ASSERT(IS_ALIGNED(bytes, fs_info->sectorsize));
return ncsums * fs_info->sectorsize;
return (bytes >> fs_info->sectorsize_bits) * fs_info->csum_size;
}
static size_t csum_size_to_bytes(const struct btrfs_fs_info *fs_info, u32 csum_size)
{
ASSERT(IS_ALIGNED(csum_size, fs_info->csum_size));
return (csum_size / fs_info->csum_size) << fs_info->sectorsize_bits;
}
static inline u32 max_ordered_sum_bytes(const struct btrfs_fs_info *fs_info)
{
u32 max_csum_size = round_down(PAGE_SIZE - sizeof(struct btrfs_ordered_sum),
fs_info->csum_size);
return csum_size_to_bytes(fs_info, max_csum_size);
}
/*
@ -135,9 +154,7 @@ static inline u32 max_ordered_sum_bytes(struct btrfs_fs_info *fs_info,
*/
static int btrfs_ordered_sum_size(struct btrfs_fs_info *fs_info, unsigned long bytes)
{
int num_sectors = (int)DIV_ROUND_UP(bytes, fs_info->sectorsize);
return sizeof(struct btrfs_ordered_sum) + num_sectors * fs_info->csum_size;
return sizeof(struct btrfs_ordered_sum) + bytes_to_csum_size(fs_info, bytes);
}
int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
@ -254,7 +271,7 @@ int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
/*
* Find checksums for logical bytenr range [disk_bytenr, disk_bytenr + len) and
* estore the result to @dst.
* store the result to @dst.
*
* Return >0 for the number of sectors we found.
* Return 0 for the range [disk_bytenr, disk_bytenr + sectorsize) has no csum
@ -360,7 +377,7 @@ static int search_file_offset_in_bio(struct bio *bio, struct inode *inode,
return ret;
}
/**
/*
* Lookup the checksum for the read bio in csum tree.
*
* @inode: inode that the bio is for.
@ -510,7 +527,7 @@ blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst
return ret;
}
int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit,
bool nowait)
{
@ -521,11 +538,7 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
struct btrfs_ordered_sum *sums;
struct btrfs_csum_item *item;
LIST_HEAD(tmplist);
unsigned long offset;
int ret;
size_t size;
u64 csum_end;
const u32 csum_size = fs_info->csum_size;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
@ -551,16 +564,33 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (ret > 0 && path->slots[0] > 0) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
/*
* There are two cases we can hit here for the previous csum
* item:
*
* |<- search range ->|
* |<- csum item ->|
*
* Or
* |<- search range ->|
* |<- csum item ->|
*
* Check if the previous csum item covers the leading part of
* the search range. If so we have to start from previous csum
* item.
*/
if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
key.type == BTRFS_EXTENT_CSUM_KEY) {
offset = (start - key.offset) >> fs_info->sectorsize_bits;
if (offset * csum_size <
if (bytes_to_csum_size(fs_info, start - key.offset) <
btrfs_item_size(leaf, path->slots[0] - 1))
path->slots[0]--;
}
}
while (start <= end) {
u64 csum_end;
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
@ -580,8 +610,8 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
if (key.offset > start)
start = key.offset;
size = btrfs_item_size(leaf, path->slots[0]);
csum_end = key.offset + (size / csum_size) * fs_info->sectorsize;
csum_end = key.offset + csum_size_to_bytes(fs_info,
btrfs_item_size(leaf, path->slots[0]));
if (csum_end <= start) {
path->slots[0]++;
continue;
@ -591,8 +621,11 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
while (start < csum_end) {
unsigned long offset;
size_t size;
size = min_t(size_t, csum_end - start,
max_ordered_sum_bytes(fs_info, csum_size));
max_ordered_sum_bytes(fs_info));
sums = kzalloc(btrfs_ordered_sum_size(fs_info, size),
GFP_NOFS);
if (!sums) {
@ -603,16 +636,14 @@ int btrfs_lookup_csums_range(struct btrfs_root *root, u64 start, u64 end,
sums->bytenr = start;
sums->len = (int)size;
offset = (start - key.offset) >> fs_info->sectorsize_bits;
offset *= csum_size;
size >>= fs_info->sectorsize_bits;
offset = bytes_to_csum_size(fs_info, start - key.offset);
read_extent_buffer(path->nodes[0],
sums->sums,
((unsigned long)item) + offset,
csum_size * size);
bytes_to_csum_size(fs_info, size));
start += fs_info->sectorsize * size;
start += size;
list_add_tail(&sums->list, &tmplist);
}
path->slots[0]++;
@ -630,8 +661,129 @@ fail:
return ret;
}
/**
* Calculate checksums of the data contained inside a bio
/*
* Do the same work as btrfs_lookup_csums_list(), the difference is in how
* we return the result.
*
* This version will set the corresponding bits in @csum_bitmap to represent
* that there is a csum found.
* Each bit represents a sector. Thus caller should ensure @csum_buf passed
* in is large enough to contain all csums.
*/
int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
u8 *csum_buf, unsigned long *csum_bitmap)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_key key;
struct btrfs_path *path;
struct extent_buffer *leaf;
struct btrfs_csum_item *item;
const u64 orig_start = start;
int ret;
ASSERT(IS_ALIGNED(start, fs_info->sectorsize) &&
IS_ALIGNED(end + 1, fs_info->sectorsize));
path = btrfs_alloc_path();
if (!path)
return -ENOMEM;
key.objectid = BTRFS_EXTENT_CSUM_OBJECTID;
key.type = BTRFS_EXTENT_CSUM_KEY;
key.offset = start;
ret = btrfs_search_slot(NULL, root, &key, path, 0, 0);
if (ret < 0)
goto fail;
if (ret > 0 && path->slots[0] > 0) {
leaf = path->nodes[0];
btrfs_item_key_to_cpu(leaf, &key, path->slots[0] - 1);
/*
* There are two cases we can hit here for the previous csum
* item:
*
* |<- search range ->|
* |<- csum item ->|
*
* Or
* |<- search range ->|
* |<- csum item ->|
*
* Check if the previous csum item covers the leading part of
* the search range. If so we have to start from previous csum
* item.
*/
if (key.objectid == BTRFS_EXTENT_CSUM_OBJECTID &&
key.type == BTRFS_EXTENT_CSUM_KEY) {
if (bytes_to_csum_size(fs_info, start - key.offset) <
btrfs_item_size(leaf, path->slots[0] - 1))
path->slots[0]--;
}
}
while (start <= end) {
u64 csum_end;
leaf = path->nodes[0];
if (path->slots[0] >= btrfs_header_nritems(leaf)) {
ret = btrfs_next_leaf(root, path);
if (ret < 0)
goto fail;
if (ret > 0)
break;
leaf = path->nodes[0];
}
btrfs_item_key_to_cpu(leaf, &key, path->slots[0]);
if (key.objectid != BTRFS_EXTENT_CSUM_OBJECTID ||
key.type != BTRFS_EXTENT_CSUM_KEY ||
key.offset > end)
break;
if (key.offset > start)
start = key.offset;
csum_end = key.offset + csum_size_to_bytes(fs_info,
btrfs_item_size(leaf, path->slots[0]));
if (csum_end <= start) {
path->slots[0]++;
continue;
}
csum_end = min(csum_end, end + 1);
item = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_csum_item);
while (start < csum_end) {
unsigned long offset;
size_t size;
u8 *csum_dest = csum_buf + bytes_to_csum_size(fs_info,
start - orig_start);
size = min_t(size_t, csum_end - start, end + 1 - start);
offset = bytes_to_csum_size(fs_info, start - key.offset);
read_extent_buffer(path->nodes[0], csum_dest,
((unsigned long)item) + offset,
bytes_to_csum_size(fs_info, size));
bitmap_set(csum_bitmap,
(start - orig_start) >> fs_info->sectorsize_bits,
size >> fs_info->sectorsize_bits);
start += size;
}
path->slots[0]++;
}
ret = 0;
fail:
btrfs_free_path(path);
return ret;
}
/*
* Calculate checksums of the data contained inside a bio.
*
* @inode: Owner of the data inside the bio
* @bio: Contains the data to be checksummed
@ -746,15 +898,16 @@ blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
}
/*
* helper function for csum removal, this expects the
* key to describe the csum pointed to by the path, and it expects
* the csum to overlap the range [bytenr, len]
* Remove one checksum overlapping a range.
*
* The csum should not be entirely contained in the range and the
* range should not be entirely contained in the csum.
* This expects the key to describe the csum pointed to by the path, and it
* expects the csum to overlap the range [bytenr, len]
*
* This calls btrfs_truncate_item with the correct args based on the
* overlap, and fixes up the key as required.
* The csum should not be entirely contained in the range and the range should
* not be entirely contained in the csum.
*
* This calls btrfs_truncate_item with the correct args based on the overlap,
* and fixes up the key as required.
*/
static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
struct btrfs_path *path,
@ -803,8 +956,7 @@ static noinline void truncate_one_csum(struct btrfs_fs_info *fs_info,
}
/*
* deletes the csum items from the csum tree for a given
* range of bytes.
* Delete the csum items from the csum tree for a given range of bytes.
*/
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len)
@ -1209,7 +1361,6 @@ out:
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
const bool new_inline,
struct extent_map *em)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
@ -1261,10 +1412,9 @@ void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
*/
em->orig_start = EXTENT_MAP_HOLE;
em->block_len = (u64)-1;
if (!new_inline && compress_type != BTRFS_COMPRESS_NONE) {
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
em->compress_type = compress_type;
}
if (compress_type != BTRFS_COMPRESS_NONE)
set_bit(EXTENT_FLAG_COMPRESSED, &em->flags);
} else {
btrfs_err(fs_info,
"unknown file extent item type %d, inode %llu, offset %llu, "

69
fs/btrfs/file-item.h Normal file
View File

@ -0,0 +1,69 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_FILE_ITEM_H
#define BTRFS_FILE_ITEM_H
#include "accessors.h"
#define BTRFS_FILE_EXTENT_INLINE_DATA_START \
(offsetof(struct btrfs_file_extent_item, disk_bytenr))
static inline u32 BTRFS_MAX_INLINE_DATA_SIZE(const struct btrfs_fs_info *info)
{
return BTRFS_MAX_ITEM_SIZE(info) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
/*
* Return the number of bytes used by the item on disk, minus the size of any
* extent headers. If a file is compressed on disk, this is the compressed
* size.
*/
static inline u32 btrfs_file_extent_inline_item_len(
const struct extent_buffer *eb,
int nr)
{
return btrfs_item_size(eb, nr) - BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
static inline unsigned long btrfs_file_extent_inline_start(
const struct btrfs_file_extent_item *e)
{
return (unsigned long)e + BTRFS_FILE_EXTENT_INLINE_DATA_START;
}
static inline u32 btrfs_file_extent_calc_inline_size(u32 datasize)
{
return BTRFS_FILE_EXTENT_INLINE_DATA_START + datasize;
}
int btrfs_del_csums(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 bytenr, u64 len);
blk_status_t btrfs_lookup_bio_sums(struct inode *inode, struct bio *bio, u8 *dst);
int btrfs_insert_hole_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 objectid, u64 pos,
u64 num_bytes);
int btrfs_lookup_file_extent(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid,
u64 bytenr, int mod);
int btrfs_csum_file_blocks(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_ordered_sum *sums);
blk_status_t btrfs_csum_one_bio(struct btrfs_inode *inode, struct bio *bio,
u64 offset, bool one_ordered);
int btrfs_lookup_csums_list(struct btrfs_root *root, u64 start, u64 end,
struct list_head *list, int search_commit,
bool nowait);
int btrfs_lookup_csums_bitmap(struct btrfs_root *root, u64 start, u64 end,
u8 *csum_buf, unsigned long *csum_bitmap);
void btrfs_extent_item_to_extent_map(struct btrfs_inode *inode,
const struct btrfs_path *path,
struct btrfs_file_extent_item *fi,
struct extent_map *em);
int btrfs_inode_clear_file_extent_range(struct btrfs_inode *inode, u64 start,
u64 len);
int btrfs_inode_set_file_extent_range(struct btrfs_inode *inode, u64 start, u64 len);
void btrfs_inode_safe_disk_i_size_write(struct btrfs_inode *inode, u64 new_i_size);
u64 btrfs_file_extent_end(const struct btrfs_path *path);
#endif

View File

@ -30,329 +30,13 @@
#include "delalloc-space.h"
#include "reflink.h"
#include "subpage.h"
static struct kmem_cache *btrfs_inode_defrag_cachep;
/*
* when auto defrag is enabled we
* queue up these defrag structs to remember which
* inodes need defragging passes
*/
struct inode_defrag {
struct rb_node rb_node;
/* objectid */
u64 ino;
/*
* transid where the defrag was added, we search for
* extents newer than this
*/
u64 transid;
/* root objectid */
u64 root;
/*
* The extent size threshold for autodefrag.
*
* This value is different for compressed/non-compressed extents,
* thus needs to be passed from higher layer.
* (aka, inode_should_defrag())
*/
u32 extent_thresh;
};
static int __compare_inode_defrag(struct inode_defrag *defrag1,
struct inode_defrag *defrag2)
{
if (defrag1->root > defrag2->root)
return 1;
else if (defrag1->root < defrag2->root)
return -1;
else if (defrag1->ino > defrag2->ino)
return 1;
else if (defrag1->ino < defrag2->ino)
return -1;
else
return 0;
}
/* pop a record for an inode into the defrag tree. The lock
* must be held already
*
* If you're inserting a record for an older transid than an
* existing record, the transid already in the tree is lowered
*
* If an existing record is found the defrag item you
* pass in is freed
*/
static int __btrfs_add_inode_defrag(struct btrfs_inode *inode,
struct inode_defrag *defrag)
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct inode_defrag *entry;
struct rb_node **p;
struct rb_node *parent = NULL;
int ret;
p = &fs_info->defrag_inodes.rb_node;
while (*p) {
parent = *p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
ret = __compare_inode_defrag(defrag, entry);
if (ret < 0)
p = &parent->rb_left;
else if (ret > 0)
p = &parent->rb_right;
else {
/* if we're reinserting an entry for
* an old defrag run, make sure to
* lower the transid of our existing record
*/
if (defrag->transid < entry->transid)
entry->transid = defrag->transid;
entry->extent_thresh = min(defrag->extent_thresh,
entry->extent_thresh);
return -EEXIST;
}
}
set_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags);
rb_link_node(&defrag->rb_node, parent, p);
rb_insert_color(&defrag->rb_node, &fs_info->defrag_inodes);
return 0;
}
static inline int __need_auto_defrag(struct btrfs_fs_info *fs_info)
{
if (!btrfs_test_opt(fs_info, AUTO_DEFRAG))
return 0;
if (btrfs_fs_closing(fs_info))
return 0;
return 1;
}
/*
* insert a defrag record for this inode if auto defrag is
* enabled
*/
int btrfs_add_inode_defrag(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u32 extent_thresh)
{
struct btrfs_root *root = inode->root;
struct btrfs_fs_info *fs_info = root->fs_info;
struct inode_defrag *defrag;
u64 transid;
int ret;
if (!__need_auto_defrag(fs_info))
return 0;
if (test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags))
return 0;
if (trans)
transid = trans->transid;
else
transid = inode->root->last_trans;
defrag = kmem_cache_zalloc(btrfs_inode_defrag_cachep, GFP_NOFS);
if (!defrag)
return -ENOMEM;
defrag->ino = btrfs_ino(inode);
defrag->transid = transid;
defrag->root = root->root_key.objectid;
defrag->extent_thresh = extent_thresh;
spin_lock(&fs_info->defrag_inodes_lock);
if (!test_bit(BTRFS_INODE_IN_DEFRAG, &inode->runtime_flags)) {
/*
* If we set IN_DEFRAG flag and evict the inode from memory,
* and then re-read this inode, this new inode doesn't have
* IN_DEFRAG flag. At the case, we may find the existed defrag.
*/
ret = __btrfs_add_inode_defrag(inode, defrag);
if (ret)
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
} else {
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
}
spin_unlock(&fs_info->defrag_inodes_lock);
return 0;
}
/*
* pick the defragable inode that we want, if it doesn't exist, we will get
* the next one.
*/
static struct inode_defrag *
btrfs_pick_defrag_inode(struct btrfs_fs_info *fs_info, u64 root, u64 ino)
{
struct inode_defrag *entry = NULL;
struct inode_defrag tmp;
struct rb_node *p;
struct rb_node *parent = NULL;
int ret;
tmp.ino = ino;
tmp.root = root;
spin_lock(&fs_info->defrag_inodes_lock);
p = fs_info->defrag_inodes.rb_node;
while (p) {
parent = p;
entry = rb_entry(parent, struct inode_defrag, rb_node);
ret = __compare_inode_defrag(&tmp, entry);
if (ret < 0)
p = parent->rb_left;
else if (ret > 0)
p = parent->rb_right;
else
goto out;
}
if (parent && __compare_inode_defrag(&tmp, entry) > 0) {
parent = rb_next(parent);
if (parent)
entry = rb_entry(parent, struct inode_defrag, rb_node);
else
entry = NULL;
}
out:
if (entry)
rb_erase(parent, &fs_info->defrag_inodes);
spin_unlock(&fs_info->defrag_inodes_lock);
return entry;
}
void btrfs_cleanup_defrag_inodes(struct btrfs_fs_info *fs_info)
{
struct inode_defrag *defrag;
struct rb_node *node;
spin_lock(&fs_info->defrag_inodes_lock);
node = rb_first(&fs_info->defrag_inodes);
while (node) {
rb_erase(node, &fs_info->defrag_inodes);
defrag = rb_entry(node, struct inode_defrag, rb_node);
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
cond_resched_lock(&fs_info->defrag_inodes_lock);
node = rb_first(&fs_info->defrag_inodes);
}
spin_unlock(&fs_info->defrag_inodes_lock);
}
#define BTRFS_DEFRAG_BATCH 1024
static int __btrfs_run_defrag_inode(struct btrfs_fs_info *fs_info,
struct inode_defrag *defrag)
{
struct btrfs_root *inode_root;
struct inode *inode;
struct btrfs_ioctl_defrag_range_args range;
int ret = 0;
u64 cur = 0;
again:
if (test_bit(BTRFS_FS_STATE_REMOUNTING, &fs_info->fs_state))
goto cleanup;
if (!__need_auto_defrag(fs_info))
goto cleanup;
/* get the inode */
inode_root = btrfs_get_fs_root(fs_info, defrag->root, true);
if (IS_ERR(inode_root)) {
ret = PTR_ERR(inode_root);
goto cleanup;
}
inode = btrfs_iget(fs_info->sb, defrag->ino, inode_root);
btrfs_put_root(inode_root);
if (IS_ERR(inode)) {
ret = PTR_ERR(inode);
goto cleanup;
}
if (cur >= i_size_read(inode)) {
iput(inode);
goto cleanup;
}
/* do a chunk of defrag */
clear_bit(BTRFS_INODE_IN_DEFRAG, &BTRFS_I(inode)->runtime_flags);
memset(&range, 0, sizeof(range));
range.len = (u64)-1;
range.start = cur;
range.extent_thresh = defrag->extent_thresh;
sb_start_write(fs_info->sb);
ret = btrfs_defrag_file(inode, NULL, &range, defrag->transid,
BTRFS_DEFRAG_BATCH);
sb_end_write(fs_info->sb);
iput(inode);
if (ret < 0)
goto cleanup;
cur = max(cur + fs_info->sectorsize, range.start);
goto again;
cleanup:
kmem_cache_free(btrfs_inode_defrag_cachep, defrag);
return ret;
}
/*
* run through the list of inodes in the FS that need
* defragging
*/
int btrfs_run_defrag_inodes(struct btrfs_fs_info *fs_info)
{
struct inode_defrag *defrag;
u64 first_ino = 0;
u64 root_objectid = 0;
atomic_inc(&fs_info->defrag_running);
while (1) {
/* Pause the auto defragger. */
if (test_bit(BTRFS_FS_STATE_REMOUNTING,
&fs_info->fs_state))
break;
if (!__need_auto_defrag(fs_info))
break;
/* find an inode to defrag */
defrag = btrfs_pick_defrag_inode(fs_info, root_objectid,
first_ino);
if (!defrag) {
if (root_objectid || first_ino) {
root_objectid = 0;
first_ino = 0;
continue;
} else {
break;
}
}
first_ino = defrag->ino + 1;
root_objectid = defrag->root;
__btrfs_run_defrag_inode(fs_info, defrag);
}
atomic_dec(&fs_info->defrag_running);
/*
* during unmount, we use the transaction_wait queue to
* wait for the defragger to stop
*/
wake_up(&fs_info->transaction_wait);
return 0;
}
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "file-item.h"
#include "ioctl.h"
#include "file.h"
#include "super.h"
/* simple helper to fault in pages and copy. This should go away
* and be replaced with calls into generic code.
@ -696,7 +380,10 @@ next_slot:
args->start - extent_offset,
0, false);
ret = btrfs_inc_extent_ref(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
}
key.offset = args->start;
}
@ -783,7 +470,10 @@ delete_extent_item:
key.offset - extent_offset, 0,
false);
ret = btrfs_free_extent(trans, &ref);
BUG_ON(ret); /* -ENOMEM */
if (ret) {
btrfs_abort_transaction(trans, ret);
break;
}
args->bytes_found += extent_end - key.offset;
}
@ -1302,7 +992,8 @@ lock_and_cleanup_extent_if_need(struct btrfs_inode *inode, struct page **pages,
struct btrfs_ordered_extent *ordered;
if (nowait) {
if (!try_lock_extent(&inode->io_tree, start_pos, last_pos)) {
if (!try_lock_extent(&inode->io_tree, start_pos, last_pos,
cached_state)) {
for (i = 0; i < num_pages; i++) {
unlock_page(pages[i]);
put_page(pages[i]);
@ -1372,6 +1063,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
{
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct btrfs_root *root = inode->root;
struct extent_state *cached_state = NULL;
u64 lockstart, lockend;
u64 num_bytes;
int ret;
@ -1388,12 +1080,14 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
num_bytes = lockend - lockstart + 1;
if (nowait) {
if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend)) {
if (!btrfs_try_lock_ordered_range(inode, lockstart, lockend,
&cached_state)) {
btrfs_drew_write_unlock(&root->snapshot_lock);
return -EAGAIN;
}
} else {
btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend, NULL);
btrfs_lock_and_flush_ordered_range(inode, lockstart, lockend,
&cached_state);
}
ret = can_nocow_extent(&inode->vfs_inode, lockstart, &num_bytes,
NULL, NULL, NULL, nowait, false);
@ -1402,7 +1096,7 @@ int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
else
*write_bytes = min_t(size_t, *write_bytes ,
num_bytes - pos + lockstart);
unlock_extent(&inode->io_tree, lockstart, lockend, NULL);
unlock_extent(&inode->io_tree, lockstart, lockend, &cached_state);
return ret;
}
@ -1505,7 +1199,7 @@ static noinline ssize_t btrfs_buffered_write(struct kiocb *iocb,
if (nowait)
ilock_flags |= BTRFS_ILOCK_TRY;
ret = btrfs_inode_lock(inode, ilock_flags);
ret = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
if (ret < 0)
return ret;
@ -1740,7 +1434,7 @@ again:
iocb->ki_pos += num_written;
}
out:
btrfs_inode_unlock(inode, ilock_flags);
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
return num_written ? num_written : ret;
}
@ -1780,19 +1474,19 @@ static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
ilock_flags |= BTRFS_ILOCK_SHARED;
relock:
err = btrfs_inode_lock(inode, ilock_flags);
err = btrfs_inode_lock(BTRFS_I(inode), ilock_flags);
if (err < 0)
return err;
err = generic_write_checks(iocb, from);
if (err <= 0) {
btrfs_inode_unlock(inode, ilock_flags);
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
return err;
}
err = btrfs_write_check(iocb, from, err);
if (err < 0) {
btrfs_inode_unlock(inode, ilock_flags);
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto out;
}
@ -1803,13 +1497,13 @@ relock:
*/
if ((ilock_flags & BTRFS_ILOCK_SHARED) &&
pos + iov_iter_count(from) > i_size_read(inode)) {
btrfs_inode_unlock(inode, ilock_flags);
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
ilock_flags &= ~BTRFS_ILOCK_SHARED;
goto relock;
}
if (check_direct_IO(fs_info, from, pos)) {
btrfs_inode_unlock(inode, ilock_flags);
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
goto buffered;
}
@ -1840,7 +1534,7 @@ relock:
* iocb, and that needs to lock the inode. So unlock it before calling
* iomap_dio_complete() to avoid a deadlock.
*/
btrfs_inode_unlock(inode, ilock_flags);
btrfs_inode_unlock(BTRFS_I(inode), ilock_flags);
if (IS_ERR_OR_NULL(dio))
err = PTR_ERR_OR_ZERO(dio);
@ -1887,8 +1581,8 @@ buffered:
/*
* If we are in a NOWAIT context, then return -EAGAIN to signal the caller
* it must retry the operation in a context where blocking is acceptable,
* since we currently don't have NOWAIT semantics support for buffered IO
* and may block there for many reasons (reserving space for example).
* because even if we end up not blocking during the buffered IO attempt
* below, we will block when flushing and waiting for the IO.
*/
if (iocb->ki_flags & IOCB_NOWAIT) {
err = -EAGAIN;
@ -1928,7 +1622,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
loff_t count;
ssize_t ret;
btrfs_inode_lock(inode, 0);
btrfs_inode_lock(BTRFS_I(inode), 0);
count = encoded->len;
ret = generic_write_checks_count(iocb, &count);
if (ret == 0 && count != encoded->len) {
@ -1947,7 +1641,7 @@ static ssize_t btrfs_encoded_write(struct kiocb *iocb, struct iov_iter *from,
ret = btrfs_do_encoded_write(iocb, from, encoded);
out:
btrfs_inode_unlock(inode, 0);
btrfs_inode_unlock(BTRFS_I(inode), 0);
return ret;
}
@ -2008,10 +1702,12 @@ int btrfs_release_file(struct inode *inode, struct file *filp)
{
struct btrfs_file_private *private = filp->private_data;
if (private && private->filldir_buf)
if (private) {
kfree(private->filldir_buf);
free_extent_state(private->llseek_cached_state);
kfree(private);
filp->private_data = NULL;
}
/*
* Set by setattr when we are about to truncate a file from a non-zero
@ -2118,7 +1814,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
if (ret)
goto out;
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
atomic_inc(&root->log_batch);
@ -2142,7 +1838,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
*/
ret = start_ordered_ops(inode, start, end);
if (ret) {
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
goto out;
}
@ -2245,7 +1941,7 @@ int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync)
* file again, but that will end up using the synchronization
* inside btrfs_sync_log to keep things safe.
*/
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
if (ret == BTRFS_NO_LOG_SYNC) {
ret = btrfs_end_transaction(trans);
@ -2313,7 +2009,7 @@ out:
out_release_extents:
btrfs_release_log_ctx_extents(&ctx);
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
goto out;
}
@ -2908,7 +2604,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
bool truncated_block = false;
bool updated_inode = false;
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
ret = btrfs_wait_ordered_range(inode, offset, len);
if (ret)
@ -2956,7 +2652,7 @@ static int btrfs_punch_hole(struct file *file, loff_t offset, loff_t len)
truncated_block = true;
ret = btrfs_truncate_block(BTRFS_I(inode), offset, 0, 0);
if (ret) {
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
}
}
@ -3055,7 +2751,7 @@ out_only_mutex:
ret = ret2;
}
}
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
}
@ -3366,7 +3062,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_PUNCH_HOLE)
return btrfs_punch_hole(file, offset, len);
btrfs_inode_lock(inode, BTRFS_ILOCK_MMAP);
btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
if (!(mode & FALLOC_FL_KEEP_SIZE) && offset + len > inode->i_size) {
ret = inode_newsize_ok(inode, offset + len);
@ -3416,7 +3112,7 @@ static long btrfs_fallocate(struct file *file, int mode,
if (mode & FALLOC_FL_ZERO_RANGE) {
ret = btrfs_zero_range(inode, offset, len, mode);
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
return ret;
}
@ -3514,7 +3210,7 @@ out_unlock:
unlock_extent(&BTRFS_I(inode)->io_tree, alloc_start, locked_end,
&cached_state);
out:
btrfs_inode_unlock(inode, BTRFS_ILOCK_MMAP);
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_MMAP);
extent_changeset_free(data_reserved);
return ret;
}
@ -3526,117 +3222,106 @@ out:
* looping while it gets adjacent subranges, and merging them together.
*/
static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state,
bool *search_io_tree,
u64 *delalloc_start_ret, u64 *delalloc_end_ret)
{
const u64 len = end + 1 - start;
struct extent_map_tree *em_tree = &inode->extent_tree;
struct extent_map *em;
u64 em_end;
u64 delalloc_len;
u64 len = end + 1 - start;
u64 delalloc_len = 0;
struct btrfs_ordered_extent *oe;
u64 oe_start;
u64 oe_end;
/*
* Search the io tree first for EXTENT_DELALLOC. If we find any, it
* means we have delalloc (dirty pages) for which writeback has not
* started yet.
*/
if (*search_io_tree) {
spin_lock(&inode->lock);
if (inode->delalloc_bytes > 0) {
spin_unlock(&inode->lock);
*delalloc_start_ret = start;
delalloc_len = count_range_bits(&inode->io_tree, delalloc_start_ret, end,
len, EXTENT_DELALLOC, 1);
delalloc_len = count_range_bits(&inode->io_tree,
delalloc_start_ret, end,
len, EXTENT_DELALLOC, 1,
cached_state);
} else {
spin_unlock(&inode->lock);
}
}
if (delalloc_len > 0) {
/*
* If delalloc was found then *delalloc_start_ret has a sector size
* aligned value (rounded down).
*/
if (delalloc_len > 0)
*delalloc_end_ret = *delalloc_start_ret + delalloc_len - 1;
if (*delalloc_start_ret == start) {
/* Delalloc for the whole range, nothing more to do. */
if (*delalloc_end_ret == end)
return true;
/* Else trim our search range for ordered extents. */
start = *delalloc_end_ret + 1;
len = end + 1 - start;
}
} else {
/* No delalloc, future calls don't need to search again. */
*search_io_tree = false;
}
/*
* Now also check if there's any extent map in the range that does not
* map to a hole or prealloc extent. We do this because:
* Now also check if there's any ordered extent in the range.
* We do this because:
*
* 1) When delalloc is flushed, the file range is locked, we clear the
* EXTENT_DELALLOC bit from the io tree and create an extent map for
* an allocated extent. So we might just have been called after
* delalloc is flushed and before the ordered extent completes and
* inserts the new file extent item in the subvolume's btree;
* EXTENT_DELALLOC bit from the io tree and create an extent map and
* an ordered extent for the write. So we might just have been called
* after delalloc is flushed and before the ordered extent completes
* and inserts the new file extent item in the subvolume's btree;
*
* 2) We may have an extent map created by flushing delalloc for a
* 2) We may have an ordered extent created by flushing delalloc for a
* subrange that starts before the subrange we found marked with
* EXTENT_DELALLOC in the io tree.
*
* We could also use the extent map tree to find such delalloc that is
* being flushed, but using the ordered extents tree is more efficient
* because it's usually much smaller as ordered extents are removed from
* the tree once they complete. With the extent maps, we mau have them
* in the extent map tree for a very long time, and they were either
* created by previous writes or loaded by read operations.
*/
read_lock(&em_tree->lock);
em = lookup_extent_mapping(em_tree, start, len);
read_unlock(&em_tree->lock);
/* extent_map_end() returns a non-inclusive end offset. */
em_end = em ? extent_map_end(em) : 0;
/*
* If we have a hole/prealloc extent map, check the next one if this one
* ends before our range's end.
*/
if (em && (em->block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags)) && em_end < end) {
struct extent_map *next_em;
read_lock(&em_tree->lock);
next_em = lookup_extent_mapping(em_tree, em_end, len - em_end);
read_unlock(&em_tree->lock);
free_extent_map(em);
em_end = next_em ? extent_map_end(next_em) : 0;
em = next_em;
}
if (em && (em->block_start == EXTENT_MAP_HOLE ||
test_bit(EXTENT_FLAG_PREALLOC, &em->flags))) {
free_extent_map(em);
em = NULL;
}
/*
* No extent map or one for a hole or prealloc extent. Use the delalloc
* range we found in the io tree if we have one.
*/
if (!em)
oe = btrfs_lookup_first_ordered_range(inode, start, len);
if (!oe)
return (delalloc_len > 0);
/*
* We don't have any range as EXTENT_DELALLOC in the io tree, so the
* extent map is the only subrange representing delalloc.
*/
/* The ordered extent may span beyond our search range. */
oe_start = max(oe->file_offset, start);
oe_end = min(oe->file_offset + oe->num_bytes - 1, end);
btrfs_put_ordered_extent(oe);
/* Don't have unflushed delalloc, return the ordered extent range. */
if (delalloc_len == 0) {
*delalloc_start_ret = em->start;
*delalloc_end_ret = min(end, em_end - 1);
free_extent_map(em);
*delalloc_start_ret = oe_start;
*delalloc_end_ret = oe_end;
return true;
}
/*
* The extent map represents a delalloc range that starts before the
* delalloc range we found in the io tree.
* We have both unflushed delalloc (io_tree) and an ordered extent.
* If the ranges are adjacent returned a combined range, otherwise
* return the leftmost range.
*/
if (em->start < *delalloc_start_ret) {
*delalloc_start_ret = em->start;
/*
* If the ranges are adjacent, return a combined range.
* Otherwise return the extent map's range.
*/
if (em_end < *delalloc_start_ret)
*delalloc_end_ret = min(end, em_end - 1);
free_extent_map(em);
return true;
if (oe_start < *delalloc_start_ret) {
if (oe_end < *delalloc_start_ret)
*delalloc_end_ret = oe_end;
*delalloc_start_ret = oe_start;
} else if (*delalloc_end_ret + 1 == oe_start) {
*delalloc_end_ret = oe_end;
}
/*
* The extent map starts after the delalloc range we found in the io
* tree. If it's adjacent, return a combined range, otherwise return
* the range found in the io tree.
*/
if (*delalloc_end_ret + 1 == em->start)
*delalloc_end_ret = min(end, em_end - 1);
free_extent_map(em);
return true;
}
@ -3648,6 +3333,8 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
* sector size aligned.
* @end: The end offset (inclusive value) of the search range.
* It does not need to be sector size aligned.
* @cached_state: Extent state record used for speeding up delalloc
* searches in the inode's io_tree. Can be NULL.
* @delalloc_start_ret: Output argument, set to the start offset of the
* subrange found with delalloc (may not be sector size
* aligned).
@ -3659,10 +3346,12 @@ static bool find_delalloc_subrange(struct btrfs_inode *inode, u64 start, u64 end
* end offsets of the subrange.
*/
bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state,
u64 *delalloc_start_ret, u64 *delalloc_end_ret)
{
u64 cur_offset = round_down(start, inode->root->fs_info->sectorsize);
u64 prev_delalloc_end = 0;
bool search_io_tree = true;
bool ret = false;
while (cur_offset < end) {
@ -3671,6 +3360,7 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
bool delalloc;
delalloc = find_delalloc_subrange(inode, cur_offset, end,
cached_state, &search_io_tree,
&delalloc_start,
&delalloc_end);
if (!delalloc)
@ -3716,13 +3406,14 @@ bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
* is found, it updates @start_ret with the start of the subrange.
*/
static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
struct extent_state **cached_state,
u64 start, u64 end, u64 *start_ret)
{
u64 delalloc_start;
u64 delalloc_end;
bool delalloc;
delalloc = btrfs_find_delalloc_in_range(inode, start, end,
delalloc = btrfs_find_delalloc_in_range(inode, start, end, cached_state,
&delalloc_start, &delalloc_end);
if (delalloc && whence == SEEK_DATA) {
*start_ret = delalloc_start;
@ -3765,11 +3456,13 @@ static bool find_desired_extent_in_hole(struct btrfs_inode *inode, int whence,
return false;
}
static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
int whence)
static loff_t find_desired_extent(struct file *file, loff_t offset, int whence)
{
struct btrfs_inode *inode = BTRFS_I(file->f_mapping->host);
struct btrfs_file_private *private = file->private_data;
struct btrfs_fs_info *fs_info = inode->root->fs_info;
struct extent_state *cached_state = NULL;
struct extent_state **delalloc_cached_state;
const loff_t i_size = i_size_read(&inode->vfs_inode);
const u64 ino = btrfs_ino(inode);
struct btrfs_root *root = inode->root;
@ -3794,6 +3487,22 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
inode_get_bytes(&inode->vfs_inode) == i_size)
return i_size;
if (!private) {
private = kzalloc(sizeof(*private), GFP_KERNEL);
/*
* No worries if memory allocation failed.
* The private structure is used only for speeding up multiple
* lseek SEEK_HOLE/DATA calls to a file when there's delalloc,
* so everything will still be correct.
*/
file->private_data = private;
}
if (private)
delalloc_cached_state = &private->llseek_cached_state;
else
delalloc_cached_state = NULL;
/*
* offset can be negative, in this case we start finding DATA/HOLE from
* the very start of the file.
@ -3871,6 +3580,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
search_start = offset;
found = find_desired_extent_in_hole(inode, whence,
delalloc_cached_state,
search_start,
key.offset - 1,
&found_start);
@ -3905,6 +3615,7 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
search_start = offset;
found = find_desired_extent_in_hole(inode, whence,
delalloc_cached_state,
search_start,
extent_end - 1,
&found_start);
@ -3946,7 +3657,8 @@ static loff_t find_desired_extent(struct btrfs_inode *inode, loff_t offset,
/* We have an implicit hole from the last extent found up to i_size. */
if (!found && start < i_size) {
found = find_desired_extent_in_hole(inode, whence, start,
found = find_desired_extent_in_hole(inode, whence,
delalloc_cached_state, start,
i_size - 1, &start);
if (!found)
start = i_size;
@ -3974,9 +3686,9 @@ static loff_t btrfs_file_llseek(struct file *file, loff_t offset, int whence)
return generic_file_llseek(file, offset, whence);
case SEEK_DATA:
case SEEK_HOLE:
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
offset = find_desired_extent(BTRFS_I(inode), offset, whence);
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
offset = find_desired_extent(file, offset, whence);
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
break;
}
@ -4031,7 +3743,7 @@ static ssize_t btrfs_direct_read(struct kiocb *iocb, struct iov_iter *to)
if (check_direct_read(btrfs_sb(inode->i_sb), to, iocb->ki_pos))
return 0;
btrfs_inode_lock(inode, BTRFS_ILOCK_SHARED);
btrfs_inode_lock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
again:
/*
* This is similar to what we do for direct IO writes, see the comment
@ -4080,7 +3792,7 @@ again:
goto again;
}
}
btrfs_inode_unlock(inode, BTRFS_ILOCK_SHARED);
btrfs_inode_unlock(BTRFS_I(inode), BTRFS_ILOCK_SHARED);
return ret < 0 ? ret : read;
}
@ -4117,23 +3829,6 @@ const struct file_operations btrfs_file_operations = {
.remap_file_range = btrfs_remap_file_range,
};
void __cold btrfs_auto_defrag_exit(void)
{
kmem_cache_destroy(btrfs_inode_defrag_cachep);
}
int __init btrfs_auto_defrag_init(void)
{
btrfs_inode_defrag_cachep = kmem_cache_create("btrfs_inode_defrag",
sizeof(struct inode_defrag), 0,
SLAB_MEM_SPREAD,
NULL);
if (!btrfs_inode_defrag_cachep)
return -ENOMEM;
return 0;
}
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end)
{
int ret;

33
fs/btrfs/file.h Normal file
View File

@ -0,0 +1,33 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_FILE_H
#define BTRFS_FILE_H
extern const struct file_operations btrfs_file_operations;
int btrfs_sync_file(struct file *file, loff_t start, loff_t end, int datasync);
int btrfs_drop_extents(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_inode *inode,
struct btrfs_drop_extents_args *args);
int btrfs_replace_file_extents(struct btrfs_inode *inode,
struct btrfs_path *path, const u64 start,
const u64 end,
struct btrfs_replace_extent_info *extent_info,
struct btrfs_trans_handle **trans_out);
int btrfs_mark_extent_written(struct btrfs_trans_handle *trans,
struct btrfs_inode *inode, u64 start, u64 end);
ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
const struct btrfs_ioctl_encoded_io_args *encoded);
int btrfs_release_file(struct inode *inode, struct file *file);
int btrfs_dirty_pages(struct btrfs_inode *inode, struct page **pages,
size_t num_pages, loff_t pos, size_t write_bytes,
struct extent_state **cached, bool noreserve);
int btrfs_fdatawrite_range(struct inode *inode, loff_t start, loff_t end);
int btrfs_check_nocow_lock(struct btrfs_inode *inode, loff_t pos,
size_t *write_bytes, bool nowait);
void btrfs_check_nocow_unlock(struct btrfs_inode *inode);
bool btrfs_find_delalloc_in_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state,
u64 *delalloc_start_ret, u64 *delalloc_end_ret);
#endif

View File

@ -11,8 +11,10 @@
#include <linux/ratelimit.h>
#include <linux/error-injection.h>
#include <linux/sched/mm.h>
#include "misc.h"
#include "ctree.h"
#include "fs.h"
#include "messages.h"
#include "misc.h"
#include "free-space-cache.h"
#include "transaction.h"
#include "disk-io.h"
@ -24,11 +26,18 @@
#include "discard.h"
#include "subpage.h"
#include "inode-item.h"
#include "accessors.h"
#include "file-item.h"
#include "file.h"
#include "super.h"
#define BITS_PER_BITMAP (PAGE_SIZE * 8UL)
#define MAX_CACHE_BYTES_PER_GIG SZ_64K
#define FORCE_EXTENT_THRESHOLD SZ_1M
static struct kmem_cache *btrfs_free_space_cachep;
static struct kmem_cache *btrfs_free_space_bitmap_cachep;
struct btrfs_trim_range {
u64 start;
u64 bytes;
@ -251,7 +260,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
}
ret = btrfs_orphan_add(trans, BTRFS_I(inode));
if (ret) {
btrfs_add_delayed_iput(inode);
btrfs_add_delayed_iput(BTRFS_I(inode));
goto out;
}
clear_nlink(inode);
@ -265,7 +274,7 @@ int btrfs_remove_free_space_inode(struct btrfs_trans_handle *trans,
spin_unlock(&block_group->lock);
}
/* One for the lookup ref */
btrfs_add_delayed_iput(inode);
btrfs_add_delayed_iput(BTRFS_I(inode));
key.objectid = BTRFS_FREE_SPACE_OBJECTID;
key.type = 0;
@ -1363,8 +1372,8 @@ int btrfs_wait_cache_io(struct btrfs_trans_handle *trans,
path, block_group->start);
}
/**
* Write out cached info to an inode
/*
* Write out cached info to an inode.
*
* @root: root the inode belongs to
* @inode: freespace inode we are writing out
@ -2717,8 +2726,7 @@ static int __btrfs_add_free_space_zoned(struct btrfs_block_group *block_group,
btrfs_mark_bg_unused(block_group);
} else if (bg_reclaim_threshold &&
reclaimable_unusable >=
div_factor_fine(block_group->zone_capacity,
bg_reclaim_threshold)) {
mult_perc(block_group->zone_capacity, bg_reclaim_threshold)) {
btrfs_mark_bg_to_reclaim(block_group);
}
@ -3028,10 +3036,7 @@ void btrfs_remove_free_space_cache(struct btrfs_block_group *block_group)
}
/**
* btrfs_is_free_space_trimmed - see if everything is trimmed
* @block_group: block_group of interest
*
/*
* Walk @block_group's free space rb_tree to determine if everything is trimmed.
*/
bool btrfs_is_free_space_trimmed(struct btrfs_block_group *block_group)
@ -4132,6 +4137,31 @@ out:
return ret;
}
int __init btrfs_free_space_init(void)
{
btrfs_free_space_cachep = kmem_cache_create("btrfs_free_space",
sizeof(struct btrfs_free_space), 0,
SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_cachep)
return -ENOMEM;
btrfs_free_space_bitmap_cachep = kmem_cache_create("btrfs_free_space_bitmap",
PAGE_SIZE, PAGE_SIZE,
SLAB_MEM_SPREAD, NULL);
if (!btrfs_free_space_bitmap_cachep) {
kmem_cache_destroy(btrfs_free_space_cachep);
return -ENOMEM;
}
return 0;
}
void __cold btrfs_free_space_exit(void)
{
kmem_cache_destroy(btrfs_free_space_cachep);
kmem_cache_destroy(btrfs_free_space_bitmap_cachep);
}
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
/*
* Use this if you need to make a bitmap or extent entry specifically, it

View File

@ -43,6 +43,17 @@ static inline bool btrfs_free_space_trimming_bitmap(
return (info->trim_state == BTRFS_TRIM_STATE_TRIMMING);
}
/*
* Deltas are an effective way to populate global statistics. Give macro names
* to make it clear what we're doing. An example is discard_extents in
* btrfs_free_space_ctl.
*/
enum {
BTRFS_STAT_CURR,
BTRFS_STAT_PREV,
BTRFS_STAT_NR_ENTRIES,
};
struct btrfs_free_space_ctl {
spinlock_t tree_lock;
struct rb_root free_space_offset;
@ -79,6 +90,8 @@ struct btrfs_io_ctl {
int bitmaps;
};
int __init btrfs_free_space_init(void);
void __cold btrfs_free_space_exit(void);
struct inode *lookup_free_space_inode(struct btrfs_block_group *block_group,
struct btrfs_path *path);
int create_free_space_inode(struct btrfs_trans_handle *trans,

View File

@ -5,12 +5,17 @@
#include <linux/kernel.h>
#include <linux/sched/mm.h>
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "locking.h"
#include "free-space-tree.h"
#include "transaction.h"
#include "block-group.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
struct btrfs_block_group *block_group,
@ -803,7 +808,7 @@ int __remove_from_free_space_tree(struct btrfs_trans_handle *trans,
u32 flags;
int ret;
if (block_group->needs_free_space) {
if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
ret = __add_block_group_free_space(trans, block_group, path);
if (ret)
return ret;
@ -996,7 +1001,7 @@ int __add_to_free_space_tree(struct btrfs_trans_handle *trans,
u32 flags;
int ret;
if (block_group->needs_free_space) {
if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
ret = __add_block_group_free_space(trans, block_group, path);
if (ret)
return ret;
@ -1299,7 +1304,7 @@ static int __add_block_group_free_space(struct btrfs_trans_handle *trans,
{
int ret;
block_group->needs_free_space = 0;
clear_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags);
ret = add_new_free_space_info(trans, block_group, path);
if (ret)
@ -1321,7 +1326,7 @@ int add_block_group_free_space(struct btrfs_trans_handle *trans,
return 0;
mutex_lock(&block_group->free_space_lock);
if (!block_group->needs_free_space)
if (!test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags))
goto out;
path = btrfs_alloc_path();
@ -1354,7 +1359,7 @@ int remove_block_group_free_space(struct btrfs_trans_handle *trans,
if (!btrfs_fs_compat_ro(trans->fs_info, FREE_SPACE_TREE))
return 0;
if (block_group->needs_free_space) {
if (test_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &block_group->runtime_flags)) {
/* We never added this block group to the free space tree. */
return 0;
}

94
fs/btrfs/fs.c Normal file
View File

@ -0,0 +1,94 @@
// SPDX-License-Identifier: GPL-2.0
#include "messages.h"
#include "ctree.h"
#include "fs.h"
#include "accessors.h"
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name)
{
struct btrfs_super_block *disk_super;
u64 features;
disk_super = fs_info->super_copy;
features = btrfs_super_incompat_flags(disk_super);
if (!(features & flag)) {
spin_lock(&fs_info->super_lock);
features = btrfs_super_incompat_flags(disk_super);
if (!(features & flag)) {
features |= flag;
btrfs_set_super_incompat_flags(disk_super, features);
btrfs_info(fs_info,
"setting incompat feature flag for %s (0x%llx)",
name, flag);
}
spin_unlock(&fs_info->super_lock);
}
}
void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name)
{
struct btrfs_super_block *disk_super;
u64 features;
disk_super = fs_info->super_copy;
features = btrfs_super_incompat_flags(disk_super);
if (features & flag) {
spin_lock(&fs_info->super_lock);
features = btrfs_super_incompat_flags(disk_super);
if (features & flag) {
features &= ~flag;
btrfs_set_super_incompat_flags(disk_super, features);
btrfs_info(fs_info,
"clearing incompat feature flag for %s (0x%llx)",
name, flag);
}
spin_unlock(&fs_info->super_lock);
}
}
void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
const char *name)
{
struct btrfs_super_block *disk_super;
u64 features;
disk_super = fs_info->super_copy;
features = btrfs_super_compat_ro_flags(disk_super);
if (!(features & flag)) {
spin_lock(&fs_info->super_lock);
features = btrfs_super_compat_ro_flags(disk_super);
if (!(features & flag)) {
features |= flag;
btrfs_set_super_compat_ro_flags(disk_super, features);
btrfs_info(fs_info,
"setting compat-ro feature flag for %s (0x%llx)",
name, flag);
}
spin_unlock(&fs_info->super_lock);
}
}
void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
const char *name)
{
struct btrfs_super_block *disk_super;
u64 features;
disk_super = fs_info->super_copy;
features = btrfs_super_compat_ro_flags(disk_super);
if (features & flag) {
spin_lock(&fs_info->super_lock);
features = btrfs_super_compat_ro_flags(disk_super);
if (features & flag) {
features &= ~flag;
btrfs_set_super_compat_ro_flags(disk_super, features);
btrfs_info(fs_info,
"clearing compat-ro feature flag for %s (0x%llx)",
name, flag);
}
spin_unlock(&fs_info->super_lock);
}
}

976
fs/btrfs/fs.h Normal file
View File

@ -0,0 +1,976 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_FS_H
#define BTRFS_FS_H
#include <linux/fs.h>
#include <linux/btrfs_tree.h>
#include <linux/sizes.h>
#include "extent-io-tree.h"
#include "extent_map.h"
#include "async-thread.h"
#include "block-rsv.h"
#define BTRFS_MAX_EXTENT_SIZE SZ_128M
#define BTRFS_OLDEST_GENERATION 0ULL
#define BTRFS_EMPTY_DIR_SIZE 0
#define BTRFS_DIRTY_METADATA_THRESH SZ_32M
#define BTRFS_SUPER_INFO_OFFSET SZ_64K
#define BTRFS_SUPER_INFO_SIZE 4096
static_assert(sizeof(struct btrfs_super_block) == BTRFS_SUPER_INFO_SIZE);
/*
* The reserved space at the beginning of each device. It covers the primary
* super block and leaves space for potential use by other tools like
* bootloaders or to lower potential damage of accidental overwrite.
*/
#define BTRFS_DEVICE_RANGE_RESERVED (SZ_1M)
/*
* Runtime (in-memory) states of filesystem
*/
enum {
/* Global indicator of serious filesystem errors */
BTRFS_FS_STATE_ERROR,
/*
* Filesystem is being remounted, allow to skip some operations, like
* defrag
*/
BTRFS_FS_STATE_REMOUNTING,
/* Filesystem in RO mode */
BTRFS_FS_STATE_RO,
/* Track if a transaction abort has been reported on this filesystem */
BTRFS_FS_STATE_TRANS_ABORTED,
/*
* Bio operations should be blocked on this filesystem because a source
* or target device is being destroyed as part of a device replace
*/
BTRFS_FS_STATE_DEV_REPLACING,
/* The btrfs_fs_info created for self-tests */
BTRFS_FS_STATE_DUMMY_FS_INFO,
BTRFS_FS_STATE_NO_CSUMS,
/* Indicates there was an error cleaning up a log tree. */
BTRFS_FS_STATE_LOG_CLEANUP_ERROR,
BTRFS_FS_STATE_COUNT
};
enum {
BTRFS_FS_CLOSING_START,
BTRFS_FS_CLOSING_DONE,
BTRFS_FS_LOG_RECOVERING,
BTRFS_FS_OPEN,
BTRFS_FS_QUOTA_ENABLED,
BTRFS_FS_UPDATE_UUID_TREE_GEN,
BTRFS_FS_CREATING_FREE_SPACE_TREE,
BTRFS_FS_BTREE_ERR,
BTRFS_FS_LOG1_ERR,
BTRFS_FS_LOG2_ERR,
BTRFS_FS_QUOTA_OVERRIDE,
/* Used to record internally whether fs has been frozen */
BTRFS_FS_FROZEN,
/*
* Indicate that balance has been set up from the ioctl and is in the
* main phase. The fs_info::balance_ctl is initialized.
*/
BTRFS_FS_BALANCE_RUNNING,
/*
* Indicate that relocation of a chunk has started, it's set per chunk
* and is toggled between chunks.
*/
BTRFS_FS_RELOC_RUNNING,
/* Indicate that the cleaner thread is awake and doing something. */
BTRFS_FS_CLEANER_RUNNING,
/*
* The checksumming has an optimized version and is considered fast,
* so we don't need to offload checksums to workqueues.
*/
BTRFS_FS_CSUM_IMPL_FAST,
/* Indicate that the discard workqueue can service discards. */
BTRFS_FS_DISCARD_RUNNING,
/* Indicate that we need to cleanup space cache v1 */
BTRFS_FS_CLEANUP_SPACE_CACHE_V1,
/* Indicate that we can't trust the free space tree for caching yet */
BTRFS_FS_FREE_SPACE_TREE_UNTRUSTED,
/* Indicate whether there are any tree modification log users */
BTRFS_FS_TREE_MOD_LOG_USERS,
/* Indicate that we want the transaction kthread to commit right now. */
BTRFS_FS_COMMIT_TRANS,
/* Indicate we have half completed snapshot deletions pending. */
BTRFS_FS_UNFINISHED_DROPS,
/* Indicate we have to finish a zone to do next allocation. */
BTRFS_FS_NEED_ZONE_FINISH,
/* Indicate that we want to commit the transaction. */
BTRFS_FS_NEED_TRANS_COMMIT,
#if BITS_PER_LONG == 32
/* Indicate if we have error/warn message printed on 32bit systems */
BTRFS_FS_32BIT_ERROR,
BTRFS_FS_32BIT_WARN,
#endif
};
/*
* Flags for mount options.
*
* Note: don't forget to add new options to btrfs_show_options()
*/
enum {
BTRFS_MOUNT_NODATASUM = (1UL << 0),
BTRFS_MOUNT_NODATACOW = (1UL << 1),
BTRFS_MOUNT_NOBARRIER = (1UL << 2),
BTRFS_MOUNT_SSD = (1UL << 3),
BTRFS_MOUNT_DEGRADED = (1UL << 4),
BTRFS_MOUNT_COMPRESS = (1UL << 5),
BTRFS_MOUNT_NOTREELOG = (1UL << 6),
BTRFS_MOUNT_FLUSHONCOMMIT = (1UL << 7),
BTRFS_MOUNT_SSD_SPREAD = (1UL << 8),
BTRFS_MOUNT_NOSSD = (1UL << 9),
BTRFS_MOUNT_DISCARD_SYNC = (1UL << 10),
BTRFS_MOUNT_FORCE_COMPRESS = (1UL << 11),
BTRFS_MOUNT_SPACE_CACHE = (1UL << 12),
BTRFS_MOUNT_CLEAR_CACHE = (1UL << 13),
BTRFS_MOUNT_USER_SUBVOL_RM_ALLOWED = (1UL << 14),
BTRFS_MOUNT_ENOSPC_DEBUG = (1UL << 15),
BTRFS_MOUNT_AUTO_DEFRAG = (1UL << 16),
BTRFS_MOUNT_USEBACKUPROOT = (1UL << 17),
BTRFS_MOUNT_SKIP_BALANCE = (1UL << 18),
BTRFS_MOUNT_CHECK_INTEGRITY = (1UL << 19),
BTRFS_MOUNT_CHECK_INTEGRITY_DATA = (1UL << 20),
BTRFS_MOUNT_PANIC_ON_FATAL_ERROR = (1UL << 21),
BTRFS_MOUNT_RESCAN_UUID_TREE = (1UL << 22),
BTRFS_MOUNT_FRAGMENT_DATA = (1UL << 23),
BTRFS_MOUNT_FRAGMENT_METADATA = (1UL << 24),
BTRFS_MOUNT_FREE_SPACE_TREE = (1UL << 25),
BTRFS_MOUNT_NOLOGREPLAY = (1UL << 26),
BTRFS_MOUNT_REF_VERIFY = (1UL << 27),
BTRFS_MOUNT_DISCARD_ASYNC = (1UL << 28),
BTRFS_MOUNT_IGNOREBADROOTS = (1UL << 29),
BTRFS_MOUNT_IGNOREDATACSUMS = (1UL << 30),
BTRFS_MOUNT_NODISCARD = (1UL << 31),
};
/*
* Compat flags that we support. If any incompat flags are set other than the
* ones specified below then we will fail to mount
*/
#define BTRFS_FEATURE_COMPAT_SUPP 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_SAFE_CLEAR 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SUPP \
(BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE | \
BTRFS_FEATURE_COMPAT_RO_FREE_SPACE_TREE_VALID | \
BTRFS_FEATURE_COMPAT_RO_VERITY | \
BTRFS_FEATURE_COMPAT_RO_BLOCK_GROUP_TREE)
#define BTRFS_FEATURE_COMPAT_RO_SAFE_SET 0ULL
#define BTRFS_FEATURE_COMPAT_RO_SAFE_CLEAR 0ULL
#ifdef CONFIG_BTRFS_DEBUG
/*
* Extent tree v2 supported only with CONFIG_BTRFS_DEBUG
*/
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
BTRFS_FEATURE_INCOMPAT_RAID56 | \
BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
BTRFS_FEATURE_INCOMPAT_ZONED | \
BTRFS_FEATURE_INCOMPAT_EXTENT_TREE_V2)
#else
#define BTRFS_FEATURE_INCOMPAT_SUPP \
(BTRFS_FEATURE_INCOMPAT_MIXED_BACKREF | \
BTRFS_FEATURE_INCOMPAT_DEFAULT_SUBVOL | \
BTRFS_FEATURE_INCOMPAT_MIXED_GROUPS | \
BTRFS_FEATURE_INCOMPAT_BIG_METADATA | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_LZO | \
BTRFS_FEATURE_INCOMPAT_COMPRESS_ZSTD | \
BTRFS_FEATURE_INCOMPAT_RAID56 | \
BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF | \
BTRFS_FEATURE_INCOMPAT_SKINNY_METADATA | \
BTRFS_FEATURE_INCOMPAT_NO_HOLES | \
BTRFS_FEATURE_INCOMPAT_METADATA_UUID | \
BTRFS_FEATURE_INCOMPAT_RAID1C34 | \
BTRFS_FEATURE_INCOMPAT_ZONED)
#endif
#define BTRFS_FEATURE_INCOMPAT_SAFE_SET \
(BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
#define BTRFS_FEATURE_INCOMPAT_SAFE_CLEAR 0ULL
#define BTRFS_DEFAULT_COMMIT_INTERVAL (30)
#define BTRFS_DEFAULT_MAX_INLINE (2048)
struct btrfs_dev_replace {
/* See #define above */
u64 replace_state;
/* Seconds since 1-Jan-1970 */
time64_t time_started;
/* Seconds since 1-Jan-1970 */
time64_t time_stopped;
atomic64_t num_write_errors;
atomic64_t num_uncorrectable_read_errors;
u64 cursor_left;
u64 committed_cursor_left;
u64 cursor_left_last_write_of_item;
u64 cursor_right;
/* See #define above */
u64 cont_reading_from_srcdev_mode;
int is_valid;
int item_needs_writeback;
struct btrfs_device *srcdev;
struct btrfs_device *tgtdev;
struct mutex lock_finishing_cancel_unmount;
struct rw_semaphore rwsem;
struct btrfs_scrub_progress scrub_progress;
struct percpu_counter bio_counter;
wait_queue_head_t replace_wait;
};
/*
* Free clusters are used to claim free space in relatively large chunks,
* allowing us to do less seeky writes. They are used for all metadata
* allocations. In ssd_spread mode they are also used for data allocations.
*/
struct btrfs_free_cluster {
spinlock_t lock;
spinlock_t refill_lock;
struct rb_root root;
/* Largest extent in this cluster */
u64 max_size;
/* First extent starting offset */
u64 window_start;
/* We did a full search and couldn't create a cluster */
bool fragmented;
struct btrfs_block_group *block_group;
/*
* When a cluster is allocated from a block group, we put the cluster
* onto a list in the block group so that it can be freed before the
* block group is freed.
*/
struct list_head block_group_list;
};
/* Discard control. */
/*
* Async discard uses multiple lists to differentiate the discard filter
* parameters. Index 0 is for completely free block groups where we need to
* ensure the entire block group is trimmed without being lossy. Indices
* afterwards represent monotonically decreasing discard filter sizes to
* prioritize what should be discarded next.
*/
#define BTRFS_NR_DISCARD_LISTS 3
#define BTRFS_DISCARD_INDEX_UNUSED 0
#define BTRFS_DISCARD_INDEX_START 1
struct btrfs_discard_ctl {
struct workqueue_struct *discard_workers;
struct delayed_work work;
spinlock_t lock;
struct btrfs_block_group *block_group;
struct list_head discard_list[BTRFS_NR_DISCARD_LISTS];
u64 prev_discard;
u64 prev_discard_time;
atomic_t discardable_extents;
atomic64_t discardable_bytes;
u64 max_discard_size;
u64 delay_ms;
u32 iops_limit;
u32 kbps_limit;
u64 discard_extent_bytes;
u64 discard_bitmap_bytes;
atomic64_t discard_bytes_saved;
};
/*
* Exclusive operations (device replace, resize, device add/remove, balance)
*/
enum btrfs_exclusive_operation {
BTRFS_EXCLOP_NONE,
BTRFS_EXCLOP_BALANCE_PAUSED,
BTRFS_EXCLOP_BALANCE,
BTRFS_EXCLOP_DEV_ADD,
BTRFS_EXCLOP_DEV_REMOVE,
BTRFS_EXCLOP_DEV_REPLACE,
BTRFS_EXCLOP_RESIZE,
BTRFS_EXCLOP_SWAP_ACTIVATE,
};
/* Store data about transaction commits, exported via sysfs. */
struct btrfs_commit_stats {
/* Total number of commits */
u64 commit_count;
/* The maximum commit duration so far in ns */
u64 max_commit_dur;
/* The last commit duration in ns */
u64 last_commit_dur;
/* The total commit duration in ns */
u64 total_commit_dur;
};
struct btrfs_fs_info {
u8 chunk_tree_uuid[BTRFS_UUID_SIZE];
unsigned long flags;
struct btrfs_root *tree_root;
struct btrfs_root *chunk_root;
struct btrfs_root *dev_root;
struct btrfs_root *fs_root;
struct btrfs_root *quota_root;
struct btrfs_root *uuid_root;
struct btrfs_root *data_reloc_root;
struct btrfs_root *block_group_root;
/* The log root tree is a directory of all the other log roots */
struct btrfs_root *log_root_tree;
/* The tree that holds the global roots (csum, extent, etc) */
rwlock_t global_root_lock;
struct rb_root global_root_tree;
spinlock_t fs_roots_radix_lock;
struct radix_tree_root fs_roots_radix;
/* Block group cache stuff */
rwlock_t block_group_cache_lock;
struct rb_root_cached block_group_cache_tree;
/* Keep track of unallocated space */
atomic64_t free_chunk_space;
/* Track ranges which are used by log trees blocks/logged data extents */
struct extent_io_tree excluded_extents;
/* logical->physical extent mapping */
struct extent_map_tree mapping_tree;
/*
* Block reservation for extent, checksum, root tree and delayed dir
* index item.
*/
struct btrfs_block_rsv global_block_rsv;
/* Block reservation for metadata operations */
struct btrfs_block_rsv trans_block_rsv;
/* Block reservation for chunk tree */
struct btrfs_block_rsv chunk_block_rsv;
/* Block reservation for delayed operations */
struct btrfs_block_rsv delayed_block_rsv;
/* Block reservation for delayed refs */
struct btrfs_block_rsv delayed_refs_rsv;
struct btrfs_block_rsv empty_block_rsv;
u64 generation;
u64 last_trans_committed;
/*
* Generation of the last transaction used for block group relocation
* since the filesystem was last mounted (or 0 if none happened yet).
* Must be written and read while holding btrfs_fs_info::commit_root_sem.
*/
u64 last_reloc_trans;
u64 avg_delayed_ref_runtime;
/*
* This is updated to the current trans every time a full commit is
* required instead of the faster short fsync log commits
*/
u64 last_trans_log_full_commit;
unsigned long mount_opt;
unsigned long compress_type:4;
unsigned int compress_level;
u32 commit_interval;
/*
* It is a suggestive number, the read side is safe even it gets a
* wrong number because we will write out the data into a regular
* extent. The write side(mount/remount) is under ->s_umount lock,
* so it is also safe.
*/
u64 max_inline;
struct btrfs_transaction *running_transaction;
wait_queue_head_t transaction_throttle;
wait_queue_head_t transaction_wait;
wait_queue_head_t transaction_blocked_wait;
wait_queue_head_t async_submit_wait;
/*
* Used to protect the incompat_flags, compat_flags, compat_ro_flags
* when they are updated.
*
* Because we do not clear the flags for ever, so we needn't use
* the lock on the read side.
*
* We also needn't use the lock when we mount the fs, because
* there is no other task which will update the flag.
*/
spinlock_t super_lock;
struct btrfs_super_block *super_copy;
struct btrfs_super_block *super_for_commit;
struct super_block *sb;
struct inode *btree_inode;
struct mutex tree_log_mutex;
struct mutex transaction_kthread_mutex;
struct mutex cleaner_mutex;
struct mutex chunk_mutex;
/*
* This is taken to make sure we don't set block groups ro after the
* free space cache has been allocated on them.
*/
struct mutex ro_block_group_mutex;
/*
* This is used during read/modify/write to make sure no two ios are
* trying to mod the same stripe at the same time.
*/
struct btrfs_stripe_hash_table *stripe_hash_table;
/*
* This protects the ordered operations list only while we are
* processing all of the entries on it. This way we make sure the
* commit code doesn't find the list temporarily empty because another
* function happens to be doing non-waiting preflush before jumping
* into the main commit.
*/
struct mutex ordered_operations_mutex;
struct rw_semaphore commit_root_sem;
struct rw_semaphore cleanup_work_sem;
struct rw_semaphore subvol_sem;
spinlock_t trans_lock;
/*
* The reloc mutex goes with the trans lock, it is taken during commit
* to protect us from the relocation code.
*/
struct mutex reloc_mutex;
struct list_head trans_list;
struct list_head dead_roots;
struct list_head caching_block_groups;
spinlock_t delayed_iput_lock;
struct list_head delayed_iputs;
atomic_t nr_delayed_iputs;
wait_queue_head_t delayed_iputs_wait;
atomic64_t tree_mod_seq;
/* This protects tree_mod_log and tree_mod_seq_list */
rwlock_t tree_mod_log_lock;
struct rb_root tree_mod_log;
struct list_head tree_mod_seq_list;
atomic_t async_delalloc_pages;
/* This is used to protect the following list -- ordered_roots. */
spinlock_t ordered_root_lock;
/*
* All fs/file tree roots in which there are data=ordered extents
* pending writeback are added into this list.
*
* These can span multiple transactions and basically include every
* dirty data page that isn't from nodatacow.
*/
struct list_head ordered_roots;
struct mutex delalloc_root_mutex;
spinlock_t delalloc_root_lock;
/* All fs/file tree roots that have delalloc inodes. */
struct list_head delalloc_roots;
/*
* There is a pool of worker threads for checksumming during writes and
* a pool for checksumming after reads. This is because readers can
* run with FS locks held, and the writers may be waiting for those
* locks. We don't want ordering in the pending list to cause
* deadlocks, and so the two are serviced separately.
*
* A third pool does submit_bio to avoid deadlocking with the other two.
*/
struct btrfs_workqueue *workers;
struct btrfs_workqueue *hipri_workers;
struct btrfs_workqueue *delalloc_workers;
struct btrfs_workqueue *flush_workers;
struct workqueue_struct *endio_workers;
struct workqueue_struct *endio_meta_workers;
struct workqueue_struct *rmw_workers;
struct workqueue_struct *compressed_write_workers;
struct btrfs_workqueue *endio_write_workers;
struct btrfs_workqueue *endio_freespace_worker;
struct btrfs_workqueue *caching_workers;
/*
* Fixup workers take dirty pages that didn't properly go through the
* cow mechanism and make them safe to write. It happens for the
* sys_munmap function call path.
*/
struct btrfs_workqueue *fixup_workers;
struct btrfs_workqueue *delayed_workers;
struct task_struct *transaction_kthread;
struct task_struct *cleaner_kthread;
u32 thread_pool_size;
struct kobject *space_info_kobj;
struct kobject *qgroups_kobj;
struct kobject *discard_kobj;
/* Used to keep from writing metadata until there is a nice batch */
struct percpu_counter dirty_metadata_bytes;
struct percpu_counter delalloc_bytes;
struct percpu_counter ordered_bytes;
s32 dirty_metadata_batch;
s32 delalloc_batch;
struct list_head dirty_cowonly_roots;
struct btrfs_fs_devices *fs_devices;
/*
* The space_info list is effectively read only after initial setup.
* It is populated at mount time and cleaned up after all block groups
* are removed. RCU is used to protect it.
*/
struct list_head space_info;
struct btrfs_space_info *data_sinfo;
struct reloc_control *reloc_ctl;
/* data_alloc_cluster is only used in ssd_spread mode */
struct btrfs_free_cluster data_alloc_cluster;
/* All metadata allocations go through this cluster. */
struct btrfs_free_cluster meta_alloc_cluster;
/* Auto defrag inodes go here. */
spinlock_t defrag_inodes_lock;
struct rb_root defrag_inodes;
atomic_t defrag_running;
/* Used to protect avail_{data, metadata, system}_alloc_bits */
seqlock_t profiles_lock;
/*
* These three are in extended format (availability of single chunks is
* denoted by BTRFS_AVAIL_ALLOC_BIT_SINGLE bit, other types are denoted
* by corresponding BTRFS_BLOCK_GROUP_* bits)
*/
u64 avail_data_alloc_bits;
u64 avail_metadata_alloc_bits;
u64 avail_system_alloc_bits;
/* Balance state */
spinlock_t balance_lock;
struct mutex balance_mutex;
atomic_t balance_pause_req;
atomic_t balance_cancel_req;
struct btrfs_balance_control *balance_ctl;
wait_queue_head_t balance_wait_q;
/* Cancellation requests for chunk relocation */
atomic_t reloc_cancel_req;
u32 data_chunk_allocations;
u32 metadata_ratio;
void *bdev_holder;
/* Private scrub information */
struct mutex scrub_lock;
atomic_t scrubs_running;
atomic_t scrub_pause_req;
atomic_t scrubs_paused;
atomic_t scrub_cancel_req;
wait_queue_head_t scrub_pause_wait;
/*
* The worker pointers are NULL iff the refcount is 0, ie. scrub is not
* running.
*/
refcount_t scrub_workers_refcnt;
struct workqueue_struct *scrub_workers;
struct workqueue_struct *scrub_wr_completion_workers;
struct workqueue_struct *scrub_parity_workers;
struct btrfs_subpage_info *subpage_info;
struct btrfs_discard_ctl discard_ctl;
#ifdef CONFIG_BTRFS_FS_CHECK_INTEGRITY
u32 check_integrity_print_mask;
#endif
/* Is qgroup tracking in a consistent state? */
u64 qgroup_flags;
/* Holds configuration and tracking. Protected by qgroup_lock. */
struct rb_root qgroup_tree;
spinlock_t qgroup_lock;
/*
* Used to avoid frequently calling ulist_alloc()/ulist_free()
* when doing qgroup accounting, it must be protected by qgroup_lock.
*/
struct ulist *qgroup_ulist;
/*
* Protect user change for quota operations. If a transaction is needed,
* it must be started before locking this lock.
*/
struct mutex qgroup_ioctl_lock;
/* List of dirty qgroups to be written at next commit. */
struct list_head dirty_qgroups;
/* Used by qgroup for an efficient tree traversal. */
u64 qgroup_seq;
/* Qgroup rescan items. */
/* Protects the progress item */
struct mutex qgroup_rescan_lock;
struct btrfs_key qgroup_rescan_progress;
struct btrfs_workqueue *qgroup_rescan_workers;
struct completion qgroup_rescan_completion;
struct btrfs_work qgroup_rescan_work;
/* Protected by qgroup_rescan_lock */
bool qgroup_rescan_running;
u8 qgroup_drop_subtree_thres;
/* Filesystem state */
unsigned long fs_state;
struct btrfs_delayed_root *delayed_root;
/* Extent buffer radix tree */
spinlock_t buffer_lock;
/* Entries are eb->start / sectorsize */
struct radix_tree_root buffer_radix;
/* Next backup root to be overwritten */
int backup_root_index;
/* Device replace state */
struct btrfs_dev_replace dev_replace;
struct semaphore uuid_tree_rescan_sem;
/* Used to reclaim the metadata space in the background. */
struct work_struct async_reclaim_work;
struct work_struct async_data_reclaim_work;
struct work_struct preempt_reclaim_work;
/* Reclaim partially filled block groups in the background */
struct work_struct reclaim_bgs_work;
struct list_head reclaim_bgs;
int bg_reclaim_threshold;
spinlock_t unused_bgs_lock;
struct list_head unused_bgs;
struct mutex unused_bg_unpin_mutex;
/* Protect block groups that are going to be deleted */
struct mutex reclaim_bgs_lock;
/* Cached block sizes */
u32 nodesize;
u32 sectorsize;
/* ilog2 of sectorsize, use to avoid 64bit division */
u32 sectorsize_bits;
u32 csum_size;
u32 csums_per_leaf;
u32 stripesize;
/*
* Maximum size of an extent. BTRFS_MAX_EXTENT_SIZE on regular
* filesystem, on zoned it depends on the device constraints.
*/
u64 max_extent_size;
/* Block groups and devices containing active swapfiles. */
spinlock_t swapfile_pins_lock;
struct rb_root swapfile_pins;
struct crypto_shash *csum_shash;
/* Type of exclusive operation running, protected by super_lock */
enum btrfs_exclusive_operation exclusive_operation;
/*
* Zone size > 0 when in ZONED mode, otherwise it's used for a check
* if the mode is enabled
*/
u64 zone_size;
/* Max size to emit ZONE_APPEND write command */
u64 max_zone_append_size;
struct mutex zoned_meta_io_lock;
spinlock_t treelog_bg_lock;
u64 treelog_bg;
/*
* Start of the dedicated data relocation block group, protected by
* relocation_bg_lock.
*/
spinlock_t relocation_bg_lock;
u64 data_reloc_bg;
struct mutex zoned_data_reloc_io_lock;
u64 nr_global_roots;
spinlock_t zone_active_bgs_lock;
struct list_head zone_active_bgs;
/* Updates are not protected by any lock */
struct btrfs_commit_stats commit_stats;
/*
* Last generation where we dropped a non-relocation root.
* Use btrfs_set_last_root_drop_gen() and btrfs_get_last_root_drop_gen()
* to change it and to read it, respectively.
*/
u64 last_root_drop_gen;
/*
* Annotations for transaction events (structures are empty when
* compiled without lockdep).
*/
struct lockdep_map btrfs_trans_num_writers_map;
struct lockdep_map btrfs_trans_num_extwriters_map;
struct lockdep_map btrfs_state_change_map[4];
struct lockdep_map btrfs_trans_pending_ordered_map;
struct lockdep_map btrfs_ordered_extent_map;
#ifdef CONFIG_BTRFS_FS_REF_VERIFY
spinlock_t ref_verify_lock;
struct rb_root block_tree;
#endif
#ifdef CONFIG_BTRFS_DEBUG
struct kobject *debug_kobj;
struct list_head allocated_roots;
spinlock_t eb_leak_lock;
struct list_head allocated_ebs;
#endif
};
static inline void btrfs_set_last_root_drop_gen(struct btrfs_fs_info *fs_info,
u64 gen)
{
WRITE_ONCE(fs_info->last_root_drop_gen, gen);
}
static inline u64 btrfs_get_last_root_drop_gen(const struct btrfs_fs_info *fs_info)
{
return READ_ONCE(fs_info->last_root_drop_gen);
}
/*
* Take the number of bytes to be checksummed and figure out how many leaves
* it would require to store the csums for that many bytes.
*/
static inline u64 btrfs_csum_bytes_to_leaves(
const struct btrfs_fs_info *fs_info, u64 csum_bytes)
{
const u64 num_csums = csum_bytes >> fs_info->sectorsize_bits;
return DIV_ROUND_UP_ULL(num_csums, fs_info->csums_per_leaf);
}
/*
* Use this if we would be adding new items, as we could split nodes as we cow
* down the tree.
*/
static inline u64 btrfs_calc_insert_metadata_size(struct btrfs_fs_info *fs_info,
unsigned num_items)
{
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * 2 * num_items;
}
/*
* Doing a truncate or a modification won't result in new nodes or leaves, just
* what we need for COW.
*/
static inline u64 btrfs_calc_metadata_size(struct btrfs_fs_info *fs_info,
unsigned num_items)
{
return (u64)fs_info->nodesize * BTRFS_MAX_LEVEL * num_items;
}
#define BTRFS_MAX_EXTENT_ITEM_SIZE(r) ((BTRFS_LEAF_DATA_SIZE(r->fs_info) >> 4) - \
sizeof(struct btrfs_item))
static inline bool btrfs_is_zoned(const struct btrfs_fs_info *fs_info)
{
return fs_info->zone_size > 0;
}
/*
* Count how many fs_info->max_extent_size cover the @size
*/
static inline u32 count_max_extents(struct btrfs_fs_info *fs_info, u64 size)
{
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
if (!fs_info)
return div_u64(size + BTRFS_MAX_EXTENT_SIZE - 1, BTRFS_MAX_EXTENT_SIZE);
#endif
return div_u64(size + fs_info->max_extent_size - 1, fs_info->max_extent_size);
}
bool btrfs_exclop_start(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
bool btrfs_exclop_start_try_lock(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation type);
void btrfs_exclop_start_unlock(struct btrfs_fs_info *fs_info);
void btrfs_exclop_finish(struct btrfs_fs_info *fs_info);
void btrfs_exclop_balance(struct btrfs_fs_info *fs_info,
enum btrfs_exclusive_operation op);
/* Compatibility and incompatibility defines */
void __btrfs_set_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name);
void __btrfs_clear_fs_incompat(struct btrfs_fs_info *fs_info, u64 flag,
const char *name);
void __btrfs_set_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
const char *name);
void __btrfs_clear_fs_compat_ro(struct btrfs_fs_info *fs_info, u64 flag,
const char *name);
#define __btrfs_fs_incompat(fs_info, flags) \
(!!(btrfs_super_incompat_flags((fs_info)->super_copy) & (flags)))
#define __btrfs_fs_compat_ro(fs_info, flags) \
(!!(btrfs_super_compat_ro_flags((fs_info)->super_copy) & (flags)))
#define btrfs_set_fs_incompat(__fs_info, opt) \
__btrfs_set_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt)
#define btrfs_clear_fs_incompat(__fs_info, opt) \
__btrfs_clear_fs_incompat((__fs_info), BTRFS_FEATURE_INCOMPAT_##opt, #opt)
#define btrfs_fs_incompat(fs_info, opt) \
__btrfs_fs_incompat((fs_info), BTRFS_FEATURE_INCOMPAT_##opt)
#define btrfs_set_fs_compat_ro(__fs_info, opt) \
__btrfs_set_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt)
#define btrfs_clear_fs_compat_ro(__fs_info, opt) \
__btrfs_clear_fs_compat_ro((__fs_info), BTRFS_FEATURE_COMPAT_RO_##opt, #opt)
#define btrfs_fs_compat_ro(fs_info, opt) \
__btrfs_fs_compat_ro((fs_info), BTRFS_FEATURE_COMPAT_RO_##opt)
#define btrfs_clear_opt(o, opt) ((o) &= ~BTRFS_MOUNT_##opt)
#define btrfs_set_opt(o, opt) ((o) |= BTRFS_MOUNT_##opt)
#define btrfs_raw_test_opt(o, opt) ((o) & BTRFS_MOUNT_##opt)
#define btrfs_test_opt(fs_info, opt) ((fs_info)->mount_opt & \
BTRFS_MOUNT_##opt)
#define btrfs_set_and_info(fs_info, opt, fmt, args...) \
do { \
if (!btrfs_test_opt(fs_info, opt)) \
btrfs_info(fs_info, fmt, ##args); \
btrfs_set_opt(fs_info->mount_opt, opt); \
} while (0)
#define btrfs_clear_and_info(fs_info, opt, fmt, args...) \
do { \
if (btrfs_test_opt(fs_info, opt)) \
btrfs_info(fs_info, fmt, ##args); \
btrfs_clear_opt(fs_info->mount_opt, opt); \
} while (0)
static inline int btrfs_fs_closing(struct btrfs_fs_info *fs_info)
{
/* Do it this way so we only ever do one test_bit in the normal case. */
if (test_bit(BTRFS_FS_CLOSING_START, &fs_info->flags)) {
if (test_bit(BTRFS_FS_CLOSING_DONE, &fs_info->flags))
return 2;
return 1;
}
return 0;
}
/*
* If we remount the fs to be R/O or umount the fs, the cleaner needn't do
* anything except sleeping. This function is used to check the status of
* the fs.
* We check for BTRFS_FS_STATE_RO to avoid races with a concurrent remount,
* since setting and checking for SB_RDONLY in the superblock's flags is not
* atomic.
*/
static inline int btrfs_need_cleaner_sleep(struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_RO, &fs_info->fs_state) ||
btrfs_fs_closing(fs_info);
}
static inline void btrfs_wake_unfinished_drop(struct btrfs_fs_info *fs_info)
{
clear_and_wake_up_bit(BTRFS_FS_UNFINISHED_DROPS, &fs_info->flags);
}
#define BTRFS_FS_ERROR(fs_info) (unlikely(test_bit(BTRFS_FS_STATE_ERROR, \
&(fs_info)->fs_state)))
#define BTRFS_FS_LOG_CLEANUP_ERROR(fs_info) \
(unlikely(test_bit(BTRFS_FS_STATE_LOG_CLEANUP_ERROR, \
&(fs_info)->fs_state)))
#ifdef CONFIG_BTRFS_FS_RUN_SANITY_TESTS
#define EXPORT_FOR_TESTS
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
return test_bit(BTRFS_FS_STATE_DUMMY_FS_INFO, &fs_info->fs_state);
}
void btrfs_test_destroy_inode(struct inode *inode);
#else
#define EXPORT_FOR_TESTS static
static inline int btrfs_is_testing(struct btrfs_fs_info *fs_info)
{
return 0;
}
#endif
#endif

View File

@ -4,14 +4,20 @@
*/
#include "ctree.h"
#include "fs.h"
#include "messages.h"
#include "inode-item.h"
#include "disk-io.h"
#include "transaction.h"
#include "print-tree.h"
#include "space-info.h"
#include "accessors.h"
#include "extent-tree.h"
#include "file-item.h"
struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
int slot, const char *name,
int name_len)
int slot,
const struct fscrypt_str *name)
{
struct btrfs_inode_ref *ref;
unsigned long ptr;
@ -27,9 +33,10 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
len = btrfs_inode_ref_name_len(leaf, ref);
name_ptr = (unsigned long)(ref + 1);
cur_offset += len + sizeof(*ref);
if (len != name_len)
if (len != name->len)
continue;
if (memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0)
if (memcmp_extent_buffer(leaf, name->name, name_ptr,
name->len) == 0)
return ref;
}
return NULL;
@ -37,7 +44,7 @@ struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
struct extent_buffer *leaf, int slot, u64 ref_objectid,
const char *name, int name_len)
const struct fscrypt_str *name)
{
struct btrfs_inode_extref *extref;
unsigned long ptr;
@ -60,9 +67,10 @@ struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
name_ptr = (unsigned long)(&extref->name);
ref_name_len = btrfs_inode_extref_name_len(leaf, extref);
if (ref_name_len == name_len &&
if (ref_name_len == name->len &&
btrfs_inode_extref_parent(leaf, extref) == ref_objectid &&
(memcmp_extent_buffer(leaf, name, name_ptr, name_len) == 0))
(memcmp_extent_buffer(leaf, name->name, name_ptr,
name->len) == 0))
return extref;
cur_offset += ref_name_len + sizeof(*extref);
@ -75,7 +83,7 @@ struct btrfs_inode_extref *
btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
const char *name, int name_len,
const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, int ins_len,
int cow)
{
@ -84,7 +92,7 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
ret = btrfs_search_slot(trans, root, &key, path, ins_len, cow);
if (ret < 0)
@ -92,13 +100,13 @@ btrfs_lookup_inode_extref(struct btrfs_trans_handle *trans,
if (ret > 0)
return NULL;
return btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
ref_objectid, name, name_len);
ref_objectid, name);
}
static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid,
u64 *index)
{
@ -107,14 +115,14 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_inode_extref *extref;
struct extent_buffer *leaf;
int ret;
int del_len = name_len + sizeof(*extref);
int del_len = name->len + sizeof(*extref);
unsigned long ptr;
unsigned long item_start;
u32 item_size;
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
path = btrfs_alloc_path();
if (!path)
@ -132,7 +140,7 @@ static int btrfs_del_inode_extref(struct btrfs_trans_handle *trans,
* readonly.
*/
extref = btrfs_find_name_in_ext_backref(path->nodes[0], path->slots[0],
ref_objectid, name, name_len);
ref_objectid, name);
if (!extref) {
btrfs_handle_fs_error(root->fs_info, -ENOENT, NULL);
ret = -EROFS;
@ -168,8 +176,7 @@ out:
}
int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct btrfs_root *root, const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, u64 *index)
{
struct btrfs_path *path;
@ -182,7 +189,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
u32 sub_item_len;
int ret;
int search_ext_refs = 0;
int del_len = name_len + sizeof(*ref);
int del_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
key.offset = ref_objectid;
@ -201,8 +208,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name,
name_len);
ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0], name);
if (!ref) {
ret = -ENOENT;
search_ext_refs = 1;
@ -219,7 +225,7 @@ int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
goto out;
}
ptr = (unsigned long)ref;
sub_item_len = name_len + sizeof(*ref);
sub_item_len = name->len + sizeof(*ref);
item_start = btrfs_item_ptr_offset(leaf, path->slots[0]);
memmove_extent_buffer(leaf, ptr, ptr + sub_item_len,
item_size - (ptr + sub_item_len - item_start));
@ -233,7 +239,7 @@ out:
* name in our ref array. Find and remove the extended
* inode ref then.
*/
return btrfs_del_inode_extref(trans, root, name, name_len,
return btrfs_del_inode_extref(trans, root, name,
inode_objectid, ref_objectid, index);
}
@ -247,12 +253,13 @@ out:
*/
static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
u64 inode_objectid, u64 ref_objectid, u64 index)
const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid,
u64 index)
{
struct btrfs_inode_extref *extref;
int ret;
int ins_len = name_len + sizeof(*extref);
int ins_len = name->len + sizeof(*extref);
unsigned long ptr;
struct btrfs_path *path;
struct btrfs_key key;
@ -260,7 +267,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
key.objectid = inode_objectid;
key.type = BTRFS_INODE_EXTREF_KEY;
key.offset = btrfs_extref_hash(ref_objectid, name, name_len);
key.offset = btrfs_extref_hash(ref_objectid, name->name, name->len);
path = btrfs_alloc_path();
if (!path)
@ -272,7 +279,7 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
if (btrfs_find_name_in_ext_backref(path->nodes[0],
path->slots[0],
ref_objectid,
name, name_len))
name))
goto out;
btrfs_extend_item(path, ins_len);
@ -286,12 +293,12 @@ static int btrfs_insert_inode_extref(struct btrfs_trans_handle *trans,
ptr += btrfs_item_size(leaf, path->slots[0]) - ins_len;
extref = (struct btrfs_inode_extref *)ptr;
btrfs_set_inode_extref_name_len(path->nodes[0], extref, name_len);
btrfs_set_inode_extref_name_len(path->nodes[0], extref, name->len);
btrfs_set_inode_extref_index(path->nodes[0], extref, index);
btrfs_set_inode_extref_parent(path->nodes[0], extref, ref_objectid);
ptr = (unsigned long)&extref->name;
write_extent_buffer(path->nodes[0], name, ptr, name_len);
write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
@ -301,8 +308,7 @@ out:
/* Will return 0, -ENOMEM, -EMLINK, or -EEXIST or anything from the CoW path */
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct btrfs_root *root, const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, u64 index)
{
struct btrfs_fs_info *fs_info = root->fs_info;
@ -311,7 +317,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_inode_ref *ref;
unsigned long ptr;
int ret;
int ins_len = name_len + sizeof(*ref);
int ins_len = name->len + sizeof(*ref);
key.objectid = inode_objectid;
key.offset = ref_objectid;
@ -327,7 +333,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (ret == -EEXIST) {
u32 old_size;
ref = btrfs_find_name_in_backref(path->nodes[0], path->slots[0],
name, name_len);
name);
if (ref)
goto out;
@ -336,7 +342,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
ref = (struct btrfs_inode_ref *)((unsigned long)ref + old_size);
btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len);
btrfs_set_inode_ref_index(path->nodes[0], ref, index);
ptr = (unsigned long)(ref + 1);
ret = 0;
@ -344,7 +350,7 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
if (ret == -EOVERFLOW) {
if (btrfs_find_name_in_backref(path->nodes[0],
path->slots[0],
name, name_len))
name))
ret = -EEXIST;
else
ret = -EMLINK;
@ -353,11 +359,11 @@ int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
} else {
ref = btrfs_item_ptr(path->nodes[0], path->slots[0],
struct btrfs_inode_ref);
btrfs_set_inode_ref_name_len(path->nodes[0], ref, name_len);
btrfs_set_inode_ref_name_len(path->nodes[0], ref, name->len);
btrfs_set_inode_ref_index(path->nodes[0], ref, index);
ptr = (unsigned long)(ref + 1);
}
write_extent_buffer(path->nodes[0], name, ptr, name_len);
write_extent_buffer(path->nodes[0], name->name, ptr, name->len);
btrfs_mark_buffer_dirty(path->nodes[0]);
out:
@ -370,7 +376,6 @@ out:
if (btrfs_super_incompat_flags(disk_super)
& BTRFS_FEATURE_INCOMPAT_EXTENDED_IREF)
ret = btrfs_insert_inode_extref(trans, root, name,
name_len,
inode_objectid,
ref_objectid, index);
}

View File

@ -64,33 +64,31 @@ int btrfs_truncate_inode_items(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_truncate_control *control);
int btrfs_insert_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct btrfs_root *root, const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, u64 index);
int btrfs_del_inode_ref(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
const char *name, int name_len,
struct btrfs_root *root, const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, u64 *index);
int btrfs_insert_empty_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path, u64 objectid);
int btrfs_lookup_inode(struct btrfs_trans_handle *trans, struct btrfs_root
*root, struct btrfs_path *path,
int btrfs_lookup_inode(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct btrfs_path *path,
struct btrfs_key *location, int mod);
struct btrfs_inode_extref *btrfs_lookup_inode_extref(
struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_path *path,
const char *name, int name_len,
const struct fscrypt_str *name,
u64 inode_objectid, u64 ref_objectid, int ins_len,
int cow);
struct btrfs_inode_ref *btrfs_find_name_in_backref(struct extent_buffer *leaf,
int slot, const char *name,
int name_len);
int slot,
const struct fscrypt_str *name);
struct btrfs_inode_extref *btrfs_find_name_in_ext_backref(
struct extent_buffer *leaf, int slot, u64 ref_objectid,
const char *name, int name_len);
const struct fscrypt_str *name);
#endif

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

17
fs/btrfs/ioctl.h Normal file
View File

@ -0,0 +1,17 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_IOCTL_H
#define BTRFS_IOCTL_H
long btrfs_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
long btrfs_compat_ioctl(struct file *file, unsigned int cmd, unsigned long arg);
int btrfs_fileattr_get(struct dentry *dentry, struct fileattr *fa);
int btrfs_fileattr_set(struct user_namespace *mnt_userns,
struct dentry *dentry, struct fileattr *fa);
int btrfs_ioctl_get_supported_features(void __user *arg);
void btrfs_sync_inode_flags_to_i_flags(struct inode *inode);
int __pure btrfs_is_empty_uuid(u8 *uuid);
void btrfs_update_ioctl_balance_args(struct btrfs_fs_info *fs_info,
struct btrfs_ioctl_balance_args *bargs);
#endif

View File

@ -12,6 +12,7 @@
#include "ctree.h"
#include "extent_io.h"
#include "locking.h"
#include "accessors.h"
/*
* Lockdep class keys for extent_buffer->lock's in this root. For a given

View File

@ -78,6 +78,82 @@ enum btrfs_lock_nesting {
BTRFS_NESTING_MAX,
};
enum btrfs_lockdep_trans_states {
BTRFS_LOCKDEP_TRANS_COMMIT_START,
BTRFS_LOCKDEP_TRANS_UNBLOCKED,
BTRFS_LOCKDEP_TRANS_SUPER_COMMITTED,
BTRFS_LOCKDEP_TRANS_COMPLETED,
};
/*
* Lockdep annotation for wait events.
*
* @owner: The struct where the lockdep map is defined
* @lock: The lockdep map corresponding to a wait event
*
* This macro is used to annotate a wait event. In this case a thread acquires
* the lockdep map as writer (exclusive lock) because it has to block until all
* the threads that hold the lock as readers signal the condition for the wait
* event and release their locks.
*/
#define btrfs_might_wait_for_event(owner, lock) \
do { \
rwsem_acquire(&owner->lock##_map, 0, 0, _THIS_IP_); \
rwsem_release(&owner->lock##_map, _THIS_IP_); \
} while (0)
/*
* Protection for the resource/condition of a wait event.
*
* @owner: The struct where the lockdep map is defined
* @lock: The lockdep map corresponding to a wait event
*
* Many threads can modify the condition for the wait event at the same time
* and signal the threads that block on the wait event. The threads that modify
* the condition and do the signaling acquire the lock as readers (shared
* lock).
*/
#define btrfs_lockdep_acquire(owner, lock) \
rwsem_acquire_read(&owner->lock##_map, 0, 0, _THIS_IP_)
/*
* Used after signaling the condition for a wait event to release the lockdep
* map held by a reader thread.
*/
#define btrfs_lockdep_release(owner, lock) \
rwsem_release(&owner->lock##_map, _THIS_IP_)
/*
* Macros for the transaction states wait events, similar to the generic wait
* event macros.
*/
#define btrfs_might_wait_for_state(owner, i) \
do { \
rwsem_acquire(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_); \
rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_); \
} while (0)
#define btrfs_trans_state_lockdep_acquire(owner, i) \
rwsem_acquire_read(&owner->btrfs_state_change_map[i], 0, 0, _THIS_IP_)
#define btrfs_trans_state_lockdep_release(owner, i) \
rwsem_release(&owner->btrfs_state_change_map[i], _THIS_IP_)
/* Initialization of the lockdep map */
#define btrfs_lockdep_init_map(owner, lock) \
do { \
static struct lock_class_key lock##_key; \
lockdep_init_map(&owner->lock##_map, #lock, &lock##_key, 0); \
} while (0)
/* Initialization of the transaction states lockdep maps. */
#define btrfs_state_lockdep_init_map(owner, lock, state) \
do { \
static struct lock_class_key lock##_key; \
lockdep_init_map(&owner->btrfs_state_change_map[state], #lock, \
&lock##_key, 0); \
} while (0)
static_assert(BTRFS_NESTING_MAX <= MAX_LOCKDEP_SUBCLASSES,
"too many lock subclasses defined");

View File

@ -13,8 +13,10 @@
#include <linux/bio.h>
#include <linux/lzo.h>
#include <linux/refcount.h>
#include "messages.h"
#include "compression.h"
#include "ctree.h"
#include "super.h"
#define LZO_LEN 4
@ -425,7 +427,7 @@ out:
return ret;
}
int lzo_decompress(struct list_head *ws, unsigned char *data_in,
int lzo_decompress(struct list_head *ws, const u8 *data_in,
struct page *dest_page, unsigned long start_byte, size_t srclen,
size_t destlen)
{

353
fs/btrfs/messages.c Normal file
View File

@ -0,0 +1,353 @@
// SPDX-License-Identifier: GPL-2.0
#include "fs.h"
#include "messages.h"
#include "discard.h"
#include "transaction.h"
#include "space-info.h"
#include "super.h"
#ifdef CONFIG_PRINTK
#define STATE_STRING_PREFACE ": state "
#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
/*
* Characters to print to indicate error conditions or uncommon filesystem state.
* RO is not an error.
*/
static const char fs_state_chars[] = {
[BTRFS_FS_STATE_ERROR] = 'E',
[BTRFS_FS_STATE_REMOUNTING] = 'M',
[BTRFS_FS_STATE_RO] = 0,
[BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
[BTRFS_FS_STATE_DEV_REPLACING] = 'R',
[BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
[BTRFS_FS_STATE_NO_CSUMS] = 'C',
[BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
};
static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
{
unsigned int bit;
bool states_printed = false;
unsigned long fs_state = READ_ONCE(info->fs_state);
char *curr = buf;
memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
curr += sizeof(STATE_STRING_PREFACE) - 1;
for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
*curr++ = fs_state_chars[bit];
states_printed = true;
}
}
/* If no states were printed, reset the buffer */
if (!states_printed)
curr = buf;
*curr++ = 0;
}
#endif
/*
* Generally the error codes correspond to their respective errors, but there
* are a few special cases.
*
* EUCLEAN: Any sort of corruption that we encounter. The tree-checker for
* instance will return EUCLEAN if any of the blocks are corrupted in
* a way that is problematic. We want to reserve EUCLEAN for these
* sort of corruptions.
*
* EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
* need to use EROFS for this case. We will have no idea of the
* original failure, that will have been reported at the time we tripped
* over the error. Each subsequent error that doesn't have any context
* of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
*/
const char * __attribute_const__ btrfs_decode_error(int errno)
{
char *errstr = "unknown";
switch (errno) {
case -ENOENT: /* -2 */
errstr = "No such entry";
break;
case -EIO: /* -5 */
errstr = "IO failure";
break;
case -ENOMEM: /* -12*/
errstr = "Out of memory";
break;
case -EEXIST: /* -17 */
errstr = "Object already exists";
break;
case -ENOSPC: /* -28 */
errstr = "No space left";
break;
case -EROFS: /* -30 */
errstr = "Readonly filesystem";
break;
case -EOPNOTSUPP: /* -95 */
errstr = "Operation not supported";
break;
case -EUCLEAN: /* -117 */
errstr = "Filesystem corrupted";
break;
case -EDQUOT: /* -122 */
errstr = "Quota exceeded";
break;
}
return errstr;
}
/*
* __btrfs_handle_fs_error decodes expected errors from the caller and
* invokes the appropriate error response.
*/
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
#ifdef CONFIG_PRINTK
char statestr[STATE_STRING_BUF_LEN];
const char *errstr;
#endif
#ifdef CONFIG_PRINTK_INDEX
printk_index_subsys_emit(
"BTRFS: error (device %s%s) in %s:%d: errno=%d %s", KERN_CRIT, fmt);
#endif
/*
* Special case: if the error is EROFS, and we're already under
* SB_RDONLY, then it is safe here.
*/
if (errno == -EROFS && sb_rdonly(sb))
return;
#ifdef CONFIG_PRINTK
errstr = btrfs_decode_error(errno);
btrfs_state_to_string(fs_info, statestr);
if (fmt) {
struct va_format vaf;
va_list args;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
sb->s_id, statestr, function, line, errno, errstr, &vaf);
va_end(args);
} else {
pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
sb->s_id, statestr, function, line, errno, errstr);
}
#endif
/*
* Today we only save the error info to memory. Long term we'll also
* send it down to the disk.
*/
set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
/* Don't go through full error handling during mount. */
if (!(sb->s_flags & SB_BORN))
return;
if (sb_rdonly(sb))
return;
btrfs_discard_stop(fs_info);
/* Handle error by forcing the filesystem readonly. */
btrfs_set_sb_rdonly(sb);
btrfs_info(fs_info, "forced readonly");
/*
* Note that a running device replace operation is not canceled here
* although there is no way to update the progress. It would add the
* risk of a deadlock, therefore the canceling is omitted. The only
* penalty is that some I/O remains active until the procedure
* completes. The next time when the filesystem is mounted writable
* again, the device replace operation continues.
*/
}
#ifdef CONFIG_PRINTK
static const char * const logtypes[] = {
"emergency",
"alert",
"critical",
"error",
"warning",
"notice",
"info",
"debug",
};
/*
* Use one ratelimit state per log level so that a flood of less important
* messages doesn't cause more important ones to be dropped.
*/
static struct ratelimit_state printk_limits[] = {
RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
va_list args;
int kern_level;
const char *type = logtypes[4];
struct ratelimit_state *ratelimit = &printk_limits[4];
#ifdef CONFIG_PRINTK_INDEX
printk_index_subsys_emit("%sBTRFS %s (device %s): ", NULL, fmt);
#endif
va_start(args, fmt);
while ((kern_level = printk_get_level(fmt)) != 0) {
size_t size = printk_skip_level(fmt) - fmt;
if (kern_level >= '0' && kern_level <= '7') {
memcpy(lvl, fmt, size);
lvl[size] = '\0';
type = logtypes[kern_level - '0'];
ratelimit = &printk_limits[kern_level - '0'];
}
fmt += size;
}
vaf.fmt = fmt;
vaf.va = &args;
if (__ratelimit(ratelimit)) {
if (fs_info) {
char statestr[STATE_STRING_BUF_LEN];
btrfs_state_to_string(fs_info, statestr);
_printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
fs_info->sb->s_id, statestr, &vaf);
} else {
_printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
}
}
va_end(args);
}
#endif
#ifdef CONFIG_BTRFS_ASSERT
void __cold btrfs_assertfail(const char *expr, const char *file, int line)
{
pr_err("assertion failed: %s, in %s:%d\n", expr, file, line);
BUG();
}
#endif
void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info)
{
btrfs_err(fs_info,
"Unsupported V0 extent filesystem detected. Aborting. Please re-create your filesystem with a newer kernel");
}
#if BITS_PER_LONG == 32
void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
{
if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
btrfs_warn(fs_info,
"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
BTRFS_32BIT_MAX_FILE_SIZE >> 40);
btrfs_warn(fs_info,
"please consider upgrading to 64bit kernel/hardware");
}
}
void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
{
if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
btrfs_err(fs_info, "reached 32bit limit for logical addresses");
btrfs_err(fs_info,
"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
BTRFS_32BIT_MAX_FILE_SIZE >> 40);
btrfs_err(fs_info,
"please consider upgrading to 64bit kernel/hardware");
}
}
#endif
/*
* We only mark the transaction aborted and then set the file system read-only.
* This will prevent new transactions from starting or trying to join this
* one.
*
* This means that error recovery at the call site is limited to freeing
* any local memory allocations and passing the error code up without
* further cleanup. The transaction should complete as it normally would
* in the call path but will return -EIO.
*
* We'll complete the cleanup in btrfs_end_transaction and
* btrfs_commit_transaction.
*/
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
unsigned int line, int errno, bool first_hit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
WRITE_ONCE(trans->aborted, errno);
WRITE_ONCE(trans->transaction->aborted, errno);
if (first_hit && errno == -ENOSPC)
btrfs_dump_space_info_for_trans_abort(fs_info);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
wake_up(&fs_info->transaction_blocked_wait);
__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
}
/*
* __btrfs_panic decodes unexpected, fatal errors from the caller, issues an
* alert, and either panics or BUGs, depending on mount options.
*/
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
char *s_id = "<unknown>";
const char *errstr;
struct va_format vaf = { .fmt = fmt };
va_list args;
if (fs_info)
s_id = fs_info->sb->s_id;
va_start(args, fmt);
vaf.va = &args;
errstr = btrfs_decode_error(errno);
if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
s_id, function, line, &vaf, errno, errstr);
btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
function, line, &vaf, errno, errstr);
va_end(args);
/* Caller calls BUG() */
}

245
fs/btrfs/messages.h Normal file
View File

@ -0,0 +1,245 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_MESSAGES_H
#define BTRFS_MESSAGES_H
#include <linux/types.h>
struct btrfs_fs_info;
struct btrfs_trans_handle;
static inline __printf(2, 3) __cold
void btrfs_no_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
}
#ifdef CONFIG_PRINTK
#define btrfs_printk(fs_info, fmt, args...) \
_btrfs_printk(fs_info, fmt, ##args)
__printf(2, 3)
__cold
void _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...);
#else
#define btrfs_printk(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, fmt, ##args)
#endif
#define btrfs_emerg(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_EMERG fmt, ##args)
#define btrfs_alert(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_ALERT fmt, ##args)
#define btrfs_crit(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_CRIT fmt, ##args)
#define btrfs_err(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_ERR fmt, ##args)
#define btrfs_warn(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_WARNING fmt, ##args)
#define btrfs_notice(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_NOTICE fmt, ##args)
#define btrfs_info(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_INFO fmt, ##args)
/*
* Wrappers that use printk_in_rcu
*/
#define btrfs_emerg_in_rcu(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_EMERG fmt, ##args)
#define btrfs_alert_in_rcu(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_ALERT fmt, ##args)
#define btrfs_crit_in_rcu(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_CRIT fmt, ##args)
#define btrfs_err_in_rcu(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_ERR fmt, ##args)
#define btrfs_warn_in_rcu(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_WARNING fmt, ##args)
#define btrfs_notice_in_rcu(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
#define btrfs_info_in_rcu(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_INFO fmt, ##args)
/*
* Wrappers that use a ratelimited printk_in_rcu
*/
#define btrfs_emerg_rl_in_rcu(fs_info, fmt, args...) \
btrfs_printk_rl_in_rcu(fs_info, KERN_EMERG fmt, ##args)
#define btrfs_alert_rl_in_rcu(fs_info, fmt, args...) \
btrfs_printk_rl_in_rcu(fs_info, KERN_ALERT fmt, ##args)
#define btrfs_crit_rl_in_rcu(fs_info, fmt, args...) \
btrfs_printk_rl_in_rcu(fs_info, KERN_CRIT fmt, ##args)
#define btrfs_err_rl_in_rcu(fs_info, fmt, args...) \
btrfs_printk_rl_in_rcu(fs_info, KERN_ERR fmt, ##args)
#define btrfs_warn_rl_in_rcu(fs_info, fmt, args...) \
btrfs_printk_rl_in_rcu(fs_info, KERN_WARNING fmt, ##args)
#define btrfs_notice_rl_in_rcu(fs_info, fmt, args...) \
btrfs_printk_rl_in_rcu(fs_info, KERN_NOTICE fmt, ##args)
#define btrfs_info_rl_in_rcu(fs_info, fmt, args...) \
btrfs_printk_rl_in_rcu(fs_info, KERN_INFO fmt, ##args)
/*
* Wrappers that use a ratelimited printk
*/
#define btrfs_emerg_rl(fs_info, fmt, args...) \
btrfs_printk_ratelimited(fs_info, KERN_EMERG fmt, ##args)
#define btrfs_alert_rl(fs_info, fmt, args...) \
btrfs_printk_ratelimited(fs_info, KERN_ALERT fmt, ##args)
#define btrfs_crit_rl(fs_info, fmt, args...) \
btrfs_printk_ratelimited(fs_info, KERN_CRIT fmt, ##args)
#define btrfs_err_rl(fs_info, fmt, args...) \
btrfs_printk_ratelimited(fs_info, KERN_ERR fmt, ##args)
#define btrfs_warn_rl(fs_info, fmt, args...) \
btrfs_printk_ratelimited(fs_info, KERN_WARNING fmt, ##args)
#define btrfs_notice_rl(fs_info, fmt, args...) \
btrfs_printk_ratelimited(fs_info, KERN_NOTICE fmt, ##args)
#define btrfs_info_rl(fs_info, fmt, args...) \
btrfs_printk_ratelimited(fs_info, KERN_INFO fmt, ##args)
#if defined(CONFIG_DYNAMIC_DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
_dynamic_func_call_no_desc(fmt, btrfs_printk, \
fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
_dynamic_func_call_no_desc(fmt, btrfs_printk_in_rcu, \
fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
_dynamic_func_call_no_desc(fmt, btrfs_printk_rl_in_rcu, \
fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
_dynamic_func_call_no_desc(fmt, btrfs_printk_ratelimited, \
fs_info, KERN_DEBUG fmt, ##args)
#elif defined(DEBUG)
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_printk(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
btrfs_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
btrfs_printk_rl_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
btrfs_printk_ratelimited(fs_info, KERN_DEBUG fmt, ##args)
#else
#define btrfs_debug(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_in_rcu(fs_info, fmt, args...) \
btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl_in_rcu(fs_info, fmt, args...) \
btrfs_no_printk_in_rcu(fs_info, KERN_DEBUG fmt, ##args)
#define btrfs_debug_rl(fs_info, fmt, args...) \
btrfs_no_printk(fs_info, KERN_DEBUG fmt, ##args)
#endif
#define btrfs_printk_in_rcu(fs_info, fmt, args...) \
do { \
rcu_read_lock(); \
btrfs_printk(fs_info, fmt, ##args); \
rcu_read_unlock(); \
} while (0)
#define btrfs_no_printk_in_rcu(fs_info, fmt, args...) \
do { \
rcu_read_lock(); \
btrfs_no_printk(fs_info, fmt, ##args); \
rcu_read_unlock(); \
} while (0)
#define btrfs_printk_ratelimited(fs_info, fmt, args...) \
do { \
static DEFINE_RATELIMIT_STATE(_rs, \
DEFAULT_RATELIMIT_INTERVAL, \
DEFAULT_RATELIMIT_BURST); \
if (__ratelimit(&_rs)) \
btrfs_printk(fs_info, fmt, ##args); \
} while (0)
#define btrfs_printk_rl_in_rcu(fs_info, fmt, args...) \
do { \
rcu_read_lock(); \
btrfs_printk_ratelimited(fs_info, fmt, ##args); \
rcu_read_unlock(); \
} while (0)
#ifdef CONFIG_BTRFS_ASSERT
void __cold btrfs_assertfail(const char *expr, const char *file, int line);
#define ASSERT(expr) \
(likely(expr) ? (void)0 : btrfs_assertfail(#expr, __FILE__, __LINE__))
#else
#define ASSERT(expr) (void)(expr)
#endif
void __cold btrfs_print_v0_err(struct btrfs_fs_info *fs_info);
__printf(5, 6)
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
const char * __attribute_const__ btrfs_decode_error(int errno);
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
unsigned int line, int errno, bool first_hit);
bool __cold abort_should_print_stack(int errno);
/*
* Call btrfs_abort_transaction as early as possible when an error condition is
* detected, that way the exact stack trace is reported for some errors.
*/
#define btrfs_abort_transaction(trans, errno) \
do { \
bool first = false; \
/* Report first abort since mount */ \
if (!test_and_set_bit(BTRFS_FS_STATE_TRANS_ABORTED, \
&((trans)->fs_info->fs_state))) { \
first = true; \
if (WARN(abort_should_print_stack(errno), \
KERN_ERR \
"BTRFS: Transaction aborted (error %d)\n", \
(errno))) { \
/* Stack trace printed. */ \
} else { \
btrfs_err((trans)->fs_info, \
"Transaction aborted (error %d)", \
(errno)); \
} \
} \
__btrfs_abort_transaction((trans), __func__, \
__LINE__, (errno), first); \
} while (0)
#define btrfs_handle_fs_error(fs_info, errno, fmt, args...) \
__btrfs_handle_fs_error((fs_info), __func__, __LINE__, \
(errno), fmt, ##args)
__printf(5, 6)
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...);
/*
* If BTRFS_MOUNT_PANIC_ON_FATAL_ERROR is in mount_opt, __btrfs_panic
* will panic(). Otherwise we BUG() here.
*/
#define btrfs_panic(fs_info, errno, fmt, args...) \
do { \
__btrfs_panic(fs_info, __func__, __LINE__, errno, fmt, ##args); \
BUG(); \
} while (0)
#if BITS_PER_LONG == 32
#define BTRFS_32BIT_MAX_FILE_SIZE (((u64)ULONG_MAX + 1) << PAGE_SHIFT)
/*
* The warning threshold is 5/8th of the MAX_LFS_FILESIZE that limits the logical
* addresses of extents.
*
* For 4K page size it's about 10T, for 64K it's 160T.
*/
#define BTRFS_32BIT_EARLY_WARN_THRESHOLD (BTRFS_32BIT_MAX_FILE_SIZE * 5 / 8)
void btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info);
void btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info);
#endif
#endif

View File

@ -10,6 +10,14 @@
#define in_range(b, first, len) ((b) >= (first) && (b) < (first) + (len))
/*
* Enumerate bits using enum autoincrement. Define the @name as the n-th bit.
*/
#define ENUM_BIT(name) \
__ ## name ## _BIT, \
name = (1U << __ ## name ## _BIT), \
__ ## name ## _SEQ = __ ## name ## _BIT
static inline void cond_wake_up(struct wait_queue_head *wq)
{
/*
@ -32,22 +40,10 @@ static inline void cond_wake_up_nomb(struct wait_queue_head *wq)
wake_up(wq);
}
static inline u64 div_factor(u64 num, int factor)
static inline u64 mult_perc(u64 num, u32 percent)
{
if (factor == 10)
return num;
num *= factor;
return div_u64(num, 10);
return div_u64(num * percent, 100);
}
static inline u64 div_factor_fine(u64 num, int factor)
{
if (factor == 100)
return num;
num *= factor;
return div_u64(num, 100);
}
/* Copy of is_power_of_two that is 64bit safe */
static inline bool is_power_of_two_u64(u64 n)
{

View File

@ -7,6 +7,7 @@
#include <linux/blkdev.h>
#include <linux/writeback.h>
#include <linux/sched/mm.h>
#include "messages.h"
#include "misc.h"
#include "ctree.h"
#include "transaction.h"
@ -17,6 +18,8 @@
#include "delalloc-space.h"
#include "qgroup.h"
#include "subpage.h"
#include "file.h"
#include "super.h"
static struct kmem_cache *btrfs_ordered_extent_cache;
@ -143,7 +146,7 @@ static inline struct rb_node *tree_search(struct btrfs_ordered_inode_tree *tree,
return ret;
}
/**
/*
* Add an ordered extent to the per-inode tree.
*
* @inode: Inode that this extent is for.
@ -501,7 +504,7 @@ void btrfs_put_ordered_extent(struct btrfs_ordered_extent *entry)
ASSERT(list_empty(&entry->log_list));
ASSERT(RB_EMPTY_NODE(&entry->rb_node));
if (entry->inode)
btrfs_add_delayed_iput(entry->inode);
btrfs_add_delayed_iput(BTRFS_I(entry->inode));
while (!list_empty(&entry->list)) {
cur = entry->list.next;
sum = list_entry(cur, struct btrfs_ordered_sum, list);
@ -1019,17 +1022,18 @@ out:
}
/*
* btrfs_flush_ordered_range - Lock the passed range and ensures all pending
* ordered extents in it are run to completion.
* Lock the passed range and ensures all pending ordered extents in it are run
* to completion.
*
* @inode: Inode whose ordered tree is to be searched
* @start: Beginning of range to flush
* @end: Last byte of range to lock
* @cached_state: If passed, will return the extent state responsible for the
* locked range. It's the caller's responsibility to free the cached state.
* locked range. It's the caller's responsibility to free the
* cached state.
*
* This function always returns with the given range locked, ensuring after it's
* called no order extent can be pending.
* Always return with the given range locked, ensuring after it's called no
* order extent can be pending.
*/
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
@ -1069,11 +1073,12 @@ void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
* Return true if btrfs_lock_ordered_range does not return any extents,
* otherwise false.
*/
bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state)
{
struct btrfs_ordered_extent *ordered;
if (!try_lock_extent(&inode->io_tree, start, end))
if (!try_lock_extent(&inode->io_tree, start, end, cached_state))
return false;
ordered = btrfs_lookup_ordered_range(inode, start, end - start + 1);
@ -1081,7 +1086,7 @@ bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end)
return true;
btrfs_put_ordered_extent(ordered);
unlock_extent(&inode->io_tree, start, end, NULL);
unlock_extent(&inode->io_tree, start, end, cached_state);
return false;
}

View File

@ -206,7 +206,8 @@ void btrfs_wait_ordered_roots(struct btrfs_fs_info *fs_info, u64 nr,
void btrfs_lock_and_flush_ordered_range(struct btrfs_inode *inode, u64 start,
u64 end,
struct extent_state **cached_state);
bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end);
bool btrfs_try_lock_ordered_range(struct btrfs_inode *inode, u64 start, u64 end,
struct extent_state **cached_state);
int btrfs_split_ordered_extent(struct btrfs_ordered_extent *ordered, u64 pre,
u64 post);
int __init ordered_data_init(void);

View File

@ -5,6 +5,7 @@
#include "ctree.h"
#include "disk-io.h"
#include "orphan.h"
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset)

11
fs/btrfs/orphan.h Normal file
View File

@ -0,0 +1,11 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_ORPHAN_H
#define BTRFS_ORPHAN_H
int btrfs_insert_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
int btrfs_del_orphan_item(struct btrfs_trans_handle *trans,
struct btrfs_root *root, u64 offset);
#endif

View File

@ -3,9 +3,12 @@
* Copyright (C) 2007 Oracle. All rights reserved.
*/
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "print-tree.h"
#include "accessors.h"
#include "tree-checker.h"
struct root_name_map {
u64 id;
@ -240,9 +243,9 @@ void btrfs_print_leaf(struct extent_buffer *l)
case BTRFS_DIR_ITEM_KEY:
di = btrfs_item_ptr(l, i, struct btrfs_dir_item);
btrfs_dir_item_key_to_cpu(l, di, &found_key);
pr_info("\t\tdir oid %llu type %u\n",
pr_info("\t\tdir oid %llu flags %u\n",
found_key.objectid,
btrfs_dir_type(l, di));
btrfs_dir_flags(l, di));
break;
case BTRFS_ROOT_ITEM_KEY:
ri = btrfs_item_ptr(l, i, struct btrfs_root_item);
@ -384,14 +387,16 @@ void btrfs_print_tree(struct extent_buffer *c, bool follow)
if (!follow)
return;
for (i = 0; i < nr; i++) {
struct btrfs_key first_key;
struct btrfs_tree_parent_check check = {
.level = level - 1,
.transid = btrfs_node_ptr_generation(c, i),
.owner_root = btrfs_header_owner(c),
.has_first_key = true
};
struct extent_buffer *next;
btrfs_node_key_to_cpu(c, &first_key, i);
next = read_tree_block(fs_info, btrfs_node_blockptr(c, i),
btrfs_header_owner(c),
btrfs_node_ptr_generation(c, i),
level - 1, &first_key);
btrfs_node_key_to_cpu(c, &check.first_key, i);
next = read_tree_block(fs_info, btrfs_node_blockptr(c, i), &check);
if (IS_ERR(next))
continue;
if (!extent_buffer_uptodate(next)) {

View File

@ -4,12 +4,17 @@
*/
#include <linux/hashtable.h>
#include "messages.h"
#include "props.h"
#include "btrfs_inode.h"
#include "transaction.h"
#include "ctree.h"
#include "xattr.h"
#include "compression.h"
#include "space-info.h"
#include "fs.h"
#include "accessors.h"
#include "super.h"
#define BTRFS_PROP_HANDLERS_HT_BITS 8
static DEFINE_HASHTABLE(prop_handlers_ht, BTRFS_PROP_HANDLERS_HT_BITS);
@ -453,7 +458,7 @@ int btrfs_inode_inherit_props(struct btrfs_trans_handle *trans,
return 0;
}
void __init btrfs_props_init(void)
int __init btrfs_props_init(void)
{
int i;
@ -463,5 +468,6 @@ void __init btrfs_props_init(void)
hash_add(prop_handlers_ht, &p->node, h);
}
return 0;
}

View File

@ -8,7 +8,7 @@
#include "ctree.h"
void __init btrfs_props_init(void);
int __init btrfs_props_init(void);
int btrfs_set_prop(struct btrfs_trans_handle *trans, struct inode *inode,
const char *name, const char *value, size_t value_len,

View File

@ -24,6 +24,11 @@
#include "block-group.h"
#include "sysfs.h"
#include "tree-mod-log.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "tree-checker.h"
/*
* Helpers to access qgroup reservation
@ -1790,8 +1795,7 @@ int btrfs_qgroup_trace_extent_nolock(struct btrfs_fs_info *fs_info,
int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
struct btrfs_qgroup_extent_record *qrecord)
{
struct ulist *old_root;
u64 bytenr = qrecord->bytenr;
struct btrfs_backref_walk_ctx ctx = { 0 };
int ret;
/*
@ -1818,8 +1822,10 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
if (trans->fs_info->qgroup_flags & BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)
return 0;
ret = btrfs_find_all_roots(NULL, trans->fs_info, bytenr, 0, &old_root,
true);
ctx.bytenr = qrecord->bytenr;
ctx.fs_info = trans->fs_info;
ret = btrfs_find_all_roots(&ctx, true);
if (ret < 0) {
qgroup_mark_inconsistent(trans->fs_info);
btrfs_warn(trans->fs_info,
@ -1835,12 +1841,12 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
*
* So modifying qrecord->old_roots is safe here
*/
qrecord->old_roots = old_root;
qrecord->old_roots = ctx.roots;
return 0;
}
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, gfp_t gfp_flag)
u64 num_bytes)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
struct btrfs_qgroup_extent_record *record;
@ -1850,7 +1856,7 @@ int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
if (!test_bit(BTRFS_FS_QUOTA_ENABLED, &fs_info->flags)
|| bytenr == 0 || num_bytes == 0)
return 0;
record = kzalloc(sizeof(*record), gfp_flag);
record = kzalloc(sizeof(*record), GFP_NOFS);
if (!record)
return -ENOMEM;
@ -1902,8 +1908,7 @@ int btrfs_qgroup_trace_leaf_items(struct btrfs_trans_handle *trans,
num_bytes = btrfs_file_extent_disk_num_bytes(eb, fi);
ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes,
GFP_NOFS);
ret = btrfs_qgroup_trace_extent(trans, bytenr, num_bytes);
if (ret)
return ret;
}
@ -2102,12 +2107,11 @@ static int qgroup_trace_extent_swap(struct btrfs_trans_handle* trans,
* blocks for qgroup accounting.
*/
ret = btrfs_qgroup_trace_extent(trans, src_path->nodes[dst_level]->start,
nodesize, GFP_NOFS);
nodesize);
if (ret < 0)
goto out;
ret = btrfs_qgroup_trace_extent(trans,
dst_path->nodes[dst_level]->start,
nodesize, GFP_NOFS);
ret = btrfs_qgroup_trace_extent(trans, dst_path->nodes[dst_level]->start,
nodesize);
if (ret < 0)
goto out;
@ -2336,7 +2340,13 @@ int btrfs_qgroup_trace_subtree(struct btrfs_trans_handle *trans,
}
if (!extent_buffer_uptodate(root_eb)) {
ret = btrfs_read_extent_buffer(root_eb, root_gen, root_level, NULL);
struct btrfs_tree_parent_check check = {
.has_first_key = false,
.transid = root_gen,
.level = root_level
};
ret = btrfs_read_extent_buffer(root_eb, &check);
if (ret)
goto out;
}
@ -2391,8 +2401,7 @@ walk_down:
path->locks[level] = BTRFS_READ_LOCK;
ret = btrfs_qgroup_trace_extent(trans, child_bytenr,
fs_info->nodesize,
GFP_NOFS);
fs_info->nodesize);
if (ret)
goto out;
}
@ -2749,17 +2758,22 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
if (!ret && !(fs_info->qgroup_flags &
BTRFS_QGROUP_RUNTIME_FLAG_NO_ACCOUNTING)) {
struct btrfs_backref_walk_ctx ctx = { 0 };
ctx.bytenr = record->bytenr;
ctx.fs_info = fs_info;
/*
* Old roots should be searched when inserting qgroup
* extent record
*/
if (WARN_ON(!record->old_roots)) {
/* Search commit root to find old_roots */
ret = btrfs_find_all_roots(NULL, fs_info,
record->bytenr, 0,
&record->old_roots, false);
ret = btrfs_find_all_roots(&ctx, false);
if (ret < 0)
goto cleanup;
record->old_roots = ctx.roots;
ctx.roots = NULL;
}
/* Free the reserved data space */
@ -2772,10 +2786,11 @@ int btrfs_qgroup_account_extents(struct btrfs_trans_handle *trans)
* which doesn't lock tree or delayed_refs and search
* current root. It's safe inside commit_transaction().
*/
ret = btrfs_find_all_roots(trans, fs_info,
record->bytenr, BTRFS_SEQ_LAST, &new_roots, false);
ctx.trans = trans;
ret = btrfs_find_all_roots(&ctx, false);
if (ret < 0)
goto cleanup;
new_roots = ctx.roots;
if (qgroup_to_skip) {
ulist_del(new_roots, qgroup_to_skip, 0);
ulist_del(record->old_roots, qgroup_to_skip,
@ -3241,7 +3256,6 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
struct btrfs_root *extent_root;
struct btrfs_key found;
struct extent_buffer *scratch_leaf = NULL;
struct ulist *roots = NULL;
u64 num_bytes;
bool done;
int slot;
@ -3291,6 +3305,8 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
mutex_unlock(&fs_info->qgroup_rescan_lock);
for (; slot < btrfs_header_nritems(scratch_leaf); ++slot) {
struct btrfs_backref_walk_ctx ctx = { 0 };
btrfs_item_key_to_cpu(scratch_leaf, &found, slot);
if (found.type != BTRFS_EXTENT_ITEM_KEY &&
found.type != BTRFS_METADATA_ITEM_KEY)
@ -3300,13 +3316,15 @@ static int qgroup_rescan_leaf(struct btrfs_trans_handle *trans,
else
num_bytes = found.offset;
ret = btrfs_find_all_roots(NULL, fs_info, found.objectid, 0,
&roots, false);
ctx.bytenr = found.objectid;
ctx.fs_info = fs_info;
ret = btrfs_find_all_roots(&ctx, false);
if (ret < 0)
goto out;
/* For rescan, just pass old_roots as NULL */
ret = btrfs_qgroup_account_extent(trans, found.objectid,
num_bytes, NULL, roots);
num_bytes, NULL, ctx.roots);
if (ret < 0)
goto out;
}
@ -4292,6 +4310,7 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
struct extent_buffer *subvol_eb)
{
struct btrfs_fs_info *fs_info = root->fs_info;
struct btrfs_tree_parent_check check = { 0 };
struct btrfs_qgroup_swapped_blocks *blocks = &root->swapped_blocks;
struct btrfs_qgroup_swapped_block *block;
struct extent_buffer *reloc_eb = NULL;
@ -4340,10 +4359,13 @@ int btrfs_qgroup_trace_subtree_after_cow(struct btrfs_trans_handle *trans,
blocks->swapped = swapped;
spin_unlock(&blocks->lock);
check.level = block->level;
check.transid = block->reloc_generation;
check.has_first_key = true;
memcpy(&check.first_key, &block->first_key, sizeof(check.first_key));
/* Read out reloc subtree root */
reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, 0,
block->reloc_generation, block->level,
&block->first_key);
reloc_eb = read_tree_block(fs_info, block->reloc_bytenr, &check);
if (IS_ERR(reloc_eb)) {
ret = PTR_ERR(reloc_eb);
reloc_eb = NULL;

View File

@ -11,6 +11,7 @@
#include <linux/kobject.h>
#include "ulist.h"
#include "delayed-ref.h"
#include "misc.h"
/*
* Btrfs qgroup overview
@ -242,9 +243,11 @@ static inline u64 btrfs_qgroup_subvolid(u64 qgroupid)
/*
* For qgroup event trace points only
*/
#define QGROUP_RESERVE (1<<0)
#define QGROUP_RELEASE (1<<1)
#define QGROUP_FREE (1<<2)
enum {
ENUM_BIT(QGROUP_RESERVE),
ENUM_BIT(QGROUP_RELEASE),
ENUM_BIT(QGROUP_FREE),
};
int btrfs_quota_enable(struct btrfs_fs_info *fs_info);
int btrfs_quota_disable(struct btrfs_fs_info *fs_info);
@ -318,7 +321,7 @@ int btrfs_qgroup_trace_extent_post(struct btrfs_trans_handle *trans,
* (NULL trans)
*/
int btrfs_qgroup_trace_extent(struct btrfs_trans_handle *trans, u64 bytenr,
u64 num_bytes, gfp_t gfp_flag);
u64 num_bytes);
/*
* Inform qgroup to trace all leaf items of data

File diff suppressed because it is too large Load Diff

View File

@ -74,12 +74,6 @@ struct btrfs_raid_bio {
/* How many sectors there are for each stripe */
u8 stripe_nsectors;
/* First bad stripe, -1 means no corruption */
s8 faila;
/* Second bad stripe (for RAID6 use) */
s8 failb;
/* Stripe number that we're scrubbing */
u8 scrubp;
@ -93,9 +87,7 @@ struct btrfs_raid_bio {
atomic_t stripes_pending;
atomic_t error;
struct work_struct end_io_work;
wait_queue_head_t io_wait;
/* Bitmap to record which horizontal stripe has data */
unsigned long dbitmap;
@ -126,6 +118,29 @@ struct btrfs_raid_bio {
/* Allocated with real_stripes-many pointers for finish_*() calls */
void **finish_pointers;
/*
* The bitmap recording where IO errors happened.
* Each bit is corresponding to one sector in either bio_sectors[] or
* stripe_sectors[] array.
*
* The reason we don't use another bit in sector_ptr is, we have two
* arrays of sectors, and a lot of IO can use sectors in both arrays.
* Thus making it much harder to iterate.
*/
unsigned long *error_bitmap;
/*
* Checksum buffer if the rbio is for data. The buffer should cover
* all data sectors (exlcuding P/Q sectors).
*/
u8 *csum_buf;
/*
* Each bit represents if the corresponding sector has data csum found.
* Should only cover data sectors (excluding P/Q sectors).
*/
unsigned long *csum_bitmap;
};
/*

View File

@ -18,7 +18,11 @@ static inline struct rcu_string *rcu_string_strdup(const char *src, gfp_t mask)
(len * sizeof(char)), mask);
if (!ret)
return ret;
strncpy(ret->str, src, len);
/* Warn if the source got unexpectedly truncated. */
if (WARN_ON(strscpy(ret->str, src, len) < 0)) {
kfree(ret);
return NULL;
}
return ret;
}

View File

@ -5,11 +5,14 @@
#include <linux/sched.h>
#include <linux/stacktrace.h>
#include "messages.h"
#include "ctree.h"
#include "disk-io.h"
#include "locking.h"
#include "delayed-ref.h"
#include "ref-verify.h"
#include "fs.h"
#include "accessors.h"
/*
* Used to keep track the roots and number of refs each root has for a given

View File

@ -2,13 +2,19 @@
#include <linux/blkdev.h>
#include <linux/iversion.h>
#include "compression.h"
#include "ctree.h"
#include "fs.h"
#include "messages.h"
#include "compression.h"
#include "delalloc-space.h"
#include "disk-io.h"
#include "reflink.h"
#include "transaction.h"
#include "subpage.h"
#include "accessors.h"
#include "file-item.h"
#include "file.h"
#include "super.h"
#define BTRFS_MAX_DEDUPE_LEN SZ_16M
@ -318,8 +324,8 @@ copy_to_page:
goto out;
}
/**
* btrfs_clone() - clone a range from inode file to another
/*
* Clone a range from inode file to another.
*
* @src: Inode to clone from
* @inode: Inode to clone to
@ -887,7 +893,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
return -EINVAL;
if (same_inode) {
btrfs_inode_lock(src_inode, BTRFS_ILOCK_MMAP);
btrfs_inode_lock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
} else {
lock_two_nondirectories(src_inode, dst_inode);
btrfs_double_mmap_lock(src_inode, dst_inode);
@ -905,7 +911,7 @@ loff_t btrfs_remap_file_range(struct file *src_file, loff_t off,
out_unlock:
if (same_inode) {
btrfs_inode_unlock(src_inode, BTRFS_ILOCK_MMAP);
btrfs_inode_unlock(BTRFS_I(src_inode), BTRFS_ILOCK_MMAP);
} else {
btrfs_double_mmap_unlock(src_inode, dst_inode);
unlock_two_nondirectories(src_inode, dst_inode);

View File

@ -27,6 +27,15 @@
#include "subpage.h"
#include "zoned.h"
#include "inode-item.h"
#include "space-info.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "file-item.h"
#include "relocation.h"
#include "super.h"
#include "tree-checker.h"
/*
* Relocation overview
@ -470,7 +479,7 @@ static noinline_for_stack struct btrfs_backref_node *build_backref_tree(
int ret;
int err = 0;
iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info, GFP_NOFS);
iter = btrfs_backref_iter_alloc(rc->extent_root->fs_info);
if (!iter)
return ERR_PTR(-ENOMEM);
path = btrfs_alloc_path();
@ -1109,10 +1118,12 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
inode = find_next_inode(root, key.objectid);
first = 0;
} else if (inode && btrfs_ino(BTRFS_I(inode)) < key.objectid) {
btrfs_add_delayed_iput(inode);
btrfs_add_delayed_iput(BTRFS_I(inode));
inode = find_next_inode(root, key.objectid);
}
if (inode && btrfs_ino(BTRFS_I(inode)) == key.objectid) {
struct extent_state *cached_state = NULL;
end = key.offset +
btrfs_file_extent_num_bytes(leaf, fi);
WARN_ON(!IS_ALIGNED(key.offset,
@ -1120,14 +1131,15 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
WARN_ON(!IS_ALIGNED(end, fs_info->sectorsize));
end--;
ret = try_lock_extent(&BTRFS_I(inode)->io_tree,
key.offset, end);
key.offset, end,
&cached_state);
if (!ret)
continue;
btrfs_drop_extent_map_range(BTRFS_I(inode),
key.offset, end, true);
unlock_extent(&BTRFS_I(inode)->io_tree,
key.offset, end, NULL);
key.offset, end, &cached_state);
}
}
@ -1170,7 +1182,7 @@ int replace_file_extents(struct btrfs_trans_handle *trans,
if (dirty)
btrfs_mark_buffer_dirty(leaf);
if (inode)
btrfs_add_delayed_iput(inode);
btrfs_add_delayed_iput(BTRFS_I(inode));
return ret;
}
@ -1516,6 +1528,8 @@ static int invalidate_extent_cache(struct btrfs_root *root,
objectid = min_key->objectid;
while (1) {
struct extent_state *cached_state = NULL;
cond_resched();
iput(inode);
@ -1566,9 +1580,9 @@ static int invalidate_extent_cache(struct btrfs_root *root,
}
/* the lock_extent waits for read_folio to complete */
lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
btrfs_drop_extent_map_range(BTRFS_I(inode), start, end, true);
unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
}
return 0;
}
@ -2597,10 +2611,14 @@ static int tree_block_processed(u64 bytenr, struct reloc_control *rc)
static int get_tree_block_key(struct btrfs_fs_info *fs_info,
struct tree_block *block)
{
struct btrfs_tree_parent_check check = {
.level = block->level,
.owner_root = block->owner,
.transid = block->key.offset
};
struct extent_buffer *eb;
eb = read_tree_block(fs_info, block->bytenr, block->owner,
block->key.offset, block->level, NULL);
eb = read_tree_block(fs_info, block->bytenr, &check);
if (IS_ERR(eb))
return PTR_ERR(eb);
if (!extent_buffer_uptodate(eb)) {
@ -2861,25 +2879,27 @@ static noinline_for_stack int prealloc_file_extent_cluster(
if (ret)
return ret;
btrfs_inode_lock(&inode->vfs_inode, 0);
btrfs_inode_lock(inode, 0);
for (nr = 0; nr < cluster->nr; nr++) {
struct extent_state *cached_state = NULL;
start = cluster->boundary[nr] - offset;
if (nr + 1 < cluster->nr)
end = cluster->boundary[nr + 1] - 1 - offset;
else
end = cluster->end - offset;
lock_extent(&inode->io_tree, start, end, NULL);
lock_extent(&inode->io_tree, start, end, &cached_state);
num_bytes = end + 1 - start;
ret = btrfs_prealloc_file_range(&inode->vfs_inode, 0, start,
num_bytes, num_bytes,
end + 1, &alloc_hint);
cur_offset = end + 1;
unlock_extent(&inode->io_tree, start, end, NULL);
unlock_extent(&inode->io_tree, start, end, &cached_state);
if (ret)
break;
}
btrfs_inode_unlock(&inode->vfs_inode, 0);
btrfs_inode_unlock(inode, 0);
if (cur_offset < prealloc_end)
btrfs_free_reserved_data_space_noquota(inode->root->fs_info,
@ -2891,6 +2911,7 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
u64 start, u64 end, u64 block_start)
{
struct extent_map *em;
struct extent_state *cached_state = NULL;
int ret = 0;
em = alloc_extent_map();
@ -2903,9 +2924,9 @@ static noinline_for_stack int setup_relocation_extent_mapping(struct inode *inod
em->block_start = block_start;
set_bit(EXTENT_FLAG_PINNED, &em->flags);
lock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
lock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
ret = btrfs_replace_extent_map_range(BTRFS_I(inode), em, false);
unlock_extent(&BTRFS_I(inode)->io_tree, start, end, NULL);
unlock_extent(&BTRFS_I(inode)->io_tree, start, end, &cached_state);
free_extent_map(em);
return ret;
@ -2983,6 +3004,7 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
*/
cur = max(page_start, cluster->boundary[*cluster_nr] - offset);
while (cur <= page_end) {
struct extent_state *cached_state = NULL;
u64 extent_start = cluster->boundary[*cluster_nr] - offset;
u64 extent_end = get_cluster_boundary_end(cluster,
*cluster_nr) - offset;
@ -2998,13 +3020,15 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
goto release_page;
/* Mark the range delalloc and dirty for later writeback */
lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
lock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
&cached_state);
ret = btrfs_set_extent_delalloc(BTRFS_I(inode), clamped_start,
clamped_end, 0, NULL);
clamped_end, 0, &cached_state);
if (ret) {
clear_extent_bits(&BTRFS_I(inode)->io_tree,
clear_extent_bit(&BTRFS_I(inode)->io_tree,
clamped_start, clamped_end,
EXTENT_LOCKED | EXTENT_BOUNDARY);
EXTENT_LOCKED | EXTENT_BOUNDARY,
&cached_state);
btrfs_delalloc_release_metadata(BTRFS_I(inode),
clamped_len, true);
btrfs_delalloc_release_extents(BTRFS_I(inode),
@ -3031,7 +3055,8 @@ static int relocate_one_page(struct inode *inode, struct file_ra_state *ra,
boundary_start, boundary_end,
EXTENT_BOUNDARY);
}
unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end, NULL);
unlock_extent(&BTRFS_I(inode)->io_tree, clamped_start, clamped_end,
&cached_state);
btrfs_delalloc_release_extents(BTRFS_I(inode), clamped_len);
cur += clamped_len;
@ -3388,24 +3413,28 @@ int add_data_references(struct reloc_control *rc,
struct btrfs_path *path,
struct rb_root *blocks)
{
struct btrfs_fs_info *fs_info = rc->extent_root->fs_info;
struct ulist *leaves = NULL;
struct btrfs_backref_walk_ctx ctx = { 0 };
struct ulist_iterator leaf_uiter;
struct ulist_node *ref_node = NULL;
const u32 blocksize = fs_info->nodesize;
const u32 blocksize = rc->extent_root->fs_info->nodesize;
int ret = 0;
btrfs_release_path(path);
ret = btrfs_find_all_leafs(NULL, fs_info, extent_key->objectid,
0, &leaves, NULL, true);
ctx.bytenr = extent_key->objectid;
ctx.ignore_extent_item_pos = true;
ctx.fs_info = rc->extent_root->fs_info;
ret = btrfs_find_all_leafs(&ctx);
if (ret < 0)
return ret;
ULIST_ITER_INIT(&leaf_uiter);
while ((ref_node = ulist_next(leaves, &leaf_uiter))) {
while ((ref_node = ulist_next(ctx.refs, &leaf_uiter))) {
struct btrfs_tree_parent_check check = { 0 };
struct extent_buffer *eb;
eb = read_tree_block(fs_info, ref_node->val, 0, 0, 0, NULL);
eb = read_tree_block(ctx.fs_info, ref_node->val, &check);
if (IS_ERR(eb)) {
ret = PTR_ERR(eb);
break;
@ -3421,7 +3450,7 @@ int add_data_references(struct reloc_control *rc,
}
if (ret < 0)
free_block_list(blocks);
ulist_free(leaves);
ulist_free(ctx.refs);
return ret;
}
@ -3905,8 +3934,7 @@ static struct reloc_control *alloc_reloc_control(struct btrfs_fs_info *fs_info)
INIT_LIST_HEAD(&rc->dirty_subvol_roots);
btrfs_backref_init_cache(fs_info, &rc->backref_cache, 1);
mapping_tree_init(&rc->reloc_root_tree);
extent_io_tree_init(fs_info, &rc->processed_blocks,
IO_TREE_RELOC_BLOCKS, NULL);
extent_io_tree_init(fs_info, &rc->processed_blocks, IO_TREE_RELOC_BLOCKS);
return rc;
}
@ -4330,7 +4358,7 @@ int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len)
disk_bytenr = file_pos + inode->index_cnt;
csum_root = btrfs_csum_root(fs_info, disk_bytenr);
ret = btrfs_lookup_csums_range(csum_root, disk_bytenr,
ret = btrfs_lookup_csums_list(csum_root, disk_bytenr,
disk_bytenr + len - 1, &list, 0, false);
if (ret)
goto out;

23
fs/btrfs/relocation.h Normal file
View File

@ -0,0 +1,23 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_RELOCATION_H
#define BTRFS_RELOCATION_H
int btrfs_relocate_block_group(struct btrfs_fs_info *fs_info, u64 group_start);
int btrfs_init_reloc_root(struct btrfs_trans_handle *trans, struct btrfs_root *root);
int btrfs_update_reloc_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
int btrfs_recover_relocation(struct btrfs_fs_info *fs_info);
int btrfs_reloc_clone_csums(struct btrfs_inode *inode, u64 file_pos, u64 len);
int btrfs_reloc_cow_block(struct btrfs_trans_handle *trans,
struct btrfs_root *root, struct extent_buffer *buf,
struct extent_buffer *cow);
void btrfs_reloc_pre_snapshot(struct btrfs_pending_snapshot *pending,
u64 *bytes_to_reserve);
int btrfs_reloc_post_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_pending_snapshot *pending);
int btrfs_should_cancel_balance(struct btrfs_fs_info *fs_info);
struct btrfs_root *find_reloc_root(struct btrfs_fs_info *fs_info, u64 bytenr);
int btrfs_should_ignore_reloc_root(struct btrfs_root *root);
#endif

View File

@ -6,11 +6,16 @@
#include <linux/err.h>
#include <linux/uuid.h>
#include "ctree.h"
#include "fs.h"
#include "messages.h"
#include "transaction.h"
#include "disk-io.h"
#include "print-tree.h"
#include "qgroup.h"
#include "space-info.h"
#include "accessors.h"
#include "root-tree.h"
#include "orphan.h"
/*
* Read a root item from the tree. In case we detect a root item smaller then
@ -327,9 +332,8 @@ out:
}
int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 *sequence, const char *name,
int name_len)
u64 ref_id, u64 dirid, u64 *sequence,
const struct fscrypt_str *name)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_path *path;
@ -356,8 +360,8 @@ again:
struct btrfs_root_ref);
ptr = (unsigned long)(ref + 1);
if ((btrfs_root_ref_dirid(leaf, ref) != dirid) ||
(btrfs_root_ref_name_len(leaf, ref) != name_len) ||
memcmp_extent_buffer(leaf, name, ptr, name_len)) {
(btrfs_root_ref_name_len(leaf, ref) != name->len) ||
memcmp_extent_buffer(leaf, name->name, ptr, name->len)) {
ret = -ENOENT;
goto out;
}
@ -400,8 +404,8 @@ out:
* Will return 0, -ENOMEM, or anything from the CoW path
*/
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 sequence, const char *name,
int name_len)
u64 ref_id, u64 dirid, u64 sequence,
const struct fscrypt_str *name)
{
struct btrfs_root *tree_root = trans->fs_info->tree_root;
struct btrfs_key key;
@ -420,7 +424,7 @@ int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
key.offset = ref_id;
again:
ret = btrfs_insert_empty_item(trans, tree_root, path, &key,
sizeof(*ref) + name_len);
sizeof(*ref) + name->len);
if (ret) {
btrfs_abort_transaction(trans, ret);
btrfs_free_path(path);
@ -431,9 +435,9 @@ again:
ref = btrfs_item_ptr(leaf, path->slots[0], struct btrfs_root_ref);
btrfs_set_root_ref_dirid(leaf, ref, dirid);
btrfs_set_root_ref_sequence(leaf, ref, sequence);
btrfs_set_root_ref_name_len(leaf, ref, name_len);
btrfs_set_root_ref_name_len(leaf, ref, name->len);
ptr = (unsigned long)(ref + 1);
write_extent_buffer(leaf, name, ptr, name_len);
write_extent_buffer(leaf, name->name, ptr, name->len);
btrfs_mark_buffer_dirty(leaf);
if (key.type == BTRFS_ROOT_BACKREF_KEY) {

34
fs/btrfs/root-tree.h Normal file
View File

@ -0,0 +1,34 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_ROOT_TREE_H
#define BTRFS_ROOT_TREE_H
int btrfs_subvolume_reserve_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv,
int nitems, bool use_global_rsv);
void btrfs_subvolume_release_metadata(struct btrfs_root *root,
struct btrfs_block_rsv *rsv);
int btrfs_add_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 sequence,
const struct fscrypt_str *name);
int btrfs_del_root_ref(struct btrfs_trans_handle *trans, u64 root_id,
u64 ref_id, u64 dirid, u64 *sequence,
const struct fscrypt_str *name);
int btrfs_del_root(struct btrfs_trans_handle *trans, const struct btrfs_key *key);
int btrfs_insert_root(struct btrfs_trans_handle *trans, struct btrfs_root *root,
const struct btrfs_key *key,
struct btrfs_root_item *item);
int __must_check btrfs_update_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root,
struct btrfs_key *key,
struct btrfs_root_item *item);
int btrfs_find_root(struct btrfs_root *root, const struct btrfs_key *search_key,
struct btrfs_path *path, struct btrfs_root_item *root_item,
struct btrfs_key *root_key);
int btrfs_find_orphan_roots(struct btrfs_fs_info *fs_info);
void btrfs_set_root_node(struct btrfs_root_item *item,
struct extent_buffer *node);
void btrfs_check_and_init_root_item(struct btrfs_root_item *item);
void btrfs_update_root_times(struct btrfs_trans_handle *trans, struct btrfs_root *root);
#endif

View File

@ -17,10 +17,13 @@
#include "extent_io.h"
#include "dev-replace.h"
#include "check-integrity.h"
#include "rcu-string.h"
#include "raid56.h"
#include "block-group.h"
#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
#include "scrub.h"
/*
* This is only the first step towards a full-features scrub. It reads all
@ -56,6 +59,17 @@ struct scrub_ctx;
#define SCRUB_MAX_PAGES (DIV_ROUND_UP(BTRFS_MAX_METADATA_BLOCKSIZE, PAGE_SIZE))
/*
* Maximum number of mirrors that can be available for all profiles counting
* the target device of dev-replace as one. During an active device replace
* procedure, the target device of the copy operation is a mirror for the
* filesystem data as well that can be used to read data in order to repair
* read errors on other disks.
*
* Current value is derived from RAID1C4 with 4 copies.
*/
#define BTRFS_MAX_MIRRORS (4 + 1)
struct scrub_recover {
refcount_t refs;
struct btrfs_io_context *bioc;
@ -284,7 +298,7 @@ static struct scrub_block *alloc_scrub_block(struct scrub_ctx *sctx,
* Will also allocate new pages for @sblock if needed.
*/
static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
u64 logical, gfp_t gfp)
u64 logical)
{
const pgoff_t page_index = (logical - sblock->logical) >> PAGE_SHIFT;
struct scrub_sector *ssector;
@ -292,7 +306,7 @@ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
/* We must never have scrub_block exceed U32_MAX in size. */
ASSERT(logical - sblock->logical < U32_MAX);
ssector = kzalloc(sizeof(*ssector), gfp);
ssector = kzalloc(sizeof(*ssector), GFP_KERNEL);
if (!ssector)
return NULL;
@ -300,7 +314,7 @@ static struct scrub_sector *alloc_scrub_sector(struct scrub_block *sblock,
if (!sblock->pages[page_index]) {
int ret;
sblock->pages[page_index] = alloc_page(gfp);
sblock->pages[page_index] = alloc_page(GFP_KERNEL);
if (!sblock->pages[page_index]) {
kfree(ssector);
return NULL;
@ -794,8 +808,8 @@ nomem:
return ERR_PTR(-ENOMEM);
}
static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
void *warn_ctx)
static int scrub_print_warning_inode(u64 inum, u64 offset, u64 num_bytes,
u64 root, void *warn_ctx)
{
u32 nlink;
int ret;
@ -862,7 +876,7 @@ static int scrub_print_warning_inode(u64 inum, u64 offset, u64 root,
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu, length %u, links %u (path: %s)",
swarn->errstr, swarn->logical,
rcu_str_deref(swarn->dev->name),
btrfs_dev_name(swarn->dev),
swarn->physical,
root, inum, offset,
fs_info->sectorsize, nlink,
@ -876,7 +890,7 @@ err:
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu, root %llu, inode %llu, offset %llu: path resolving failed with ret=%d",
swarn->errstr, swarn->logical,
rcu_str_deref(swarn->dev->name),
btrfs_dev_name(swarn->dev),
swarn->physical,
root, inum, offset, ret);
@ -894,7 +908,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
struct btrfs_extent_item *ei;
struct scrub_warning swarn;
unsigned long ptr = 0;
u64 extent_item_pos;
u64 flags = 0;
u64 ref_root;
u32 item_size;
@ -908,8 +921,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
/* Super block error, no need to search extent tree. */
if (sblock->sectors[0]->flags & BTRFS_EXTENT_FLAG_SUPER) {
btrfs_warn_in_rcu(fs_info, "%s on device %s, physical %llu",
errstr, rcu_str_deref(dev->name),
sblock->physical);
errstr, btrfs_dev_name(dev), sblock->physical);
return;
}
path = btrfs_alloc_path();
@ -926,7 +938,6 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
if (ret < 0)
goto out;
extent_item_pos = swarn.logical - found_key.objectid;
swarn.extent_item_size = found_key.offset;
eb = path->nodes[0];
@ -941,7 +952,7 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
btrfs_warn_in_rcu(fs_info,
"%s at logical %llu on dev %s, physical %llu: metadata %s (level %d) in tree %llu",
errstr, swarn.logical,
rcu_str_deref(dev->name),
btrfs_dev_name(dev),
swarn.physical,
ref_level ? "node" : "leaf",
ret < 0 ? -1 : ref_level,
@ -949,12 +960,18 @@ static void scrub_print_warning(const char *errstr, struct scrub_block *sblock)
} while (ret != 1);
btrfs_release_path(path);
} else {
struct btrfs_backref_walk_ctx ctx = { 0 };
btrfs_release_path(path);
ctx.bytenr = found_key.objectid;
ctx.extent_item_pos = swarn.logical - found_key.objectid;
ctx.fs_info = fs_info;
swarn.path = path;
swarn.dev = dev;
iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, 1,
scrub_print_warning_inode, &swarn, false);
iterate_extent_inodes(&ctx, true, scrub_print_warning_inode, &swarn);
}
out:
@ -1358,7 +1375,7 @@ corrected_error:
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"fixed up error at logical %llu on dev %s",
logical, rcu_str_deref(dev->name));
logical, btrfs_dev_name(dev));
}
} else {
did_not_correct_error:
@ -1367,7 +1384,7 @@ did_not_correct_error:
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"unable to fixup (regular) error at logical %llu on dev %s",
logical, rcu_str_deref(dev->name));
logical, btrfs_dev_name(dev));
}
out:
@ -1480,7 +1497,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
return -EIO;
}
recover = kzalloc(sizeof(struct scrub_recover), GFP_NOFS);
recover = kzalloc(sizeof(struct scrub_recover), GFP_KERNEL);
if (!recover) {
btrfs_put_bioc(bioc);
btrfs_bio_counter_dec(fs_info);
@ -1503,7 +1520,7 @@ static int scrub_setup_recheck_block(struct scrub_block *original_sblock,
sblock = sblocks_for_recheck[mirror_index];
sblock->sctx = sctx;
sector = alloc_scrub_sector(sblock, logical, GFP_NOFS);
sector = alloc_scrub_sector(sblock, logical);
if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@ -2313,14 +2330,14 @@ static void scrub_missing_raid56_worker(struct work_struct *work)
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"IO error rebuilding logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
logical, btrfs_dev_name(dev));
} else if (sblock->header_error || sblock->checksum_error) {
spin_lock(&sctx->stat_lock);
sctx->stat.uncorrectable_errors++;
spin_unlock(&sctx->stat_lock);
btrfs_err_rl_in_rcu(fs_info,
"failed to rebuild valid logical %llu for dev %s",
logical, rcu_str_deref(dev->name));
logical, btrfs_dev_name(dev));
} else {
scrub_write_block_to_dev_replace(sblock);
}
@ -2425,7 +2442,7 @@ static int scrub_sectors(struct scrub_ctx *sctx, u64 logical, u32 len,
*/
u32 l = min(sectorsize, len);
sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
sector = alloc_scrub_sector(sblock, logical);
if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@ -2756,7 +2773,7 @@ static int scrub_sectors_for_parity(struct scrub_parity *sparity,
for (index = 0; len > 0; index++) {
struct scrub_sector *sector;
sector = alloc_scrub_sector(sblock, logical, GFP_KERNEL);
sector = alloc_scrub_sector(sblock, logical);
if (!sector) {
spin_lock(&sctx->stat_lock);
sctx->stat.malloc_errors++;
@ -3221,7 +3238,7 @@ static int scrub_raid56_data_stripe_for_parity(struct scrub_ctx *sctx,
extent_dev = bioc->stripes[0].dev;
btrfs_put_bioc(bioc);
ret = btrfs_lookup_csums_range(csum_root, extent_start,
ret = btrfs_lookup_csums_list(csum_root, extent_start,
extent_start + extent_size - 1,
&sctx->csum_list, 1, false);
if (ret) {
@ -3447,7 +3464,7 @@ static int scrub_simple_mirror(struct scrub_ctx *sctx,
cur_logical;
if (extent_flags & BTRFS_EXTENT_FLAG_DATA) {
ret = btrfs_lookup_csums_range(csum_root, cur_logical,
ret = btrfs_lookup_csums_list(csum_root, cur_logical,
cur_logical + scrub_len - 1,
&sctx->csum_list, 1, false);
if (ret)
@ -4284,7 +4301,7 @@ int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
mutex_unlock(&fs_info->fs_devices->device_list_mutex);
btrfs_err_in_rcu(fs_info,
"scrub on devid %llu: filesystem on %s is not writable",
devid, rcu_str_deref(dev->name));
devid, btrfs_dev_name(dev));
ret = -EROFS;
goto out;
}

16
fs/btrfs/scrub.h Normal file
View File

@ -0,0 +1,16 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_SCRUB_H
#define BTRFS_SCRUB_H
int btrfs_scrub_dev(struct btrfs_fs_info *fs_info, u64 devid, u64 start,
u64 end, struct btrfs_scrub_progress *progress,
int readonly, int is_dev_replace);
void btrfs_scrub_pause(struct btrfs_fs_info *fs_info);
void btrfs_scrub_continue(struct btrfs_fs_info *fs_info);
int btrfs_scrub_cancel(struct btrfs_fs_info *info);
int btrfs_scrub_cancel_dev(struct btrfs_device *dev);
int btrfs_scrub_progress(struct btrfs_fs_info *fs_info, u64 devid,
struct btrfs_scrub_progress *progress);
#endif

View File

@ -27,6 +27,11 @@
#include "compression.h"
#include "xattr.h"
#include "print-tree.h"
#include "accessors.h"
#include "dir-item.h"
#include "file-item.h"
#include "ioctl.h"
#include "verity.h"
/*
* Maximum number of references an extent can have in order for us to attempt to
@ -34,7 +39,7 @@
* avoid hitting limitations of the backreference walking code (taking a lot of
* time and using too much memory for extents with large number of references).
*/
#define SEND_MAX_EXTENT_REFS 64
#define SEND_MAX_EXTENT_REFS 1024
/*
* A fs_path is a helper to dynamically build path names with unknown size.
@ -71,13 +76,46 @@ struct clone_root {
struct btrfs_root *root;
u64 ino;
u64 offset;
u64 found_refs;
u64 num_bytes;
bool found_ref;
};
#define SEND_CTX_MAX_NAME_CACHE_SIZE 128
#define SEND_CTX_NAME_CACHE_CLEAN_SIZE (SEND_CTX_MAX_NAME_CACHE_SIZE * 2)
/*
* Limit the root_ids array of struct backref_cache_entry to 12 elements.
* This makes the size of a cache entry to be exactly 128 bytes on x86_64.
* The most common case is to have a single root for cloning, which corresponds
* to the send root. Having the user specify more than 11 clone roots is not
* common, and in such rare cases we simply don't use caching if the number of
* cloning roots that lead down to a leaf is more than 12.
*/
#define SEND_MAX_BACKREF_CACHE_ROOTS 12
/*
* Max number of entries in the cache.
* With SEND_MAX_BACKREF_CACHE_ROOTS as 12, the size in bytes, excluding
* maple tree's internal nodes, is 16K.
*/
#define SEND_MAX_BACKREF_CACHE_SIZE 128
/*
* A backref cache entry maps a leaf to a list of IDs of roots from which the
* leaf is accessible and we can use for clone operations.
* With SEND_MAX_BACKREF_CACHE_ROOTS as 12, each cache entry is 128 bytes (on
* x86_64).
*/
struct backref_cache_entry {
/* List to link to the cache's lru list. */
struct list_head list;
/* The key for this entry in the cache. */
u64 key;
u64 root_ids[SEND_MAX_BACKREF_CACHE_ROOTS];
/* Number of valid elements in the root_ids array. */
int num_roots;
};
struct send_ctx {
struct file *send_filp;
loff_t send_off;
@ -246,6 +284,14 @@ struct send_ctx {
struct rb_root rbtree_new_refs;
struct rb_root rbtree_deleted_refs;
struct {
u64 last_reloc_trans;
struct list_head lru_list;
struct maple_tree entries;
/* Number of entries stored in the cache. */
int size;
} backref_cache;
};
struct pending_dir_move {
@ -1093,7 +1139,7 @@ static int iterate_dir_item(struct btrfs_root *root, struct btrfs_path *path,
data_len = btrfs_dir_data_len(eb, di);
btrfs_dir_item_key_to_cpu(eb, di, &di_key);
if (btrfs_dir_type(eb, di) == BTRFS_FT_XATTR) {
if (btrfs_dir_ftype(eb, di) == BTRFS_FT_XATTR) {
if (name_len > XATTR_NAME_MAX) {
ret = -ENAMETOOLONG;
goto out;
@ -1236,8 +1282,12 @@ struct backref_ctx {
/* may be truncated in case it's the last extent in a file */
u64 extent_len;
/* Just to check for bugs in backref resolving */
int found_itself;
/* The bytenr the file extent item we are processing refers to. */
u64 bytenr;
/* The owner (root id) of the data backref for the current extent. */
u64 backref_owner;
/* The offset of the data backref for the current extent. */
u64 backref_offset;
};
static int __clone_root_cmp_bsearch(const void *key, const void *elt)
@ -1266,32 +1316,33 @@ static int __clone_root_cmp_sort(const void *e1, const void *e2)
/*
* Called for every backref that is found for the current extent.
* Results are collected in sctx->clone_roots->ino/offset/found_refs
* Results are collected in sctx->clone_roots->ino/offset.
*/
static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
static int iterate_backrefs(u64 ino, u64 offset, u64 num_bytes, u64 root_id,
void *ctx_)
{
struct backref_ctx *bctx = ctx_;
struct clone_root *found;
struct clone_root *clone_root;
/* First check if the root is in the list of accepted clone sources */
found = bsearch((void *)(uintptr_t)root, bctx->sctx->clone_roots,
clone_root = bsearch((void *)(uintptr_t)root_id, bctx->sctx->clone_roots,
bctx->sctx->clone_roots_cnt,
sizeof(struct clone_root),
__clone_root_cmp_bsearch);
if (!found)
if (!clone_root)
return 0;
if (found->root == bctx->sctx->send_root &&
/* This is our own reference, bail out as we can't clone from it. */
if (clone_root->root == bctx->sctx->send_root &&
ino == bctx->cur_objectid &&
offset == bctx->cur_offset) {
bctx->found_itself = 1;
}
offset == bctx->cur_offset)
return 0;
/*
* Make sure we don't consider clones from send_root that are
* behind the current inode/offset.
*/
if (found->root == bctx->sctx->send_root) {
if (clone_root->root == bctx->sctx->send_root) {
/*
* If the source inode was not yet processed we can't issue a
* clone operation, as the source extent does not exist yet at
@ -1312,21 +1363,217 @@ static int __iterate_backrefs(u64 ino, u64 offset, u64 root, void *ctx_)
}
bctx->found++;
found->found_refs++;
if (ino < found->ino) {
found->ino = ino;
found->offset = offset;
} else if (found->ino == ino) {
clone_root->found_ref = true;
/*
* same extent found more then once in the same file.
* If the given backref refers to a file extent item with a larger
* number of bytes than what we found before, use the new one so that
* we clone more optimally and end up doing less writes and getting
* less exclusive, non-shared extents at the destination.
*/
if (found->offset > offset + bctx->extent_len)
found->offset = offset;
if (num_bytes > clone_root->num_bytes) {
clone_root->ino = ino;
clone_root->offset = offset;
clone_root->num_bytes = num_bytes;
/*
* Found a perfect candidate, so there's no need to continue
* backref walking.
*/
if (num_bytes >= bctx->extent_len)
return BTRFS_ITERATE_EXTENT_INODES_STOP;
}
return 0;
}
static void empty_backref_cache(struct send_ctx *sctx)
{
struct backref_cache_entry *entry;
struct backref_cache_entry *tmp;
list_for_each_entry_safe(entry, tmp, &sctx->backref_cache.lru_list, list)
kfree(entry);
INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
mtree_destroy(&sctx->backref_cache.entries);
sctx->backref_cache.size = 0;
}
static bool lookup_backref_cache(u64 leaf_bytenr, void *ctx,
const u64 **root_ids_ret, int *root_count_ret)
{
struct backref_ctx *bctx = ctx;
struct send_ctx *sctx = bctx->sctx;
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
const u64 key = leaf_bytenr >> fs_info->sectorsize_bits;
struct backref_cache_entry *entry;
if (sctx->backref_cache.size == 0)
return false;
/*
* If relocation happened since we first filled the cache, then we must
* empty the cache and can not use it, because even though we operate on
* read-only roots, their leaves and nodes may have been reallocated and
* now be used for different nodes/leaves of the same tree or some other
* tree.
*
* We are called from iterate_extent_inodes() while either holding a
* transaction handle or holding fs_info->commit_root_sem, so no need
* to take any lock here.
*/
if (fs_info->last_reloc_trans > sctx->backref_cache.last_reloc_trans) {
empty_backref_cache(sctx);
return false;
}
entry = mtree_load(&sctx->backref_cache.entries, key);
if (!entry)
return false;
*root_ids_ret = entry->root_ids;
*root_count_ret = entry->num_roots;
list_move_tail(&entry->list, &sctx->backref_cache.lru_list);
return true;
}
static void store_backref_cache(u64 leaf_bytenr, const struct ulist *root_ids,
void *ctx)
{
struct backref_ctx *bctx = ctx;
struct send_ctx *sctx = bctx->sctx;
struct btrfs_fs_info *fs_info = sctx->send_root->fs_info;
struct backref_cache_entry *new_entry;
struct ulist_iterator uiter;
struct ulist_node *node;
int ret;
/*
* We're called while holding a transaction handle or while holding
* fs_info->commit_root_sem (at iterate_extent_inodes()), so must do a
* NOFS allocation.
*/
new_entry = kmalloc(sizeof(struct backref_cache_entry), GFP_NOFS);
/* No worries, cache is optional. */
if (!new_entry)
return;
new_entry->key = leaf_bytenr >> fs_info->sectorsize_bits;
new_entry->num_roots = 0;
ULIST_ITER_INIT(&uiter);
while ((node = ulist_next(root_ids, &uiter)) != NULL) {
const u64 root_id = node->val;
struct clone_root *root;
root = bsearch((void *)(uintptr_t)root_id, sctx->clone_roots,
sctx->clone_roots_cnt, sizeof(struct clone_root),
__clone_root_cmp_bsearch);
if (!root)
continue;
/* Too many roots, just exit, no worries as caching is optional. */
if (new_entry->num_roots >= SEND_MAX_BACKREF_CACHE_ROOTS) {
kfree(new_entry);
return;
}
new_entry->root_ids[new_entry->num_roots] = root_id;
new_entry->num_roots++;
}
/*
* We may have not added any roots to the new cache entry, which means
* none of the roots is part of the list of roots from which we are
* allowed to clone. Cache the new entry as it's still useful to avoid
* backref walking to determine which roots have a path to the leaf.
*/
if (sctx->backref_cache.size >= SEND_MAX_BACKREF_CACHE_SIZE) {
struct backref_cache_entry *lru_entry;
struct backref_cache_entry *mt_entry;
lru_entry = list_first_entry(&sctx->backref_cache.lru_list,
struct backref_cache_entry, list);
mt_entry = mtree_erase(&sctx->backref_cache.entries, lru_entry->key);
ASSERT(mt_entry == lru_entry);
list_del(&mt_entry->list);
kfree(mt_entry);
sctx->backref_cache.size--;
}
ret = mtree_insert(&sctx->backref_cache.entries, new_entry->key,
new_entry, GFP_NOFS);
ASSERT(ret == 0 || ret == -ENOMEM);
if (ret) {
/* Caching is optional, no worries. */
kfree(new_entry);
return;
}
list_add_tail(&new_entry->list, &sctx->backref_cache.lru_list);
/*
* We are called from iterate_extent_inodes() while either holding a
* transaction handle or holding fs_info->commit_root_sem, so no need
* to take any lock here.
*/
if (sctx->backref_cache.size == 0)
sctx->backref_cache.last_reloc_trans = fs_info->last_reloc_trans;
sctx->backref_cache.size++;
}
static int check_extent_item(u64 bytenr, const struct btrfs_extent_item *ei,
const struct extent_buffer *leaf, void *ctx)
{
const u64 refs = btrfs_extent_refs(leaf, ei);
const struct backref_ctx *bctx = ctx;
const struct send_ctx *sctx = bctx->sctx;
if (bytenr == bctx->bytenr) {
const u64 flags = btrfs_extent_flags(leaf, ei);
if (WARN_ON(flags & BTRFS_EXTENT_FLAG_TREE_BLOCK))
return -EUCLEAN;
/*
* If we have only one reference and only the send root as a
* clone source - meaning no clone roots were given in the
* struct btrfs_ioctl_send_args passed to the send ioctl - then
* it's our reference and there's no point in doing backref
* walking which is expensive, so exit early.
*/
if (refs == 1 && sctx->clone_roots_cnt == 1)
return -ENOENT;
}
/*
* Backreference walking (iterate_extent_inodes() below) is currently
* too expensive when an extent has a large number of references, both
* in time spent and used memory. So for now just fallback to write
* operations instead of clone operations when an extent has more than
* a certain amount of references.
*/
if (refs > SEND_MAX_EXTENT_REFS)
return -ENOENT;
return 0;
}
static bool skip_self_data_ref(u64 root, u64 ino, u64 offset, void *ctx)
{
const struct backref_ctx *bctx = ctx;
if (ino == bctx->cur_objectid &&
root == bctx->backref_owner &&
offset == bctx->backref_offset)
return true;
return false;
}
/*
* Given an inode, offset and extent item, it finds a good clone for a clone
* instruction. Returns -ENOENT when none could be found. The function makes
@ -1348,79 +1595,36 @@ static int find_extent_clone(struct send_ctx *sctx,
u64 logical;
u64 disk_byte;
u64 num_bytes;
u64 extent_item_pos;
u64 flags = 0;
struct btrfs_file_extent_item *fi;
struct extent_buffer *eb = path->nodes[0];
struct backref_ctx backref_ctx = {0};
struct backref_ctx backref_ctx = { 0 };
struct btrfs_backref_walk_ctx backref_walk_ctx = { 0 };
struct clone_root *cur_clone_root;
struct btrfs_key found_key;
struct btrfs_path *tmp_path;
struct btrfs_extent_item *ei;
int compressed;
u32 i;
tmp_path = alloc_path_for_send();
if (!tmp_path)
return -ENOMEM;
/* We only use this path under the commit sem */
tmp_path->need_commit_sem = 0;
if (data_offset >= ino_size) {
/*
* There may be extents that lie behind the file's size.
* I at least had this in combination with snapshotting while
* writing large files.
* With fallocate we can get prealloc extents beyond the inode's i_size,
* so we don't do anything here because clone operations can not clone
* to a range beyond i_size without increasing the i_size of the
* destination inode.
*/
ret = 0;
goto out;
}
if (data_offset >= ino_size)
return 0;
fi = btrfs_item_ptr(eb, path->slots[0],
struct btrfs_file_extent_item);
fi = btrfs_item_ptr(eb, path->slots[0], struct btrfs_file_extent_item);
extent_type = btrfs_file_extent_type(eb, fi);
if (extent_type == BTRFS_FILE_EXTENT_INLINE) {
ret = -ENOENT;
goto out;
}
compressed = btrfs_file_extent_compression(eb, fi);
if (extent_type == BTRFS_FILE_EXTENT_INLINE)
return -ENOENT;
num_bytes = btrfs_file_extent_num_bytes(eb, fi);
disk_byte = btrfs_file_extent_disk_bytenr(eb, fi);
if (disk_byte == 0) {
ret = -ENOENT;
goto out;
}
if (disk_byte == 0)
return -ENOENT;
compressed = btrfs_file_extent_compression(eb, fi);
num_bytes = btrfs_file_extent_num_bytes(eb, fi);
logical = disk_byte + btrfs_file_extent_offset(eb, fi);
down_read(&fs_info->commit_root_sem);
ret = extent_from_logical(fs_info, disk_byte, tmp_path,
&found_key, &flags);
up_read(&fs_info->commit_root_sem);
if (ret < 0)
goto out;
if (flags & BTRFS_EXTENT_FLAG_TREE_BLOCK) {
ret = -EIO;
goto out;
}
ei = btrfs_item_ptr(tmp_path->nodes[0], tmp_path->slots[0],
struct btrfs_extent_item);
/*
* Backreference walking (iterate_extent_inodes() below) is currently
* too expensive when an extent has a large number of references, both
* in time spent and used memory. So for now just fallback to write
* operations instead of clone operations when an extent has more than
* a certain amount of references.
*/
if (btrfs_extent_refs(tmp_path->nodes[0], ei) > SEND_MAX_EXTENT_REFS) {
ret = -ENOENT;
goto out;
}
btrfs_release_path(tmp_path);
/*
* Setup the clone roots.
*/
@ -1428,37 +1632,59 @@ static int find_extent_clone(struct send_ctx *sctx,
cur_clone_root = sctx->clone_roots + i;
cur_clone_root->ino = (u64)-1;
cur_clone_root->offset = 0;
cur_clone_root->found_refs = 0;
cur_clone_root->num_bytes = 0;
cur_clone_root->found_ref = false;
}
backref_ctx.sctx = sctx;
backref_ctx.found = 0;
backref_ctx.cur_objectid = ino;
backref_ctx.cur_offset = data_offset;
backref_ctx.found_itself = 0;
backref_ctx.extent_len = num_bytes;
backref_ctx.bytenr = disk_byte;
/*
* Use the header owner and not the send root's id, because in case of a
* snapshot we can have shared subtrees.
*/
backref_ctx.backref_owner = btrfs_header_owner(eb);
backref_ctx.backref_offset = data_offset - btrfs_file_extent_offset(eb, fi);
/*
* The last extent of a file may be too large due to page alignment.
* We need to adjust extent_len in this case so that the checks in
* __iterate_backrefs work.
* iterate_backrefs() work.
*/
if (data_offset + num_bytes >= ino_size)
backref_ctx.extent_len = ino_size - data_offset;
else
backref_ctx.extent_len = num_bytes;
/*
* Now collect all backrefs.
*/
backref_walk_ctx.bytenr = disk_byte;
if (compressed == BTRFS_COMPRESS_NONE)
extent_item_pos = logical - found_key.objectid;
else
extent_item_pos = 0;
ret = iterate_extent_inodes(fs_info, found_key.objectid,
extent_item_pos, 1, __iterate_backrefs,
&backref_ctx, false);
backref_walk_ctx.extent_item_pos = btrfs_file_extent_offset(eb, fi);
backref_walk_ctx.fs_info = fs_info;
backref_walk_ctx.cache_lookup = lookup_backref_cache;
backref_walk_ctx.cache_store = store_backref_cache;
backref_walk_ctx.indirect_ref_iterator = iterate_backrefs;
backref_walk_ctx.check_extent_item = check_extent_item;
backref_walk_ctx.user_ctx = &backref_ctx;
/*
* If have a single clone root, then it's the send root and we can tell
* the backref walking code to skip our own backref and not resolve it,
* since we can not use it for cloning - the source and destination
* ranges can't overlap and in case the leaf is shared through a subtree
* due to snapshots, we can't use those other roots since they are not
* in the list of clone roots.
*/
if (sctx->clone_roots_cnt == 1)
backref_walk_ctx.skip_data_ref = skip_self_data_ref;
ret = iterate_extent_inodes(&backref_walk_ctx, true, iterate_backrefs,
&backref_ctx);
if (ret < 0)
goto out;
return ret;
down_read(&fs_info->commit_root_sem);
if (fs_info->last_reloc_trans > sctx->last_reloc_trans) {
@ -1475,37 +1701,42 @@ static int find_extent_clone(struct send_ctx *sctx,
* was already reallocated after the relocation.
*/
up_read(&fs_info->commit_root_sem);
ret = -ENOENT;
goto out;
return -ENOENT;
}
up_read(&fs_info->commit_root_sem);
if (!backref_ctx.found_itself) {
/* found a bug in backref code? */
ret = -EIO;
btrfs_err(fs_info,
"did not find backref in send_root. inode=%llu, offset=%llu, disk_byte=%llu found extent=%llu",
ino, data_offset, disk_byte, found_key.objectid);
goto out;
}
btrfs_debug(fs_info,
"find_extent_clone: data_offset=%llu, ino=%llu, num_bytes=%llu, logical=%llu",
data_offset, ino, num_bytes, logical);
if (!backref_ctx.found)
if (!backref_ctx.found) {
btrfs_debug(fs_info, "no clones found");
return -ENOENT;
}
cur_clone_root = NULL;
for (i = 0; i < sctx->clone_roots_cnt; i++) {
if (sctx->clone_roots[i].found_refs) {
if (!cur_clone_root)
cur_clone_root = sctx->clone_roots + i;
else if (sctx->clone_roots[i].root == sctx->send_root)
/* prefer clones from send_root over others */
cur_clone_root = sctx->clone_roots + i;
}
struct clone_root *clone_root = &sctx->clone_roots[i];
if (!clone_root->found_ref)
continue;
/*
* Choose the root from which we can clone more bytes, to
* minimize write operations and therefore have more extent
* sharing at the destination (the same as in the source).
*/
if (!cur_clone_root ||
clone_root->num_bytes > cur_clone_root->num_bytes) {
cur_clone_root = clone_root;
/*
* We found an optimal clone candidate (any inode from
* any root is fine), so we're done.
*/
if (clone_root->num_bytes >= backref_ctx.extent_len)
break;
}
}
if (cur_clone_root) {
@ -1515,8 +1746,6 @@ static int find_extent_clone(struct send_ctx *sctx,
ret = -ENOENT;
}
out:
btrfs_free_path(tmp_path);
return ret;
}
@ -1596,13 +1825,17 @@ static int gen_unique_name(struct send_ctx *sctx,
return -ENOMEM;
while (1) {
struct fscrypt_str tmp_name;
len = snprintf(tmp, sizeof(tmp), "o%llu-%llu-%llu",
ino, gen, idx);
ASSERT(len < sizeof(tmp));
tmp_name.name = tmp;
tmp_name.len = strlen(tmp);
di = btrfs_lookup_dir_item(NULL, sctx->send_root,
path, BTRFS_FIRST_FREE_OBJECTID,
tmp, strlen(tmp), 0);
&tmp_name, 0);
btrfs_release_path(path);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
@ -1622,7 +1855,7 @@ static int gen_unique_name(struct send_ctx *sctx,
di = btrfs_lookup_dir_item(NULL, sctx->parent_root,
path, BTRFS_FIRST_FREE_OBJECTID,
tmp, strlen(tmp), 0);
&tmp_name, 0);
btrfs_release_path(path);
if (IS_ERR(di)) {
ret = PTR_ERR(di);
@ -1752,13 +1985,13 @@ static int lookup_dir_item_inode(struct btrfs_root *root,
struct btrfs_dir_item *di;
struct btrfs_key key;
struct btrfs_path *path;
struct fscrypt_str name_str = FSTR_INIT((char *)name, name_len);
path = alloc_path_for_send();
if (!path)
return -ENOMEM;
di = btrfs_lookup_dir_item(NULL, root, path,
dir, name, name_len, 0);
di = btrfs_lookup_dir_item(NULL, root, path, dir, &name_str, 0);
if (IS_ERR_OR_NULL(di)) {
ret = di ? PTR_ERR(di) : -ENOENT;
goto out;
@ -7863,6 +8096,9 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
INIT_RADIX_TREE(&sctx->name_cache, GFP_KERNEL);
INIT_LIST_HEAD(&sctx->name_cache_list);
INIT_LIST_HEAD(&sctx->backref_cache.lru_list);
mt_init(&sctx->backref_cache.entries);
sctx->flags = arg->flags;
if (arg->flags & BTRFS_SEND_FLAG_VERSION) {
@ -7901,7 +8137,7 @@ long btrfs_ioctl_send(struct inode *inode, struct btrfs_ioctl_send_args *arg)
if (sctx->proto >= 2) {
u32 send_buf_num_pages;
sctx->send_max_size = ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE);
sctx->send_max_size = BTRFS_SEND_BUF_SIZE_V2;
sctx->send_buf = vmalloc(sctx->send_max_size);
if (!sctx->send_buf) {
ret = -ENOMEM;
@ -8125,6 +8361,8 @@ out:
close_current_inode(sctx);
empty_backref_cache(sctx);
kfree(sctx);
}

View File

@ -18,10 +18,12 @@
#endif
/*
* In send stream v1, no command is larger than 64K. In send stream v2, no limit
* should be assumed.
* In send stream v1, no command is larger than 64K. In send stream v2, no
* limit should be assumed, the buffer size is set to be a header with
* compressed extent size.
*/
#define BTRFS_SEND_BUF_SIZE_V1 SZ_64K
#define BTRFS_SEND_BUF_SIZE_V2 ALIGN(SZ_16K + BTRFS_MAX_COMPRESSED, PAGE_SIZE)
struct inode;
struct btrfs_ioctl_send_args;

View File

@ -10,6 +10,9 @@
#include "transaction.h"
#include "block-group.h"
#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
/*
* HOW DOES SPACE RESERVATION WORK
@ -856,7 +859,7 @@ static bool need_preemptive_reclaim(struct btrfs_fs_info *fs_info,
u64 thresh;
u64 used;
thresh = div_factor_fine(total, 90);
thresh = mult_perc(total, 90);
lockdep_assert_held(&space_info->lock);
@ -974,7 +977,7 @@ static bool steal_from_global_rsv(struct btrfs_fs_info *fs_info,
return false;
spin_lock(&global_rsv->lock);
min_bytes = div_factor(global_rsv->size, 1);
min_bytes = mult_perc(global_rsv->size, 10);
if (global_rsv->reserved < min_bytes + ticket->bytes) {
spin_unlock(&global_rsv->lock);
return false;
@ -1490,8 +1493,8 @@ static void wait_reserve_ticket(struct btrfs_fs_info *fs_info,
spin_unlock(&space_info->lock);
}
/**
* Do the appropriate flushing and waiting for a ticket
/*
* Do the appropriate flushing and waiting for a ticket.
*
* @fs_info: the filesystem
* @space_info: space info for the reservation
@ -1583,8 +1586,18 @@ static inline bool can_steal(enum btrfs_reserve_flush_enum flush)
flush == BTRFS_RESERVE_FLUSH_EVICT);
}
/**
* Try to reserve bytes from the block_rsv's space
/*
* NO_FLUSH and FLUSH_EMERGENCY don't want to create a ticket, they just want to
* fail as quickly as possible.
*/
static inline bool can_ticket(enum btrfs_reserve_flush_enum flush)
{
return (flush != BTRFS_RESERVE_NO_FLUSH &&
flush != BTRFS_RESERVE_FLUSH_EMERGENCY);
}
/*
* Try to reserve bytes from the block_rsv's space.
*
* @fs_info: the filesystem
* @space_info: space info we want to allocate from
@ -1644,6 +1657,21 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
ret = 0;
}
/*
* Things are dire, we need to make a reservation so we don't abort. We
* will let this reservation go through as long as we have actual space
* left to allocate for the block.
*/
if (ret && unlikely(flush == BTRFS_RESERVE_FLUSH_EMERGENCY)) {
used = btrfs_space_info_used(space_info, false);
if (used + orig_bytes <=
writable_total_bytes(fs_info, space_info)) {
btrfs_space_info_update_bytes_may_use(fs_info, space_info,
orig_bytes);
ret = 0;
}
}
/*
* If we couldn't make a reservation then setup our reservation ticket
* and kick the async worker if it's not already running.
@ -1651,7 +1679,7 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
* If we are a priority flusher then we just need to add our ticket to
* the list and we will do our own flushing further down.
*/
if (ret && flush != BTRFS_RESERVE_NO_FLUSH) {
if (ret && can_ticket(flush)) {
ticket.bytes = orig_bytes;
ticket.error = 0;
space_info->reclaim_size += ticket.bytes;
@ -1701,15 +1729,15 @@ static int __reserve_bytes(struct btrfs_fs_info *fs_info,
}
}
spin_unlock(&space_info->lock);
if (!ret || flush == BTRFS_RESERVE_NO_FLUSH)
if (!ret || !can_ticket(flush))
return ret;
return handle_reserve_ticket(fs_info, space_info, &ticket, start_ns,
orig_bytes, flush);
}
/**
* Trye to reserve metadata bytes from the block_rsv's space
/*
* Try to reserve metadata bytes from the block_rsv's space.
*
* @fs_info: the filesystem
* @block_rsv: block_rsv we're allocating for
@ -1743,8 +1771,8 @@ int btrfs_reserve_metadata_bytes(struct btrfs_fs_info *fs_info,
return ret;
}
/**
* Try to reserve data bytes for an allocation
/*
* Try to reserve data bytes for an allocation.
*
* @fs_info: the filesystem
* @bytes: number of bytes we need
@ -1787,3 +1815,37 @@ __cold void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info)
}
dump_global_block_rsv(fs_info);
}
/*
* Account the unused space of all the readonly block group in the space_info.
* takes mirrors into account.
*/
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo)
{
struct btrfs_block_group *block_group;
u64 free_bytes = 0;
int factor;
/* It's df, we don't care if it's racy */
if (list_empty(&sinfo->ro_bgs))
return 0;
spin_lock(&sinfo->lock);
list_for_each_entry(block_group, &sinfo->ro_bgs, ro_list) {
spin_lock(&block_group->lock);
if (!block_group->ro) {
spin_unlock(&block_group->lock);
continue;
}
factor = btrfs_bg_type_to_factor(block_group->flags);
free_bytes += (block_group->length -
block_group->used) * factor;
spin_unlock(&block_group->lock);
}
spin_unlock(&sinfo->lock);
return free_bytes;
}

View File

@ -5,6 +5,83 @@
#include "volumes.h"
/*
* Different levels for to flush space when doing space reservations.
*
* The higher the level, the more methods we try to reclaim space.
*/
enum btrfs_reserve_flush_enum {
/* If we are in the transaction, we can't flush anything.*/
BTRFS_RESERVE_NO_FLUSH,
/*
* Flush space by:
* - Running delayed inode items
* - Allocating a new chunk
*/
BTRFS_RESERVE_FLUSH_LIMIT,
/*
* Flush space by:
* - Running delayed inode items
* - Running delayed refs
* - Running delalloc and waiting for ordered extents
* - Allocating a new chunk
*/
BTRFS_RESERVE_FLUSH_EVICT,
/*
* Flush space by above mentioned methods and by:
* - Running delayed iputs
* - Committing transaction
*
* Can be interrupted by a fatal signal.
*/
BTRFS_RESERVE_FLUSH_DATA,
BTRFS_RESERVE_FLUSH_FREE_SPACE_INODE,
BTRFS_RESERVE_FLUSH_ALL,
/*
* Pretty much the same as FLUSH_ALL, but can also steal space from
* global rsv.
*
* Can be interrupted by a fatal signal.
*/
BTRFS_RESERVE_FLUSH_ALL_STEAL,
/*
* This is for btrfs_use_block_rsv only. We have exhausted our block
* rsv and our global block rsv. This can happen for things like
* delalloc where we are overwriting a lot of extents with a single
* extent and didn't reserve enough space. Alternatively it can happen
* with delalloc where we reserve 1 extents worth for a large extent but
* fragmentation leads to multiple extents being created. This will
* give us the reservation in the case of
*
* if (num_bytes < (space_info->total_bytes -
* btrfs_space_info_used(space_info, false))
*
* Which ignores bytes_may_use. This is potentially dangerous, but our
* reservation system is generally pessimistic so is able to absorb this
* style of mistake.
*/
BTRFS_RESERVE_FLUSH_EMERGENCY,
};
enum btrfs_flush_state {
FLUSH_DELAYED_ITEMS_NR = 1,
FLUSH_DELAYED_ITEMS = 2,
FLUSH_DELAYED_REFS_NR = 3,
FLUSH_DELAYED_REFS = 4,
FLUSH_DELALLOC = 5,
FLUSH_DELALLOC_WAIT = 6,
FLUSH_DELALLOC_FULL = 7,
ALLOC_CHUNK = 8,
ALLOC_CHUNK_FORCE = 9,
RUN_DELAYED_IPUTS = 10,
COMMIT_TRANS = 11,
};
struct btrfs_space_info {
spinlock_t lock;
@ -159,5 +236,6 @@ int btrfs_reserve_data_bytes(struct btrfs_fs_info *fs_info, u64 bytes,
enum btrfs_reserve_flush_enum flush);
void btrfs_dump_space_info_for_trans_abort(struct btrfs_fs_info *fs_info);
void btrfs_init_async_reclaim_work(struct btrfs_fs_info *fs_info);
u64 btrfs_account_ro_block_groups_free_space(struct btrfs_space_info *sinfo);
#endif /* BTRFS_SPACE_INFO_H */

View File

@ -1,6 +1,7 @@
// SPDX-License-Identifier: GPL-2.0
#include <linux/slab.h>
#include "messages.h"
#include "ctree.h"
#include "subpage.h"
#include "btrfs_inode.h"

View File

@ -26,6 +26,7 @@
#include <linux/ratelimit.h>
#include <linux/crc32c.h>
#include <linux/btrfs.h>
#include "messages.h"
#include "delayed-inode.h"
#include "ctree.h"
#include "disk-io.h"
@ -34,7 +35,7 @@
#include "print-tree.h"
#include "props.h"
#include "xattr.h"
#include "volumes.h"
#include "bio.h"
#include "export.h"
#include "compression.h"
#include "rcu-string.h"
@ -49,6 +50,14 @@
#include "discard.h"
#include "qgroup.h"
#include "raid56.h"
#include "fs.h"
#include "accessors.h"
#include "defrag.h"
#include "dir-item.h"
#include "ioctl.h"
#include "scrub.h"
#include "verity.h"
#include "super.h"
#define CREATE_TRACE_POINTS
#include <trace/events/btrfs.h>
@ -67,328 +76,6 @@ static struct file_system_type btrfs_root_fs_type;
static int btrfs_remount(struct super_block *sb, int *flags, char *data);
#ifdef CONFIG_PRINTK
#define STATE_STRING_PREFACE ": state "
#define STATE_STRING_BUF_LEN (sizeof(STATE_STRING_PREFACE) + BTRFS_FS_STATE_COUNT)
/*
* Characters to print to indicate error conditions or uncommon filesystem state.
* RO is not an error.
*/
static const char fs_state_chars[] = {
[BTRFS_FS_STATE_ERROR] = 'E',
[BTRFS_FS_STATE_REMOUNTING] = 'M',
[BTRFS_FS_STATE_RO] = 0,
[BTRFS_FS_STATE_TRANS_ABORTED] = 'A',
[BTRFS_FS_STATE_DEV_REPLACING] = 'R',
[BTRFS_FS_STATE_DUMMY_FS_INFO] = 0,
[BTRFS_FS_STATE_NO_CSUMS] = 'C',
[BTRFS_FS_STATE_LOG_CLEANUP_ERROR] = 'L',
};
static void btrfs_state_to_string(const struct btrfs_fs_info *info, char *buf)
{
unsigned int bit;
bool states_printed = false;
unsigned long fs_state = READ_ONCE(info->fs_state);
char *curr = buf;
memcpy(curr, STATE_STRING_PREFACE, sizeof(STATE_STRING_PREFACE));
curr += sizeof(STATE_STRING_PREFACE) - 1;
for_each_set_bit(bit, &fs_state, sizeof(fs_state)) {
WARN_ON_ONCE(bit >= BTRFS_FS_STATE_COUNT);
if ((bit < BTRFS_FS_STATE_COUNT) && fs_state_chars[bit]) {
*curr++ = fs_state_chars[bit];
states_printed = true;
}
}
/* If no states were printed, reset the buffer */
if (!states_printed)
curr = buf;
*curr++ = 0;
}
#endif
/*
* Generally the error codes correspond to their respective errors, but there
* are a few special cases.
*
* EUCLEAN: Any sort of corruption that we encounter. The tree-checker for
* instance will return EUCLEAN if any of the blocks are corrupted in
* a way that is problematic. We want to reserve EUCLEAN for these
* sort of corruptions.
*
* EROFS: If we check BTRFS_FS_STATE_ERROR and fail out with a return error, we
* need to use EROFS for this case. We will have no idea of the
* original failure, that will have been reported at the time we tripped
* over the error. Each subsequent error that doesn't have any context
* of the original error should use EROFS when handling BTRFS_FS_STATE_ERROR.
*/
const char * __attribute_const__ btrfs_decode_error(int errno)
{
char *errstr = "unknown";
switch (errno) {
case -ENOENT: /* -2 */
errstr = "No such entry";
break;
case -EIO: /* -5 */
errstr = "IO failure";
break;
case -ENOMEM: /* -12*/
errstr = "Out of memory";
break;
case -EEXIST: /* -17 */
errstr = "Object already exists";
break;
case -ENOSPC: /* -28 */
errstr = "No space left";
break;
case -EROFS: /* -30 */
errstr = "Readonly filesystem";
break;
case -EOPNOTSUPP: /* -95 */
errstr = "Operation not supported";
break;
case -EUCLEAN: /* -117 */
errstr = "Filesystem corrupted";
break;
case -EDQUOT: /* -122 */
errstr = "Quota exceeded";
break;
}
return errstr;
}
/*
* __btrfs_handle_fs_error decodes expected errors from the caller and
* invokes the appropriate error response.
*/
__cold
void __btrfs_handle_fs_error(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
struct super_block *sb = fs_info->sb;
#ifdef CONFIG_PRINTK
char statestr[STATE_STRING_BUF_LEN];
const char *errstr;
#endif
/*
* Special case: if the error is EROFS, and we're already
* under SB_RDONLY, then it is safe here.
*/
if (errno == -EROFS && sb_rdonly(sb))
return;
#ifdef CONFIG_PRINTK
errstr = btrfs_decode_error(errno);
btrfs_state_to_string(fs_info, statestr);
if (fmt) {
struct va_format vaf;
va_list args;
va_start(args, fmt);
vaf.fmt = fmt;
vaf.va = &args;
pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s (%pV)\n",
sb->s_id, statestr, function, line, errno, errstr, &vaf);
va_end(args);
} else {
pr_crit("BTRFS: error (device %s%s) in %s:%d: errno=%d %s\n",
sb->s_id, statestr, function, line, errno, errstr);
}
#endif
/*
* Today we only save the error info to memory. Long term we'll
* also send it down to the disk
*/
set_bit(BTRFS_FS_STATE_ERROR, &fs_info->fs_state);
/* Don't go through full error handling during mount */
if (!(sb->s_flags & SB_BORN))
return;
if (sb_rdonly(sb))
return;
btrfs_discard_stop(fs_info);
/* btrfs handle error by forcing the filesystem readonly */
btrfs_set_sb_rdonly(sb);
btrfs_info(fs_info, "forced readonly");
/*
* Note that a running device replace operation is not canceled here
* although there is no way to update the progress. It would add the
* risk of a deadlock, therefore the canceling is omitted. The only
* penalty is that some I/O remains active until the procedure
* completes. The next time when the filesystem is mounted writable
* again, the device replace operation continues.
*/
}
#ifdef CONFIG_PRINTK
static const char * const logtypes[] = {
"emergency",
"alert",
"critical",
"error",
"warning",
"notice",
"info",
"debug",
};
/*
* Use one ratelimit state per log level so that a flood of less important
* messages doesn't cause more important ones to be dropped.
*/
static struct ratelimit_state printk_limits[] = {
RATELIMIT_STATE_INIT(printk_limits[0], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[1], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[2], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[3], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[4], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[5], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[6], DEFAULT_RATELIMIT_INTERVAL, 100),
RATELIMIT_STATE_INIT(printk_limits[7], DEFAULT_RATELIMIT_INTERVAL, 100),
};
void __cold _btrfs_printk(const struct btrfs_fs_info *fs_info, const char *fmt, ...)
{
char lvl[PRINTK_MAX_SINGLE_HEADER_LEN + 1] = "\0";
struct va_format vaf;
va_list args;
int kern_level;
const char *type = logtypes[4];
struct ratelimit_state *ratelimit = &printk_limits[4];
va_start(args, fmt);
while ((kern_level = printk_get_level(fmt)) != 0) {
size_t size = printk_skip_level(fmt) - fmt;
if (kern_level >= '0' && kern_level <= '7') {
memcpy(lvl, fmt, size);
lvl[size] = '\0';
type = logtypes[kern_level - '0'];
ratelimit = &printk_limits[kern_level - '0'];
}
fmt += size;
}
vaf.fmt = fmt;
vaf.va = &args;
if (__ratelimit(ratelimit)) {
if (fs_info) {
char statestr[STATE_STRING_BUF_LEN];
btrfs_state_to_string(fs_info, statestr);
_printk("%sBTRFS %s (device %s%s): %pV\n", lvl, type,
fs_info->sb->s_id, statestr, &vaf);
} else {
_printk("%sBTRFS %s: %pV\n", lvl, type, &vaf);
}
}
va_end(args);
}
#endif
#if BITS_PER_LONG == 32
void __cold btrfs_warn_32bit_limit(struct btrfs_fs_info *fs_info)
{
if (!test_and_set_bit(BTRFS_FS_32BIT_WARN, &fs_info->flags)) {
btrfs_warn(fs_info, "reaching 32bit limit for logical addresses");
btrfs_warn(fs_info,
"due to page cache limit on 32bit systems, btrfs can't access metadata at or beyond %lluT",
BTRFS_32BIT_MAX_FILE_SIZE >> 40);
btrfs_warn(fs_info,
"please consider upgrading to 64bit kernel/hardware");
}
}
void __cold btrfs_err_32bit_limit(struct btrfs_fs_info *fs_info)
{
if (!test_and_set_bit(BTRFS_FS_32BIT_ERROR, &fs_info->flags)) {
btrfs_err(fs_info, "reached 32bit limit for logical addresses");
btrfs_err(fs_info,
"due to page cache limit on 32bit systems, metadata beyond %lluT can't be accessed",
BTRFS_32BIT_MAX_FILE_SIZE >> 40);
btrfs_err(fs_info,
"please consider upgrading to 64bit kernel/hardware");
}
}
#endif
/*
* We only mark the transaction aborted and then set the file system read-only.
* This will prevent new transactions from starting or trying to join this
* one.
*
* This means that error recovery at the call site is limited to freeing
* any local memory allocations and passing the error code up without
* further cleanup. The transaction should complete as it normally would
* in the call path but will return -EIO.
*
* We'll complete the cleanup in btrfs_end_transaction and
* btrfs_commit_transaction.
*/
__cold
void __btrfs_abort_transaction(struct btrfs_trans_handle *trans,
const char *function,
unsigned int line, int errno, bool first_hit)
{
struct btrfs_fs_info *fs_info = trans->fs_info;
WRITE_ONCE(trans->aborted, errno);
WRITE_ONCE(trans->transaction->aborted, errno);
if (first_hit && errno == -ENOSPC)
btrfs_dump_space_info_for_trans_abort(fs_info);
/* Wake up anybody who may be waiting on this transaction */
wake_up(&fs_info->transaction_wait);
wake_up(&fs_info->transaction_blocked_wait);
__btrfs_handle_fs_error(fs_info, function, line, errno, NULL);
}
/*
* __btrfs_panic decodes unexpected, fatal errors from the caller,
* issues an alert, and either panics or BUGs, depending on mount options.
*/
__cold
void __btrfs_panic(struct btrfs_fs_info *fs_info, const char *function,
unsigned int line, int errno, const char *fmt, ...)
{
char *s_id = "<unknown>";
const char *errstr;
struct va_format vaf = { .fmt = fmt };
va_list args;
if (fs_info)
s_id = fs_info->sb->s_id;
va_start(args, fmt);
vaf.va = &args;
errstr = btrfs_decode_error(errno);
if (fs_info && (btrfs_test_opt(fs_info, PANIC_ON_FATAL_ERROR)))
panic(KERN_CRIT "BTRFS panic (device %s) in %s:%d: %pV (errno=%d %s)\n",
s_id, function, line, &vaf, errno, errstr);
btrfs_crit(fs_info, "panic in %s:%d: %pV (errno=%d %s)",
function, line, &vaf, errno, errstr);
va_end(args);
/* Caller calls BUG() */
}
static void btrfs_put_super(struct super_block *sb)
{
close_ctree(btrfs_sb(sb));
@ -918,12 +605,14 @@ int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
ret = -EINVAL;
goto out;
}
btrfs_clear_opt(info->mount_opt, NODISCARD);
break;
case Opt_nodiscard:
btrfs_clear_and_info(info, DISCARD_SYNC,
"turning off discard");
btrfs_clear_and_info(info, DISCARD_ASYNC,
"turning off async discard");
btrfs_set_opt(info->mount_opt, NODISCARD);
break;
case Opt_space_cache:
case Opt_space_cache_version:
@ -1394,6 +1083,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
struct btrfs_dir_item *di;
struct btrfs_path *path;
struct btrfs_key location;
struct fscrypt_str name = FSTR_INIT("default", 7);
u64 dir_id;
path = btrfs_alloc_path();
@ -1406,7 +1096,7 @@ static int get_default_subvol_objectid(struct btrfs_fs_info *fs_info, u64 *objec
* to mount.
*/
dir_id = btrfs_super_root_dir(fs_info->super_copy);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, "default", 7, 0);
di = btrfs_lookup_dir_item(NULL, root, path, dir_id, &name, 0);
if (IS_ERR(di)) {
btrfs_free_path(path);
return PTR_ERR(di);
@ -1507,7 +1197,8 @@ int btrfs_sync_fs(struct super_block *sb, int wait)
* Exit unless we have some pending changes
* that need to go through commit
*/
if (fs_info->pending_changes == 0)
if (!test_bit(BTRFS_FS_NEED_TRANS_COMMIT,
&fs_info->flags))
return 0;
/*
* A non-blocking test if the fs is frozen. We must not
@ -2645,7 +2336,7 @@ static int btrfs_show_devname(struct seq_file *m, struct dentry *root)
* the end of RCU grace period.
*/
rcu_read_lock();
seq_escape(m, rcu_str_deref(fs_info->fs_devices->latest_dev->name), " \t\n\\");
seq_escape(m, btrfs_dev_name(fs_info->fs_devices->latest_dev), " \t\n\\");
rcu_read_unlock();
return 0;
@ -2694,7 +2385,7 @@ static __cold void btrfs_interface_exit(void)
misc_deregister(&btrfs_misc);
}
static void __init btrfs_print_mod_info(void)
static int __init btrfs_print_mod_info(void)
{
static const char options[] = ""
#ifdef CONFIG_BTRFS_DEBUG
@ -2721,122 +2412,125 @@ static void __init btrfs_print_mod_info(void)
#endif
;
pr_info("Btrfs loaded, crc32c=%s%s\n", crc32c_impl(), options);
return 0;
}
static int __init init_btrfs_fs(void)
static int register_btrfs(void)
{
int err;
return register_filesystem(&btrfs_fs_type);
}
btrfs_props_init();
static void unregister_btrfs(void)
{
unregister_filesystem(&btrfs_fs_type);
}
err = btrfs_init_sysfs();
if (err)
return err;
/* Helper structure for long init/exit functions. */
struct init_sequence {
int (*init_func)(void);
/* Can be NULL if the init_func doesn't need cleanup. */
void (*exit_func)(void);
};
btrfs_init_compress();
static const struct init_sequence mod_init_seq[] = {
{
.init_func = btrfs_props_init,
.exit_func = NULL,
}, {
.init_func = btrfs_init_sysfs,
.exit_func = btrfs_exit_sysfs,
}, {
.init_func = btrfs_init_compress,
.exit_func = btrfs_exit_compress,
}, {
.init_func = btrfs_init_cachep,
.exit_func = btrfs_destroy_cachep,
}, {
.init_func = btrfs_transaction_init,
.exit_func = btrfs_transaction_exit,
}, {
.init_func = btrfs_ctree_init,
.exit_func = btrfs_ctree_exit,
}, {
.init_func = btrfs_free_space_init,
.exit_func = btrfs_free_space_exit,
}, {
.init_func = extent_state_init_cachep,
.exit_func = extent_state_free_cachep,
}, {
.init_func = extent_buffer_init_cachep,
.exit_func = extent_buffer_free_cachep,
}, {
.init_func = btrfs_bioset_init,
.exit_func = btrfs_bioset_exit,
}, {
.init_func = extent_map_init,
.exit_func = extent_map_exit,
}, {
.init_func = ordered_data_init,
.exit_func = ordered_data_exit,
}, {
.init_func = btrfs_delayed_inode_init,
.exit_func = btrfs_delayed_inode_exit,
}, {
.init_func = btrfs_auto_defrag_init,
.exit_func = btrfs_auto_defrag_exit,
}, {
.init_func = btrfs_delayed_ref_init,
.exit_func = btrfs_delayed_ref_exit,
}, {
.init_func = btrfs_prelim_ref_init,
.exit_func = btrfs_prelim_ref_exit,
}, {
.init_func = btrfs_interface_init,
.exit_func = btrfs_interface_exit,
}, {
.init_func = btrfs_print_mod_info,
.exit_func = NULL,
}, {
.init_func = btrfs_run_sanity_tests,
.exit_func = NULL,
}, {
.init_func = register_btrfs,
.exit_func = unregister_btrfs,
}
};
err = btrfs_init_cachep();
if (err)
goto free_compress;
static bool mod_init_result[ARRAY_SIZE(mod_init_seq)];
err = extent_state_init_cachep();
if (err)
goto free_cachep;
static __always_inline void btrfs_exit_btrfs_fs(void)
{
int i;
err = extent_buffer_init_cachep();
if (err)
goto free_extent_cachep;
err = btrfs_bioset_init();
if (err)
goto free_eb_cachep;
err = extent_map_init();
if (err)
goto free_bioset;
err = ordered_data_init();
if (err)
goto free_extent_map;
err = btrfs_delayed_inode_init();
if (err)
goto free_ordered_data;
err = btrfs_auto_defrag_init();
if (err)
goto free_delayed_inode;
err = btrfs_delayed_ref_init();
if (err)
goto free_auto_defrag;
err = btrfs_prelim_ref_init();
if (err)
goto free_delayed_ref;
err = btrfs_interface_init();
if (err)
goto free_prelim_ref;
btrfs_print_mod_info();
err = btrfs_run_sanity_tests();
if (err)
goto unregister_ioctl;
err = register_filesystem(&btrfs_fs_type);
if (err)
goto unregister_ioctl;
return 0;
unregister_ioctl:
btrfs_interface_exit();
free_prelim_ref:
btrfs_prelim_ref_exit();
free_delayed_ref:
btrfs_delayed_ref_exit();
free_auto_defrag:
btrfs_auto_defrag_exit();
free_delayed_inode:
btrfs_delayed_inode_exit();
free_ordered_data:
ordered_data_exit();
free_extent_map:
extent_map_exit();
free_bioset:
btrfs_bioset_exit();
free_eb_cachep:
extent_buffer_free_cachep();
free_extent_cachep:
extent_state_free_cachep();
free_cachep:
btrfs_destroy_cachep();
free_compress:
btrfs_exit_compress();
btrfs_exit_sysfs();
return err;
for (i = ARRAY_SIZE(mod_init_seq) - 1; i >= 0; i--) {
if (!mod_init_result[i])
continue;
if (mod_init_seq[i].exit_func)
mod_init_seq[i].exit_func();
mod_init_result[i] = false;
}
}
static void __exit exit_btrfs_fs(void)
{
btrfs_destroy_cachep();
btrfs_delayed_ref_exit();
btrfs_auto_defrag_exit();
btrfs_delayed_inode_exit();
btrfs_prelim_ref_exit();
ordered_data_exit();
extent_map_exit();
btrfs_bioset_exit();
extent_state_free_cachep();
extent_buffer_free_cachep();
btrfs_interface_exit();
unregister_filesystem(&btrfs_fs_type);
btrfs_exit_sysfs();
btrfs_cleanup_fs_uuids();
btrfs_exit_compress();
btrfs_exit_btrfs_fs();
}
static int __init init_btrfs_fs(void)
{
int ret;
int i;
for (i = 0; i < ARRAY_SIZE(mod_init_seq); i++) {
ASSERT(!mod_init_result[i]);
ret = mod_init_seq[i].init_func();
if (ret < 0) {
btrfs_exit_btrfs_fs();
return ret;
}
mod_init_result[i] = true;
}
return 0;
}
late_initcall(init_btrfs_fs);

29
fs/btrfs/super.h Normal file
View File

@ -0,0 +1,29 @@
/* SPDX-License-Identifier: GPL-2.0 */
#ifndef BTRFS_SUPER_H
#define BTRFS_SUPER_H
int btrfs_parse_options(struct btrfs_fs_info *info, char *options,
unsigned long new_flags);
int btrfs_sync_fs(struct super_block *sb, int wait);
char *btrfs_get_subvol_name_from_objectid(struct btrfs_fs_info *fs_info,
u64 subvol_objectid);
static inline struct btrfs_fs_info *btrfs_sb(struct super_block *sb)
{
return sb->s_fs_info;
}
static inline void btrfs_set_sb_rdonly(struct super_block *sb)
{
sb->s_flags |= SB_RDONLY;
set_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
}
static inline void btrfs_clear_sb_rdonly(struct super_block *sb)
{
sb->s_flags &= ~SB_RDONLY;
clear_bit(BTRFS_FS_STATE_RO, &btrfs_sb(sb)->fs_state);
}
#endif

View File

@ -10,7 +10,7 @@
#include <linux/completion.h>
#include <linux/bug.h>
#include <crypto/hash.h>
#include "messages.h"
#include "ctree.h"
#include "discard.h"
#include "disk-io.h"
@ -22,6 +22,8 @@
#include "block-group.h"
#include "qgroup.h"
#include "misc.h"
#include "fs.h"
#include "accessors.h"
/*
* Structure name Path
@ -248,7 +250,7 @@ static ssize_t btrfs_feature_attr_store(struct kobject *kobj,
/*
* We don't want to do full transaction commit from inside sysfs
*/
btrfs_set_pending(fs_info, COMMIT);
set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
wake_up_process(fs_info->transaction_kthread);
return count;
@ -762,7 +764,7 @@ static ssize_t btrfs_chunk_size_store(struct kobject *kobj,
val = min(val, BTRFS_MAX_DATA_CHUNK_SIZE);
/* Limit stripe size to 10% of available space. */
val = min(div_factor(fs_info->fs_devices->total_rw_bytes, 1), val);
val = min(mult_perc(fs_info->fs_devices->total_rw_bytes, 10), val);
/* Must be multiple of 256M. */
val &= ~((u64)SZ_256M - 1);
@ -959,7 +961,7 @@ static ssize_t btrfs_label_store(struct kobject *kobj,
/*
* We don't want to do full transaction commit from inside sysfs
*/
btrfs_set_pending(fs_info, COMMIT);
set_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
wake_up_process(fs_info->transaction_kthread);
return len;
@ -1160,16 +1162,16 @@ static ssize_t btrfs_read_policy_show(struct kobject *kobj,
for (i = 0; i < BTRFS_NR_READ_POLICY; i++) {
if (fs_devices->read_policy == i)
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s[%s]",
ret += sysfs_emit_at(buf, ret, "%s[%s]",
(ret == 0 ? "" : " "),
btrfs_read_policy_name[i]);
else
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "%s%s",
ret += sysfs_emit_at(buf, ret, "%s%s",
(ret == 0 ? "" : " "),
btrfs_read_policy_name[i]);
}
ret += scnprintf(buf + ret, PAGE_SIZE - ret, "\n");
ret += sysfs_emit_at(buf, ret, "\n");
return ret;
}

View File

@ -16,6 +16,7 @@
#include "../disk-io.h"
#include "../qgroup.h"
#include "../block-group.h"
#include "../fs.h"
static struct vfsmount *test_mnt = NULL;
@ -101,7 +102,7 @@ struct btrfs_device *btrfs_alloc_dummy_device(struct btrfs_fs_info *fs_info)
if (!dev)
return ERR_PTR(-ENOMEM);
extent_io_tree_init(NULL, &dev->alloc_state, 0, NULL);
extent_io_tree_init(NULL, &dev->alloc_state, 0);
INIT_LIST_HEAD(&dev->dev_list);
list_add(&dev->dev_list, &fs_info->fs_devices->devices);

View File

@ -8,6 +8,7 @@
#include "../ctree.h"
#include "../extent_io.h"
#include "../disk-io.h"
#include "../accessors.h"
static int test_btrfs_split_item(u32 sectorsize, u32 nodesize)
{

View File

@ -132,7 +132,7 @@ static int test_find_delalloc(u32 sectorsize)
* Passing NULL as we don't have fs_info but tracepoints are not used
* at this point
*/
extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST, NULL);
extent_io_tree_init(NULL, tmp, IO_TREE_SELFTEST);
/*
* First go through and create and mark all of our pages dirty, we pin
@ -489,7 +489,7 @@ static int test_find_first_clear_extent_bit(void)
test_msg("running find_first_clear_extent_bit test");
extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST, NULL);
extent_io_tree_init(NULL, &tree, IO_TREE_SELFTEST);
/* Test correct handling of empty tree */
find_first_clear_extent_bit(&tree, 0, &start, &end, CHUNK_TRIMMED);

View File

@ -10,6 +10,7 @@
#include "../free-space-tree.h"
#include "../transaction.h"
#include "../block-group.h"
#include "../accessors.h"
struct free_space_extent {
u64 start;
@ -470,7 +471,7 @@ static int run_test(test_func_t test_func, int bitmaps, u32 sectorsize,
}
cache->bitmap_low_thresh = 0;
cache->bitmap_high_thresh = (u32)-1;
cache->needs_free_space = 1;
set_bit(BLOCK_GROUP_FLAG_NEEDS_FREE_SPACE, &cache->runtime_flags);
cache->fs_info = root->fs_info;
btrfs_init_dummy_trans(&trans, root->fs_info);

View File

@ -11,6 +11,7 @@
#include "../extent_io.h"
#include "../volumes.h"
#include "../compression.h"
#include "../accessors.h"
static void insert_extent(struct btrfs_root *root, u64 start, u64 len,
u64 ram_bytes, u64 offset, u64 disk_bytenr,
@ -72,8 +73,8 @@ static void insert_inode_item_key(struct btrfs_root *root)
* diagram of how the extents will look though this may not be possible we still
* want to make sure everything acts normally (the last number is not inclusive)
*
* [0 - 5][5 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291]
* [hole ][inline][hole but no extent][ hole ][ regular ][regular1 split]
* [0 - 6][ 6 - 4096 ][ 4096 - 4100][4100 - 8195][8195 - 12291]
* [inline][hole but no extent][ hole ][ regular ][regular1 split]
*
* [12291 - 16387][16387 - 24579][24579 - 28675][ 28675 - 32771][32771 - 36867 ]
* [ hole ][regular1 split][ prealloc ][ prealloc1 ][prealloc1 written]
@ -90,19 +91,12 @@ static void setup_file_extents(struct btrfs_root *root, u32 sectorsize)
u64 disk_bytenr = SZ_1M;
u64 offset = 0;
/* First we want a hole */
insert_extent(root, offset, 5, 5, 0, 0, 0, BTRFS_FILE_EXTENT_REG, 0,
slot);
slot++;
offset += 5;
/*
* Now we want an inline extent, I don't think this is possible but hey
* why not? Also keep in mind if we have an inline extent it counts as
* the whole first page. If we were to expand it we would have to cow
* and we wouldn't have an inline extent anymore.
* Tree-checker has strict limits on inline extents that they can only
* exist at file offset 0, thus we can only have one inline file extent
* at most.
*/
insert_extent(root, offset, 1, 1, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
insert_extent(root, offset, 6, 6, 0, 0, 0, BTRFS_FILE_EXTENT_INLINE, 0,
slot);
slot++;
offset = sectorsize;
@ -281,37 +275,25 @@ static noinline int test_btrfs_get_extent(u32 sectorsize, u32 nodesize)
test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_HOLE) {
test_err("expected a hole, got %llu", em->block_start);
goto out;
}
if (em->start != 0 || em->len != 5) {
test_err(
"unexpected extent wanted start 0 len 5, got start %llu len %llu",
em->start, em->len);
goto out;
}
if (em->flags != 0) {
test_err("unexpected flags set, want 0 have %lu", em->flags);
goto out;
}
offset = em->start + em->len;
free_extent_map(em);
em = btrfs_get_extent(BTRFS_I(inode), NULL, 0, offset, sectorsize);
if (IS_ERR(em)) {
test_err("got an error when we shouldn't have");
goto out;
}
if (em->block_start != EXTENT_MAP_INLINE) {
test_err("expected an inline, got %llu", em->block_start);
goto out;
}
if (em->start != offset || em->len != (sectorsize - 5)) {
/*
* For inline extent, we always round up the em to sectorsize, as
* they are either:
*
* a) a hidden hole
* The range will be zeroed at inline extent read time.
*
* b) a file extent with unaligned bytenr
* Tree checker will reject it.
*/
if (em->start != 0 || em->len != sectorsize) {
test_err(
"unexpected extent wanted start %llu len 1, got start %llu len %llu",
offset, em->start, em->len);
"unexpected extent wanted start 0 len %u, got start %llu len %llu",
sectorsize, em->start, em->len);
goto out;
}
if (em->flags != 0) {

View File

@ -10,6 +10,8 @@
#include "../disk-io.h"
#include "../qgroup.h"
#include "../backref.h"
#include "../fs.h"
#include "../accessors.h"
static int insert_normal_tree_ref(struct btrfs_root *root, u64 bytenr,
u64 num_bytes, u64 parent, u64 root_objectid)
@ -203,6 +205,7 @@ static int remove_extent_ref(struct btrfs_root *root, u64 bytenr,
static int test_no_shared_qgroup(struct btrfs_root *root,
u32 sectorsize, u32 nodesize)
{
struct btrfs_backref_walk_ctx ctx = { 0 };
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist *old_roots = NULL;
@ -218,16 +221,22 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return ret;
}
ctx.bytenr = nodesize;
ctx.trans = &trans;
ctx.fs_info = fs_info;
/*
* Since the test trans doesn't have the complicated delayed refs,
* we can only call btrfs_qgroup_account_extent() directly to test
* quota.
*/
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
test_err("couldn't find old roots: %d", ret);
return ret;
}
old_roots = ctx.roots;
ctx.roots = NULL;
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
@ -236,12 +245,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return ret;
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
new_roots = ctx.roots;
ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
@ -260,11 +271,13 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return -EINVAL;
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
test_err("couldn't find old roots: %d", ret);
return ret;
}
old_roots = ctx.roots;
ctx.roots = NULL;
ret = remove_extent_item(root, nodesize, nodesize);
if (ret) {
@ -272,12 +285,14 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
return -EINVAL;
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
new_roots = ctx.roots;
ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
@ -302,6 +317,7 @@ static int test_no_shared_qgroup(struct btrfs_root *root,
static int test_multiple_refs(struct btrfs_root *root,
u32 sectorsize, u32 nodesize)
{
struct btrfs_backref_walk_ctx ctx = { 0 };
struct btrfs_trans_handle trans;
struct btrfs_fs_info *fs_info = root->fs_info;
struct ulist *old_roots = NULL;
@ -322,11 +338,17 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
ctx.bytenr = nodesize;
ctx.trans = &trans;
ctx.fs_info = fs_info;
ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
test_err("couldn't find old roots: %d", ret);
return ret;
}
old_roots = ctx.roots;
ctx.roots = NULL;
ret = insert_normal_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FS_TREE_OBJECTID);
@ -335,12 +357,14 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
new_roots = ctx.roots;
ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
@ -355,11 +379,13 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
test_err("couldn't find old roots: %d", ret);
return ret;
}
old_roots = ctx.roots;
ctx.roots = NULL;
ret = add_tree_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
@ -368,12 +394,14 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
new_roots = ctx.roots;
ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);
@ -394,11 +422,13 @@ static int test_multiple_refs(struct btrfs_root *root,
return -EINVAL;
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &old_roots, false);
ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
test_err("couldn't find old roots: %d", ret);
return ret;
}
old_roots = ctx.roots;
ctx.roots = NULL;
ret = remove_extent_ref(root, nodesize, nodesize, 0,
BTRFS_FIRST_FREE_OBJECTID);
@ -407,12 +437,14 @@ static int test_multiple_refs(struct btrfs_root *root,
return ret;
}
ret = btrfs_find_all_roots(&trans, fs_info, nodesize, 0, &new_roots, false);
ret = btrfs_find_all_roots(&ctx, false);
if (ret) {
ulist_free(old_roots);
test_err("couldn't find old roots: %d", ret);
return ret;
}
new_roots = ctx.roots;
ctx.roots = NULL;
ret = btrfs_qgroup_account_extent(&trans, nodesize, nodesize, old_roots,
new_roots);

View File

@ -6,6 +6,7 @@
#include <linux/fs.h>
#include <linux/slab.h>
#include <linux/sched.h>
#include <linux/sched/mm.h>
#include <linux/writeback.h>
#include <linux/pagemap.h>
#include <linux/blkdev.h>
@ -23,6 +24,18 @@
#include "block-group.h"
#include "space-info.h"
#include "zoned.h"
#include "fs.h"
#include "accessors.h"
#include "extent-tree.h"
#include "root-tree.h"
#include "defrag.h"
#include "dir-item.h"
#include "uuid-tree.h"
#include "ioctl.h"
#include "relocation.h"
#include "scrub.h"
static struct kmem_cache *btrfs_trans_handle_cachep;
#define BTRFS_ROOT_TRANS_TAG 0
@ -365,9 +378,9 @@ loop:
spin_lock_init(&cur_trans->releasing_ebs_lock);
list_add_tail(&cur_trans->list, &fs_info->trans_list);
extent_io_tree_init(fs_info, &cur_trans->dirty_pages,
IO_TREE_TRANS_DIRTY_PAGES, NULL);
IO_TREE_TRANS_DIRTY_PAGES);
extent_io_tree_init(fs_info, &cur_trans->pinned_extents,
IO_TREE_FS_PINNED_EXTENTS, NULL);
IO_TREE_FS_PINNED_EXTENTS);
fs_info->generation++;
cur_trans->transid = fs_info->generation;
fs_info->running_transaction = cur_trans;
@ -936,7 +949,7 @@ static bool should_end_transaction(struct btrfs_trans_handle *trans)
if (btrfs_check_space_for_delayed_refs(fs_info))
return true;
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 5);
return !!btrfs_block_rsv_check(&fs_info->global_block_rsv, 50);
}
bool btrfs_should_end_transaction(struct btrfs_trans_handle *trans)
@ -1607,10 +1620,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
struct btrfs_root *root = pending->root;
struct btrfs_root *parent_root;
struct btrfs_block_rsv *rsv;
struct inode *parent_inode;
struct inode *parent_inode = pending->dir;
struct btrfs_path *path;
struct btrfs_dir_item *dir_item;
struct dentry *dentry;
struct extent_buffer *tmp;
struct extent_buffer *old;
struct timespec64 cur_time;
@ -1619,6 +1631,8 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
u64 index = 0;
u64 objectid;
u64 root_flags;
unsigned int nofs_flags;
struct fscrypt_name fname;
ASSERT(pending->path);
path = pending->path;
@ -1626,9 +1640,22 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ASSERT(pending->root_item);
new_root_item = pending->root_item;
/*
* We're inside a transaction and must make sure that any potential
* allocations with GFP_KERNEL in fscrypt won't recurse back to
* filesystem.
*/
nofs_flags = memalloc_nofs_save();
pending->error = fscrypt_setup_filename(parent_inode,
&pending->dentry->d_name, 0,
&fname);
memalloc_nofs_restore(nofs_flags);
if (pending->error)
goto free_pending;
pending->error = btrfs_get_free_objectid(tree_root, &objectid);
if (pending->error)
goto no_free_objectid;
goto free_fname;
/*
* Make qgroup to skip current new snapshot's qgroupid, as it is
@ -1657,8 +1684,6 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
trace_btrfs_space_reservation(fs_info, "transaction",
trans->transid,
trans->bytes_reserved, 1);
dentry = pending->dentry;
parent_inode = pending->dir;
parent_root = BTRFS_I(parent_inode)->root;
ret = record_root_in_trans(trans, parent_root, 0);
if (ret)
@ -1674,8 +1699,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
/* check if there is a file/dir which has the same name. */
dir_item = btrfs_lookup_dir_item(NULL, parent_root, path,
btrfs_ino(BTRFS_I(parent_inode)),
dentry->d_name.name,
dentry->d_name.len, 0);
&fname.disk_name, 0);
if (dir_item != NULL && !IS_ERR(dir_item)) {
pending->error = -EEXIST;
goto dir_item_existed;
@ -1770,7 +1794,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
ret = btrfs_add_root_ref(trans, objectid,
parent_root->root_key.objectid,
btrfs_ino(BTRFS_I(parent_inode)), index,
dentry->d_name.name, dentry->d_name.len);
&fname.disk_name);
if (ret) {
btrfs_abort_transaction(trans, ret);
goto fail;
@ -1802,9 +1826,9 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
if (ret < 0)
goto fail;
ret = btrfs_insert_dir_item(trans, dentry->d_name.name,
dentry->d_name.len, BTRFS_I(parent_inode),
&key, BTRFS_FT_DIR, index);
ret = btrfs_insert_dir_item(trans, &fname.disk_name,
BTRFS_I(parent_inode), &key, BTRFS_FT_DIR,
index);
/* We have check then name at the beginning, so it is impossible. */
BUG_ON(ret == -EEXIST || ret == -EOVERFLOW);
if (ret) {
@ -1813,7 +1837,7 @@ static noinline int create_pending_snapshot(struct btrfs_trans_handle *trans,
}
btrfs_i_size_write(BTRFS_I(parent_inode), parent_inode->i_size +
dentry->d_name.len * 2);
fname.disk_name.len * 2);
parent_inode->i_mtime = current_time(parent_inode);
parent_inode->i_ctime = parent_inode->i_mtime;
ret = btrfs_update_inode_fallback(trans, parent_root, BTRFS_I(parent_inode));
@ -1845,7 +1869,9 @@ dir_item_existed:
trans->bytes_reserved = 0;
clear_skip_qgroup:
btrfs_clear_skip_qgroup(trans);
no_free_objectid:
free_fname:
fscrypt_free_filename(&fname);
free_pending:
kfree(new_root_item);
pending->root_item = NULL;
btrfs_free_path(path);
@ -2101,6 +2127,8 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
ASSERT(refcount_read(&trans->use_count) == 1);
btrfs_trans_state_lockdep_acquire(fs_info, BTRFS_LOCKDEP_TRANS_COMMIT_START);
clear_bit(BTRFS_FS_NEED_TRANS_COMMIT, &fs_info->flags);
/* Stop the commit early if ->aborted is set */
if (TRANS_ABORTED(cur_trans)) {
ret = cur_trans->aborted;
@ -2354,12 +2382,6 @@ int btrfs_commit_transaction(struct btrfs_trans_handle *trans)
if (ret)
goto unlock_reloc;
/*
* Since the transaction is done, we can apply the pending changes
* before the next transaction.
*/
btrfs_apply_pending_changes(fs_info);
/* commit_fs_roots gets rid of all the tree log roots, it is now
* safe to free the root of tree log roots
*/
@ -2582,21 +2604,17 @@ int btrfs_clean_one_deleted_snapshot(struct btrfs_fs_info *fs_info)
return (ret < 0) ? 0 : 1;
}
void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info)
int __init btrfs_transaction_init(void)
{
unsigned long prev;
unsigned long bit;
prev = xchg(&fs_info->pending_changes, 0);
if (!prev)
return;
bit = 1 << BTRFS_PENDING_COMMIT;
if (prev & bit)
btrfs_debug(fs_info, "pending commit done");
prev &= ~bit;
if (prev)
btrfs_warn(fs_info,
"unknown pending changes left 0x%lx, ignoring", prev);
btrfs_trans_handle_cachep = kmem_cache_create("btrfs_trans_handle",
sizeof(struct btrfs_trans_handle), 0,
SLAB_TEMPORARY | SLAB_MEM_SPREAD, NULL);
if (!btrfs_trans_handle_cachep)
return -ENOMEM;
return 0;
}
void __cold btrfs_transaction_exit(void)
{
kmem_cache_destroy(btrfs_trans_handle_cachep);
}

View File

@ -10,6 +10,7 @@
#include "btrfs_inode.h"
#include "delayed-ref.h"
#include "ctree.h"
#include "misc.h"
enum btrfs_trans_state {
TRANS_STATE_RUNNING,
@ -98,14 +99,15 @@ struct btrfs_transaction {
struct list_head releasing_ebs;
};
#define __TRANS_FREEZABLE (1U << 0)
#define __TRANS_START (1U << 9)
#define __TRANS_ATTACH (1U << 10)
#define __TRANS_JOIN (1U << 11)
#define __TRANS_JOIN_NOLOCK (1U << 12)
#define __TRANS_DUMMY (1U << 13)
#define __TRANS_JOIN_NOSTART (1U << 14)
enum {
ENUM_BIT(__TRANS_FREEZABLE),
ENUM_BIT(__TRANS_START),
ENUM_BIT(__TRANS_ATTACH),
ENUM_BIT(__TRANS_JOIN),
ENUM_BIT(__TRANS_JOIN_NOLOCK),
ENUM_BIT(__TRANS_DUMMY),
ENUM_BIT(__TRANS_JOIN_NOSTART),
};
#define TRANS_START (__TRANS_START | __TRANS_FREEZABLE)
#define TRANS_ATTACH (__TRANS_ATTACH)
@ -231,9 +233,11 @@ int btrfs_wait_tree_log_extents(struct btrfs_root *root, int mark);
int btrfs_transaction_blocked(struct btrfs_fs_info *info);
int btrfs_transaction_in_commit(struct btrfs_fs_info *info);
void btrfs_put_transaction(struct btrfs_transaction *transaction);
void btrfs_apply_pending_changes(struct btrfs_fs_info *fs_info);
void btrfs_add_dropped_root(struct btrfs_trans_handle *trans,
struct btrfs_root *root);
void btrfs_trans_release_chunk_metadata(struct btrfs_trans_handle *trans);
int __init btrfs_transaction_init(void);
void __cold btrfs_transaction_exit(void);
#endif

View File

@ -18,6 +18,7 @@
#include <linux/types.h>
#include <linux/stddef.h>
#include <linux/error-injection.h>
#include "messages.h"
#include "ctree.h"
#include "tree-checker.h"
#include "disk-io.h"
@ -25,6 +26,9 @@
#include "volumes.h"
#include "misc.h"
#include "btrfs_inode.h"
#include "fs.h"
#include "accessors.h"
#include "file-item.h"
/*
* Error message should follow the following format:
@ -528,7 +532,7 @@ static int check_dir_item(struct extent_buffer *leaf,
}
/* dir type check */
dir_type = btrfs_dir_type(leaf, di);
dir_type = btrfs_dir_ftype(leaf, di);
if (unlikely(dir_type >= BTRFS_FT_MAX)) {
dir_item_err(leaf, slot,
"invalid dir item type, have %u expect [0, %u)",
@ -1780,10 +1784,10 @@ static int check_leaf(struct extent_buffer *leaf, bool check_item_data)
/* Also check if the item pointer overlaps with btrfs item. */
if (unlikely(btrfs_item_ptr_offset(leaf, slot) <
btrfs_item_nr_offset(slot) + sizeof(struct btrfs_item))) {
btrfs_item_nr_offset(leaf, slot) + sizeof(struct btrfs_item))) {
generic_err(leaf, slot,
"slot overlaps with its data, item end %lu data start %lu",
btrfs_item_nr_offset(slot) +
btrfs_item_nr_offset(leaf, slot) +
sizeof(struct btrfs_item),
btrfs_item_ptr_offset(leaf, slot));
return -EUCLEAN;

View File

@ -6,8 +6,39 @@
#ifndef BTRFS_TREE_CHECKER_H
#define BTRFS_TREE_CHECKER_H
#include "ctree.h"
#include "extent_io.h"
#include <uapi/linux/btrfs_tree.h>
struct extent_buffer;
struct btrfs_chunk;
/* All the extra info needed to verify the parentness of a tree block. */
struct btrfs_tree_parent_check {
/*
* The owner check against the tree block.
*
* Can be 0 to skip the owner check.
*/
u64 owner_root;
/*
* Expected transid, can be 0 to skip the check, but such skip
* should only be utlized for backref walk related code.
*/
u64 transid;
/*
* The expected first key.
*
* This check can be skipped if @has_first_key is false, such skip
* can happen for case where we don't have the parent node key,
* e.g. reading the tree root, doing backref walk.
*/
struct btrfs_key first_key;
bool has_first_key;
/* The expected level. Should always be set. */
u8 level;
};
/*
* Comprehensive leaf checker.

Some files were not shown because too many files have changed in this diff Show More