diff --git a/fs/btrfs/block-rsv.c b/fs/btrfs/block-rsv.c index d07bd41a7c1e..e46dc3688983 100644 --- a/fs/btrfs/block-rsv.c +++ b/fs/btrfs/block-rsv.c @@ -6,6 +6,98 @@ #include "space-info.h" #include "transaction.h" +/* + * HOW DO BLOCK RESERVES WORK + * + * Think of block_rsv's as buckets for logically grouped metadata + * reservations. Each block_rsv has a ->size and a ->reserved. ->size is + * how large we want our block rsv to be, ->reserved is how much space is + * currently reserved for this block reserve. + * + * ->failfast exists for the truncate case, and is described below. + * + * NORMAL OPERATION + * + * -> Reserve + * Entrance: btrfs_block_rsv_add, btrfs_block_rsv_refill + * + * We call into btrfs_reserve_metadata_bytes() with our bytes, which is + * accounted for in space_info->bytes_may_use, and then add the bytes to + * ->reserved, and ->size in the case of btrfs_block_rsv_add. + * + * ->size is an over-estimation of how much we may use for a particular + * operation. + * + * -> Use + * Entrance: btrfs_use_block_rsv + * + * When we do a btrfs_alloc_tree_block() we call into btrfs_use_block_rsv() + * to determine the appropriate block_rsv to use, and then verify that + * ->reserved has enough space for our tree block allocation. Once + * successful we subtract fs_info->nodesize from ->reserved. + * + * -> Finish + * Entrance: btrfs_block_rsv_release + * + * We are finished with our operation, subtract our individual reservation + * from ->size, and then subtract ->size from ->reserved and free up the + * excess if there is any. + * + * There is some logic here to refill the delayed refs rsv or the global rsv + * as needed, otherwise the excess is subtracted from + * space_info->bytes_may_use. + * + * TYPES OF BLOCK RESERVES + * + * BLOCK_RSV_TRANS, BLOCK_RSV_DELOPS, BLOCK_RSV_CHUNK + * These behave normally, as described above, just within the confines of the + * lifetime of their particular operation (transaction for the whole trans + * handle lifetime, for example). + * + * BLOCK_RSV_GLOBAL + * It is impossible to properly account for all the space that may be required + * to make our extent tree updates. This block reserve acts as an overflow + * buffer in case our delayed refs reserve does not reserve enough space to + * update the extent tree. + * + * We can steal from this in some cases as well, notably on evict() or + * truncate() in order to help users recover from ENOSPC conditions. + * + * BLOCK_RSV_DELALLOC + * The individual item sizes are determined by the per-inode size + * calculations, which are described with the delalloc code. This is pretty + * straightforward, it's just the calculation of ->size encodes a lot of + * different items, and thus it gets used when updating inodes, inserting file + * extents, and inserting checksums. + * + * BLOCK_RSV_DELREFS + * We keep a running tally of how many delayed refs we have on the system. + * We assume each one of these delayed refs are going to use a full + * reservation. We use the transaction items and pre-reserve space for every + * operation, and use this reservation to refill any gap between ->size and + * ->reserved that may exist. + * + * From there it's straightforward, removing a delayed ref means we remove its + * count from ->size and free up reservations as necessary. Since this is + * the most dynamic block reserve in the system, we will try to refill this + * block reserve first with any excess returned by any other block reserve. + * + * BLOCK_RSV_EMPTY + * This is the fallback block reserve to make us try to reserve space if we + * don't have a specific bucket for this allocation. It is mostly used for + * updating the device tree and such, since that is a separate pool we're + * content to just reserve space from the space_info on demand. + * + * BLOCK_RSV_TEMP + * This is used by things like truncate and iput. We will temporarily + * allocate a block reserve, set it to some size, and then truncate bytes + * until we have no space left. With ->failfast set we'll simply return + * ENOSPC from btrfs_use_block_rsv() to signal that we need to unwind and try + * to make a new reservation. This is because these operations are + * unbounded, so we want to do as much work as we can, and then back off and + * re-reserve. + */ + static u64 block_rsv_release_bytes(struct btrfs_fs_info *fs_info, struct btrfs_block_rsv *block_rsv, struct btrfs_block_rsv *dest, u64 num_bytes,