2020-11-10 19:26:07 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
|
|
|
|
#ifndef BTRFS_ZONED_H
|
|
|
|
#define BTRFS_ZONED_H
|
|
|
|
|
|
|
|
#include <linux/types.h>
|
2020-11-10 19:26:08 +08:00
|
|
|
#include <linux/blkdev.h>
|
2022-10-19 22:50:49 +08:00
|
|
|
#include "messages.h"
|
btrfs: implement log-structured superblock for ZONED mode
Superblock (and its copies) is the only data structure in btrfs which
has a fixed location on a device. Since we cannot overwrite in a
sequential write required zone, we cannot place superblock in the zone.
One easy solution is limiting superblock and copies to be placed only in
conventional zones. However, this method has two downsides: one is
reduced number of superblock copies. The location of the second copy of
superblock is 256GB, which is in a sequential write required zone on
typical devices in the market today. So, the number of superblock and
copies is limited to be two. Second downside is that we cannot support
devices which have no conventional zones at all.
To solve these two problems, we employ superblock log writing. It uses
two adjacent zones as a circular buffer to write updated superblocks.
Once the first zone is filled up, start writing into the second one.
Then, when both zones are filled up and before starting to write to the
first zone again, it reset the first zone.
We can determine the position of the latest superblock by reading write
pointer information from a device. One corner case is when both zones
are full. For this situation, we read out the last superblock of each
zone, and compare them to determine which zone is older.
The following zones are reserved as the circular buffer on ZONED btrfs.
- The primary superblock: zones 0 and 1
- The first copy: zones 16 and 17
- The second copy: zones 1024 or zone at 256GB which is minimum, and
next to it
If these reserved zones are conventional, superblock is written fixed at
the start of the zone without logging.
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-10 19:26:14 +08:00
|
|
|
#include "volumes.h"
|
|
|
|
#include "disk-io.h"
|
2021-02-04 18:22:18 +08:00
|
|
|
#include "block-group.h"
|
2021-12-07 22:28:34 +08:00
|
|
|
#include "btrfs_inode.h"
|
2020-11-10 19:26:07 +08:00
|
|
|
|
2022-03-29 16:56:06 +08:00
|
|
|
#define BTRFS_DEFAULT_RECLAIM_THRESH (75)
|
2021-04-19 15:41:02 +08:00
|
|
|
|
2020-11-10 19:26:07 +08:00
|
|
|
struct btrfs_zoned_device_info {
|
|
|
|
/*
|
|
|
|
* Number of zones, zone size and types of zones if bdev is a
|
|
|
|
* zoned block device.
|
|
|
|
*/
|
|
|
|
u64 zone_size;
|
|
|
|
u8 zone_size_shift;
|
|
|
|
u32 nr_zones;
|
2021-08-19 20:19:15 +08:00
|
|
|
unsigned int max_active_zones;
|
|
|
|
atomic_t active_zones_left;
|
2020-11-10 19:26:07 +08:00
|
|
|
unsigned long *seq_zones;
|
|
|
|
unsigned long *empty_zones;
|
2021-08-19 20:19:15 +08:00
|
|
|
unsigned long *active_zones;
|
2021-11-11 13:14:38 +08:00
|
|
|
struct blk_zone *zone_cache;
|
btrfs: implement log-structured superblock for ZONED mode
Superblock (and its copies) is the only data structure in btrfs which
has a fixed location on a device. Since we cannot overwrite in a
sequential write required zone, we cannot place superblock in the zone.
One easy solution is limiting superblock and copies to be placed only in
conventional zones. However, this method has two downsides: one is
reduced number of superblock copies. The location of the second copy of
superblock is 256GB, which is in a sequential write required zone on
typical devices in the market today. So, the number of superblock and
copies is limited to be two. Second downside is that we cannot support
devices which have no conventional zones at all.
To solve these two problems, we employ superblock log writing. It uses
two adjacent zones as a circular buffer to write updated superblocks.
Once the first zone is filled up, start writing into the second one.
Then, when both zones are filled up and before starting to write to the
first zone again, it reset the first zone.
We can determine the position of the latest superblock by reading write
pointer information from a device. One corner case is when both zones
are full. For this situation, we read out the last superblock of each
zone, and compare them to determine which zone is older.
The following zones are reserved as the circular buffer on ZONED btrfs.
- The primary superblock: zones 0 and 1
- The first copy: zones 16 and 17
- The second copy: zones 1024 or zone at 256GB which is minimum, and
next to it
If these reserved zones are conventional, superblock is written fixed at
the start of the zone without logging.
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-10 19:26:14 +08:00
|
|
|
struct blk_zone sb_zones[2 * BTRFS_SUPER_MIRROR_MAX];
|
2020-11-10 19:26:07 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
#ifdef CONFIG_BLK_DEV_ZONED
|
|
|
|
int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
|
|
|
|
struct blk_zone *zone);
|
2021-02-04 18:21:42 +08:00
|
|
|
int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info);
|
2021-11-11 13:14:38 +08:00
|
|
|
int btrfs_get_dev_zone_info(struct btrfs_device *device, bool populate_cache);
|
2020-11-10 19:26:07 +08:00
|
|
|
void btrfs_destroy_dev_zone_info(struct btrfs_device *device);
|
2022-11-04 22:12:33 +08:00
|
|
|
struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(struct btrfs_device *orig_dev);
|
2020-11-10 19:26:08 +08:00
|
|
|
int btrfs_check_zoned_mode(struct btrfs_fs_info *fs_info);
|
2020-11-10 19:26:10 +08:00
|
|
|
int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info);
|
btrfs: implement log-structured superblock for ZONED mode
Superblock (and its copies) is the only data structure in btrfs which
has a fixed location on a device. Since we cannot overwrite in a
sequential write required zone, we cannot place superblock in the zone.
One easy solution is limiting superblock and copies to be placed only in
conventional zones. However, this method has two downsides: one is
reduced number of superblock copies. The location of the second copy of
superblock is 256GB, which is in a sequential write required zone on
typical devices in the market today. So, the number of superblock and
copies is limited to be two. Second downside is that we cannot support
devices which have no conventional zones at all.
To solve these two problems, we employ superblock log writing. It uses
two adjacent zones as a circular buffer to write updated superblocks.
Once the first zone is filled up, start writing into the second one.
Then, when both zones are filled up and before starting to write to the
first zone again, it reset the first zone.
We can determine the position of the latest superblock by reading write
pointer information from a device. One corner case is when both zones
are full. For this situation, we read out the last superblock of each
zone, and compare them to determine which zone is older.
The following zones are reserved as the circular buffer on ZONED btrfs.
- The primary superblock: zones 0 and 1
- The first copy: zones 16 and 17
- The second copy: zones 1024 or zone at 256GB which is minimum, and
next to it
If these reserved zones are conventional, superblock is written fixed at
the start of the zone without logging.
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-10 19:26:14 +08:00
|
|
|
int btrfs_sb_log_location_bdev(struct block_device *bdev, int mirror, int rw,
|
|
|
|
u64 *bytenr_ret);
|
|
|
|
int btrfs_sb_log_location(struct btrfs_device *device, int mirror, int rw,
|
|
|
|
u64 *bytenr_ret);
|
2021-08-19 20:19:14 +08:00
|
|
|
int btrfs_advance_sb_log(struct btrfs_device *device, int mirror);
|
btrfs: implement log-structured superblock for ZONED mode
Superblock (and its copies) is the only data structure in btrfs which
has a fixed location on a device. Since we cannot overwrite in a
sequential write required zone, we cannot place superblock in the zone.
One easy solution is limiting superblock and copies to be placed only in
conventional zones. However, this method has two downsides: one is
reduced number of superblock copies. The location of the second copy of
superblock is 256GB, which is in a sequential write required zone on
typical devices in the market today. So, the number of superblock and
copies is limited to be two. Second downside is that we cannot support
devices which have no conventional zones at all.
To solve these two problems, we employ superblock log writing. It uses
two adjacent zones as a circular buffer to write updated superblocks.
Once the first zone is filled up, start writing into the second one.
Then, when both zones are filled up and before starting to write to the
first zone again, it reset the first zone.
We can determine the position of the latest superblock by reading write
pointer information from a device. One corner case is when both zones
are full. For this situation, we read out the last superblock of each
zone, and compare them to determine which zone is older.
The following zones are reserved as the circular buffer on ZONED btrfs.
- The primary superblock: zones 0 and 1
- The first copy: zones 16 and 17
- The second copy: zones 1024 or zone at 256GB which is minimum, and
next to it
If these reserved zones are conventional, superblock is written fixed at
the start of the zone without logging.
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-10 19:26:14 +08:00
|
|
|
int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror);
|
2021-02-04 18:21:48 +08:00
|
|
|
u64 btrfs_find_allocatable_zones(struct btrfs_device *device, u64 hole_start,
|
|
|
|
u64 hole_end, u64 num_bytes);
|
|
|
|
int btrfs_reset_device_zone(struct btrfs_device *device, u64 physical,
|
|
|
|
u64 length, u64 *bytes);
|
|
|
|
int btrfs_ensure_empty_zones(struct btrfs_device *device, u64 start, u64 size);
|
2021-02-04 18:21:51 +08:00
|
|
|
int btrfs_load_block_group_zone_info(struct btrfs_block_group *cache, bool new);
|
2021-02-04 18:21:52 +08:00
|
|
|
void btrfs_calc_zone_unusable(struct btrfs_block_group *cache);
|
2021-02-04 18:21:54 +08:00
|
|
|
void btrfs_redirty_list_add(struct btrfs_transaction *trans,
|
|
|
|
struct extent_buffer *eb);
|
|
|
|
void btrfs_free_redirty_list(struct btrfs_transaction *trans);
|
2022-12-12 15:37:21 +08:00
|
|
|
bool btrfs_use_zone_append(struct btrfs_bio *bbio);
|
2023-01-21 14:50:18 +08:00
|
|
|
void btrfs_record_physical_zoned(struct btrfs_bio *bbio);
|
2021-02-04 18:22:05 +08:00
|
|
|
void btrfs_rewrite_logical_zoned(struct btrfs_ordered_extent *ordered);
|
2021-02-04 18:22:08 +08:00
|
|
|
bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
|
|
|
|
struct extent_buffer *eb,
|
|
|
|
struct btrfs_block_group **cache_ret);
|
|
|
|
void btrfs_revert_meta_write_pointer(struct btrfs_block_group *cache,
|
|
|
|
struct extent_buffer *eb);
|
2021-02-04 18:22:13 +08:00
|
|
|
int btrfs_zoned_issue_zeroout(struct btrfs_device *device, u64 physical, u64 length);
|
2021-02-04 18:22:14 +08:00
|
|
|
int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev, u64 logical,
|
|
|
|
u64 physical_start, u64 physical_pos);
|
2021-08-19 20:19:17 +08:00
|
|
|
bool btrfs_zone_activate(struct btrfs_block_group *block_group);
|
|
|
|
int btrfs_zone_finish(struct btrfs_block_group *block_group);
|
btrfs: zoned: fix chunk allocation condition for zoned allocator
The ZNS specification defines a limit on the number of "active"
zones. That limit impose us to limit the number of block groups which
can be used for an allocation at the same time. Not to exceed the
limit, we reuse the existing active block groups as much as possible
when we can't activate any other zones without sacrificing an already
activated block group in commit a85f05e59bc1 ("btrfs: zoned: avoid
chunk allocation if active block group has enough space").
However, the check is wrong in two ways. First, it checks the
condition for every raid index (ffe_ctl->index). Even if it reaches
the condition and "ffe_ctl->max_extent_size >=
ffe_ctl->min_alloc_size" is met, there can be other block groups
having enough space to hold ffe_ctl->num_bytes. (Actually, this won't
happen in the current zoned code as it only supports SINGLE
profile. But, it can happen once it enables other RAID types.)
Second, it checks the active zone availability depending on the
raid index. The raid index is just an index for
space_info->block_groups, so it has nothing to do with chunk allocation.
These mistakes are causing a faulty allocation in a certain
situation. Consider we are running zoned btrfs on a device whose
max_active_zone == 0 (no limit). And, suppose no block group have a
room to fit ffe_ctl->num_bytes but some room to meet
ffe_ctl->min_alloc_size (i.e. max_extent_size > num_bytes >=
min_alloc_size).
In this situation, the following occur:
- With SINGLE raid_index, it reaches the chunk allocation checking
code
- The check returns true because we can activate a new zone (no limit)
- But, before allocating the chunk, it iterates to the next raid index
(RAID5)
- Since there are no RAID5 block groups on zoned mode, it again
reaches the check code
- The check returns false because of btrfs_can_activate_zone()'s "if
(raid_index != BTRFS_RAID_SINGLE)" part
- That results in returning -ENOSPC without allocating a new chunk
As a result, we end up hitting -ENOSPC too early.
Move the check to the right place in the can_allocate_chunk() hook,
and do the active zone check depending on the allocation flag, not on
the raid index.
CC: stable@vger.kernel.org # 5.16
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-12-07 23:35:49 +08:00
|
|
|
bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices, u64 flags);
|
2021-08-19 20:19:23 +08:00
|
|
|
void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info, u64 logical,
|
|
|
|
u64 length);
|
2022-05-04 08:48:53 +08:00
|
|
|
void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
|
|
|
|
struct extent_buffer *eb);
|
2021-09-09 00:19:26 +08:00
|
|
|
void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg);
|
2021-11-11 13:14:38 +08:00
|
|
|
void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info);
|
2022-03-29 16:56:09 +08:00
|
|
|
bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info);
|
btrfs: zoned: prevent allocation from previous data relocation BG
After commit 5f0addf7b890 ("btrfs: zoned: use dedicated lock for data
relocation"), we observe IO errors on e.g, btrfs/232 like below.
[09.0][T4038707] WARNING: CPU: 3 PID: 4038707 at fs/btrfs/extent-tree.c:2381 btrfs_cross_ref_exist+0xfc/0x120 [btrfs]
<snip>
[09.9][T4038707] Call Trace:
[09.5][T4038707] <TASK>
[09.3][T4038707] run_delalloc_nocow+0x7f1/0x11a0 [btrfs]
[09.6][T4038707] ? test_range_bit+0x174/0x320 [btrfs]
[09.2][T4038707] ? fallback_to_cow+0x980/0x980 [btrfs]
[09.3][T4038707] ? find_lock_delalloc_range+0x33e/0x3e0 [btrfs]
[09.5][T4038707] btrfs_run_delalloc_range+0x445/0x1320 [btrfs]
[09.2][T4038707] ? test_range_bit+0x320/0x320 [btrfs]
[09.4][T4038707] ? lock_downgrade+0x6a0/0x6a0
[09.2][T4038707] ? orc_find.part.0+0x1ed/0x300
[09.5][T4038707] ? __module_address.part.0+0x25/0x300
[09.0][T4038707] writepage_delalloc+0x159/0x310 [btrfs]
<snip>
[09.4][ C3] sd 10:0:1:0: [sde] tag#2620 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
[09.5][ C3] sd 10:0:1:0: [sde] tag#2620 Sense Key : Illegal Request [current]
[09.9][ C3] sd 10:0:1:0: [sde] tag#2620 Add. Sense: Unaligned write command
[09.5][ C3] sd 10:0:1:0: [sde] tag#2620 CDB: Write(16) 8a 00 00 00 00 00 02 f3 63 87 00 00 00 2c 00 00
[09.4][ C3] critical target error, dev sde, sector 396041272 op 0x1:(WRITE) flags 0x800 phys_seg 3 prio class 0
[09.9][ C3] BTRFS error (device dm-1): bdev /dev/mapper/dml_102_2 errs: wr 1, rd 0, flush 0, corrupt 0, gen 0
The IO errors occur when we allocate a regular extent in previous data
relocation block group.
On zoned btrfs, we use a dedicated block group to relocate a data
extent. Thus, we allocate relocating data extents (pre-alloc) only from
the dedicated block group and vice versa. Once the free space in the
dedicated block group gets tight, a relocating extent may not fit into
the block group. In that case, we need to switch the dedicated block
group to the next one. Then, the previous one is now freed up for
allocating a regular extent. The BG is already not enough to allocate
the relocating extent, but there is still room to allocate a smaller
extent. Now the problem happens. By allocating a regular extent while
nocow IOs for the relocation is still on-going, we will issue WRITE IOs
(for relocation) and ZONE APPEND IOs (for the regular writes) at the
same time. That mixed IOs confuses the write pointer and arises the
unaligned write errors.
This commit introduces a new bit 'zoned_data_reloc_ongoing' to the
btrfs_block_group. We set this bit before releasing the dedicated block
group, and no extent are allocated from a block group having this bit
set. This bit is similar to setting block_group->ro, but is different from
it by allowing nocow writes to start.
Once all the nocow IO for relocation is done (hooked from
btrfs_finish_ordered_io), we reset the bit to release the block group for
further allocation.
Fixes: c2707a255623 ("btrfs: zoned: add a dedicated data relocation block group")
CC: stable@vger.kernel.org # 5.16+
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-07 15:08:29 +08:00
|
|
|
void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info, u64 logical,
|
|
|
|
u64 length);
|
2022-07-09 07:18:44 +08:00
|
|
|
int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info);
|
2022-07-09 07:18:47 +08:00
|
|
|
int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_space_info *space_info, bool do_finish);
|
2020-11-10 19:26:07 +08:00
|
|
|
#else /* CONFIG_BLK_DEV_ZONED */
|
|
|
|
static inline int btrfs_get_dev_zone(struct btrfs_device *device, u64 pos,
|
|
|
|
struct blk_zone *zone)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-02-04 18:21:42 +08:00
|
|
|
static inline int btrfs_get_dev_zone_info_all_devices(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-11-11 13:14:38 +08:00
|
|
|
static inline int btrfs_get_dev_zone_info(struct btrfs_device *device,
|
|
|
|
bool populate_cache)
|
2020-11-10 19:26:07 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void btrfs_destroy_dev_zone_info(struct btrfs_device *device) { }
|
|
|
|
|
2022-11-04 22:12:33 +08:00
|
|
|
/*
|
|
|
|
* In case the kernel is compiled without CONFIG_BLK_DEV_ZONED we'll never call
|
|
|
|
* into btrfs_clone_dev_zone_info() so it's safe to return NULL here.
|
|
|
|
*/
|
|
|
|
static inline struct btrfs_zoned_device_info *btrfs_clone_dev_zone_info(
|
|
|
|
struct btrfs_device *orig_dev)
|
|
|
|
{
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
|
2020-11-10 19:26:08 +08:00
|
|
|
static inline int btrfs_check_zoned_mode(const struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
if (!btrfs_is_zoned(fs_info))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
btrfs_err(fs_info, "zoned block devices support is not enabled");
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2020-11-10 19:26:10 +08:00
|
|
|
static inline int btrfs_check_mountopts_zoned(struct btrfs_fs_info *info)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
btrfs: implement log-structured superblock for ZONED mode
Superblock (and its copies) is the only data structure in btrfs which
has a fixed location on a device. Since we cannot overwrite in a
sequential write required zone, we cannot place superblock in the zone.
One easy solution is limiting superblock and copies to be placed only in
conventional zones. However, this method has two downsides: one is
reduced number of superblock copies. The location of the second copy of
superblock is 256GB, which is in a sequential write required zone on
typical devices in the market today. So, the number of superblock and
copies is limited to be two. Second downside is that we cannot support
devices which have no conventional zones at all.
To solve these two problems, we employ superblock log writing. It uses
two adjacent zones as a circular buffer to write updated superblocks.
Once the first zone is filled up, start writing into the second one.
Then, when both zones are filled up and before starting to write to the
first zone again, it reset the first zone.
We can determine the position of the latest superblock by reading write
pointer information from a device. One corner case is when both zones
are full. For this situation, we read out the last superblock of each
zone, and compare them to determine which zone is older.
The following zones are reserved as the circular buffer on ZONED btrfs.
- The primary superblock: zones 0 and 1
- The first copy: zones 16 and 17
- The second copy: zones 1024 or zone at 256GB which is minimum, and
next to it
If these reserved zones are conventional, superblock is written fixed at
the start of the zone without logging.
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-10 19:26:14 +08:00
|
|
|
static inline int btrfs_sb_log_location_bdev(struct block_device *bdev,
|
|
|
|
int mirror, int rw, u64 *bytenr_ret)
|
|
|
|
{
|
|
|
|
*bytenr_ret = btrfs_sb_offset(mirror);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int btrfs_sb_log_location(struct btrfs_device *device, int mirror,
|
|
|
|
int rw, u64 *bytenr_ret)
|
|
|
|
{
|
|
|
|
*bytenr_ret = btrfs_sb_offset(mirror);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-08-19 20:19:14 +08:00
|
|
|
static inline int btrfs_advance_sb_log(struct btrfs_device *device, int mirror)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
btrfs: implement log-structured superblock for ZONED mode
Superblock (and its copies) is the only data structure in btrfs which
has a fixed location on a device. Since we cannot overwrite in a
sequential write required zone, we cannot place superblock in the zone.
One easy solution is limiting superblock and copies to be placed only in
conventional zones. However, this method has two downsides: one is
reduced number of superblock copies. The location of the second copy of
superblock is 256GB, which is in a sequential write required zone on
typical devices in the market today. So, the number of superblock and
copies is limited to be two. Second downside is that we cannot support
devices which have no conventional zones at all.
To solve these two problems, we employ superblock log writing. It uses
two adjacent zones as a circular buffer to write updated superblocks.
Once the first zone is filled up, start writing into the second one.
Then, when both zones are filled up and before starting to write to the
first zone again, it reset the first zone.
We can determine the position of the latest superblock by reading write
pointer information from a device. One corner case is when both zones
are full. For this situation, we read out the last superblock of each
zone, and compare them to determine which zone is older.
The following zones are reserved as the circular buffer on ZONED btrfs.
- The primary superblock: zones 0 and 1
- The first copy: zones 16 and 17
- The second copy: zones 1024 or zone at 256GB which is minimum, and
next to it
If these reserved zones are conventional, superblock is written fixed at
the start of the zone without logging.
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-10 19:26:14 +08:00
|
|
|
|
|
|
|
static inline int btrfs_reset_sb_log_zones(struct block_device *bdev, int mirror)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-02-04 18:21:48 +08:00
|
|
|
static inline u64 btrfs_find_allocatable_zones(struct btrfs_device *device,
|
|
|
|
u64 hole_start, u64 hole_end,
|
|
|
|
u64 num_bytes)
|
|
|
|
{
|
|
|
|
return hole_start;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int btrfs_reset_device_zone(struct btrfs_device *device,
|
|
|
|
u64 physical, u64 length, u64 *bytes)
|
|
|
|
{
|
|
|
|
*bytes = 0;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int btrfs_ensure_empty_zones(struct btrfs_device *device,
|
|
|
|
u64 start, u64 size)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-02-04 18:21:50 +08:00
|
|
|
static inline int btrfs_load_block_group_zone_info(
|
2021-02-04 18:21:51 +08:00
|
|
|
struct btrfs_block_group *cache, bool new)
|
2021-02-04 18:21:50 +08:00
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-02-04 18:21:52 +08:00
|
|
|
static inline void btrfs_calc_zone_unusable(struct btrfs_block_group *cache) { }
|
|
|
|
|
2021-02-04 18:21:54 +08:00
|
|
|
static inline void btrfs_redirty_list_add(struct btrfs_transaction *trans,
|
|
|
|
struct extent_buffer *eb) { }
|
|
|
|
static inline void btrfs_free_redirty_list(struct btrfs_transaction *trans) { }
|
|
|
|
|
2022-12-12 15:37:21 +08:00
|
|
|
static inline bool btrfs_use_zone_append(struct btrfs_bio *bbio)
|
2021-02-04 18:22:03 +08:00
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
2021-02-04 18:22:05 +08:00
|
|
|
|
2023-01-21 14:50:18 +08:00
|
|
|
static inline void btrfs_record_physical_zoned(struct btrfs_bio *bbio)
|
2021-02-04 18:22:05 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void btrfs_rewrite_logical_zoned(
|
|
|
|
struct btrfs_ordered_extent *ordered) { }
|
|
|
|
|
2021-02-04 18:22:08 +08:00
|
|
|
static inline bool btrfs_check_meta_write_pointer(struct btrfs_fs_info *fs_info,
|
|
|
|
struct extent_buffer *eb,
|
|
|
|
struct btrfs_block_group **cache_ret)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void btrfs_revert_meta_write_pointer(
|
|
|
|
struct btrfs_block_group *cache,
|
|
|
|
struct extent_buffer *eb)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2021-02-04 18:22:13 +08:00
|
|
|
static inline int btrfs_zoned_issue_zeroout(struct btrfs_device *device,
|
|
|
|
u64 physical, u64 length)
|
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2021-02-04 18:22:14 +08:00
|
|
|
static inline int btrfs_sync_zone_write_pointer(struct btrfs_device *tgt_dev,
|
|
|
|
u64 logical, u64 physical_start,
|
|
|
|
u64 physical_pos)
|
|
|
|
{
|
|
|
|
return -EOPNOTSUPP;
|
|
|
|
}
|
|
|
|
|
2021-08-19 20:19:17 +08:00
|
|
|
static inline bool btrfs_zone_activate(struct btrfs_block_group *block_group)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int btrfs_zone_finish(struct btrfs_block_group *block_group)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2021-08-19 20:19:22 +08:00
|
|
|
static inline bool btrfs_can_activate_zone(struct btrfs_fs_devices *fs_devices,
|
btrfs: zoned: fix chunk allocation condition for zoned allocator
The ZNS specification defines a limit on the number of "active"
zones. That limit impose us to limit the number of block groups which
can be used for an allocation at the same time. Not to exceed the
limit, we reuse the existing active block groups as much as possible
when we can't activate any other zones without sacrificing an already
activated block group in commit a85f05e59bc1 ("btrfs: zoned: avoid
chunk allocation if active block group has enough space").
However, the check is wrong in two ways. First, it checks the
condition for every raid index (ffe_ctl->index). Even if it reaches
the condition and "ffe_ctl->max_extent_size >=
ffe_ctl->min_alloc_size" is met, there can be other block groups
having enough space to hold ffe_ctl->num_bytes. (Actually, this won't
happen in the current zoned code as it only supports SINGLE
profile. But, it can happen once it enables other RAID types.)
Second, it checks the active zone availability depending on the
raid index. The raid index is just an index for
space_info->block_groups, so it has nothing to do with chunk allocation.
These mistakes are causing a faulty allocation in a certain
situation. Consider we are running zoned btrfs on a device whose
max_active_zone == 0 (no limit). And, suppose no block group have a
room to fit ffe_ctl->num_bytes but some room to meet
ffe_ctl->min_alloc_size (i.e. max_extent_size > num_bytes >=
min_alloc_size).
In this situation, the following occur:
- With SINGLE raid_index, it reaches the chunk allocation checking
code
- The check returns true because we can activate a new zone (no limit)
- But, before allocating the chunk, it iterates to the next raid index
(RAID5)
- Since there are no RAID5 block groups on zoned mode, it again
reaches the check code
- The check returns false because of btrfs_can_activate_zone()'s "if
(raid_index != BTRFS_RAID_SINGLE)" part
- That results in returning -ENOSPC without allocating a new chunk
As a result, we end up hitting -ENOSPC too early.
Move the check to the right place in the can_allocate_chunk() hook,
and do the active zone check depending on the allocation flag, not on
the raid index.
CC: stable@vger.kernel.org # 5.16
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2021-12-07 23:35:49 +08:00
|
|
|
u64 flags)
|
2021-08-19 20:19:22 +08:00
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-08-19 20:19:23 +08:00
|
|
|
static inline void btrfs_zone_finish_endio(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 logical, u64 length) { }
|
|
|
|
|
2022-05-04 08:48:53 +08:00
|
|
|
static inline void btrfs_schedule_zone_finish_bg(struct btrfs_block_group *bg,
|
|
|
|
struct extent_buffer *eb) { }
|
|
|
|
|
2021-09-09 00:19:26 +08:00
|
|
|
static inline void btrfs_clear_data_reloc_bg(struct btrfs_block_group *bg) { }
|
|
|
|
|
2021-11-11 13:14:38 +08:00
|
|
|
static inline void btrfs_free_zone_cache(struct btrfs_fs_info *fs_info) { }
|
2022-03-29 16:56:09 +08:00
|
|
|
|
|
|
|
static inline bool btrfs_zoned_should_reclaim(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
return false;
|
|
|
|
}
|
btrfs: zoned: prevent allocation from previous data relocation BG
After commit 5f0addf7b890 ("btrfs: zoned: use dedicated lock for data
relocation"), we observe IO errors on e.g, btrfs/232 like below.
[09.0][T4038707] WARNING: CPU: 3 PID: 4038707 at fs/btrfs/extent-tree.c:2381 btrfs_cross_ref_exist+0xfc/0x120 [btrfs]
<snip>
[09.9][T4038707] Call Trace:
[09.5][T4038707] <TASK>
[09.3][T4038707] run_delalloc_nocow+0x7f1/0x11a0 [btrfs]
[09.6][T4038707] ? test_range_bit+0x174/0x320 [btrfs]
[09.2][T4038707] ? fallback_to_cow+0x980/0x980 [btrfs]
[09.3][T4038707] ? find_lock_delalloc_range+0x33e/0x3e0 [btrfs]
[09.5][T4038707] btrfs_run_delalloc_range+0x445/0x1320 [btrfs]
[09.2][T4038707] ? test_range_bit+0x320/0x320 [btrfs]
[09.4][T4038707] ? lock_downgrade+0x6a0/0x6a0
[09.2][T4038707] ? orc_find.part.0+0x1ed/0x300
[09.5][T4038707] ? __module_address.part.0+0x25/0x300
[09.0][T4038707] writepage_delalloc+0x159/0x310 [btrfs]
<snip>
[09.4][ C3] sd 10:0:1:0: [sde] tag#2620 FAILED Result: hostbyte=DID_OK driverbyte=DRIVER_OK cmd_age=0s
[09.5][ C3] sd 10:0:1:0: [sde] tag#2620 Sense Key : Illegal Request [current]
[09.9][ C3] sd 10:0:1:0: [sde] tag#2620 Add. Sense: Unaligned write command
[09.5][ C3] sd 10:0:1:0: [sde] tag#2620 CDB: Write(16) 8a 00 00 00 00 00 02 f3 63 87 00 00 00 2c 00 00
[09.4][ C3] critical target error, dev sde, sector 396041272 op 0x1:(WRITE) flags 0x800 phys_seg 3 prio class 0
[09.9][ C3] BTRFS error (device dm-1): bdev /dev/mapper/dml_102_2 errs: wr 1, rd 0, flush 0, corrupt 0, gen 0
The IO errors occur when we allocate a regular extent in previous data
relocation block group.
On zoned btrfs, we use a dedicated block group to relocate a data
extent. Thus, we allocate relocating data extents (pre-alloc) only from
the dedicated block group and vice versa. Once the free space in the
dedicated block group gets tight, a relocating extent may not fit into
the block group. In that case, we need to switch the dedicated block
group to the next one. Then, the previous one is now freed up for
allocating a regular extent. The BG is already not enough to allocate
the relocating extent, but there is still room to allocate a smaller
extent. Now the problem happens. By allocating a regular extent while
nocow IOs for the relocation is still on-going, we will issue WRITE IOs
(for relocation) and ZONE APPEND IOs (for the regular writes) at the
same time. That mixed IOs confuses the write pointer and arises the
unaligned write errors.
This commit introduces a new bit 'zoned_data_reloc_ongoing' to the
btrfs_block_group. We set this bit before releasing the dedicated block
group, and no extent are allocated from a block group having this bit
set. This bit is similar to setting block_group->ro, but is different from
it by allowing nocow writes to start.
Once all the nocow IO for relocation is done (hooked from
btrfs_finish_ordered_io), we reset the bit to release the block group for
further allocation.
Fixes: c2707a255623 ("btrfs: zoned: add a dedicated data relocation block group")
CC: stable@vger.kernel.org # 5.16+
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2022-06-07 15:08:29 +08:00
|
|
|
|
|
|
|
static inline void btrfs_zoned_release_data_reloc_bg(struct btrfs_fs_info *fs_info,
|
|
|
|
u64 logical, u64 length) { }
|
2022-07-09 07:18:44 +08:00
|
|
|
|
|
|
|
static inline int btrfs_zone_finish_one_bg(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
|
|
|
|
2022-07-09 07:18:47 +08:00
|
|
|
static inline int btrfs_zoned_activate_one_bg(struct btrfs_fs_info *fs_info,
|
|
|
|
struct btrfs_space_info *space_info,
|
|
|
|
bool do_finish)
|
|
|
|
{
|
|
|
|
/* Consider all the block groups are active */
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2020-11-10 19:26:07 +08:00
|
|
|
#endif
|
|
|
|
|
|
|
|
static inline bool btrfs_dev_is_sequential(struct btrfs_device *device, u64 pos)
|
|
|
|
{
|
|
|
|
struct btrfs_zoned_device_info *zone_info = device->zone_info;
|
|
|
|
|
|
|
|
if (!zone_info)
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return test_bit(pos >> zone_info->zone_size_shift, zone_info->seq_zones);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline bool btrfs_dev_is_empty_zone(struct btrfs_device *device, u64 pos)
|
|
|
|
{
|
|
|
|
struct btrfs_zoned_device_info *zone_info = device->zone_info;
|
|
|
|
|
|
|
|
if (!zone_info)
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return test_bit(pos >> zone_info->zone_size_shift, zone_info->empty_zones);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void btrfs_dev_set_empty_zone_bit(struct btrfs_device *device,
|
|
|
|
u64 pos, bool set)
|
|
|
|
{
|
|
|
|
struct btrfs_zoned_device_info *zone_info = device->zone_info;
|
|
|
|
unsigned int zno;
|
|
|
|
|
|
|
|
if (!zone_info)
|
|
|
|
return;
|
|
|
|
|
|
|
|
zno = pos >> zone_info->zone_size_shift;
|
|
|
|
if (set)
|
|
|
|
set_bit(zno, zone_info->empty_zones);
|
|
|
|
else
|
|
|
|
clear_bit(zno, zone_info->empty_zones);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void btrfs_dev_set_zone_empty(struct btrfs_device *device, u64 pos)
|
|
|
|
{
|
|
|
|
btrfs_dev_set_empty_zone_bit(device, pos, true);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void btrfs_dev_clear_zone_empty(struct btrfs_device *device, u64 pos)
|
|
|
|
{
|
|
|
|
btrfs_dev_set_empty_zone_bit(device, pos, false);
|
|
|
|
}
|
|
|
|
|
2020-11-10 19:26:08 +08:00
|
|
|
static inline bool btrfs_check_device_zone_type(const struct btrfs_fs_info *fs_info,
|
|
|
|
struct block_device *bdev)
|
|
|
|
{
|
|
|
|
if (btrfs_is_zoned(fs_info)) {
|
2021-02-04 18:21:47 +08:00
|
|
|
/*
|
|
|
|
* We can allow a regular device on a zoned filesystem, because
|
|
|
|
* we will emulate the zoned capabilities.
|
|
|
|
*/
|
|
|
|
if (!bdev_is_zoned(bdev))
|
|
|
|
return true;
|
|
|
|
|
|
|
|
return fs_info->zone_size ==
|
|
|
|
(bdev_zone_sectors(bdev) << SECTOR_SHIFT);
|
2020-11-10 19:26:08 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Do not allow Host Manged zoned device */
|
|
|
|
return bdev_zoned_model(bdev) != BLK_ZONED_HM;
|
|
|
|
}
|
|
|
|
|
btrfs: implement log-structured superblock for ZONED mode
Superblock (and its copies) is the only data structure in btrfs which
has a fixed location on a device. Since we cannot overwrite in a
sequential write required zone, we cannot place superblock in the zone.
One easy solution is limiting superblock and copies to be placed only in
conventional zones. However, this method has two downsides: one is
reduced number of superblock copies. The location of the second copy of
superblock is 256GB, which is in a sequential write required zone on
typical devices in the market today. So, the number of superblock and
copies is limited to be two. Second downside is that we cannot support
devices which have no conventional zones at all.
To solve these two problems, we employ superblock log writing. It uses
two adjacent zones as a circular buffer to write updated superblocks.
Once the first zone is filled up, start writing into the second one.
Then, when both zones are filled up and before starting to write to the
first zone again, it reset the first zone.
We can determine the position of the latest superblock by reading write
pointer information from a device. One corner case is when both zones
are full. For this situation, we read out the last superblock of each
zone, and compare them to determine which zone is older.
The following zones are reserved as the circular buffer on ZONED btrfs.
- The primary superblock: zones 0 and 1
- The first copy: zones 16 and 17
- The second copy: zones 1024 or zone at 256GB which is minimum, and
next to it
If these reserved zones are conventional, superblock is written fixed at
the start of the zone without logging.
Signed-off-by: Naohiro Aota <naohiro.aota@wdc.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2020-11-10 19:26:14 +08:00
|
|
|
static inline bool btrfs_check_super_location(struct btrfs_device *device, u64 pos)
|
|
|
|
{
|
|
|
|
/*
|
|
|
|
* On a non-zoned device, any address is OK. On a zoned device,
|
|
|
|
* non-SEQUENTIAL WRITE REQUIRED zones are capable.
|
|
|
|
*/
|
|
|
|
return device->zone_info == NULL || !btrfs_dev_is_sequential(device, pos);
|
|
|
|
}
|
|
|
|
|
2021-02-04 18:21:56 +08:00
|
|
|
static inline bool btrfs_can_zone_reset(struct btrfs_device *device,
|
|
|
|
u64 physical, u64 length)
|
|
|
|
{
|
|
|
|
u64 zone_size;
|
|
|
|
|
|
|
|
if (!btrfs_dev_is_sequential(device, physical))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
zone_size = device->zone_info->zone_size;
|
|
|
|
if (!IS_ALIGNED(physical, zone_size) || !IS_ALIGNED(length, zone_size))
|
|
|
|
return false;
|
|
|
|
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2021-02-04 18:22:08 +08:00
|
|
|
static inline void btrfs_zoned_meta_io_lock(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
if (!btrfs_is_zoned(fs_info))
|
|
|
|
return;
|
|
|
|
mutex_lock(&fs_info->zoned_meta_io_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void btrfs_zoned_meta_io_unlock(struct btrfs_fs_info *fs_info)
|
|
|
|
{
|
|
|
|
if (!btrfs_is_zoned(fs_info))
|
|
|
|
return;
|
|
|
|
mutex_unlock(&fs_info->zoned_meta_io_lock);
|
|
|
|
}
|
|
|
|
|
2021-02-04 18:22:18 +08:00
|
|
|
static inline void btrfs_clear_treelog_bg(struct btrfs_block_group *bg)
|
|
|
|
{
|
|
|
|
struct btrfs_fs_info *fs_info = bg->fs_info;
|
|
|
|
|
|
|
|
if (!btrfs_is_zoned(fs_info))
|
|
|
|
return;
|
|
|
|
|
|
|
|
spin_lock(&fs_info->treelog_bg_lock);
|
|
|
|
if (fs_info->treelog_bg == bg->start)
|
|
|
|
fs_info->treelog_bg = 0;
|
|
|
|
spin_unlock(&fs_info->treelog_bg_lock);
|
|
|
|
}
|
|
|
|
|
2021-12-07 22:28:34 +08:00
|
|
|
static inline void btrfs_zoned_data_reloc_lock(struct btrfs_inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = inode->root;
|
|
|
|
|
|
|
|
if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
|
2022-04-18 15:15:03 +08:00
|
|
|
mutex_lock(&root->fs_info->zoned_data_reloc_io_lock);
|
2021-12-07 22:28:34 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void btrfs_zoned_data_reloc_unlock(struct btrfs_inode *inode)
|
|
|
|
{
|
|
|
|
struct btrfs_root *root = inode->root;
|
|
|
|
|
|
|
|
if (btrfs_is_data_reloc_root(root) && btrfs_is_zoned(root->fs_info))
|
2022-04-18 15:15:03 +08:00
|
|
|
mutex_unlock(&root->fs_info->zoned_data_reloc_io_lock);
|
2021-12-07 22:28:34 +08:00
|
|
|
}
|
|
|
|
|
2022-05-04 08:48:50 +08:00
|
|
|
static inline bool btrfs_zoned_bg_is_full(const struct btrfs_block_group *bg)
|
|
|
|
{
|
|
|
|
ASSERT(btrfs_is_zoned(bg->fs_info));
|
|
|
|
return (bg->alloc_offset == bg->zone_capacity);
|
|
|
|
}
|
|
|
|
|
2020-11-10 19:26:07 +08:00
|
|
|
#endif
|