btrfs: raid56: switch write path to rmw_rbio()

This includes the following changes: - Implement new raid_unplug() functions Now we don't need a workqueue to run the plug, as all our work is just queue rmw_rbio_work() call, which can be executed without sleep. - Implement a rmw_rbio_work_locked() helper This is for unlock_stripe(), which is already holding the full stripe lock. - Remove all the old functions This should already shows how complex the old functions are, as we ended up removing the following functions: * rmw_work() * validate_rbio_for_rmw() * raid56_rmw_end_io_work() * raid56_rmw_stripe() * full_stripe_write() * partial_stripe_write() * __raid56_parity_write() * run_plug() * unplug_work() * btrfs_raid_unplug() * rmw_work() * __raid56_parity_recover() * raid_recover_end_io_work() - Unexport rmw_rbio() Signed-off-by: Qu Wenruo <wqu@suse.com> Signed-off-by: David Sterba <dsterba@suse.com>
2022-11-01 19:16:09 +08:00 · 2022-11-01 19:16:09 +08:00 · 93723095b5
parent 5eb30ee26f
commit 93723095b5
2 changed files with 42 additions and 313 deletions
--- a/fs/btrfs/raid56.c
+++ b/fs/btrfs/raid56.c
@ -64,9 +64,9 @@ struct sector_ptr {
 	unsigned int uptodate:8;
 };

-static int __raid56_parity_recover(struct btrfs_raid_bio *rbio);
 static noinline void finish_rmw(struct btrfs_raid_bio *rbio);
-static void rmw_work(struct work_struct *work);
+static void rmw_rbio_work(struct work_struct *work);
+static void rmw_rbio_work_locked(struct work_struct *work);
 static int fail_bio_stripe(struct btrfs_raid_bio *rbio, struct bio *bio);
 static int fail_rbio_index(struct btrfs_raid_bio *rbio, int failed);
 static void index_rbio_pages(struct btrfs_raid_bio *rbio);
@ -816,7 +816,7 @@ static noinline void unlock_stripe(struct btrfs_raid_bio *rbio)
 				start_async_work(next, recover_rbio_work_locked);
 			} else if (next->operation == BTRFS_RBIO_WRITE) {
 				steal_rbio(rbio, next);
-				start_async_work(next, rmw_work);
+				start_async_work(next, rmw_rbio_work_locked);
 			} else if (next->operation == BTRFS_RBIO_PARITY_SCRUB) {
 				steal_rbio(rbio, next);
 				start_async_work(next, scrub_parity_work);
@ -1108,23 +1108,6 @@ static int rbio_add_io_sector(struct btrfs_raid_bio *rbio,
 	return 0;
 }

-/*
- * while we're doing the read/modify/write cycle, we could
- * have errors in reading pages off the disk.  This checks
- * for errors and if we're not able to read the page it'll
- * trigger parity reconstruction.  The rmw will be finished
- * after we've reconstructed the failed stripes
- */
-static void validate_rbio_for_rmw(struct btrfs_raid_bio *rbio)
-{
-	if (rbio->faila >= 0 || rbio->failb >= 0) {
-		BUG_ON(rbio->faila == rbio->real_stripes - 1);
-		__raid56_parity_recover(rbio);
-	} else {
-		finish_rmw(rbio);
-	}
-}
-
 static void index_one_bio(struct btrfs_raid_bio *rbio, struct bio *bio)
 {
 	const u32 sectorsize = rbio->bioc->fs_info->sectorsize;
@ -1601,31 +1584,6 @@ static void raid56_bio_end_io(struct bio *bio)
 			   &rbio->end_io_work);
 }

-/*
- * End io handler for the read phase of the RMW cycle.  All the bios here are
- * physical stripe bios we've read from the disk so we can recalculate the
- * parity of the stripe.
- *
- * This will usually kick off finish_rmw once all the bios are read in, but it
- * may trigger parity reconstruction if we had any errors along the way
- */
-static void raid56_rmw_end_io_work(struct work_struct *work)
-{
-	struct btrfs_raid_bio *rbio =
-		container_of(work, struct btrfs_raid_bio, end_io_work);
-
-	if (atomic_read(&rbio->error) > rbio->bioc->max_errors) {
-		rbio_orig_end_io(rbio, BLK_STS_IOERR);
-		return;
-	}
-
-	/*
-	 * This will normally call finish_rmw to start our write but if there
-	 * are any failed stripes we'll reconstruct from parity first.
-	 */
-	validate_rbio_for_rmw(rbio);
-}
-
 static int rmw_assemble_read_bios(struct btrfs_raid_bio *rbio,
 				  struct bio_list *bio_list)
 {
@ -1686,122 +1644,6 @@ static int alloc_rbio_data_pages(struct btrfs_raid_bio *rbio)
 	return 0;
 }

-/*
- * the stripe must be locked by the caller.  It will
- * unlock after all the writes are done
- */
-static int raid56_rmw_stripe(struct btrfs_raid_bio *rbio)
-{
-	int bios_to_read = 0;
-	struct bio_list bio_list;
-	int ret;
-	struct bio *bio;
-
-	bio_list_init(&bio_list);
-
-	ret = alloc_rbio_pages(rbio);
-	if (ret)
-		goto cleanup;
-
-	index_rbio_pages(rbio);
-
-	atomic_set(&rbio->error, 0);
-
-	ret = rmw_assemble_read_bios(rbio, &bio_list);
-	if (ret < 0)
-		goto cleanup;
-
-	bios_to_read = bio_list_size(&bio_list);
-	if (!bios_to_read) {
-		/*
-		 * this can happen if others have merged with
-		 * us, it means there is nothing left to read.
-		 * But if there are missing devices it may not be
-		 * safe to do the full stripe write yet.
-		 */
-		goto finish;
-	}
-
-	/*
-	 * The bioc may be freed once we submit the last bio. Make sure not to
-	 * touch it after that.
-	 */
-	atomic_set(&rbio->stripes_pending, bios_to_read);
-	INIT_WORK(&rbio->end_io_work, raid56_rmw_end_io_work);
-	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid56_bio_end_io;
-
-		if (trace_raid56_read_partial_enabled()) {
-			struct raid56_bio_trace_info trace_info = { 0 };
-
-			bio_get_trace_info(rbio, bio, &trace_info);
-			trace_raid56_read_partial(rbio, bio, &trace_info);
-		}
-		submit_bio(bio);
-	}
-	/* the actual write will happen once the reads are done */
-	return 0;
-
-cleanup:
-	rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
-	while ((bio = bio_list_pop(&bio_list)))
-		bio_put(bio);
-
-	return -EIO;
-
-finish:
-	validate_rbio_for_rmw(rbio);
-	return 0;
-}
-
-/*
- * if the upper layers pass in a full stripe, we thank them by only allocating
- * enough pages to hold the parity, and sending it all down quickly.
- */
-static int full_stripe_write(struct btrfs_raid_bio *rbio)
-{
-	int ret;
-
-	ret = alloc_rbio_parity_pages(rbio);
-	if (ret)
-		return ret;
-
-	ret = lock_stripe_add(rbio);
-	if (ret == 0)
-		finish_rmw(rbio);
-	return 0;
-}
-
-/*
- * partial stripe writes get handed over to async helpers.
- * We're really hoping to merge a few more writes into this
- * rbio before calculating new parity
- */
-static int partial_stripe_write(struct btrfs_raid_bio *rbio)
-{
-	int ret;
-
-	ret = lock_stripe_add(rbio);
-	if (ret == 0)
-		start_async_work(rbio, rmw_work);
-	return 0;
-}
-
-/*
- * sometimes while we were reading from the drive to
- * recalculate parity, enough new bios come into create
- * a full stripe.  So we do a check here to see if we can
- * go directly to finish_rmw
- */
-static int __raid56_parity_write(struct btrfs_raid_bio *rbio)
-{
-	/* head off into rmw land if we don't have a full stripe */
-	if (!rbio_is_full(rbio))
-		return partial_stripe_write(rbio);
-	return full_stripe_write(rbio);
-}
-
 /*
 * We use plugging call backs to collect full stripes.
 * Any time we get a partial stripe write while plugged
@ -1836,28 +1678,22 @@ static int plug_cmp(void *priv, const struct list_head *a,
 	return 0;
 }

-static void run_plug(struct btrfs_plug_cb *plug)
+static void raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
 {
+	struct btrfs_plug_cb *plug = container_of(cb, struct btrfs_plug_cb, cb);
 	struct btrfs_raid_bio *cur;
 	struct btrfs_raid_bio *last = NULL;

-	/*
-	 * sort our plug list then try to merge
-	 * everything we can in hopes of creating full
-	 * stripes.
-	 */
 	list_sort(NULL, &plug->rbio_list, plug_cmp);
+
 	while (!list_empty(&plug->rbio_list)) {
 		cur = list_entry(plug->rbio_list.next,
 				 struct btrfs_raid_bio, plug_list);
 		list_del_init(&cur->plug_list);

 		if (rbio_is_full(cur)) {
-			int ret;
-
-			/* we have a full stripe, send it down */
-			ret = full_stripe_write(cur);
-			BUG_ON(ret);
+			/* We have a full stripe, queue it down. */
+			start_async_work(cur, rmw_rbio_work);
 			continue;
 		}
 		if (last) {
@ -1865,42 +1701,16 @@ static void run_plug(struct btrfs_plug_cb *plug)
 				merge_rbio(last, cur);
 				free_raid_bio(cur);
 				continue;
-
 			}
-			__raid56_parity_write(last);
+			start_async_work(last, rmw_rbio_work);
 		}
 		last = cur;
 	}
-	if (last) {
-		__raid56_parity_write(last);
-	}
+	if (last)
+		start_async_work(last, rmw_rbio_work);
 	kfree(plug);
 }

-/*
- * if the unplug comes from schedule, we have to push the
- * work off to a helper thread
- */
-static void unplug_work(struct work_struct *work)
-{
-	struct btrfs_plug_cb *plug;
-	plug = container_of(work, struct btrfs_plug_cb, work);
-	run_plug(plug);
-}
-
-static void btrfs_raid_unplug(struct blk_plug_cb *cb, bool from_schedule)
-{
-	struct btrfs_plug_cb *plug;
-	plug = container_of(cb, struct btrfs_plug_cb, cb);
-
-	if (from_schedule) {
-		INIT_WORK(&plug->work, unplug_work);
-		queue_work(plug->info->rmw_workers, &plug->work);
-		return;
-	}
-	run_plug(plug);
-}
-
 /* Add the original bio into rbio->bio_list, and update rbio::dbitmap. */
 static void rbio_add_bio(struct btrfs_raid_bio *rbio, struct bio *orig_bio)
 {
@ -1948,19 +1758,13 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
 	rbio_add_bio(rbio, bio);

 	/*
-	 * don't plug on full rbios, just get them out the door
+	 * Don't plug on full rbios, just get them out the door
 	 * as quickly as we can
 	 */
-	if (rbio_is_full(rbio)) {
-		ret = full_stripe_write(rbio);
-		if (ret) {
-			free_raid_bio(rbio);
-			goto fail;
-		}
-		return;
-	}
+	if (rbio_is_full(rbio))
+		goto queue_rbio;

-	cb = blk_check_plugged(btrfs_raid_unplug, fs_info, sizeof(*plug));
+	cb = blk_check_plugged(raid_unplug, fs_info, sizeof(*plug));
 	if (cb) {
 		plug = container_of(cb, struct btrfs_plug_cb, cb);
 		if (!plug->info) {
@ -1968,13 +1772,14 @@ void raid56_parity_write(struct bio *bio, struct btrfs_io_context *bioc)
 			INIT_LIST_HEAD(&plug->rbio_list);
 		}
 		list_add_tail(&rbio->plug_list, &plug->rbio_list);
-	} else {
-		ret = __raid56_parity_write(rbio);
-		if (ret) {
-			free_raid_bio(rbio);
-			goto fail;
-		}
+		return;
 	}
+queue_rbio:
+	/*
+	 * Either we don't have any existing plug, or we're doing a full stripe,
+	 * can queue the rmw work now.
+	 */
+	start_async_work(rbio, rmw_rbio_work);

 	return;

@ -2217,21 +2022,6 @@ static void __raid_recover_end_io(struct btrfs_raid_bio *rbio)
 	}
 }

-/*
- * This is called only for stripes we've read from disk to reconstruct the
- * parity.
- */
-static void raid_recover_end_io_work(struct work_struct *work)
-{
-	struct btrfs_raid_bio *rbio =
-		container_of(work, struct btrfs_raid_bio, end_io_work);
-
-	if (atomic_read(&rbio->error) > rbio->bioc->max_errors)
-		rbio_orig_end_io(rbio, BLK_STS_IOERR);
-	else
-		__raid_recover_end_io(rbio);
-}
-
 static int recover_assemble_read_bios(struct btrfs_raid_bio *rbio,
 				      struct bio_list *bio_list)
 {
@ -2348,79 +2138,6 @@ static void recover_rbio_work_locked(struct work_struct *work)
 	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
 }

-/*
- * reads everything we need off the disk to reconstruct
- * the parity. endio handlers trigger final reconstruction
- * when the IO is done.
- *
- * This is used both for reads from the higher layers and for
- * parity construction required to finish a rmw cycle.
- */
-static int __raid56_parity_recover(struct btrfs_raid_bio *rbio)
-{
-	int bios_to_read = 0;
-	struct bio_list bio_list;
-	int ret;
-	struct bio *bio;
-
-	bio_list_init(&bio_list);
-
-	ret = alloc_rbio_pages(rbio);
-	if (ret)
-		goto cleanup;
-
-	atomic_set(&rbio->error, 0);
-
-	ret = recover_assemble_read_bios(rbio, &bio_list);
-	if (ret < 0)
-		goto cleanup;
-
-	bios_to_read = bio_list_size(&bio_list);
-	if (!bios_to_read) {
-		/*
-		 * we might have no bios to read just because the pages
-		 * were up to date, or we might have no bios to read because
-		 * the devices were gone.
-		 */
-		if (atomic_read(&rbio->error) <= rbio->bioc->max_errors) {
-			__raid_recover_end_io(rbio);
-			return 0;
-		} else {
-			goto cleanup;
-		}
-	}
-
-	/*
-	 * The bioc may be freed once we submit the last bio. Make sure not to
-	 * touch it after that.
-	 */
-	atomic_set(&rbio->stripes_pending, bios_to_read);
-	INIT_WORK(&rbio->end_io_work, raid_recover_end_io_work);
-	while ((bio = bio_list_pop(&bio_list))) {
-		bio->bi_end_io = raid56_bio_end_io;
-
-		if (trace_raid56_scrub_read_recover_enabled()) {
-			struct raid56_bio_trace_info trace_info = { 0 };
-
-			bio_get_trace_info(rbio, bio, &trace_info);
-			trace_raid56_scrub_read_recover(rbio, bio, &trace_info);
-		}
-		submit_bio(bio);
-	}
-
-	return 0;
-
-cleanup:
-	if (rbio->operation == BTRFS_RBIO_READ_REBUILD ||
-	    rbio->operation == BTRFS_RBIO_REBUILD_MISSING)
-		rbio_orig_end_io(rbio, BLK_STS_IOERR);
-
-	while ((bio = bio_list_pop(&bio_list)))
-		bio_put(bio);
-
-	return -EIO;
-}
-
 /*
 * the main entry point for reads from the higher layers.  This
 * is really only called when the normal read path had a failure,
@ -2529,7 +2246,7 @@ static void submit_write_bios(struct btrfs_raid_bio *rbio,
 	}
 }

-int rmw_rbio(struct btrfs_raid_bio *rbio)
+static int rmw_rbio(struct btrfs_raid_bio *rbio)
 {
 	struct bio_list bio_list;
 	int sectornr;
@ -2615,12 +2332,29 @@ write:
 	return ret;
 }

-static void rmw_work(struct work_struct *work)
+static void rmw_rbio_work(struct work_struct *work)
 {
 	struct btrfs_raid_bio *rbio;
+	int ret;

 	rbio = container_of(work, struct btrfs_raid_bio, work);
-	raid56_rmw_stripe(rbio);
+
+	ret = lock_stripe_add(rbio);
+	if (ret == 0) {
+		ret = rmw_rbio(rbio);
+		rbio_orig_end_io(rbio, errno_to_blk_status(ret));
+	}
+}
+
+static void rmw_rbio_work_locked(struct work_struct *work)
+{
+	struct btrfs_raid_bio *rbio;
+	int ret;
+
+	rbio = container_of(work, struct btrfs_raid_bio, work);
+
+	ret = rmw_rbio(rbio);
+	rbio_orig_end_io(rbio, errno_to_blk_status(ret));
 }

 /*
--- a/fs/btrfs/raid56.h
+++ b/fs/btrfs/raid56.h
@ -185,9 +185,4 @@ void raid56_submit_missing_rbio(struct btrfs_raid_bio *rbio);
 int btrfs_alloc_stripe_hash_table(struct btrfs_fs_info *info);
 void btrfs_free_stripe_hash_table(struct btrfs_fs_info *info);

-/*
- * Placeholder definition to avoid warning, will be removed when
- * the full write path is migrated.
- */
-int rmw_rbio(struct btrfs_raid_bio *rbio);
 #endif