Merge tag 'md/4.10-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md

Pull md fixes from Shaohua Li: "Basically one fix for raid5 cache which is merged in this cycle, others are trival fixes" * tag 'md/4.10-rc3' of git://git.kernel.org/pub/scm/linux/kernel/git/shli/md: md/raid5: Use correct IS_ERR() variation on pointer check md: cleanup mddev flag clear for takeover md/r5cache: fix spelling mistake on "recoverying" md/r5cache: assign conf->log before r5l_load_log() md/r5cache: simplify handling of sh->log_start in recovery md/raid5-cache: removes unnecessary write-through mode judgments md/raid10: Refactor raid10_make_request md/raid1: Refactor raid1_make_request
2017-01-12 10:17:59 -08:00 · 2017-01-12 10:17:59 -08:00 · 607ae5f269
parent ba836a6f5a 32cd7cbbac
commit 607ae5f269
6 changed files with 322 additions and 261 deletions
--- a/drivers/md/md.h
+++ b/drivers/md/md.h
@ -212,6 +212,7 @@ extern int rdev_clear_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 				int is_new);
 struct md_cluster_info;

+/* change UNSUPPORTED_MDDEV_FLAGS for each array type if new flag is added */
 enum mddev_flags {
 	MD_ARRAY_FIRST_USE,	/* First use of array, needs initialization */
 	MD_CLOSING,		/* If set, we are closing the array, do not open
@ -702,4 +703,11 @@ static inline int mddev_is_clustered(struct mddev *mddev)
 {
 	return mddev->cluster_info && mddev->bitmap_info.nodes > 1;
 }
+
+/* clear unsupported mddev_flags */
+static inline void mddev_clear_unsupported_flags(struct mddev *mddev,
+	unsigned long unsupported_flags)
+{
+	mddev->flags &= ~unsupported_flags;
+}
 #endif /* _MD_MD_H */
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@ -26,6 +26,11 @@
 #include "raid0.h"
 #include "raid5.h"

+#define UNSUPPORTED_MDDEV_FLAGS		\
+	((1L << MD_HAS_JOURNAL) |	\
+	 (1L << MD_JOURNAL_CLEAN) |	\
+	 (1L << MD_FAILFAST_SUPPORTED))
+
 static int raid0_congested(struct mddev *mddev, int bits)
 {
 	struct r0conf *conf = mddev->private;
@ -539,8 +544,7 @@ static void *raid0_takeover_raid45(struct mddev *mddev)
 	mddev->delta_disks = -1;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
-	clear_bit(MD_HAS_JOURNAL, &mddev->flags);
-	clear_bit(MD_JOURNAL_CLEAN, &mddev->flags);
+	mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);

 	create_strip_zones(mddev, &priv_conf);

@ -583,7 +587,7 @@ static void *raid0_takeover_raid10(struct mddev *mddev)
 	mddev->degraded = 0;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
-	clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+	mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);

 	create_strip_zones(mddev, &priv_conf);
 	return priv_conf;
@ -626,7 +630,7 @@ static void *raid0_takeover_raid1(struct mddev *mddev)
 	mddev->raid_disks = 1;
 	/* make sure it will be not marked as dirty */
 	mddev->recovery_cp = MaxSector;
-	clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+	mddev_clear_unsupported_flags(mddev, UNSUPPORTED_MDDEV_FLAGS);

 	create_strip_zones(mddev, &priv_conf);
 	return priv_conf;
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@ -42,6 +42,10 @@
 #include "raid1.h"
 #include "bitmap.h"

+#define UNSUPPORTED_MDDEV_FLAGS		\
+	((1L << MD_HAS_JOURNAL) |	\
+	 (1L << MD_JOURNAL_CLEAN))
+
 /*
 * Number of guaranteed r1bios in case of extreme VM load:
 */
@ -1066,17 +1070,107 @@ static void raid1_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }

-static void raid1_make_request(struct mddev *mddev, struct bio * bio)
+static void raid1_read_request(struct mddev *mddev, struct bio *bio,
+				 struct r1bio *r1_bio)
 {
 	struct r1conf *conf = mddev->private;
 	struct raid1_info *mirror;
-	struct r1bio *r1_bio;
 	struct bio *read_bio;
+	struct bitmap *bitmap = mddev->bitmap;
+	const int op = bio_op(bio);
+	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+	int sectors_handled;
+	int max_sectors;
+	int rdisk;
+
+	wait_barrier(conf, bio);
+
+read_again:
+	rdisk = read_balance(conf, r1_bio, &max_sectors);
+
+	if (rdisk < 0) {
+		/* couldn't find anywhere to read from */
+		raid_end_bio_io(r1_bio);
+		return;
+	}
+	mirror = conf->mirrors + rdisk;
+
+	if (test_bit(WriteMostly, &mirror->rdev->flags) &&
+	    bitmap) {
+		/*
+		 * Reading from a write-mostly device must take care not to
+		 * over-take any writes that are 'behind'
+		 */
+		raid1_log(mddev, "wait behind writes");
+		wait_event(bitmap->behind_wait,
+			   atomic_read(&bitmap->behind_writes) == 0);
+	}
+	r1_bio->read_disk = rdisk;
+	r1_bio->start_next_window = 0;
+
+	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+	bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
+		 max_sectors);
+
+	r1_bio->bios[rdisk] = read_bio;
+
+	read_bio->bi_iter.bi_sector = r1_bio->sector +
+		mirror->rdev->data_offset;
+	read_bio->bi_bdev = mirror->rdev->bdev;
+	read_bio->bi_end_io = raid1_end_read_request;
+	bio_set_op_attrs(read_bio, op, do_sync);
+	if (test_bit(FailFast, &mirror->rdev->flags) &&
+	    test_bit(R1BIO_FailFast, &r1_bio->state))
+	        read_bio->bi_opf |= MD_FAILFAST;
+	read_bio->bi_private = r1_bio;
+
+	if (mddev->gendisk)
+	        trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
+	                              read_bio, disk_devt(mddev->gendisk),
+	                              r1_bio->sector);
+
+	if (max_sectors < r1_bio->sectors) {
+		/*
+		 * could not read all from this device, so we will need another
+		 * r1_bio.
+		 */
+		sectors_handled = (r1_bio->sector + max_sectors
+				   - bio->bi_iter.bi_sector);
+		r1_bio->sectors = max_sectors;
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+
+		/*
+		 * Cannot call generic_make_request directly as that will be
+		 * queued in __make_request and subsequent mempool_alloc might
+		 * block waiting for it.  So hand bio over to raid1d.
+		 */
+		reschedule_retry(r1_bio);
+
+		r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+		r1_bio->master_bio = bio;
+		r1_bio->sectors = bio_sectors(bio) - sectors_handled;
+		r1_bio->state = 0;
+		r1_bio->mddev = mddev;
+		r1_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+		goto read_again;
+	} else
+		generic_make_request(read_bio);
+}
+
+static void raid1_write_request(struct mddev *mddev, struct bio *bio,
+				struct r1bio *r1_bio)
+{
+	struct r1conf *conf = mddev->private;
 	int i, disks;
-	struct bitmap *bitmap;
+	struct bitmap *bitmap = mddev->bitmap;
 	unsigned long flags;
 	const int op = bio_op(bio);
-	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
 	const unsigned long do_flush_fua = (bio->bi_opf &
 						(REQ_PREFLUSH | REQ_FUA));
@ -1096,15 +1190,15 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)

 	md_write_start(mddev, bio); /* wait on superblock update early */

-	if (bio_data_dir(bio) == WRITE &&
-	    ((bio_end_sector(bio) > mddev->suspend_lo &&
+	if ((bio_end_sector(bio) > mddev->suspend_lo &&
 	    bio->bi_iter.bi_sector < mddev->suspend_hi) ||
 	    (mddev_is_clustered(mddev) &&
 	     md_cluster_ops->area_resyncing(mddev, WRITE,
-		     bio->bi_iter.bi_sector, bio_end_sector(bio))))) {
-		/* As the suspend_* range is controlled by
-		 * userspace, we want an interruptible
-		 * wait.
+		     bio->bi_iter.bi_sector, bio_end_sector(bio)))) {
+
+		/*
+		 * As the suspend_* range is controlled by userspace, we want
+		 * an interruptible wait.
 		 */
 		DEFINE_WAIT(w);
 		for (;;) {
@ -1115,128 +1209,15 @@ static void raid1_make_request(struct mddev *mddev, struct bio * bio)
 			    bio->bi_iter.bi_sector >= mddev->suspend_hi ||
 			    (mddev_is_clustered(mddev) &&
 			     !md_cluster_ops->area_resyncing(mddev, WRITE,
-				     bio->bi_iter.bi_sector, bio_end_sector(bio))))
+				     bio->bi_iter.bi_sector,
+				     bio_end_sector(bio))))
 				break;
 			schedule();
 		}
 		finish_wait(&conf->wait_barrier, &w);
 	}
-
 	start_next_window = wait_barrier(conf, bio);

-	bitmap = mddev->bitmap;
-
-	/*
-	 * make_request() can abort the operation when read-ahead is being
-	 * used and no empty request is available.
-	 *
-	 */
-	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-	r1_bio->master_bio = bio;
-	r1_bio->sectors = bio_sectors(bio);
-	r1_bio->state = 0;
-	r1_bio->mddev = mddev;
-	r1_bio->sector = bio->bi_iter.bi_sector;
-
-	/* We might need to issue multiple reads to different
-	 * devices if there are bad blocks around, so we keep
-	 * track of the number of reads in bio->bi_phys_segments.
-	 * If this is 0, there is only one r1_bio and no locking
-	 * will be needed when requests complete.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
-	if (rw == READ) {
-		/*
-		 * read balancing logic:
-		 */
-		int rdisk;
-
-read_again:
-		rdisk = read_balance(conf, r1_bio, &max_sectors);
-
-		if (rdisk < 0) {
-			/* couldn't find anywhere to read from */
-			raid_end_bio_io(r1_bio);
-			return;
-		}
-		mirror = conf->mirrors + rdisk;
-
-		if (test_bit(WriteMostly, &mirror->rdev->flags) &&
-		    bitmap) {
-			/* Reading from a write-mostly device must
-			 * take care not to over-take any writes
-			 * that are 'behind'
-			 */
-			raid1_log(mddev, "wait behind writes");
-			wait_event(bitmap->behind_wait,
-				   atomic_read(&bitmap->behind_writes) == 0);
-		}
-		r1_bio->read_disk = rdisk;
-		r1_bio->start_next_window = 0;
-
-		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(read_bio, r1_bio->sector - bio->bi_iter.bi_sector,
-			 max_sectors);
-
-		r1_bio->bios[rdisk] = read_bio;
-
-		read_bio->bi_iter.bi_sector = r1_bio->sector +
-			mirror->rdev->data_offset;
-		read_bio->bi_bdev = mirror->rdev->bdev;
-		read_bio->bi_end_io = raid1_end_read_request;
-		bio_set_op_attrs(read_bio, op, do_sync);
-		if (test_bit(FailFast, &mirror->rdev->flags) &&
-		    test_bit(R1BIO_FailFast, &r1_bio->state))
-			read_bio->bi_opf |= MD_FAILFAST;
-		read_bio->bi_private = r1_bio;
-
-		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
-					      read_bio, disk_devt(mddev->gendisk),
-					      r1_bio->sector);
-
-		if (max_sectors < r1_bio->sectors) {
-			/* could not read all from this device, so we will
-			 * need another r1_bio.
-			 */
-
-			sectors_handled = (r1_bio->sector + max_sectors
-					   - bio->bi_iter.bi_sector);
-			r1_bio->sectors = max_sectors;
-			spin_lock_irq(&conf->device_lock);
-			if (bio->bi_phys_segments == 0)
-				bio->bi_phys_segments = 2;
-			else
-				bio->bi_phys_segments++;
-			spin_unlock_irq(&conf->device_lock);
-			/* Cannot call generic_make_request directly
-			 * as that will be queued in __make_request
-			 * and subsequent mempool_alloc might block waiting
-			 * for it.  So hand bio over to raid1d.
-			 */
-			reschedule_retry(r1_bio);
-
-			r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
-
-			r1_bio->master_bio = bio;
-			r1_bio->sectors = bio_sectors(bio) - sectors_handled;
-			r1_bio->state = 0;
-			r1_bio->mddev = mddev;
-			r1_bio->sector = bio->bi_iter.bi_sector +
-				sectors_handled;
-			goto read_again;
-		} else
-			generic_make_request(read_bio);
-		return;
-	}
-
-	/*
-	 * WRITE:
-	 */
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		raid1_log(mddev, "wait queued");
@ -1280,8 +1261,7 @@ read_again:
 			int bad_sectors;
 			int is_bad;

-			is_bad = is_badblock(rdev, r1_bio->sector,
-					     max_sectors,
+			is_bad = is_badblock(rdev, r1_bio->sector, max_sectors,
 					     &first_bad, &bad_sectors);
 			if (is_bad < 0) {
 				/* mustn't write here until the bad block is
@ -1370,7 +1350,8 @@ read_again:
 			continue;

 		mbio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector, max_sectors);
+		bio_trim(mbio, r1_bio->sector - bio->bi_iter.bi_sector,
+			 max_sectors);

 		if (first_clone) {
 			/* do behind I/O ?
@ -1464,6 +1445,40 @@ read_again:
 	wake_up(&conf->wait_barrier);
 }

+static void raid1_make_request(struct mddev *mddev, struct bio *bio)
+{
+	struct r1conf *conf = mddev->private;
+	struct r1bio *r1_bio;
+
+	/*
+	 * make_request() can abort the operation when read-ahead is being
+	 * used and no empty request is available.
+	 *
+	 */
+	r1_bio = mempool_alloc(conf->r1bio_pool, GFP_NOIO);
+
+	r1_bio->master_bio = bio;
+	r1_bio->sectors = bio_sectors(bio);
+	r1_bio->state = 0;
+	r1_bio->mddev = mddev;
+	r1_bio->sector = bio->bi_iter.bi_sector;
+
+	/*
+	 * We might need to issue multiple reads to different devices if there
+	 * are bad blocks around, so we keep track of the number of reads in
+	 * bio->bi_phys_segments.  If this is 0, there is only one r1_bio and
+	 * no locking will be needed when requests complete.  If it is
+	 * non-zero, then it is the number of not-completed requests.
+	 */
+	bio->bi_phys_segments = 0;
+	bio_clear_flag(bio, BIO_SEG_VALID);
+
+	if (bio_data_dir(bio) == READ)
+		raid1_read_request(mddev, bio, r1_bio);
+	else
+		raid1_write_request(mddev, bio, r1_bio);
+}
+
 static void raid1_status(struct seq_file *seq, struct mddev *mddev)
 {
 	struct r1conf *conf = mddev->private;
@ -3246,8 +3261,8 @@ static void *raid1_takeover(struct mddev *mddev)
 		if (!IS_ERR(conf)) {
 			/* Array must appear to be quiesced */
 			conf->array_frozen = 1;
-			clear_bit(MD_HAS_JOURNAL, &mddev->flags);
-			clear_bit(MD_JOURNAL_CLEAN, &mddev->flags);
+			mddev_clear_unsupported_flags(mddev,
+				UNSUPPORTED_MDDEV_FLAGS);
 		}
 		return conf;
 	}
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@ -1087,23 +1087,122 @@ static void raid10_unplug(struct blk_plug_cb *cb, bool from_schedule)
 	kfree(plug);
 }

-static void __make_request(struct mddev *mddev, struct bio *bio)
+static void raid10_read_request(struct mddev *mddev, struct bio *bio,
+				struct r10bio *r10_bio)
 {
 	struct r10conf *conf = mddev->private;
-	struct r10bio *r10_bio;
 	struct bio *read_bio;
+	const int op = bio_op(bio);
+	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
+	int sectors_handled;
+	int max_sectors;
+	sector_t sectors;
+	struct md_rdev *rdev;
+	int slot;
+
+	/*
+	 * Register the new request and wait if the reconstruction
+	 * thread has put up a bar for new requests.
+	 * Continue immediately if no resync is active currently.
+	 */
+	wait_barrier(conf);
+
+	sectors = bio_sectors(bio);
+	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+	    bio->bi_iter.bi_sector < conf->reshape_progress &&
+	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
+		/*
+		 * IO spans the reshape position.  Need to wait for reshape to
+		 * pass
+		 */
+		raid10_log(conf->mddev, "wait reshape");
+		allow_barrier(conf);
+		wait_event(conf->wait_barrier,
+			   conf->reshape_progress <= bio->bi_iter.bi_sector ||
+			   conf->reshape_progress >= bio->bi_iter.bi_sector +
+			   sectors);
+		wait_barrier(conf);
+	}
+
+read_again:
+	rdev = read_balance(conf, r10_bio, &max_sectors);
+	if (!rdev) {
+		raid_end_bio_io(r10_bio);
+		return;
+	}
+	slot = r10_bio->read_slot;
+
+	read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
+	bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
+		 max_sectors);
+
+	r10_bio->devs[slot].bio = read_bio;
+	r10_bio->devs[slot].rdev = rdev;
+
+	read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
+		choose_data_offset(r10_bio, rdev);
+	read_bio->bi_bdev = rdev->bdev;
+	read_bio->bi_end_io = raid10_end_read_request;
+	bio_set_op_attrs(read_bio, op, do_sync);
+	if (test_bit(FailFast, &rdev->flags) &&
+	    test_bit(R10BIO_FailFast, &r10_bio->state))
+	        read_bio->bi_opf |= MD_FAILFAST;
+	read_bio->bi_private = r10_bio;
+
+	if (mddev->gendisk)
+	        trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
+	                              read_bio, disk_devt(mddev->gendisk),
+	                              r10_bio->sector);
+	if (max_sectors < r10_bio->sectors) {
+		/*
+		 * Could not read all from this device, so we will need another
+		 * r10_bio.
+		 */
+		sectors_handled = (r10_bio->sector + max_sectors
+				   - bio->bi_iter.bi_sector);
+		r10_bio->sectors = max_sectors;
+		spin_lock_irq(&conf->device_lock);
+		if (bio->bi_phys_segments == 0)
+			bio->bi_phys_segments = 2;
+		else
+			bio->bi_phys_segments++;
+		spin_unlock_irq(&conf->device_lock);
+		/*
+		 * Cannot call generic_make_request directly as that will be
+		 * queued in __generic_make_request and subsequent
+		 * mempool_alloc might block waiting for it.  so hand bio over
+		 * to raid10d.
+		 */
+		reschedule_retry(r10_bio);
+
+		r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+		r10_bio->master_bio = bio;
+		r10_bio->sectors = bio_sectors(bio) - sectors_handled;
+		r10_bio->state = 0;
+		r10_bio->mddev = mddev;
+		r10_bio->sector = bio->bi_iter.bi_sector + sectors_handled;
+		goto read_again;
+	} else
+		generic_make_request(read_bio);
+	return;
+}
+
+static void raid10_write_request(struct mddev *mddev, struct bio *bio,
+				 struct r10bio *r10_bio)
+{
+	struct r10conf *conf = mddev->private;
 	int i;
 	const int op = bio_op(bio);
-	const int rw = bio_data_dir(bio);
 	const unsigned long do_sync = (bio->bi_opf & REQ_SYNC);
 	const unsigned long do_fua = (bio->bi_opf & REQ_FUA);
 	unsigned long flags;
 	struct md_rdev *blocked_rdev;
 	struct blk_plug_cb *cb;
 	struct raid10_plug_cb *plug = NULL;
+	sector_t sectors;
 	int sectors_handled;
 	int max_sectors;
-	int sectors;

 	md_write_start(mddev, bio);

@ -1118,8 +1217,9 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 	while (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
 	    bio->bi_iter.bi_sector < conf->reshape_progress &&
 	    bio->bi_iter.bi_sector + sectors > conf->reshape_progress) {
-		/* IO spans the reshape position.  Need to wait for
-		 * reshape to pass
+		/*
+		 * IO spans the reshape position.  Need to wait for reshape to
+		 * pass
 		 */
 		raid10_log(conf->mddev, "wait reshape");
 		allow_barrier(conf);
@ -1129,8 +1229,8 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 			   sectors);
 		wait_barrier(conf);
 	}
+
 	if (test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
-	    bio_data_dir(bio) == WRITE &&
 	    (mddev->reshape_backwards
 	     ? (bio->bi_iter.bi_sector < conf->reshape_safe &&
 		bio->bi_iter.bi_sector + sectors > conf->reshape_progress)
@ -1148,98 +1248,6 @@ static void __make_request(struct mddev *mddev, struct bio *bio)
 		conf->reshape_safe = mddev->reshape_position;
 	}

-	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
-	r10_bio->master_bio = bio;
-	r10_bio->sectors = sectors;
-
-	r10_bio->mddev = mddev;
-	r10_bio->sector = bio->bi_iter.bi_sector;
-	r10_bio->state = 0;
-
-	/* We might need to issue multiple reads to different
-	 * devices if there are bad blocks around, so we keep
-	 * track of the number of reads in bio->bi_phys_segments.
-	 * If this is 0, there is only one r10_bio and no locking
-	 * will be needed when the request completes.  If it is
-	 * non-zero, then it is the number of not-completed requests.
-	 */
-	bio->bi_phys_segments = 0;
-	bio_clear_flag(bio, BIO_SEG_VALID);
-
-	if (rw == READ) {
-		/*
-		 * read balancing logic:
-		 */
-		struct md_rdev *rdev;
-		int slot;
-
-read_again:
-		rdev = read_balance(conf, r10_bio, &max_sectors);
-		if (!rdev) {
-			raid_end_bio_io(r10_bio);
-			return;
-		}
-		slot = r10_bio->read_slot;
-
-		read_bio = bio_clone_mddev(bio, GFP_NOIO, mddev);
-		bio_trim(read_bio, r10_bio->sector - bio->bi_iter.bi_sector,
-			 max_sectors);
-
-		r10_bio->devs[slot].bio = read_bio;
-		r10_bio->devs[slot].rdev = rdev;
-
-		read_bio->bi_iter.bi_sector = r10_bio->devs[slot].addr +
-			choose_data_offset(r10_bio, rdev);
-		read_bio->bi_bdev = rdev->bdev;
-		read_bio->bi_end_io = raid10_end_read_request;
-		bio_set_op_attrs(read_bio, op, do_sync);
-		if (test_bit(FailFast, &rdev->flags) &&
-		    test_bit(R10BIO_FailFast, &r10_bio->state))
-			read_bio->bi_opf |= MD_FAILFAST;
-		read_bio->bi_private = r10_bio;
-
-		if (mddev->gendisk)
-			trace_block_bio_remap(bdev_get_queue(read_bio->bi_bdev),
-					      read_bio, disk_devt(mddev->gendisk),
-					      r10_bio->sector);
-		if (max_sectors < r10_bio->sectors) {
-			/* Could not read all from this device, so we will
-			 * need another r10_bio.
-			 */
-			sectors_handled = (r10_bio->sector + max_sectors
-					   - bio->bi_iter.bi_sector);
-			r10_bio->sectors = max_sectors;
-			spin_lock_irq(&conf->device_lock);
-			if (bio->bi_phys_segments == 0)
-				bio->bi_phys_segments = 2;
-			else
-				bio->bi_phys_segments++;
-			spin_unlock_irq(&conf->device_lock);
-			/* Cannot call generic_make_request directly
-			 * as that will be queued in __generic_make_request
-			 * and subsequent mempool_alloc might block
-			 * waiting for it.  so hand bio over to raid10d.
-			 */
-			reschedule_retry(r10_bio);
-
-			r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
-
-			r10_bio->master_bio = bio;
-			r10_bio->sectors = bio_sectors(bio) - sectors_handled;
-			r10_bio->state = 0;
-			r10_bio->mddev = mddev;
-			r10_bio->sector = bio->bi_iter.bi_sector +
-				sectors_handled;
-			goto read_again;
-		} else
-			generic_make_request(read_bio);
-		return;
-	}
-
-	/*
-	 * WRITE:
-	 */
 	if (conf->pending_count >= max_queued_requests) {
 		md_wakeup_thread(mddev->thread);
 		raid10_log(mddev, "wait queued");
@ -1300,8 +1308,7 @@ retry_write:
 			int bad_sectors;
 			int is_bad;

-			is_bad = is_badblock(rdev, dev_sector,
-					     max_sectors,
+			is_bad = is_badblock(rdev, dev_sector, max_sectors,
 					     &first_bad, &bad_sectors);
 			if (is_bad < 0) {
 				/* Mustn't write here until the bad block
@ -1405,8 +1412,7 @@ retry_write:
 			r10_bio->devs[i].bio = mbio;

 			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr+
-					   choose_data_offset(r10_bio,
-							      rdev));
+					   choose_data_offset(r10_bio, rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
 			bio_set_op_attrs(mbio, op, do_sync | do_fua);
@ -1457,8 +1463,7 @@ retry_write:
 			r10_bio->devs[i].repl_bio = mbio;

 			mbio->bi_iter.bi_sector	= (r10_bio->devs[i].addr +
-					   choose_data_offset(
-						   r10_bio, rdev));
+					   choose_data_offset(r10_bio, rdev));
 			mbio->bi_bdev = rdev->bdev;
 			mbio->bi_end_io	= raid10_end_write_request;
 			bio_set_op_attrs(mbio, op, do_sync | do_fua);
@ -1503,6 +1508,36 @@ retry_write:
 	one_write_done(r10_bio);
 }

+static void __make_request(struct mddev *mddev, struct bio *bio)
+{
+	struct r10conf *conf = mddev->private;
+	struct r10bio *r10_bio;
+
+	r10_bio = mempool_alloc(conf->r10bio_pool, GFP_NOIO);
+
+	r10_bio->master_bio = bio;
+	r10_bio->sectors = bio_sectors(bio);
+
+	r10_bio->mddev = mddev;
+	r10_bio->sector = bio->bi_iter.bi_sector;
+	r10_bio->state = 0;
+
+	/*
+	 * We might need to issue multiple reads to different devices if there
+	 * are bad blocks around, so we keep track of the number of reads in
+	 * bio->bi_phys_segments.  If this is 0, there is only one r10_bio and
+	 * no locking will be needed when the request completes.  If it is
+	 * non-zero, then it is the number of not-completed requests.
+	 */
+	bio->bi_phys_segments = 0;
+	bio_clear_flag(bio, BIO_SEG_VALID);
+
+	if (bio_data_dir(bio) == READ)
+		raid10_read_request(mddev, bio, r10_bio);
+	else
+		raid10_write_request(mddev, bio, r10_bio);
+}
+
 static void raid10_make_request(struct mddev *mddev, struct bio *bio)
 {
 	struct r10conf *conf = mddev->private;
--- a/drivers/md/raid5-cache.c
+++ b/drivers/md/raid5-cache.c
@ -1682,8 +1682,7 @@ out:

 static struct stripe_head *
 r5c_recovery_alloc_stripe(struct r5conf *conf,
-			  sector_t stripe_sect,
-			  sector_t log_start)
+			  sector_t stripe_sect)
 {
 	struct stripe_head *sh;

@ -1692,7 +1691,6 @@ r5c_recovery_alloc_stripe(struct r5conf *conf,
 		return NULL;  /* no more stripe available */

 	r5l_recovery_reset_stripe(sh);
-	sh->log_start = log_start;

 	return sh;
 }
@ -1862,7 +1860,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 						stripe_sect);

 		if (!sh) {
-			sh = r5c_recovery_alloc_stripe(conf, stripe_sect, ctx->pos);
+			sh = r5c_recovery_alloc_stripe(conf, stripe_sect);
 			/*
 			 * cannot get stripe from raid5_get_active_stripe
 			 * try replay some stripes
@ -1871,7 +1869,7 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 				r5c_recovery_replay_stripes(
 					cached_stripe_list, ctx);
 				sh = r5c_recovery_alloc_stripe(
-					conf, stripe_sect, ctx->pos);
+					conf, stripe_sect);
 			}
 			if (!sh) {
 				pr_debug("md/raid:%s: Increasing stripe cache size to %d to recovery data on journal.\n",
@ -1879,8 +1877,8 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 					conf->min_nr_stripes * 2);
 				raid5_set_cache_size(mddev,
 						     conf->min_nr_stripes * 2);
-				sh = r5c_recovery_alloc_stripe(
-					conf, stripe_sect, ctx->pos);
+				sh = r5c_recovery_alloc_stripe(conf,
+							       stripe_sect);
 			}
 			if (!sh) {
 				pr_err("md/raid:%s: Cannot get enough stripes due to memory pressure. Recovery failed.\n",
@ -1894,7 +1892,6 @@ r5c_recovery_analyze_meta_block(struct r5l_log *log,
 			if (!test_bit(STRIPE_R5C_CACHING, &sh->state) &&
 			    test_bit(R5_Wantwrite, &sh->dev[sh->pd_idx].flags)) {
 				r5l_recovery_replay_one_stripe(conf, sh, ctx);
-				sh->log_start = ctx->pos;
 				list_move_tail(&sh->lru, cached_stripe_list);
 			}
 			r5l_recovery_load_data(log, sh, ctx, payload,
@ -1933,8 +1930,6 @@ static void r5c_recovery_load_one_stripe(struct r5l_log *log,
 			set_bit(R5_UPTODATE, &dev->flags);
 		}
 	}
-	list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
-	atomic_inc(&log->stripe_in_journal_count);
 }

 /*
@ -2070,6 +2065,7 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 	struct stripe_head *sh, *next;
 	struct mddev *mddev = log->rdev->mddev;
 	struct page *page;
+	sector_t next_checkpoint = MaxSector;

 	page = alloc_page(GFP_KERNEL);
 	if (!page) {
@ -2078,6 +2074,8 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 		return -ENOMEM;
 	}

+	WARN_ON(list_empty(&ctx->cached_list));
+
 	list_for_each_entry_safe(sh, next, &ctx->cached_list, lru) {
 		struct r5l_meta_block *mb;
 		int i;
@ -2123,12 +2121,15 @@ r5c_recovery_rewrite_data_only_stripes(struct r5l_log *log,
 		sync_page_io(log->rdev, ctx->pos, PAGE_SIZE, page,
 			     REQ_OP_WRITE, REQ_FUA, false);
 		sh->log_start = ctx->pos;
+		list_add_tail(&sh->r5c, &log->stripe_in_journal_list);
+		atomic_inc(&log->stripe_in_journal_count);
 		ctx->pos = write_pos;
 		ctx->seq += 1;
-
+		next_checkpoint = sh->log_start;
 		list_del_init(&sh->lru);
 		raid5_release_stripe(sh);
 	}
+	log->next_checkpoint = next_checkpoint;
 	__free_page(page);
 	return 0;
 }
@ -2139,7 +2140,6 @@ static int r5l_recovery_log(struct r5l_log *log)
 	struct r5l_recovery_ctx ctx;
 	int ret;
 	sector_t pos;
-	struct stripe_head *sh;

 	ctx.pos = log->last_checkpoint;
 	ctx.seq = log->last_cp_seq;
@ -2164,16 +2164,13 @@ static int r5l_recovery_log(struct r5l_log *log)
 		log->next_checkpoint = ctx.pos;
 		r5l_log_write_empty_meta_block(log, ctx.pos, ctx.seq++);
 		ctx.pos = r5l_ring_add(log, ctx.pos, BLOCK_SECTORS);
-	} else {
-		sh = list_last_entry(&ctx.cached_list, struct stripe_head, lru);
-		log->next_checkpoint = sh->log_start;
 	}

 	if ((ctx.data_only_stripes == 0) && (ctx.data_parity_stripes == 0))
 		pr_debug("md/raid:%s: starting from clean shutdown\n",
 			 mdname(mddev));
 	else {
-		pr_debug("md/raid:%s: recoverying %d data-only stripes and %d data-parity stripes\n",
+		pr_debug("md/raid:%s: recovering %d data-only stripes and %d data-parity stripes\n",
 			 mdname(mddev), ctx.data_only_stripes,
 			 ctx.data_parity_stripes);

@ -2418,9 +2415,6 @@ void r5c_finish_stripe_write_out(struct r5conf *conf,
 	if (do_wakeup)
 		wake_up(&conf->wait_for_overlap);

-	if (conf->log->r5c_journal_mode == R5C_JOURNAL_MODE_WRITE_THROUGH)
-		return;
-
 	spin_lock_irq(&conf->log->stripe_in_journal_lock);
 	list_del_init(&sh->r5c);
 	spin_unlock_irq(&conf->log->stripe_in_journal_lock);
@ -2639,14 +2633,16 @@ int r5l_init_log(struct r5conf *conf, struct md_rdev *rdev)
 	spin_lock_init(&log->stripe_in_journal_lock);
 	atomic_set(&log->stripe_in_journal_count, 0);

+	rcu_assign_pointer(conf->log, log);
+
 	if (r5l_load_log(log))
 		goto error;

-	rcu_assign_pointer(conf->log, log);
 	set_bit(MD_HAS_JOURNAL, &conf->mddev->flags);
 	return 0;

 error:
+	rcu_assign_pointer(conf->log, NULL);
 	md_unregister_thread(&log->reclaim_thread);
 reclaim_thread:
 	mempool_destroy(log->meta_pool);
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@ -62,6 +62,8 @@
 #include "raid0.h"
 #include "bitmap.h"

+#define UNSUPPORTED_MDDEV_FLAGS	(1L << MD_FAILFAST_SUPPORTED)
+
 #define cpu_to_group(cpu) cpu_to_node(cpu)
 #define ANY_GROUP NUMA_NO_NODE

@ -7829,8 +7831,9 @@ static void *raid5_takeover_raid1(struct mddev *mddev)
 	mddev->new_chunk_sectors = chunksect;

 	ret = setup_conf(mddev);
-	if (!IS_ERR_VALUE(ret))
-		clear_bit(MD_FAILFAST_SUPPORTED, &mddev->flags);
+	if (!IS_ERR(ret))
+		mddev_clear_unsupported_flags(mddev,
+			UNSUPPORTED_MDDEV_FLAGS);
 	return ret;
 }