From 25eafe1a813681849ad3fb9effdfdce3e1b4335a Mon Sep 17 00:00:00 2001
From: Benjamin Randazzo <benjamin@randazzo.fr>
Date: Sat, 25 Jul 2015 16:36:50 +0200
Subject: [PATCH 01/33] md: simplify get_bitmap_file now that "file" is zeroed.

There is no point assigning '\0' to file->pathname[0] as
file is now zeroed out, so remove that branch and
simplify the code.

[Original patch combined this with the change to use
 kzalloc.  I split the two so that the change to kzalloc
 is easier to backport. - neilb]

Signed-off-by: Benjamin Randazzo <benjamin@randazzo.fr>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index e25f00f0138a..4d47a9ab8228 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5765,16 +5765,16 @@ static int get_bitmap_file(struct mddev *mddev, void __user * arg)
 
 	err = 0;
 	spin_lock(&mddev->lock);
-	/* bitmap disabled, zero the first byte and copy out */
-	if (!mddev->bitmap_info.file)
-		file->pathname[0] = '\0';
-	else if ((ptr = file_path(mddev->bitmap_info.file,
-			       file->pathname, sizeof(file->pathname))),
-		 IS_ERR(ptr))
-		err = PTR_ERR(ptr);
-	else
-		memmove(file->pathname, ptr,
-			sizeof(file->pathname)-(ptr-file->pathname));
+	/* bitmap enabled */
+	if (mddev->bitmap_info.file) {
+		ptr = file_path(mddev->bitmap_info.file, file->pathname,
+				sizeof(file->pathname));
+		if (IS_ERR(ptr))
+			err = PTR_ERR(ptr);
+		else
+			memmove(file->pathname, ptr,
+				sizeof(file->pathname)-(ptr-file->pathname));
+	}
 	spin_unlock(&mddev->lock);
 
 	if (err == 0 &&

From 199dc6ed5179251fa6158a461499c24bdd99c836 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 3 Aug 2015 13:11:47 +1000
Subject: [PATCH 02/33] md/raid0: update queue parameter in a safer location.

When a (e.g.) RAID5 array is reshaped to RAID0, the updating
of queue parameters (e.g. max number of sectors per bio) is
done in the wrong place.
It should be part of ->run, but it is actually part of ->takeover.
This means it happens before level_store() calls:

	blk_set_stacking_limits(&mddev->queue->limits);

and so it ineffective.  This can lead to errors from underlying
devices.

So move all the relevant settings out of create_stripe_zones()
and into raid0_run().

As this can lead to a bug-on it is suitable for any -stable
kernel which supports reshape to RAID0.  So 2.6.35 or later.
As the bug has been present for five years there is no urgency,
so no need to rush into -stable.

Fixes: 9af204cf720c ("md: Add support for Raid5->Raid0 and Raid10->Raid0 takeover")
Cc: stable@vger.kernel.org (v2.6.35+ - please delay until after -final release).
Reported-by: Yi Zhang <yizhan@redhat.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid0.c | 75 ++++++++++++++++++++++++----------------------
 1 file changed, 39 insertions(+), 36 deletions(-)

diff --git a/drivers/md/raid0.c b/drivers/md/raid0.c
index efb654eb5399..4a13c3cb940b 100644
--- a/drivers/md/raid0.c
+++ b/drivers/md/raid0.c
@@ -83,7 +83,7 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	char b[BDEVNAME_SIZE];
 	char b2[BDEVNAME_SIZE];
 	struct r0conf *conf = kzalloc(sizeof(*conf), GFP_KERNEL);
-	bool discard_supported = false;
+	unsigned short blksize = 512;
 
 	if (!conf)
 		return -ENOMEM;
@@ -98,6 +98,9 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		sector_div(sectors, mddev->chunk_sectors);
 		rdev1->sectors = sectors * mddev->chunk_sectors;
 
+		blksize = max(blksize, queue_logical_block_size(
+				      rdev1->bdev->bd_disk->queue));
+
 		rdev_for_each(rdev2, mddev) {
 			pr_debug("md/raid0:%s:   comparing %s(%llu)"
 				 " with %s(%llu)\n",
@@ -134,6 +137,18 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 	}
 	pr_debug("md/raid0:%s: FINAL %d zones\n",
 		 mdname(mddev), conf->nr_strip_zones);
+	/*
+	 * now since we have the hard sector sizes, we can make sure
+	 * chunk size is a multiple of that sector size
+	 */
+	if ((mddev->chunk_sectors << 9) % blksize) {
+		printk(KERN_ERR "md/raid0:%s: chunk_size of %d not multiple of block size %d\n",
+		       mdname(mddev),
+		       mddev->chunk_sectors << 9, blksize);
+		err = -EINVAL;
+		goto abort;
+	}
+
 	err = -ENOMEM;
 	conf->strip_zone = kzalloc(sizeof(struct strip_zone)*
 				conf->nr_strip_zones, GFP_KERNEL);
@@ -188,19 +203,12 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 		}
 		dev[j] = rdev1;
 
-		if (mddev->queue)
-			disk_stack_limits(mddev->gendisk, rdev1->bdev,
-					  rdev1->data_offset << 9);
-
 		if (rdev1->bdev->bd_disk->queue->merge_bvec_fn)
 			conf->has_merge_bvec = 1;
 
 		if (!smallest || (rdev1->sectors < smallest->sectors))
 			smallest = rdev1;
 		cnt++;
-
-		if (blk_queue_discard(bdev_get_queue(rdev1->bdev)))
-			discard_supported = true;
 	}
 	if (cnt != mddev->raid_disks) {
 		printk(KERN_ERR "md/raid0:%s: too few disks (%d of %d) - "
@@ -261,28 +269,6 @@ static int create_strip_zones(struct mddev *mddev, struct r0conf **private_conf)
 			 (unsigned long long)smallest->sectors);
 	}
 
-	/*
-	 * now since we have the hard sector sizes, we can make sure
-	 * chunk size is a multiple of that sector size
-	 */
-	if ((mddev->chunk_sectors << 9) % queue_logical_block_size(mddev->queue)) {
-		printk(KERN_ERR "md/raid0:%s: chunk_size of %d not valid\n",
-		       mdname(mddev),
-		       mddev->chunk_sectors << 9);
-		goto abort;
-	}
-
-	if (mddev->queue) {
-		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
-		blk_queue_io_opt(mddev->queue,
-				 (mddev->chunk_sectors << 9) * mddev->raid_disks);
-
-		if (!discard_supported)
-			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
-		else
-			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
-	}
-
 	pr_debug("md/raid0:%s: done.\n", mdname(mddev));
 	*private_conf = conf;
 
@@ -433,12 +419,6 @@ static int raid0_run(struct mddev *mddev)
 	if (md_check_no_bitmap(mddev))
 		return -EINVAL;
 
-	if (mddev->queue) {
-		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
-		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
-		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
-	}
-
 	/* if private is not null, we are here after takeover */
 	if (mddev->private == NULL) {
 		ret = create_strip_zones(mddev, &conf);
@@ -447,6 +427,29 @@ static int raid0_run(struct mddev *mddev)
 		mddev->private = conf;
 	}
 	conf = mddev->private;
+	if (mddev->queue) {
+		struct md_rdev *rdev;
+		bool discard_supported = false;
+
+		rdev_for_each(rdev, mddev) {
+			disk_stack_limits(mddev->gendisk, rdev->bdev,
+					  rdev->data_offset << 9);
+			if (blk_queue_discard(bdev_get_queue(rdev->bdev)))
+				discard_supported = true;
+		}
+		blk_queue_max_hw_sectors(mddev->queue, mddev->chunk_sectors);
+		blk_queue_max_write_same_sectors(mddev->queue, mddev->chunk_sectors);
+		blk_queue_max_discard_sectors(mddev->queue, mddev->chunk_sectors);
+
+		blk_queue_io_min(mddev->queue, mddev->chunk_sectors << 9);
+		blk_queue_io_opt(mddev->queue,
+				 (mddev->chunk_sectors << 9) * mddev->raid_disks);
+
+		if (!discard_supported)
+			queue_flag_clear_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+		else
+			queue_flag_set_unlocked(QUEUE_FLAG_DISCARD, mddev->queue);
+	}
 
 	/* calculate array device size */
 	md_set_array_sectors(mddev, raid0_size(mddev, 0, 0));

From 0e833e697bcf4c2f3f7fb9fce39d08cd4439e5d7 Mon Sep 17 00:00:00 2001
From: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Date: Wed, 1 Jul 2015 12:19:56 +1000
Subject: [PATCH 03/33] md/raid6: delta syndrome for ARM NEON

This implements XOR syndrome calculation using NEON intrinsics.
As before, the module can be built for ARM and arm64 from the
same source.

Relative performance on a Cortex-A57 based system:

  raid6: int64x1  gen()   905 MB/s
  raid6: int64x1  xor()   881 MB/s
  raid6: int64x2  gen()  1343 MB/s
  raid6: int64x2  xor()  1286 MB/s
  raid6: int64x4  gen()  1896 MB/s
  raid6: int64x4  xor()  1321 MB/s
  raid6: int64x8  gen()  1773 MB/s
  raid6: int64x8  xor()  1165 MB/s
  raid6: neonx1   gen()  1834 MB/s
  raid6: neonx1   xor()  1278 MB/s
  raid6: neonx2   gen()  2528 MB/s
  raid6: neonx2   xor()  1942 MB/s
  raid6: neonx4   gen()  2888 MB/s
  raid6: neonx4   xor()  2334 MB/s
  raid6: neonx8   gen()  2957 MB/s
  raid6: neonx8   xor()  2232 MB/s
  raid6: using algorithm neonx8 gen() 2957 MB/s
  raid6: .... xor() 2232 MB/s, rmw enabled

Cc: Markus Stockhausen <stockhausen@collogia.de>
Cc: Neil Brown <neilb@suse.de>
Signed-off-by: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 lib/raid6/neon.c  | 13 ++++++++++++-
 lib/raid6/neon.uc | 46 ++++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 58 insertions(+), 1 deletion(-)

diff --git a/lib/raid6/neon.c b/lib/raid6/neon.c
index d9ad6ee284f4..7076ef1ba3dd 100644
--- a/lib/raid6/neon.c
+++ b/lib/raid6/neon.c
@@ -40,9 +40,20 @@
 					(unsigned long)bytes, ptrs);	\
 		kernel_neon_end();					\
 	}								\
+	static void raid6_neon ## _n ## _xor_syndrome(int disks,	\
+					int start, int stop, 		\
+					size_t bytes, void **ptrs)	\
+	{								\
+		void raid6_neon ## _n  ## _xor_syndrome_real(int,	\
+				int, int, unsigned long, void**);	\
+		kernel_neon_begin();					\
+		raid6_neon ## _n ## _xor_syndrome_real(disks,		\
+			start, stop, (unsigned long)bytes, ptrs);	\
+		kernel_neon_end();					\
+	}								\
 	struct raid6_calls const raid6_neonx ## _n = {			\
 		raid6_neon ## _n ## _gen_syndrome,			\
-		NULL,		/* XOR not yet implemented */		\
+		raid6_neon ## _n ## _xor_syndrome,			\
 		raid6_have_neon,					\
 		"neonx" #_n,						\
 		0							\
diff --git a/lib/raid6/neon.uc b/lib/raid6/neon.uc
index 1b9ed793342d..4fa51b761dd0 100644
--- a/lib/raid6/neon.uc
+++ b/lib/raid6/neon.uc
@@ -3,6 +3,7 @@
  *   neon.uc - RAID-6 syndrome calculation using ARM NEON instructions
  *
  *   Copyright (C) 2012 Rob Herring
+ *   Copyright (C) 2015 Linaro Ltd. <ard.biesheuvel@linaro.org>
  *
  *   Based on altivec.uc:
  *     Copyright 2002-2004 H. Peter Anvin - All Rights Reserved
@@ -78,3 +79,48 @@ void raid6_neon$#_gen_syndrome_real(int disks, unsigned long bytes, void **ptrs)
 		vst1q_u8(&q[d+NSIZE*$$], wq$$);
 	}
 }
+
+void raid6_neon$#_xor_syndrome_real(int disks, int start, int stop,
+				    unsigned long bytes, void **ptrs)
+{
+	uint8_t **dptr = (uint8_t **)ptrs;
+	uint8_t *p, *q;
+	int d, z, z0;
+
+	register unative_t wd$$, wq$$, wp$$, w1$$, w2$$;
+	const unative_t x1d = NBYTES(0x1d);
+
+	z0 = stop;		/* P/Q right side optimization */
+	p = dptr[disks-2];	/* XOR parity */
+	q = dptr[disks-1];	/* RS syndrome */
+
+	for ( d = 0 ; d < bytes ; d += NSIZE*$# ) {
+		wq$$ = vld1q_u8(&dptr[z0][d+$$*NSIZE]);
+		wp$$ = veorq_u8(vld1q_u8(&p[d+$$*NSIZE]), wq$$);
+
+		/* P/Q data pages */
+		for ( z = z0-1 ; z >= start ; z-- ) {
+			wd$$ = vld1q_u8(&dptr[z][d+$$*NSIZE]);
+			wp$$ = veorq_u8(wp$$, wd$$);
+			w2$$ = MASK(wq$$);
+			w1$$ = SHLBYTE(wq$$);
+
+			w2$$ = vandq_u8(w2$$, x1d);
+			w1$$ = veorq_u8(w1$$, w2$$);
+			wq$$ = veorq_u8(w1$$, wd$$);
+		}
+		/* P/Q left side optimization */
+		for ( z = start-1 ; z >= 0 ; z-- ) {
+			w2$$ = MASK(wq$$);
+			w1$$ = SHLBYTE(wq$$);
+
+			w2$$ = vandq_u8(w2$$, x1d);
+			wq$$ = veorq_u8(w1$$, w2$$);
+		}
+		w1$$ = vld1q_u8(&q[d+NSIZE*$$]);
+		wq$$ = veorq_u8(wq$$, w1$$);
+
+		vst1q_u8(&p[d+NSIZE*$$], wp$$);
+		vst1q_u8(&q[d+NSIZE*$$], wq$$);
+	}
+}

From f7851be736d58e7270f05a4ca84b16ce72734a18 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 2 Jul 2015 17:12:58 +1000
Subject: [PATCH 04/33] md: Keep /proc/mdstat reporting recovery until fully
 DONE.

Currently when a recovery completes, mdstat shows that it has finished
before the new device is marked as a full member.  Because of this it
can appear to a script that the recovery finished but the array isn't
in sync.

So while MD_RECOVERY_DONE is still set, keep mdstat reporting "recovery".
Once md_reap_sync_thread() completes, the spare will be active and then
MD_RECOVERY_DONE will be cleared.

To ensure this is race-free, set MD_RECOVERY_DONE before clearning
curr_resync.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 38 ++++++++++++++++++++++++--------------
 1 file changed, 24 insertions(+), 14 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 4d47a9ab8228..689be615d7be 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7093,7 +7093,7 @@ static void status_unused(struct seq_file *seq)
 	seq_printf(seq, "\n");
 }
 
-static void status_resync(struct seq_file *seq, struct mddev *mddev)
+static int status_resync(struct seq_file *seq, struct mddev *mddev)
 {
 	sector_t max_sectors, resync, res;
 	unsigned long dt, db;
@@ -7101,18 +7101,32 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev)
 	int scale;
 	unsigned int per_milli;
 
-	if (mddev->curr_resync <= 3)
-		resync = 0;
-	else
-		resync = mddev->curr_resync
-			- atomic_read(&mddev->recovery_active);
-
 	if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery) ||
 	    test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery))
 		max_sectors = mddev->resync_max_sectors;
 	else
 		max_sectors = mddev->dev_sectors;
 
+	resync = mddev->curr_resync;
+	if (resync <= 3) {
+		if (test_bit(MD_RECOVERY_DONE, &mddev->recovery))
+			/* Still cleaning up */
+			resync = max_sectors;
+	} else
+		resync -= atomic_read(&mddev->recovery_active);
+
+	if (resync == 0) {
+		if (mddev->recovery_cp < MaxSector) {
+			seq_printf(seq, "\tresync=PENDING");
+			return 1;
+		}
+		return 0;
+	}
+	if (resync < 3) {
+		seq_printf(seq, "\tresync=DELAYED");
+		return 1;
+	}
+
 	WARN_ON(max_sectors == 0);
 	/* Pick 'scale' such that (resync>>scale)*1000 will fit
 	 * in a sector_t, and (max_sectors>>scale) will fit in a
@@ -7177,6 +7191,7 @@ static void status_resync(struct seq_file *seq, struct mddev *mddev)
 		   ((unsigned long)rt % 60)/6);
 
 	seq_printf(seq, " speed=%ldK/sec", db/2/dt);
+	return 1;
 }
 
 static void *md_seq_start(struct seq_file *seq, loff_t *pos)
@@ -7322,13 +7337,8 @@ static int md_seq_show(struct seq_file *seq, void *v)
 			mddev->pers->status(seq, mddev);
 			seq_printf(seq, "\n      ");
 			if (mddev->pers->sync_request) {
-				if (mddev->curr_resync > 2) {
-					status_resync(seq, mddev);
+				if (status_resync(seq, mddev))
 					seq_printf(seq, "\n      ");
-				} else if (mddev->curr_resync >= 1)
-					seq_printf(seq, "\tresync=DELAYED\n      ");
-				else if (mddev->recovery_cp < MaxSector)
-					seq_printf(seq, "\tresync=PENDING\n      ");
 			}
 		} else
 			seq_printf(seq, "\n       ");
@@ -7979,11 +7989,11 @@ void md_do_sync(struct md_thread *thread)
 		mddev->resync_max = MaxSector;
 	} else if (test_bit(MD_RECOVERY_REQUESTED, &mddev->recovery))
 		mddev->resync_min = mddev->curr_resync_completed;
+	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	mddev->curr_resync = 0;
 	spin_unlock(&mddev->lock);
 
 	wake_up(&resync_wait);
-	set_bit(MD_RECOVERY_DONE, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 	return;
 }

From 985ca973b68cac0adfa83497db231da7f99c6ed9 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 6 Jul 2015 12:26:57 +1000
Subject: [PATCH 05/33] md: close some races between setting and checking
 sync_action.

When checking sync_action in a script, we want to be sure it is
as accurate as possible.
As resync/reshape etc doesn't always start immediately (a separate
thread is scheduled to do it), it is best if 'action_show'
checks if MD_RECOVER_NEEDED is set (which it does) and in that
case reports what is likely to start soon (which it only sometimes
does).

So:
 - report 'reshape' if reshape_position suggests one might start.
 - set MD_RECOVERY_RECOVER in raid1_reshape(), because that is very
   likely to happen next.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c    | 2 ++
 drivers/md/raid1.c | 1 +
 2 files changed, 3 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 689be615d7be..324f9df4e429 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -4210,6 +4210,8 @@ action_show(struct mddev *mddev, char *page)
 				type = "repair";
 		} else if (test_bit(MD_RECOVERY_RECOVER, &recovery))
 			type = "recover";
+		else if (mddev->reshape_position != MaxSector)
+			type = "reshape";
 	}
 	return sprintf(page, "%s\n", type);
 }
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 967a4ed73929..742b50794dfd 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -3113,6 +3113,7 @@ static int raid1_reshape(struct mddev *mddev)
 
 	unfreeze_array(conf);
 
+	set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 	md_wakeup_thread(mddev->thread);
 

From 92140480ed59a006b245efd33a195fde62d1845f Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 6 Jul 2015 12:28:45 +1000
Subject: [PATCH 06/33] md/raid5: consider updating reshape_position at start
 of reshape.

md/raid5 only updates ->reshape_position (which is stored in
metadata and is authoritative) occasionally, but particularly
when getting closed to ->resync_max as it must be correct
when ->resync_max is reached.

When mdadm tries to stop an array which is reshaping it will:
 - freeze the reshape,
 - set resync_max to where the reshape has reached.
 - unfreeze the reshape.
When this happens, the reshape is aborted and then restarted.

The restart doesn't check that resync_max is close, and so doesn't
update ->reshape_position like it should.
This results in the reshape stopping, but ->reshape_position being
incorrect.

So on that first call to reshape_request, make sure ->reshape_position
is updated if needed.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c | 8 ++++++--
 1 file changed, 6 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index f757023fc458..256b05d1117b 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5347,6 +5347,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	sector_t stripe_addr;
 	int reshape_sectors;
 	struct list_head stripes;
+	sector_t retn;
 
 	if (sector_nr == 0) {
 		/* If restarting in the middle, skip the initial sectors */
@@ -5362,7 +5363,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 			mddev->curr_resync_completed = sector_nr;
 			sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 			*skipped = 1;
-			return sector_nr;
+			retn = sector_nr;
+			goto finish;
 		}
 	}
 
@@ -5535,6 +5537,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	 * then we need to write out the superblock.
 	 */
 	sector_nr += reshape_sectors;
+	retn = reshape_sectors;
+finish:
 	if ((sector_nr - mddev->curr_resync_completed) * 2
 	    >= mddev->resync_max - mddev->curr_resync_completed) {
 		/* Cannot proceed until we've updated the superblock... */
@@ -5560,7 +5564,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
 	}
 ret:
-	return reshape_sectors;
+	return retn;
 }
 
 static inline sector_t sync_request(struct mddev *mddev, sector_t sector_nr, int *skipped)

From 02ec50265b16493e4a62228727e9f774068123d2 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Mon, 6 Jul 2015 16:33:47 +1000
Subject: [PATCH 07/33] md/raid10: fix a few typos in comments

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid10.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index 38c58e19cfce..dcc0e9b3ee92 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -4215,7 +4215,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	 * at a time, possibly less if that exceeds RESYNC_PAGES,
 	 * or we hit a bad block or something.
 	 * This might mean we pause for normal IO in the middle of
-	 * a chunk, but that is not a problem was mddev->reshape_position
+	 * a chunk, but that is not a problem as mddev->reshape_position
 	 * can record any location.
 	 *
 	 * If we will want to write to a location that isn't
@@ -4239,7 +4239,7 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr,
 	 *
 	 * In all this the minimum difference in data offsets
 	 * (conf->offset_diff - always positive) allows a bit of slack,
-	 * so next can be after 'safe', but not by more than offset_disk
+	 * so next can be after 'safe', but not by more than offset_diff
 	 *
 	 * We need to prepare all the bios here before we start any IO
 	 * to ensure the size we choose is acceptable to all devices.

From 5cac6bcb9312a18a5091976fc374b4c7b9c4ae2e Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 17 Jul 2015 12:17:50 +1000
Subject: [PATCH 08/33] md/raid5: always set conf->prev_chunk_sectors and
 ->prev_algo

These aren't really needed when no reshape is happening,
but it is safer to have them always set to a meaningful value.
The next patch will use ->prev_chunk_sectors without checking
if a reshape is happening (because that makes the code simpler),
and this patch makes that safe.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 256b05d1117b..e543bfd7ae66 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6568,6 +6568,9 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	if (conf->reshape_progress != MaxSector) {
 		conf->prev_chunk_sectors = mddev->chunk_sectors;
 		conf->prev_algo = mddev->layout;
+	} else {
+		conf->prev_chunk_sectors = conf->chunk_sectors;
+		conf->prev_algo = conf->algorithm;
 	}
 
 	conf->min_nr_stripes = NR_STRIPES;

From 3cb5edf45457948347b5ae8cc9650c000cef4391 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Jul 2015 17:24:17 +1000
Subject: [PATCH 09/33] md/raid5: switch to use conf->chunk_sectors in place of
 mddev->chunk_sectors where possible

The chunk_sectors and new_chunk_sectors fields of mddev can be changed
any time (via sysfs) that the reconfig mutex can be taken.  So raid5
keeps internal copies in 'conf' which are stable except for a short
locked moment when reshape stops/starts.

So any access that does not hold reconfig_mutex should use the 'conf'
values, not the 'mddev' values.
Several don't.

This could result in corruption if new values were written at awkward
times.

Also use min() or max() rather than open-coding.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c | 28 ++++++++++++++--------------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e543bfd7ae66..a98162f5d97f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -4676,9 +4676,10 @@ static int raid5_mergeable_bvec(struct mddev *mddev,
 				struct bvec_merge_data *bvm,
 				struct bio_vec *biovec)
 {
+	struct r5conf *conf = mddev->private;
 	sector_t sector = bvm->bi_sector + get_start_sect(bvm->bi_bdev);
 	int max;
-	unsigned int chunk_sectors = mddev->chunk_sectors;
+	unsigned int chunk_sectors;
 	unsigned int bio_sectors = bvm->bi_size >> 9;
 
 	/*
@@ -4688,8 +4689,7 @@ static int raid5_mergeable_bvec(struct mddev *mddev,
 	if ((bvm->bi_rw & 1) == WRITE || mddev->degraded)
 		return biovec->bv_len;
 
-	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
-		chunk_sectors = mddev->new_chunk_sectors;
+	chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
 	max =  (chunk_sectors - ((sector & (chunk_sectors - 1)) + bio_sectors)) << 9;
 	if (max < 0) max = 0;
 	if (max <= biovec->bv_len && bio_sectors == 0)
@@ -4700,12 +4700,12 @@ static int raid5_mergeable_bvec(struct mddev *mddev,
 
 static int in_chunk_boundary(struct mddev *mddev, struct bio *bio)
 {
+	struct r5conf *conf = mddev->private;
 	sector_t sector = bio->bi_iter.bi_sector + get_start_sect(bio->bi_bdev);
-	unsigned int chunk_sectors = mddev->chunk_sectors;
+	unsigned int chunk_sectors;
 	unsigned int bio_sectors = bio_sectors(bio);
 
-	if (mddev->new_chunk_sectors < mddev->chunk_sectors)
-		chunk_sectors = mddev->new_chunk_sectors;
+	chunk_sectors = min(conf->chunk_sectors, conf->prev_chunk_sectors);
 	return  chunk_sectors >=
 		((sector & (chunk_sectors - 1)) + bio_sectors);
 }
@@ -5372,10 +5372,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	 * If old and new chunk sizes differ, we need to process the
 	 * largest of these
 	 */
-	if (mddev->new_chunk_sectors > mddev->chunk_sectors)
-		reshape_sectors = mddev->new_chunk_sectors;
-	else
-		reshape_sectors = mddev->chunk_sectors;
+
+	reshape_sectors = max(conf->chunk_sectors, conf->prev_chunk_sectors);
 
 	/* We update the metadata at least every 10 seconds, or when
 	 * the data about to be copied would over-write the source of
@@ -6260,8 +6258,8 @@ raid5_size(struct mddev *mddev, sector_t sectors, int raid_disks)
 		/* size is defined by the smallest of previous and new size */
 		raid_disks = min(conf->raid_disks, conf->previous_raid_disks);
 
-	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
-	sectors &= ~((sector_t)mddev->new_chunk_sectors - 1);
+	sectors &= ~((sector_t)conf->chunk_sectors - 1);
+	sectors &= ~((sector_t)conf->prev_chunk_sectors - 1);
 	return sectors * (raid_disks - conf->max_degraded);
 }
 
@@ -6996,7 +6994,7 @@ static void status(struct seq_file *seq, struct mddev *mddev)
 	int i;
 
 	seq_printf(seq, " level %d, %dk chunk, algorithm %d", mddev->level,
-		mddev->chunk_sectors / 2, mddev->layout);
+		conf->chunk_sectors / 2, mddev->layout);
 	seq_printf (seq, " [%d/%d] [", conf->raid_disks, conf->raid_disks - mddev->degraded);
 	for (i = 0; i < conf->raid_disks; i++)
 		seq_printf (seq, "%s",
@@ -7202,7 +7200,9 @@ static int raid5_resize(struct mddev *mddev, sector_t sectors)
 	 * worth it.
 	 */
 	sector_t newsize;
-	sectors &= ~((sector_t)mddev->chunk_sectors - 1);
+	struct r5conf *conf = mddev->private;
+
+	sectors &= ~((sector_t)conf->chunk_sectors - 1);
 	newsize = raid5_size(mddev, sectors, mddev->raid_disks);
 	if (mddev->external_size &&
 	    mddev->array_sectors > newsize)

From 05256d9884d3276f61537d3d7f5605dc21bd3477 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Jul 2015 17:36:21 +1000
Subject: [PATCH 10/33] md/raid5: strengthen check on reshape_position at run.

When reshaping, we work in units of the largest chunk size.
If changing from a larger to a smaller chunk size, that means we
reshape more than one stripe at a time.  So the required alignment
of reshape_position needs to take into account both the old
and new chunk size.

This means that both 'here_new' and 'here_old' are calculated with
respect to the same (maximum) chunk size, so testing if they are the
same when delta_disks is zero becomes pointless.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c | 29 ++++++++++++++---------------
 1 file changed, 14 insertions(+), 15 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index a98162f5d97f..e95219bf8859 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -6688,6 +6688,8 @@ static int run(struct mddev *mddev)
 		sector_t here_new, here_old;
 		int old_disks;
 		int max_degraded = (mddev->level == 6 ? 2 : 1);
+		int chunk_sectors;
+		int new_data_disks;
 
 		if (mddev->new_level != mddev->level) {
 			printk(KERN_ERR "md/raid:%s: unsupported reshape "
@@ -6699,28 +6701,25 @@ static int run(struct mddev *mddev)
 		/* reshape_position must be on a new-stripe boundary, and one
 		 * further up in new geometry must map after here in old
 		 * geometry.
+		 * If the chunk sizes are different, then as we perform reshape
+		 * in units of the largest of the two, reshape_position needs
+		 * be a multiple of the largest chunk size times new data disks.
 		 */
 		here_new = mddev->reshape_position;
-		if (sector_div(here_new, mddev->new_chunk_sectors *
-			       (mddev->raid_disks - max_degraded))) {
+		chunk_sectors = max(mddev->chunk_sectors, mddev->new_chunk_sectors);
+		new_data_disks = mddev->raid_disks - max_degraded;
+		if (sector_div(here_new, chunk_sectors * new_data_disks)) {
 			printk(KERN_ERR "md/raid:%s: reshape_position not "
 			       "on a stripe boundary\n", mdname(mddev));
 			return -EINVAL;
 		}
-		reshape_offset = here_new * mddev->new_chunk_sectors;
+		reshape_offset = here_new * chunk_sectors;
 		/* here_new is the stripe we will write to */
 		here_old = mddev->reshape_position;
-		sector_div(here_old, mddev->chunk_sectors *
-			   (old_disks-max_degraded));
+		sector_div(here_old, chunk_sectors * (old_disks-max_degraded));
 		/* here_old is the first stripe that we might need to read
 		 * from */
 		if (mddev->delta_disks == 0) {
-			if ((here_new * mddev->new_chunk_sectors !=
-			     here_old * mddev->chunk_sectors)) {
-				printk(KERN_ERR "md/raid:%s: reshape position is"
-				       " confused - aborting\n", mdname(mddev));
-				return -EINVAL;
-			}
 			/* We cannot be sure it is safe to start an in-place
 			 * reshape.  It is only safe if user-space is monitoring
 			 * and taking constant backups.
@@ -6739,10 +6738,10 @@ static int run(struct mddev *mddev)
 				return -EINVAL;
 			}
 		} else if (mddev->reshape_backwards
-		    ? (here_new * mddev->new_chunk_sectors + min_offset_diff <=
-		       here_old * mddev->chunk_sectors)
-		    : (here_new * mddev->new_chunk_sectors >=
-		       here_old * mddev->chunk_sectors + (-min_offset_diff))) {
+		    ? (here_new * chunk_sectors + min_offset_diff <=
+		       here_old * chunk_sectors)
+		    : (here_new * chunk_sectors >=
+		       here_old * chunk_sectors + (-min_offset_diff))) {
 			/* Reading from the same stripe as writing to - bad */
 			printk(KERN_ERR "md/raid:%s: reshape_position too early for "
 			       "auto-recovery - aborting.\n",

From c74c0d760e30f56f9699dc180036ca37993d1c58 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Wed, 15 Jul 2015 17:54:15 +1000
Subject: [PATCH 11/33] md/raid5: remove incorrect "min_t()" when calculating
 writepos.

This code is calculating:
  writepos, which is the furthest along address (device-space) that we
     *will* be writing to
  readpos, which is the earliest address that we *could* possible read
     from, and
  safepos, which is the earliest address in the 'old' section that we
     might read from after a crash when the reshape position is
     recovered from metadata.

  The first is a precise calculation, so clipping at zero doesn't
  make sense.  As the reshape position is now guaranteed to always be
  a multiple of reshape_sectors and as we already BUG_ON when
  reshape_progress is zero, there is no point in this min_t() call.

  The readpos and safepos are worst case - actual value depends on
  precise geometry.  That worst case could be negative, which is only
  a problem because we are storing the value in an unsigned.
  So leave the min_t() for those.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c | 7 ++++++-
 1 file changed, 6 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index e95219bf8859..19bbdbe1a52f 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5388,11 +5388,16 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	safepos = conf->reshape_safe;
 	sector_div(safepos, data_disks);
 	if (mddev->reshape_backwards) {
-		writepos -= min_t(sector_t, reshape_sectors, writepos);
+		BUG_ON(writepos < reshape_sectors);
+		writepos -= reshape_sectors;
 		readpos += reshape_sectors;
 		safepos += reshape_sectors;
 	} else {
 		writepos += reshape_sectors;
+		/* readpos and safepos are worst-case calculations.
+		 * A negative number is overly pessimistic, and causes
+		 * obvious problems for unsigned storage.  So clip to 0.
+		 */
 		readpos -= min_t(sector_t, reshape_sectors, readpos);
 		safepos -= min_t(sector_t, reshape_sectors, safepos);
 	}

From a4a3d26d8757a30ae21724d8b0d79e00e113c38d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 17 Jul 2015 11:57:30 +1000
Subject: [PATCH 12/33] md: set MD_RECOVERY_RECOVER when starting a degraded
 array.

This ensures that 'sync_action' will show 'recover' immediately the
array is started.  If there is no spare the status will change to
'idle' once that is detected.

Clear MD_RECOVERY_RECOVER for a read-only array to ensure this change
happens.

This allows scripts which monitor status not to get confused -
particularly my test scripts.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 6 ++++++
 1 file changed, 6 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 324f9df4e429..5b62a3d49e12 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -5218,6 +5218,11 @@ int md_run(struct mddev *mddev)
 			if (sysfs_link_rdev(mddev, rdev))
 				/* failure here is OK */;
 
+	if (mddev->degraded && !mddev->ro)
+		/* This ensures that recovering status is reported immediately
+		 * via sysfs - until a lack of spares is confirmed.
+		 */
+		set_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 	set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 
 	if (mddev->flags & MD_UPDATE_SB_FLAGS)
@@ -8164,6 +8169,7 @@ void md_check_recovery(struct mddev *mddev)
 			 */
 			set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 			md_reap_sync_thread(mddev);
+			clear_bit(MD_RECOVERY_RECOVER, &mddev->recovery);
 			clear_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
 			goto unlock;
 		}

From c5e19d906a658f27fa858b09a95d9551b1a69bd0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 17 Jul 2015 12:06:02 +1000
Subject: [PATCH 13/33] md: be careful when testing resync_max against
 curr_resync_completed.

While it generally shouldn't happen, it is not impossible for
curr_resync_completed to exceed resync_max.
This can particularly happen when reshaping RAID5 - the current
status isn't copied to curr_resync_completed promptly, so when it
is, it can exceed resync_max.
This happens when the reshape is 'frozen', resync_max is set low,
and reshape is re-enabled.

Taking a difference between two unsigned numbers is always dangerous
anyway, so add a test to behave correctly if
   curr_resync_completed > resync_max

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c    | 3 ++-
 drivers/md/raid5.c | 3 ++-
 2 files changed, 4 insertions(+), 2 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 5b62a3d49e12..b326cd26b027 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7834,7 +7834,8 @@ void md_do_sync(struct md_thread *thread)
 		      > (max_sectors >> 4)) ||
 		     time_after_eq(jiffies, update_time + UPDATE_FREQUENCY) ||
 		     (j - mddev->curr_resync_completed)*2
-		     >= mddev->resync_max - mddev->curr_resync_completed
+		     >= mddev->resync_max - mddev->curr_resync_completed ||
+		     mddev->curr_resync_completed > mddev->resync_max
 			    )) {
 			/* time to update curr_resync_completed */
 			wait_event(mddev->recovery_wait,
diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 19bbdbe1a52f..1c27aa10f89c 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5542,7 +5542,8 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 	sector_nr += reshape_sectors;
 	retn = reshape_sectors;
 finish:
-	if ((sector_nr - mddev->curr_resync_completed) * 2
+	if (mddev->curr_resync_completed > mddev->resync_max ||
+	    (sector_nr - mddev->curr_resync_completed) * 2
 	    >= mddev->resync_max - mddev->curr_resync_completed) {
 		/* Cannot proceed until we've updated the superblock... */
 		wait_event(conf->wait_for_overlap,

From 5ed1df2eacc0ba92c8c7e2499c97594b5ef928a8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 24 Jul 2015 13:27:08 +1000
Subject: [PATCH 14/33] md: sync sync_completed has correct value as recovery
 finishes.

There can be a small window between the moment that recovery
actually writes the last block and the time when various sysfs
and /proc/mdstat attributes report that it has finished.
During this time, 'sync_completed' can have the wrong value.
This can confuse monitoring software.

So:
 - don't set curr_resync_completed beyond the end of the devices,
 - set it correctly when resync/recovery has completed.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 9 +++++++++
 1 file changed, 9 insertions(+)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index b326cd26b027..84dc5d7a445b 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7880,6 +7880,9 @@ void md_do_sync(struct md_thread *thread)
 			break;
 
 		j += sectors;
+		if (j > max_sectors)
+			/* when skipping, extra large numbers can be returned. */
+			j = max_sectors;
 		if (j > 2)
 			mddev->curr_resync = j;
 		if (mddev_is_clustered(mddev))
@@ -7948,6 +7951,12 @@ void md_do_sync(struct md_thread *thread)
 	blk_finish_plug(&plug);
 	wait_event(mddev->recovery_wait, !atomic_read(&mddev->recovery_active));
 
+	if (!test_bit(MD_RECOVERY_RESHAPE, &mddev->recovery) &&
+	    !test_bit(MD_RECOVERY_INTR, &mddev->recovery) &&
+	    mddev->curr_resync > 2) {
+		mddev->curr_resync_completed = mddev->curr_resync;
+		sysfs_notify(&mddev->kobj, NULL, "sync_completed");
+	}
 	/* tell personality that we are finished */
 	mddev->pers->sync_request(mddev, max_sectors, &skipped);
 

From 6cbd81487f7cfa30e22537bf7cd07f48c4e7164d Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 24 Jul 2015 13:30:32 +1000
Subject: [PATCH 15/33] md/raid5: handle possible race as reshape completes.

It is possible (though unlikely) for a reshape to be
interrupted between the time that end_reshape is called
and the time when raid5_finish_reshape is called.

This can leave conf->reshape_progress set to MaxSector,
but mddev->reshape_position not.

This combination confused reshape_request() when ->reshape_backwards.
As conf->reshape_progress is so high, it seems the reshape hasn't
really begun.  But assuming MaxSector is a valid address only
leads to sorrow.

So ensure reshape_position and reshape_progress both agree,
and add an extra check in reshape_request() just in case they don't.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c | 5 +++++
 1 file changed, 5 insertions(+)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 1c27aa10f89c..7b0c706f33e7 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -5355,6 +5355,10 @@ static sector_t reshape_request(struct mddev *mddev, sector_t sector_nr, int *sk
 		    conf->reshape_progress < raid5_size(mddev, 0, 0)) {
 			sector_nr = raid5_size(mddev, 0, 0)
 				- conf->reshape_progress;
+		} else if (mddev->reshape_backwards &&
+			   conf->reshape_progress == MaxSector) {
+			/* shouldn't happen, but just in case, finish up.*/
+			sector_nr = MaxSector;
 		} else if (!mddev->reshape_backwards &&
 			   conf->reshape_progress > 0)
 			sector_nr = conf->reshape_progress;
@@ -7446,6 +7450,7 @@ static void end_reshape(struct r5conf *conf)
 			rdev->data_offset = rdev->new_data_offset;
 		smp_wmb();
 		conf->reshape_progress = MaxSector;
+		conf->mddev->reshape_position = MaxSector;
 		spin_unlock_irq(&conf->device_lock);
 		wake_up(&conf->wait_for_overlap);
 

From 25b2edfa3b6940b73180d2735cd19fdf58d0cf91 Mon Sep 17 00:00:00 2001
From: Sasha Levin <sasha.levin@oracle.com>
Date: Fri, 24 Jul 2015 18:19:58 -0400
Subject: [PATCH 16/33] md: setup safemode_timer before it's being used

We used to set up the safemode_timer timer in md_run. If md_run
would fail before the timer was set up we'd end up trying to modify
a timer that doesn't have a callback function when we access safe_delay_store,
which would trigger a BUG.

neilb: delete init_timer() call as setup_timer() does that.

Signed-off-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 9 ++++-----
 1 file changed, 4 insertions(+), 5 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 84dc5d7a445b..cdc080bf09d4 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -502,6 +502,8 @@ static void mddev_put(struct mddev *mddev)
 		bioset_free(bs);
 }
 
+static void md_safemode_timeout(unsigned long data);
+
 void mddev_init(struct mddev *mddev)
 {
 	mutex_init(&mddev->open_mutex);
@@ -509,7 +511,8 @@ void mddev_init(struct mddev *mddev)
 	mutex_init(&mddev->bitmap_info.mutex);
 	INIT_LIST_HEAD(&mddev->disks);
 	INIT_LIST_HEAD(&mddev->all_mddevs);
-	init_timer(&mddev->safemode_timer);
+	setup_timer(&mddev->safemode_timer, md_safemode_timeout,
+		    (unsigned long) mddev);
 	atomic_set(&mddev->active, 1);
 	atomic_set(&mddev->openers, 0);
 	atomic_set(&mddev->active_io, 0);
@@ -3276,8 +3279,6 @@ int strict_strtoul_scaled(const char *cp, unsigned long *res, int scale)
 	return 0;
 }
 
-static void md_safemode_timeout(unsigned long data);
-
 static ssize_t
 safe_delay_show(struct mddev *mddev, char *page)
 {
@@ -5204,8 +5205,6 @@ int md_run(struct mddev *mddev)
 	atomic_set(&mddev->max_corr_read_errors,
 		   MD_DEFAULT_MAX_CORRECTED_READ_ERRORS);
 	mddev->safemode = 0;
-	mddev->safemode_timer.function = md_safemode_timeout;
-	mddev->safemode_timer.data = (unsigned long) mddev;
 	mddev->safemode_delay = (200 * HZ)/1000 +1; /* 200 msec delay */
 	mddev->in_sync = 1;
 	smp_wmb();

From b89f704a8ddf7be1a66af9aad350226ae283292f Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 10 Jul 2015 16:54:02 +0800
Subject: [PATCH 17/33] md-cluster: use %pU to print UUIDs

Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 16 ++--------------
 1 file changed, 2 insertions(+), 14 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 0072190515e0..85ef5c5aa350 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -177,18 +177,6 @@ static void lockres_free(struct dlm_lock_resource *res)
 	kfree(res);
 }
 
-static char *pretty_uuid(char *dest, char *src)
-{
-	int i, len = 0;
-
-	for (i = 0; i < 16; i++) {
-		if (i == 4 || i == 6 || i == 8 || i == 10)
-			len += sprintf(dest + len, "-");
-		len += sprintf(dest + len, "%02x", (__u8)src[i]);
-	}
-	return dest;
-}
-
 static void add_resync_info(struct mddev *mddev, struct dlm_lock_resource *lockres,
 		sector_t lo, sector_t hi)
 {
@@ -388,7 +376,7 @@ static void process_add_new_disk(struct mddev *mddev, struct cluster_msg *cmsg)
 	int len;
 
 	len = snprintf(disk_uuid, 64, "DEVICE_UUID=");
-	pretty_uuid(disk_uuid + len, cmsg->uuid);
+	sprintf(disk_uuid + len, "%pU", cmsg->uuid);
 	snprintf(raid_slot, 16, "RAID_DISK=%d", cmsg->raid_slot);
 	pr_info("%s:%d Sending kobject change with %s and %s\n", __func__, __LINE__, disk_uuid, raid_slot);
 	init_completion(&cinfo->newdisk_completion);
@@ -646,7 +634,7 @@ static int join(struct mddev *mddev, int nodes)
 	mddev->cluster_info = cinfo;
 
 	memset(str, 0, 64);
-	pretty_uuid(str, mddev->uuid);
+	sprintf(str, "%pU", mddev->uuid);
 	ret = dlm_new_lockspace(str, mddev->bitmap_info.cluster_name,
 				DLM_LSFL_FS, LVB_SIZE,
 				&md_ls_ops, mddev, &ops_rv, &cinfo->lockspace);

From 05cd0e51769a51f53c68e83976c83653f6ab1595 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 10 Jul 2015 16:54:03 +0800
Subject: [PATCH 18/33] md-cluster: split recover_slot for future code reuse

Make recover_slot as a wraper to __recover_slot, since the
logic of __recover_slot can be reused for the condition
when other nodes need to take over the resync job.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 23 ++++++++++++++++-------
 1 file changed, 16 insertions(+), 7 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 85ef5c5aa350..24caabef10cd 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -269,16 +269,11 @@ static void recover_prep(void *arg)
 	set_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
 }
 
-static void recover_slot(void *arg, struct dlm_slot *slot)
+static void __recover_slot(struct mddev *mddev, int slot)
 {
-	struct mddev *mddev = arg;
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 
-	pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
-			mddev->bitmap_info.cluster_name,
-			slot->nodeid, slot->slot,
-			cinfo->slot_number);
-	set_bit(slot->slot - 1, &cinfo->recovery_map);
+	set_bit(slot, &cinfo->recovery_map);
 	if (!cinfo->recovery_thread) {
 		cinfo->recovery_thread = md_register_thread(recover_bitmaps,
 				mddev, "recover");
@@ -290,6 +285,20 @@ static void recover_slot(void *arg, struct dlm_slot *slot)
 	md_wakeup_thread(cinfo->recovery_thread);
 }
 
+static void recover_slot(void *arg, struct dlm_slot *slot)
+{
+	struct mddev *mddev = arg;
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+
+	pr_info("md-cluster: %s Node %d/%d down. My slot: %d. Initiating recovery.\n",
+			mddev->bitmap_info.cluster_name,
+			slot->nodeid, slot->slot,
+			cinfo->slot_number);
+	/* deduct one since dlm slot starts from one while the num of
+	 * cluster-md begins with 0 */
+	__recover_slot(mddev, slot->slot - 1);
+}
+
 static void recover_done(void *arg, struct dlm_slot *slots,
 		int num_slots, int our_slot,
 		uint32_t generation)

From dc737d7c3d62d2cd2b62c7739aaa7604330c3dd8 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 10 Jul 2015 16:54:04 +0800
Subject: [PATCH 19/33] md-cluster: transfer the resync ownership to another
 node

When node A stops an array while the array is doing a resync, we need
to let another node B take over the resync task.

To achieve the goal, we need the A send an explicit BITMAP_NEEDS_SYNC
message to the cluster. And the node B which received that message will
invoke __recover_slot to do resync.

Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 15 +++++++++++++++
 drivers/md/md.c         |  6 +++---
 2 files changed, 18 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 24caabef10cd..47199addae04 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -75,6 +75,7 @@ enum msg_type {
 	NEWDISK,
 	REMOVE,
 	RE_ADD,
+	BITMAP_NEEDS_SYNC,
 };
 
 struct cluster_msg {
@@ -454,6 +455,11 @@ static void process_recvd_msg(struct mddev *mddev, struct cluster_msg *msg)
 			__func__, __LINE__, msg->slot);
 		process_readd_disk(mddev, msg);
 		break;
+	case BITMAP_NEEDS_SYNC:
+		pr_info("%s: %d Received BITMAP_NEEDS_SYNC from %d\n",
+			__func__, __LINE__, msg->slot);
+		__recover_slot(mddev, msg->slot);
+		break;
 	default:
 		pr_warn("%s:%d Received unknown message from %d\n",
 			__func__, __LINE__, msg->slot);
@@ -814,8 +820,17 @@ static int resync_start(struct mddev *mddev, sector_t lo, sector_t hi)
 
 static void resync_finish(struct mddev *mddev)
 {
+	struct md_cluster_info *cinfo = mddev->cluster_info;
+	struct cluster_msg cmsg;
+	int slot = cinfo->slot_number - 1;
+
 	pr_info("%s:%d\n", __func__, __LINE__);
 	resync_send(mddev, RESYNCING, 0, 0);
+	if (test_bit(MD_RECOVERY_INTR, &mddev->recovery)) {
+		cmsg.type = cpu_to_le32(BITMAP_NEEDS_SYNC);
+		cmsg.slot = cpu_to_le32(slot);
+		sendmsg(cinfo, &cmsg);
+	}
 }
 
 static int area_resyncing(struct mddev *mddev, int direction,
diff --git a/drivers/md/md.c b/drivers/md/md.c
index cdc080bf09d4..c0637603a391 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7959,9 +7959,6 @@ void md_do_sync(struct md_thread *thread)
 	/* tell personality that we are finished */
 	mddev->pers->sync_request(mddev, max_sectors, &skipped);
 
-	if (mddev_is_clustered(mddev))
-		md_cluster_ops->resync_finish(mddev);
-
 	if (!test_bit(MD_RECOVERY_CHECK, &mddev->recovery) &&
 	    mddev->curr_resync > 2) {
 		if (test_bit(MD_RECOVERY_SYNC, &mddev->recovery)) {
@@ -7995,6 +7992,9 @@ void md_do_sync(struct md_thread *thread)
 		}
 	}
  skip:
+	if (mddev_is_clustered(mddev))
+		md_cluster_ops->resync_finish(mddev);
+
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
 
 	spin_lock(&mddev->lock);

From 66099bb0ee6c20f91ace3fa5f82202fbceb67d8e Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 10 Jul 2015 17:01:15 +0800
Subject: [PATCH 20/33] md-cluster: fix deadlock issue on message lock

There is problem with previous communication mechanism, and we got below
deadlock scenario with cluster which has 3 nodes.

	Sender                	    Receiver        		Receiver

	token(EX)
       message(EX)
      writes message
   downconverts message(CR)
      requests ack(EX)
		                  get message(CR)            gets message(CR)
                		  reads message                reads message
		               requests EX on message    requests EX on message

To fix this problem, we do the following changes:

1. the sender downconverts MESSAGE to CW rather than CR.
2. and the receiver request PR lock not EX lock on message.

And in case we failed to down-convert EX to CW on message, it is better to
unlock message otherthan still hold the lock.

Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Lidong Zhong <ldzhong@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 Documentation/md-cluster.txt |  4 ++--
 drivers/md/md-cluster.c      | 14 +++++++-------
 2 files changed, 9 insertions(+), 9 deletions(-)

diff --git a/Documentation/md-cluster.txt b/Documentation/md-cluster.txt
index de1af7db3355..1b794369e03a 100644
--- a/Documentation/md-cluster.txt
+++ b/Documentation/md-cluster.txt
@@ -91,7 +91,7 @@ The algorithm is:
     this message inappropriate or redundant.
 
  3. sender write LVB.
-    sender down-convert MESSAGE from EX to CR
+    sender down-convert MESSAGE from EX to CW
     sender try to get EX of ACK
     [ wait until all receiver has *processed* the MESSAGE ]
 
@@ -112,7 +112,7 @@ The algorithm is:
     sender down-convert ACK from EX to CR
     sender release MESSAGE
     sender release TOKEN
-                               receiver upconvert to EX of MESSAGE
+                               receiver upconvert to PR of MESSAGE
                                receiver get CR of ACK
                                receiver release MESSAGE
 
diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 47199addae04..85b7836fb4b5 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -488,8 +488,8 @@ static void recv_daemon(struct md_thread *thread)
 
 	/*release CR on ack_lockres*/
 	dlm_unlock_sync(ack_lockres);
-	/*up-convert to EX on message_lockres*/
-	dlm_lock_sync(message_lockres, DLM_LOCK_EX);
+	/*up-convert to PR on message_lockres*/
+	dlm_lock_sync(message_lockres, DLM_LOCK_PR);
 	/*get CR on ack_lockres again*/
 	dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
 	/*release CR on message_lockres*/
@@ -522,7 +522,7 @@ static void unlock_comm(struct md_cluster_info *cinfo)
  * The function:
  * 1. Grabs the message lockresource in EX mode
  * 2. Copies the message to the message LVB
- * 3. Downconverts message lockresource to CR
+ * 3. Downconverts message lockresource to CW
  * 4. Upconverts ack lock resource from CR to EX. This forces the BAST on other nodes
  *    and the other nodes read the message. The thread will wait here until all other
  *    nodes have released ack lock resource.
@@ -543,12 +543,12 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
 
 	memcpy(cinfo->message_lockres->lksb.sb_lvbptr, (void *)cmsg,
 			sizeof(struct cluster_msg));
-	/*down-convert EX to CR on Message*/
-	error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CR);
+	/*down-convert EX to CW on Message*/
+	error = dlm_lock_sync(cinfo->message_lockres, DLM_LOCK_CW);
 	if (error) {
-		pr_err("md-cluster: failed to convert EX to CR on MESSAGE(%d)\n",
+		pr_err("md-cluster: failed to convert EX to CW on MESSAGE(%d)\n",
 				error);
-		goto failed_message;
+		goto failed_ack;
 	}
 
 	/*up-convert CR to EX on Ack*/

From b83d51c0785c34d552111f38fbecbe00cd58b913 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 10 Jul 2015 17:01:16 +0800
Subject: [PATCH 21/33] md-cluster: init completion within lockres_init

We should init completion within lockres_init, otherwise
completion could be initialized more than one time during
it's life cycle.

Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 85b7836fb4b5..2a57f193b103 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -100,7 +100,6 @@ static int dlm_lock_sync(struct dlm_lock_resource *res, int mode)
 {
 	int ret = 0;
 
-	init_completion(&res->completion);
 	ret = dlm_lock(res->ls, mode, &res->lksb,
 			res->flags, res->name, strlen(res->name),
 			0, sync_ast, res, res->bast);
@@ -125,6 +124,7 @@ static struct dlm_lock_resource *lockres_init(struct mddev *mddev,
 	res = kzalloc(sizeof(struct dlm_lock_resource), GFP_KERNEL);
 	if (!res)
 		return NULL;
+	init_completion(&res->completion);
 	res->ls = cinfo->lockspace;
 	res->mddev = mddev;
 	namelen = strlen(name);
@@ -169,7 +169,6 @@ static void lockres_free(struct dlm_lock_resource *res)
 	if (!res)
 		return;
 
-	init_completion(&res->completion);
 	dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
 	wait_for_completion(&res->completion);
 

From b5ef56789b808a57fcd07271ff509911662fd877 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 10 Jul 2015 17:01:17 +0800
Subject: [PATCH 22/33] md-cluster: add the error check if failed to get dlm
 lock

In complicated cluster environment, it is possible that the
dlm lock couldn't be get/convert on purpose, the related err
info is added for better debug potential issue.

For lockres_free, if the lock is blocking by a lock request or
conversion request, then dlm_unlock just put it back to grant
queue, so need to ensure the lock is free finally.

Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 41 +++++++++++++++++++++++++++++++++++------
 1 file changed, 35 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 2a57f193b103..b80a689aad04 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -166,10 +166,24 @@ out_err:
 
 static void lockres_free(struct dlm_lock_resource *res)
 {
+	int ret;
+
 	if (!res)
 		return;
 
-	dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+	/* cancel a lock request or a conversion request that is blocked */
+	res->flags |= DLM_LKF_CANCEL;
+retry:
+	ret = dlm_unlock(res->ls, res->lksb.sb_lkid, 0, &res->lksb, res);
+	if (unlikely(ret != 0)) {
+		pr_info("%s: failed to unlock %s return %d\n", __func__, res->name, ret);
+
+		/* if a lock conversion is cancelled, then the lock is put
+		 * back to grant queue, need to ensure it is unlocked */
+		if (ret == -DLM_ECANCEL)
+			goto retry;
+	}
+	res->flags &= ~DLM_LKF_CANCEL;
 	wait_for_completion(&res->completion);
 
 	kfree(res->name);
@@ -474,6 +488,7 @@ static void recv_daemon(struct md_thread *thread)
 	struct dlm_lock_resource *ack_lockres = cinfo->ack_lockres;
 	struct dlm_lock_resource *message_lockres = cinfo->message_lockres;
 	struct cluster_msg msg;
+	int ret;
 
 	/*get CR on Message*/
 	if (dlm_lock_sync(message_lockres, DLM_LOCK_CR)) {
@@ -486,13 +501,21 @@ static void recv_daemon(struct md_thread *thread)
 	process_recvd_msg(thread->mddev, &msg);
 
 	/*release CR on ack_lockres*/
-	dlm_unlock_sync(ack_lockres);
+	ret = dlm_unlock_sync(ack_lockres);
+	if (unlikely(ret != 0))
+		pr_info("unlock ack failed return %d\n", ret);
 	/*up-convert to PR on message_lockres*/
-	dlm_lock_sync(message_lockres, DLM_LOCK_PR);
+	ret = dlm_lock_sync(message_lockres, DLM_LOCK_PR);
+	if (unlikely(ret != 0))
+		pr_info("lock PR on msg failed return %d\n", ret);
 	/*get CR on ack_lockres again*/
-	dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
+	ret = dlm_lock_sync(ack_lockres, DLM_LOCK_CR);
+	if (unlikely(ret != 0))
+		pr_info("lock CR on ack failed return %d\n", ret);
 	/*release CR on message_lockres*/
-	dlm_unlock_sync(message_lockres);
+	ret = dlm_unlock_sync(message_lockres);
+	if (unlikely(ret != 0))
+		pr_info("unlock msg failed return %d\n", ret);
 }
 
 /* lock_comm()
@@ -567,7 +590,13 @@ static int __sendmsg(struct md_cluster_info *cinfo, struct cluster_msg *cmsg)
 	}
 
 failed_ack:
-	dlm_unlock_sync(cinfo->message_lockres);
+	error = dlm_unlock_sync(cinfo->message_lockres);
+	if (unlikely(error != 0)) {
+		pr_err("md-cluster: failed convert to NL on MESSAGE(%d)\n",
+			error);
+		/* in case the message can't be released due to some reason */
+		goto failed_ack;
+	}
 failed_message:
 	return error;
 }

From 9e3072e373320b331512e24f6650efa0a09720af Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 10 Jul 2015 17:01:18 +0800
Subject: [PATCH 23/33] md-cluster: init suspend_list and suspend_lock early in
 join

If the node just join the cluster, and receive the msg from other nodes
before init suspend_list, it will cause kernel crash due to NULL pointer
dereference, so move the initializations early to fix the bug.

md-cluster: Joined cluster 3578507b-e0cb-6d4f-6322-696cd7b1b10c slot 3
BUG: unable to handle kernel NULL pointer dereference at           (null)
... ... ...
Call Trace:
[<ffffffffa0444924>] process_recvd_msg+0x2e4/0x330 [md_cluster]
[<ffffffffa0444a06>] recv_daemon+0x96/0x170 [md_cluster]
[<ffffffffa045189d>] md_thread+0x11d/0x170 [md_mod]
[<ffffffff810768c4>] kthread+0xb4/0xc0
[<ffffffff8151927c>] ret_from_fork+0x7c/0xb0
... ... ...
RIP  [<ffffffffa0443581>] __remove_suspend_info+0x11/0xa0 [md_cluster]

Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index b80a689aad04..6f1ea3c787f2 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -671,6 +671,8 @@ static int join(struct mddev *mddev, int nodes)
 	if (!cinfo)
 		return -ENOMEM;
 
+	INIT_LIST_HEAD(&cinfo->suspend_list);
+	spin_lock_init(&cinfo->suspend_lock);
 	init_completion(&cinfo->completion);
 
 	mutex_init(&cinfo->sb_mutex);
@@ -736,9 +738,6 @@ static int join(struct mddev *mddev, int nodes)
 		goto err;
 	}
 
-	INIT_LIST_HEAD(&cinfo->suspend_list);
-	spin_lock_init(&cinfo->suspend_lock);
-
 	ret = gather_all_resync_info(mddev, nodes);
 	if (ret)
 		goto err;

From b2b9bfff0aa721a04a3924ed451c417d2bd9ed15 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 10 Jul 2015 17:01:19 +0800
Subject: [PATCH 24/33] md-cluster: remove the unused sb_lock

The sb_lock is not used anywhere, so let's remove it.

Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 9 ---------
 1 file changed, 9 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 6f1ea3c787f2..057a9733f748 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -52,7 +52,6 @@ struct md_cluster_info {
 	dlm_lockspace_t *lockspace;
 	int slot_number;
 	struct completion completion;
-	struct dlm_lock_resource *sb_lock;
 	struct mutex sb_mutex;
 	struct dlm_lock_resource *bitmap_lockres;
 	struct list_head suspend_list;
@@ -692,12 +691,6 @@ static int join(struct mddev *mddev, int nodes)
 		ret = -ERANGE;
 		goto err;
 	}
-	cinfo->sb_lock = lockres_init(mddev, "cmd-super",
-					NULL, 0);
-	if (!cinfo->sb_lock) {
-		ret = -ENOMEM;
-		goto err;
-	}
 	/* Initiate the communication resources */
 	ret = -ENOMEM;
 	cinfo->recv_thread = md_register_thread(recv_daemon, mddev, "cluster_recv");
@@ -749,7 +742,6 @@ err:
 	lockres_free(cinfo->ack_lockres);
 	lockres_free(cinfo->no_new_dev_lockres);
 	lockres_free(cinfo->bitmap_lockres);
-	lockres_free(cinfo->sb_lock);
 	if (cinfo->lockspace)
 		dlm_release_lockspace(cinfo->lockspace, 2);
 	mddev->cluster_info = NULL;
@@ -770,7 +762,6 @@ static int leave(struct mddev *mddev)
 	lockres_free(cinfo->token_lockres);
 	lockres_free(cinfo->ack_lockres);
 	lockres_free(cinfo->no_new_dev_lockres);
-	lockres_free(cinfo->sb_lock);
 	lockres_free(cinfo->bitmap_lockres);
 	dlm_release_lockspace(cinfo->lockspace, 2);
 	return 0;

From 6e6d9f2cda47745a3abcb6609b1dee0e831161d8 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 10 Jul 2015 17:01:20 +0800
Subject: [PATCH 25/33] md-cluster: add missed lockres_free

We also need to free the lock resource before goto out.

Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 4 +++-
 1 file changed, 3 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 057a9733f748..411b4306840f 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -647,8 +647,10 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 			lockres_free(bm_lockres);
 			continue;
 		}
-		if (ret)
+		if (ret) {
+			lockres_free(bm_lockres);
 			goto out;
+		}
 		/* TODO: Read the disk bitmap sb and check if it needs recovery */
 		dlm_unlock_sync(bm_lockres);
 		lockres_free(bm_lockres);

From eece075cda38f55fc5829b5f9ec5fb919c561d81 Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 10 Jul 2015 17:01:21 +0800
Subject: [PATCH 26/33] md-cluster: only call complete(&cinfo->completion) when
 node join cluster

Introduce MD_CLUSTER_BEGIN_JOIN_CLUSTER flag to make sure
complete(&cinfo->completion) is only be invoked when node
join cluster. Otherwise node failure could also call the
complete, and it doesn't make sense to do it.

Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 11 ++++++++++-
 1 file changed, 10 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 411b4306840f..29f65e2be025 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -45,6 +45,7 @@ struct resync_info {
 /* md_cluster_info flags */
 #define		MD_CLUSTER_WAITING_FOR_NEWDISK		1
 #define		MD_CLUSTER_SUSPEND_READ_BALANCING	2
+#define		MD_CLUSTER_BEGIN_JOIN_CLUSTER		3
 
 
 struct md_cluster_info {
@@ -320,10 +321,17 @@ static void recover_done(void *arg, struct dlm_slot *slots,
 	struct md_cluster_info *cinfo = mddev->cluster_info;
 
 	cinfo->slot_number = our_slot;
-	complete(&cinfo->completion);
+	/* completion is only need to be complete when node join cluster,
+	 * it doesn't need to run during another node's failure */
+	if (test_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state)) {
+		complete(&cinfo->completion);
+		clear_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
+	}
 	clear_bit(MD_CLUSTER_SUSPEND_READ_BALANCING, &cinfo->state);
 }
 
+/* the ops is called when node join the cluster, and do lock recovery
+ * if node failure occurs */
 static const struct dlm_lockspace_ops md_ls_ops = {
 	.recover_prep = recover_prep,
 	.recover_slot = recover_slot,
@@ -675,6 +683,7 @@ static int join(struct mddev *mddev, int nodes)
 	INIT_LIST_HEAD(&cinfo->suspend_list);
 	spin_lock_init(&cinfo->suspend_lock);
 	init_completion(&cinfo->completion);
+	set_bit(MD_CLUSTER_BEGIN_JOIN_CLUSTER, &cinfo->state);
 
 	mutex_init(&cinfo->sb_mutex);
 	mddev->cluster_info = cinfo;

From abb9b22ac948000ae156cd2d115c8632ec30a2ce Mon Sep 17 00:00:00 2001
From: Guoqing Jiang <gqjiang@suse.com>
Date: Fri, 10 Jul 2015 17:01:22 +0800
Subject: [PATCH 27/33] md-cluster: Read the disk bitmap sb and check if it
 needs recovery

In gather_all_resync_info, we need to read the disk bitmap sb and
check if it needs recovery.

Reviewed-by: Goldwyn Rodrigues <rgoldwyn@suse.com>
Signed-off-by: Guoqing Jiang <gqjiang@suse.com>
Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 16 +++++++++++++++-
 1 file changed, 15 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index 29f65e2be025..c35a03a7eb11 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -625,6 +625,7 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 	struct dlm_lock_resource *bm_lockres;
 	struct suspend_info *s;
 	char str[64];
+	sector_t lo, hi;
 
 
 	for (i = 0; i < total_slots; i++) {
@@ -659,7 +660,20 @@ static int gather_all_resync_info(struct mddev *mddev, int total_slots)
 			lockres_free(bm_lockres);
 			goto out;
 		}
-		/* TODO: Read the disk bitmap sb and check if it needs recovery */
+
+		/* Read the disk bitmap sb and check if it needs recovery */
+		ret = bitmap_copy_from_slot(mddev, i, &lo, &hi, false);
+		if (ret) {
+			pr_warn("md-cluster: Could not gather bitmaps from slot %d", i);
+			lockres_free(bm_lockres);
+			continue;
+		}
+		if ((hi > 0) && (lo < mddev->recovery_cp)) {
+			set_bit(MD_RECOVERY_NEEDED, &mddev->recovery);
+			mddev->recovery_cp = lo;
+			md_check_recovery(mddev);
+		}
+
 		dlm_unlock_sync(bm_lockres);
 		lockres_free(bm_lockres);
 	}

From 6022e75bf0686799a6ecca3c33a669e6c70e9d26 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Thu, 13 Aug 2015 12:32:55 +1000
Subject: [PATCH 28/33] md: extend spinlock protection in
 register_md_cluster_operations

This code looks racy.

The only possible race is if two modules try to register at the same
time and that won't happen.  But make the code look safe anyway.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c | 16 ++++++++++------
 1 file changed, 10 insertions(+), 6 deletions(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index c0637603a391..7d5a6cede9b0 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -7427,15 +7427,19 @@ int unregister_md_personality(struct md_personality *p)
 }
 EXPORT_SYMBOL(unregister_md_personality);
 
-int register_md_cluster_operations(struct md_cluster_operations *ops, struct module *module)
+int register_md_cluster_operations(struct md_cluster_operations *ops,
+				   struct module *module)
 {
-	if (md_cluster_ops != NULL)
-		return -EALREADY;
+	int ret = 0;
 	spin_lock(&pers_lock);
-	md_cluster_ops = ops;
-	md_cluster_mod = module;
+	if (md_cluster_ops != NULL)
+		ret = -EALREADY;
+	else {
+		md_cluster_ops = ops;
+		md_cluster_mod = module;
+	}
 	spin_unlock(&pers_lock);
-	return 0;
+	return ret;
 }
 EXPORT_SYMBOL(register_md_cluster_operations);
 

From 18b9f67962eb890da0c053e04c3cf0e91871d4fa Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 14 Aug 2015 10:22:00 +1000
Subject: [PATCH 29/33] md-cluster: remove inappropriate try_module_get from
 join()

md_setup_cluster already calls try_module_get(), so this
try_module_get isn't needed.
Also, there is no matching module_put (except in error patch),
so this leaves an unbalanced module count.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md-cluster.c | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/drivers/md/md-cluster.c b/drivers/md/md-cluster.c
index c35a03a7eb11..11e3bc9d2a4b 100644
--- a/drivers/md/md-cluster.c
+++ b/drivers/md/md-cluster.c
@@ -687,9 +687,6 @@ static int join(struct mddev *mddev, int nodes)
 	int ret, ops_rv;
 	char str[64];
 
-	if (!try_module_get(THIS_MODULE))
-		return -ENOENT;
-
 	cinfo = kzalloc(sizeof(struct md_cluster_info), GFP_KERNEL);
 	if (!cinfo)
 		return -ENOMEM;
@@ -771,7 +768,6 @@ err:
 		dlm_release_lockspace(cinfo->lockspace, 2);
 	mddev->cluster_info = NULL;
 	kfree(cinfo);
-	module_put(THIS_MODULE);
 	return ret;
 }
 

From 55ce74d4bfe1b9444436264c637f39a152d1e5ac Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 14 Aug 2015 11:11:10 +1000
Subject: [PATCH 30/33] md/raid1: ensure device failure recorded before write
 request returns.

When a write to one of the legs of a RAID1 fails, the failure is
recorded in the metadata of the other leg(s) so that after a restart
the data on the failed drive wont be trusted even if that drive seems
to be working again  (maybe a cable was unplugged).

Similarly when we record a bad-block in response to a write failure,
we must not let the write complete until the bad-block update is safe.

Currently there is no interlock between the write request completing
and the metadata update.  So it is possible that the write will
complete, the app will confirm success in some way, and then the
machine will crash before the metadata update completes.

This is an extremely small hole for a racy to fit in, but it is
theoretically possible and so should be closed.

So:
 - set MD_CHANGE_PENDING when requesting a metadata update for a
   failed device, so we can know with certainty when it completes
 - queue requests that experienced an error on a new queue which
   is only processed after the metadata update completes
 - call raid_end_bio_io() on bios in that queue when the time comes.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/md.c    |  1 +
 drivers/md/raid1.c | 29 ++++++++++++++++++++++++++++-
 drivers/md/raid1.h |  5 +++++
 3 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/drivers/md/md.c b/drivers/md/md.c
index 7d5a6cede9b0..8644ce76e5f8 100644
--- a/drivers/md/md.c
+++ b/drivers/md/md.c
@@ -8629,6 +8629,7 @@ int rdev_set_badblocks(struct md_rdev *rdev, sector_t s, int sectors,
 		/* Make sure they get written out promptly */
 		sysfs_notify_dirent_safe(rdev->sysfs_state);
 		set_bit(MD_CHANGE_CLEAN, &rdev->mddev->flags);
+		set_bit(MD_CHANGE_PENDING, &rdev->mddev->flags);
 		md_wakeup_thread(rdev->mddev->thread);
 	}
 	return rv;
diff --git a/drivers/md/raid1.c b/drivers/md/raid1.c
index 742b50794dfd..3d9ca836247f 100644
--- a/drivers/md/raid1.c
+++ b/drivers/md/raid1.c
@@ -1508,6 +1508,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	 */
 	set_bit(MD_RECOVERY_INTR, &mddev->recovery);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_CHANGE_PENDING, &mddev->flags);
 	printk(KERN_ALERT
 	       "md/raid1:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid1:%s: Operation continuing on %d devices.\n",
@@ -2289,6 +2290,7 @@ static void handle_sync_write_finished(struct r1conf *conf, struct r1bio *r1_bio
 static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 {
 	int m;
+	bool fail = false;
 	for (m = 0; m < conf->raid_disks * 2 ; m++)
 		if (r1_bio->bios[m] == IO_MADE_GOOD) {
 			struct md_rdev *rdev = conf->mirrors[m].rdev;
@@ -2301,6 +2303,7 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 			 * narrow down and record precise write
 			 * errors.
 			 */
+			fail = true;
 			if (!narrow_write_error(r1_bio, m)) {
 				md_error(conf->mddev,
 					 conf->mirrors[m].rdev);
@@ -2312,7 +2315,13 @@ static void handle_write_finished(struct r1conf *conf, struct r1bio *r1_bio)
 		}
 	if (test_bit(R1BIO_WriteError, &r1_bio->state))
 		close_write(r1_bio);
-	raid_end_bio_io(r1_bio);
+	if (fail) {
+		spin_lock_irq(&conf->device_lock);
+		list_add(&r1_bio->retry_list, &conf->bio_end_io_list);
+		spin_unlock_irq(&conf->device_lock);
+		md_wakeup_thread(conf->mddev->thread);
+	} else
+		raid_end_bio_io(r1_bio);
 }
 
 static void handle_read_error(struct r1conf *conf, struct r1bio *r1_bio)
@@ -2418,6 +2427,23 @@ static void raid1d(struct md_thread *thread)
 
 	md_check_recovery(mddev);
 
+	if (!list_empty_careful(&conf->bio_end_io_list) &&
+	    !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+		LIST_HEAD(tmp);
+		spin_lock_irqsave(&conf->device_lock, flags);
+		if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+			list_add(&tmp, &conf->bio_end_io_list);
+			list_del_init(&conf->bio_end_io_list);
+		}
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+		while (!list_empty(&tmp)) {
+			r1_bio = list_first_entry(&conf->bio_end_io_list,
+						  struct r1bio, retry_list);
+			list_del(&r1_bio->retry_list);
+			raid_end_bio_io(r1_bio);
+		}
+	}
+
 	blk_start_plug(&plug);
 	for (;;) {
 
@@ -2819,6 +2845,7 @@ static struct r1conf *setup_conf(struct mddev *mddev)
 	conf->raid_disks = mddev->raid_disks;
 	conf->mddev = mddev;
 	INIT_LIST_HEAD(&conf->retry_list);
+	INIT_LIST_HEAD(&conf->bio_end_io_list);
 
 	spin_lock_init(&conf->resync_lock);
 	init_waitqueue_head(&conf->wait_barrier);
diff --git a/drivers/md/raid1.h b/drivers/md/raid1.h
index 14ebb288c1ef..c52d7139c5d7 100644
--- a/drivers/md/raid1.h
+++ b/drivers/md/raid1.h
@@ -61,6 +61,11 @@ struct r1conf {
 	 * block, or anything else.
 	 */
 	struct list_head	retry_list;
+	/* A separate list of r1bio which just need raid_end_bio_io called.
+	 * This mustn't happen for writes which had any errors if the superblock
+	 * needs to be written.
+	 */
+	struct list_head	bio_end_io_list;
 
 	/* queue pending writes to be submitted on unplug */
 	struct bio_list		pending_bio_list;

From 95af587e95aacb9cfda4a9641069a5244a540dc8 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 14 Aug 2015 11:26:17 +1000
Subject: [PATCH 31/33] md/raid10: ensure device failure recorded before write
 request returns.

When a write to one of the legs of a RAID10 fails, the failure is
recorded in the metadata of the other legs so that after a restart
the data on the failed drive wont be trusted even if that drive seems
to be working again (maybe a cable was unplugged).

Currently there is no interlock between the write request completing
and the metadata update.  So it is possible that the write will
complete, the app will confirm success in some way, and then the
machine will crash before the metadata update completes.

This is an extremely small hole for a racy to fit in, but it is
theoretically possible and so should be closed.

So:
 - set MD_CHANGE_PENDING when requesting a metadata update for a
   failed device, so we can know with certainty when it completes
 - queue requests that experienced an error on a new queue which
   is only processed after the metadata update completes
 - call raid_end_bio_io() on bios in that queue when the time comes.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid10.c | 29 ++++++++++++++++++++++++++++-
 drivers/md/raid10.h |  6 ++++++
 2 files changed, 34 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid10.c b/drivers/md/raid10.c
index dcc0e9b3ee92..a14c304aa751 100644
--- a/drivers/md/raid10.c
+++ b/drivers/md/raid10.c
@@ -1681,6 +1681,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_CHANGE_PENDING, &mddev->flags);
 	spin_unlock_irqrestore(&conf->device_lock, flags);
 	printk(KERN_ALERT
 	       "md/raid10:%s: Disk failure on %s, disabling device.\n"
@@ -2738,6 +2739,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 		}
 		put_buf(r10_bio);
 	} else {
+		bool fail = false;
 		for (m = 0; m < conf->copies; m++) {
 			int dev = r10_bio->devs[m].devnum;
 			struct bio *bio = r10_bio->devs[m].bio;
@@ -2750,6 +2752,7 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 				rdev_dec_pending(rdev, conf->mddev);
 			} else if (bio != NULL &&
 				   !test_bit(BIO_UPTODATE, &bio->bi_flags)) {
+				fail = true;
 				if (!narrow_write_error(r10_bio, m)) {
 					md_error(conf->mddev, rdev);
 					set_bit(R10BIO_Degraded,
@@ -2770,7 +2773,13 @@ static void handle_write_completed(struct r10conf *conf, struct r10bio *r10_bio)
 		if (test_bit(R10BIO_WriteError,
 			     &r10_bio->state))
 			close_write(r10_bio);
-		raid_end_bio_io(r10_bio);
+		if (fail) {
+			spin_lock_irq(&conf->device_lock);
+			list_add(&r10_bio->retry_list, &conf->bio_end_io_list);
+			spin_unlock_irq(&conf->device_lock);
+			md_wakeup_thread(conf->mddev->thread);
+		} else
+			raid_end_bio_io(r10_bio);
 	}
 }
 
@@ -2785,6 +2794,23 @@ static void raid10d(struct md_thread *thread)
 
 	md_check_recovery(mddev);
 
+	if (!list_empty_careful(&conf->bio_end_io_list) &&
+	    !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+		LIST_HEAD(tmp);
+		spin_lock_irqsave(&conf->device_lock, flags);
+		if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+			list_add(&tmp, &conf->bio_end_io_list);
+			list_del_init(&conf->bio_end_io_list);
+		}
+		spin_unlock_irqrestore(&conf->device_lock, flags);
+		while (!list_empty(&tmp)) {
+			r10_bio = list_first_entry(&conf->bio_end_io_list,
+						  struct r10bio, retry_list);
+			list_del(&r10_bio->retry_list);
+			raid_end_bio_io(r10_bio);
+		}
+	}
+
 	blk_start_plug(&plug);
 	for (;;) {
 
@@ -3559,6 +3585,7 @@ static struct r10conf *setup_conf(struct mddev *mddev)
 	conf->reshape_safe = conf->reshape_progress;
 	spin_lock_init(&conf->device_lock);
 	INIT_LIST_HEAD(&conf->retry_list);
+	INIT_LIST_HEAD(&conf->bio_end_io_list);
 
 	spin_lock_init(&conf->resync_lock);
 	init_waitqueue_head(&conf->wait_barrier);
diff --git a/drivers/md/raid10.h b/drivers/md/raid10.h
index 5ee6473ddc2c..6fc2c75759bf 100644
--- a/drivers/md/raid10.h
+++ b/drivers/md/raid10.h
@@ -53,6 +53,12 @@ struct r10conf {
 	sector_t		offset_diff;
 
 	struct list_head	retry_list;
+	/* A separate list of r1bio which just need raid_end_bio_io called.
+	 * This mustn't happen for writes which had any errors if the superblock
+	 * needs to be written.
+	 */
+	struct list_head	bio_end_io_list;
+
 	/* queue pending writes and submit them on unplug */
 	struct bio_list		pending_bio_list;
 	int			pending_count;

From 34a6f80e1639b124f24b5fadc1d45d69417cbace Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 14 Aug 2015 12:07:57 +1000
Subject: [PATCH 32/33] md/raid5: use bio_list for the list of bios to return.

This will make it easier to splice two lists together which will
be needed in future patch.

Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c | 41 +++++++++++++++--------------------------
 drivers/md/raid5.h |  2 +-
 2 files changed, 16 insertions(+), 27 deletions(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 7b0c706f33e7..214dcca0d7f8 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -223,18 +223,14 @@ static int raid6_idx_to_slot(int idx, struct stripe_head *sh,
 	return slot;
 }
 
-static void return_io(struct bio *return_bi)
+static void return_io(struct bio_list *return_bi)
 {
-	struct bio *bi = return_bi;
-	while (bi) {
-
-		return_bi = bi->bi_next;
-		bi->bi_next = NULL;
+	struct bio *bi;
+	while ((bi = bio_list_pop(return_bi)) != NULL) {
 		bi->bi_iter.bi_size = 0;
 		trace_block_bio_complete(bdev_get_queue(bi->bi_bdev),
 					 bi, 0);
 		bio_endio(bi, 0);
-		bi = return_bi;
 	}
 }
 
@@ -1177,7 +1173,7 @@ async_copy_data(int frombio, struct bio *bio, struct page **page,
 static void ops_complete_biofill(void *stripe_head_ref)
 {
 	struct stripe_head *sh = stripe_head_ref;
-	struct bio *return_bi = NULL;
+	struct bio_list return_bi = BIO_EMPTY_LIST;
 	int i;
 
 	pr_debug("%s: stripe %llu\n", __func__,
@@ -1201,17 +1197,15 @@ static void ops_complete_biofill(void *stripe_head_ref)
 			while (rbi && rbi->bi_iter.bi_sector <
 				dev->sector + STRIPE_SECTORS) {
 				rbi2 = r5_next_bio(rbi, dev->sector);
-				if (!raid5_dec_bi_active_stripes(rbi)) {
-					rbi->bi_next = return_bi;
-					return_bi = rbi;
-				}
+				if (!raid5_dec_bi_active_stripes(rbi))
+					bio_list_add(&return_bi, rbi);
 				rbi = rbi2;
 			}
 		}
 	}
 	clear_bit(STRIPE_BIOFILL_RUN, &sh->state);
 
-	return_io(return_bi);
+	return_io(&return_bi);
 
 	set_bit(STRIPE_HANDLE, &sh->state);
 	release_stripe(sh);
@@ -3071,7 +3065,7 @@ static void stripe_set_idx(sector_t stripe, struct r5conf *conf, int previous,
 static void
 handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 				struct stripe_head_state *s, int disks,
-				struct bio **return_bi)
+				struct bio_list *return_bi)
 {
 	int i;
 	BUG_ON(sh->batch_head);
@@ -3115,8 +3109,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
 			if (!raid5_dec_bi_active_stripes(bi)) {
 				md_write_end(conf->mddev);
-				bi->bi_next = *return_bi;
-				*return_bi = bi;
+				bio_list_add(return_bi, bi);
 			}
 			bi = nextbi;
 		}
@@ -3139,8 +3132,7 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 			clear_bit(BIO_UPTODATE, &bi->bi_flags);
 			if (!raid5_dec_bi_active_stripes(bi)) {
 				md_write_end(conf->mddev);
-				bi->bi_next = *return_bi;
-				*return_bi = bi;
+				bio_list_add(return_bi, bi);
 			}
 			bi = bi2;
 		}
@@ -3162,10 +3154,8 @@ handle_failed_stripe(struct r5conf *conf, struct stripe_head *sh,
 				struct bio *nextbi =
 					r5_next_bio(bi, sh->dev[i].sector);
 				clear_bit(BIO_UPTODATE, &bi->bi_flags);
-				if (!raid5_dec_bi_active_stripes(bi)) {
-					bi->bi_next = *return_bi;
-					*return_bi = bi;
-				}
+				if (!raid5_dec_bi_active_stripes(bi))
+					bio_list_add(return_bi, bi);
 				bi = nextbi;
 			}
 		}
@@ -3444,7 +3434,7 @@ static void break_stripe_batch_list(struct stripe_head *head_sh,
  * never LOCKED, so we don't need to test 'failed' directly.
  */
 static void handle_stripe_clean_event(struct r5conf *conf,
-	struct stripe_head *sh, int disks, struct bio **return_bi)
+	struct stripe_head *sh, int disks, struct bio_list *return_bi)
 {
 	int i;
 	struct r5dev *dev;
@@ -3478,8 +3468,7 @@ returnbi:
 					wbi2 = r5_next_bio(wbi, dev->sector);
 					if (!raid5_dec_bi_active_stripes(wbi)) {
 						md_write_end(conf->mddev);
-						wbi->bi_next = *return_bi;
-						*return_bi = wbi;
+						bio_list_add(return_bi, wbi);
 					}
 					wbi = wbi2;
 				}
@@ -4612,7 +4601,7 @@ finish:
 			md_wakeup_thread(conf->mddev->thread);
 	}
 
-	return_io(s.return_bi);
+	return_io(&s.return_bi);
 
 	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index d05144278690..1de82a6e4c23 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -265,7 +265,7 @@ struct stripe_head_state {
 	int dec_preread_active;
 	unsigned long ops_request;
 
-	struct bio *return_bi;
+	struct bio_list return_bi;
 	struct md_rdev *blocked_rdev;
 	int handle_bad_blocks;
 };

From c3cce6cda162eb2b2960a85d9c8992f4f3be85d0 Mon Sep 17 00:00:00 2001
From: NeilBrown <neilb@suse.com>
Date: Fri, 14 Aug 2015 12:47:33 +1000
Subject: [PATCH 33/33] md/raid5: ensure device failure recorded before write
 request returns.

When a write to one of the devices of a RAID5/6 fails, the failure is
recorded in the metadata of the other devices so that after a restart
the data on the failed drive wont be trusted even if that drive seems
to be working again (maybe a cable was unplugged).

Similarly when we record a bad-block in response to a write failure,
we must not let the write complete until the bad-block update is safe.

Currently there is no interlock between the write request completing
and the metadata update.  So it is possible that the write will
complete, the app will confirm success in some way, and then the
machine will crash before the metadata update completes.

This is an extremely small hole for a racy to fit in, but it is
theoretically possible and so should be closed.

So:
 - set MD_CHANGE_PENDING when requesting a metadata update for a
   failed device, so we can know with certainty when it completes
 - queue requests that completed when MD_CHANGE_PENDING is set to
   only be processed after the metadata update completes
 - call raid_end_bio_io() on bios in that queue when the time comes.


Signed-off-by: NeilBrown <neilb@suse.com>
---
 drivers/md/raid5.c | 24 +++++++++++++++++++++++-
 drivers/md/raid5.h |  3 +++
 2 files changed, 26 insertions(+), 1 deletion(-)

diff --git a/drivers/md/raid5.c b/drivers/md/raid5.c
index 214dcca0d7f8..4195064460d0 100644
--- a/drivers/md/raid5.c
+++ b/drivers/md/raid5.c
@@ -2513,6 +2513,7 @@ static void error(struct mddev *mddev, struct md_rdev *rdev)
 	set_bit(Blocked, &rdev->flags);
 	set_bit(Faulty, &rdev->flags);
 	set_bit(MD_CHANGE_DEVS, &mddev->flags);
+	set_bit(MD_CHANGE_PENDING, &mddev->flags);
 	printk(KERN_ALERT
 	       "md/raid:%s: Disk failure on %s, disabling device.\n"
 	       "md/raid:%s: Operation continuing on %d devices.\n",
@@ -4601,7 +4602,15 @@ finish:
 			md_wakeup_thread(conf->mddev->thread);
 	}
 
-	return_io(&s.return_bi);
+	if (!bio_list_empty(&s.return_bi)) {
+		if (test_bit(MD_CHANGE_PENDING, &conf->mddev->flags)) {
+			spin_lock_irq(&conf->device_lock);
+			bio_list_merge(&conf->return_bi, &s.return_bi);
+			spin_unlock_irq(&conf->device_lock);
+			md_wakeup_thread(conf->mddev->thread);
+		} else
+			return_io(&s.return_bi);
+	}
 
 	clear_bit_unlock(STRIPE_ACTIVE, &sh->state);
 }
@@ -5817,6 +5826,18 @@ static void raid5d(struct md_thread *thread)
 
 	md_check_recovery(mddev);
 
+	if (!bio_list_empty(&conf->return_bi) &&
+	    !test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+		struct bio_list tmp = BIO_EMPTY_LIST;
+		spin_lock_irq(&conf->device_lock);
+		if (!test_bit(MD_CHANGE_PENDING, &mddev->flags)) {
+			bio_list_merge(&tmp, &conf->return_bi);
+			bio_list_init(&conf->return_bi);
+		}
+		spin_unlock_irq(&conf->device_lock);
+		return_io(&tmp);
+	}
+
 	blk_start_plug(&plug);
 	handled = 0;
 	spin_lock_irq(&conf->device_lock);
@@ -6476,6 +6497,7 @@ static struct r5conf *setup_conf(struct mddev *mddev)
 	INIT_LIST_HEAD(&conf->hold_list);
 	INIT_LIST_HEAD(&conf->delayed_list);
 	INIT_LIST_HEAD(&conf->bitmap_list);
+	bio_list_init(&conf->return_bi);
 	init_llist_head(&conf->released_stripes);
 	atomic_set(&conf->active_stripes, 0);
 	atomic_set(&conf->preread_active_stripes, 0);
diff --git a/drivers/md/raid5.h b/drivers/md/raid5.h
index 1de82a6e4c23..828c2925e68f 100644
--- a/drivers/md/raid5.h
+++ b/drivers/md/raid5.h
@@ -476,6 +476,9 @@ struct r5conf {
 	int			skip_copy; /* Don't copy data from bio to stripe cache */
 	struct list_head	*last_hold; /* detect hold_list promotions */
 
+	/* bios to have bi_end_io called after metadata is synced */
+	struct bio_list		return_bi;
+
 	atomic_t		reshape_stripes; /* stripes with pending writes for reshape */
 	/* unfortunately we need two cache names as we temporarily have
 	 * two caches.