From ee376dbdf27728a2f3d30e2ba10fa387cc4c645b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 10 Jan 2015 19:47:10 -0800
Subject: [PATCH 01/56] rcu: Consolidate rcu_synchronize and wakeme_after_rcu()

There are currently duplicate identical definitions of the
rcu_synchronize() structure and the wakeme_after_rcu() function.
Thie commit therefore consolidates them.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h |  9 +++++++++
 kernel/rcu/srcu.c        | 17 -----------------
 kernel/rcu/update.c      | 15 ++++++---------
 3 files changed, 15 insertions(+), 26 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 78097491cd99..3e6afed51051 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -195,6 +195,15 @@ void call_rcu_sched(struct rcu_head *head,
 
 void synchronize_sched(void);
 
+/*
+ * Structure allowing asynchronous waiting on RCU.
+ */
+struct rcu_synchronize {
+	struct rcu_head head;
+	struct completion completion;
+};
+void wakeme_after_rcu(struct rcu_head *head);
+
 /**
  * call_rcu_tasks() - Queue an RCU for invocation task-based grace period
  * @head: structure to be used for queueing the RCU updates.
diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 445bf8ffe3fb..81f53b504c18 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -402,23 +402,6 @@ void call_srcu(struct srcu_struct *sp, struct rcu_head *head,
 }
 EXPORT_SYMBOL_GPL(call_srcu);
 
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-/*
- * Awaken the corresponding synchronize_srcu() instance now that a
- * grace period has elapsed.
- */
-static void wakeme_after_rcu(struct rcu_head *head)
-{
-	struct rcu_synchronize *rcu;
-
-	rcu = container_of(head, struct rcu_synchronize, head);
-	complete(&rcu->completion);
-}
-
 static void srcu_advance_batches(struct srcu_struct *sp, int trycount);
 static void srcu_reschedule(struct srcu_struct *sp);
 
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index e0d31a345ee6..8864ed90f0d7 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -199,16 +199,13 @@ EXPORT_SYMBOL_GPL(rcu_read_lock_bh_held);
 
 #endif /* #ifdef CONFIG_DEBUG_LOCK_ALLOC */
 
-struct rcu_synchronize {
-	struct rcu_head head;
-	struct completion completion;
-};
-
-/*
- * Awaken the corresponding synchronize_rcu() instance now that a
- * grace period has elapsed.
+/**
+ * wakeme_after_rcu() - Callback function to awaken a task after grace period
+ * @head: Pointer to rcu_head member within rcu_synchronize structure
+ *
+ * Awaken the corresponding task now that a grace period has elapsed.
  */
-static void wakeme_after_rcu(struct rcu_head  *head)
+void wakeme_after_rcu(struct rcu_head *head)
 {
 	struct rcu_synchronize *rcu;
 

From 3f47da0f32f5e43e6ae901129d5b9c2600011a2c Mon Sep 17 00:00:00 2001
From: Lai Jiangshan <laijs@cn.fujitsu.com>
Date: Tue, 13 Jan 2015 15:30:34 +0800
Subject: [PATCH 02/56] rcu_tree: Avoid touching rnp->completed when a new GP
 is started

In rcu_gp_init(), rnp->completed equals to rsp->completed in THEORY,
we don't need to touch it normally.  If something goes wrong,
it will complain and fixup rnp->completed and avoid oops.
This commit thus avoids the normal needless store to rnp->completed.

Signed-off-by: Lai Jiangshan <laijs@cn.fujitsu.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..077d0b700f74 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1757,8 +1757,8 @@ static int rcu_gp_init(struct rcu_state *rsp)
 		rcu_preempt_check_blocked_tasks(rnp);
 		rnp->qsmask = rnp->qsmaskinit;
 		ACCESS_ONCE(rnp->gpnum) = rsp->gpnum;
-		WARN_ON_ONCE(rnp->completed != rsp->completed);
-		ACCESS_ONCE(rnp->completed) = rsp->completed;
+		if (WARN_ON_ONCE(rnp->completed != rsp->completed))
+			ACCESS_ONCE(rnp->completed) = rsp->completed;
 		if (rnp == rdp->mynode)
 			(void)__note_gp_changes(rsp, rnp, rdp);
 		rcu_preempt_boost_start_gp(rnp);

From d2af1ad73e7a22ed3e04374896fee0eb300c05c3 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 20 Jan 2015 23:54:59 -0800
Subject: [PATCH 03/56] documentation: Update rcutree.kthread_prio for
 grace-period kthread use

Now that the rcutree.kthread_prio kernel boot parameter also controls
the priority of the grace-period kthreads, update the documentation to
reflect this change.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt | 14 +++++++++-----
 1 file changed, 9 insertions(+), 5 deletions(-)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bfcb1a62a7b4..d913e3b4bf0d 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2991,11 +2991,15 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			value is one, and maximum value is HZ.
 
 	rcutree.kthread_prio= 	 [KNL,BOOT]
-			Set the SCHED_FIFO priority of the RCU
-			per-CPU kthreads (rcuc/N). This value is also
-			used for the priority of the RCU boost threads
-			(rcub/N). Valid values are 1-99 and the default
-			is 1 (the least-favored priority).
+			Set the SCHED_FIFO priority of the RCU per-CPU
+			kthreads (rcuc/N). This value is also used for
+			the priority of the RCU boost threads (rcub/N)
+			and for the RCU grace-period kthreads (rcu_bh,
+			rcu_preempt, and rcu_sched). If RCU_BOOST is
+			set, valid values are 1-99 and the default is 1
+			(the least-favored priority).  Otherwise, when
+			RCU_BOOST is not set, valid values are 0-99 and
+			the default is zero (non-realtime operation).
 
 	rcutree.rcu_nocb_leader_stride= [KNL]
 			Set the number of NOCB kthread groups, which

From 89bf5d82ed451f02329bbbb06ac365e96b18804d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 24 Jan 2015 22:24:14 -0800
Subject: [PATCH 04/56] documentation: Update based on on-demand vmstat workers

Now that the on-demand vmstat workers commit is in mainline, it is
possible to eliminate vmstat_update()-induced OS jitter.  This commit
updates the documentation accordingly.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/kernel-per-CPU-kthreads.txt | 18 ++++++++++--------
 1 file changed, 10 insertions(+), 8 deletions(-)

diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index f3cd299fcc41..81fe051c4447 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -190,14 +190,16 @@ To reduce its OS jitter, do any of the following:
 		on each CPU, including cs_dbs_timer() and od_dbs_timer().
 		WARNING:  Please check your CPU specifications to
 		make sure that this is safe on your particular system.
-	d.	It is not possible to entirely get rid of OS jitter
-		from vmstat_update() on CONFIG_SMP=y systems, but you
-		can decrease its frequency by writing a large value
-		to /proc/sys/vm/stat_interval.	The default value is
-		HZ, for an interval of one second.  Of course, larger
-		values will make your virtual-memory statistics update
-		more slowly.  Of course, you can also run your workload
-		at a real-time priority, thus preempting vmstat_update(),
+	d.	As of v3.18, Christoph Lameter's on-demand vmstat workers
+		commit prevents OS jitter due to vmstat_update() on
+		CONFIG_SMP=y systems.  Before v3.18, is not possible
+		to entirely get rid of the OS jitter, but you can
+		decrease its frequency by writing a large value to
+		/proc/sys/vm/stat_interval.  The default value is HZ,
+		for an interval of one second.	Of course, larger values
+		will make your virtual-memory statistics update more
+		slowly.  Of course, you can also run your workload at
+		a real-time priority, thus preempting vmstat_update(),
 		but if your workload is CPU-bound, this is a bad idea.
 		However, there is an RFC patch from Christoph Lameter
 		(based on an earlier one from Gilad Ben-Yossef) that

From c25197841efe53258abb22cfd894a729a272edf9 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 25 Jan 2015 11:28:28 -0800
Subject: [PATCH 05/56] documentation: Update NO_HZ_FULL interaction with POSIX
 timers

POSIX timers are no longer starved on adaptive-ticks CPUs.  Instead, they
prevent affected CPUs from entering adaptive-ticks mode.  This commit
therefore updates the NO_HZ.txt documentation.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/timers/NO_HZ.txt | 10 +++-------
 1 file changed, 3 insertions(+), 7 deletions(-)

diff --git a/Documentation/timers/NO_HZ.txt b/Documentation/timers/NO_HZ.txt
index cca122f25120..6eaf576294f3 100644
--- a/Documentation/timers/NO_HZ.txt
+++ b/Documentation/timers/NO_HZ.txt
@@ -158,13 +158,9 @@ not come for free:
 	to the need to inform kernel subsystems (such as RCU) about
 	the change in mode.
 
-3.	POSIX CPU timers on adaptive-tick CPUs may miss their deadlines
-	(perhaps indefinitely) because they currently rely on
-	scheduling-tick interrupts.  This will likely be fixed in
-	one of two ways: (1) Prevent CPUs with POSIX CPU timers from
-	entering adaptive-tick mode, or (2) Use hrtimers or other
-	adaptive-ticks-immune mechanism to cause the POSIX CPU timer to
-	fire properly.
+3.	POSIX CPU timers prevent CPUs from entering adaptive-tick mode.
+	Real-time applications needing to take actions based on CPU time
+	consumption need to use other means of doing so.
 
 4.	If there are more perf events pending than the hardware can
 	accommodate, they are normally round-robined so as to collect

From f1360570f420b8b122e7f1cccf456ff7133a3007 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 25 Jan 2015 11:48:18 -0800
Subject: [PATCH 06/56] documentation: Update per-CPU kthreads documentation

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/kernel-per-CPU-kthreads.txt | 16 +++++++++++-----
 1 file changed, 11 insertions(+), 5 deletions(-)

diff --git a/Documentation/kernel-per-CPU-kthreads.txt b/Documentation/kernel-per-CPU-kthreads.txt
index 81fe051c4447..f4cbfe0ba108 100644
--- a/Documentation/kernel-per-CPU-kthreads.txt
+++ b/Documentation/kernel-per-CPU-kthreads.txt
@@ -205,7 +205,9 @@ To reduce its OS jitter, do any of the following:
 		(based on an earlier one from Gilad Ben-Yossef) that
 		reduces or even eliminates vmstat overhead for some
 		workloads at https://lkml.org/lkml/2013/9/4/379.
-	e.	If running on high-end powerpc servers, build with
+	e.	Boot with "elevator=noop" to avoid workqueue use by
+		the block layer.
+	f.	If running on high-end powerpc servers, build with
 		CONFIG_PPC_RTAS_DAEMON=n.  This prevents the RTAS
 		daemon from running on each CPU every second or so.
 		(This will require editing Kconfig files and will defeat
@@ -213,12 +215,12 @@ To reduce its OS jitter, do any of the following:
 		due to the rtas_event_scan() function.
 		WARNING:  Please check your CPU specifications to
 		make sure that this is safe on your particular system.
-	f.	If running on Cell Processor, build your kernel with
+	g.	If running on Cell Processor, build your kernel with
 		CBE_CPUFREQ_SPU_GOVERNOR=n to avoid OS jitter from
 		spu_gov_work().
 		WARNING:  Please check your CPU specifications to
 		make sure that this is safe on your particular system.
-	g.	If running on PowerMAC, build your kernel with
+	h.	If running on PowerMAC, build your kernel with
 		CONFIG_PMAC_RACKMETER=n to disable the CPU-meter,
 		avoiding OS jitter from rackmeter_do_timer().
 
@@ -260,8 +262,12 @@ Purpose: Detect software lockups on each CPU.
 To reduce its OS jitter, do at least one of the following:
 1.	Build with CONFIG_LOCKUP_DETECTOR=n, which will prevent these
 	kthreads from being created in the first place.
-2.	Echo a zero to /proc/sys/kernel/watchdog to disable the
+2.	Boot with "nosoftlockup=0", which will also prevent these kthreads
+	from being created.  Other related watchdog and softlockup boot
+	parameters may be found in Documentation/kernel-parameters.txt
+	and Documentation/watchdog/watchdog-parameters.txt.
+3.	Echo a zero to /proc/sys/kernel/watchdog to disable the
 	watchdog timer.
-3.	Echo a large number of /proc/sys/kernel/watchdog_thresh in
+4.	Echo a large number of /proc/sys/kernel/watchdog_thresh in
 	order to reduce the frequency of OS jitter due to the watchdog
 	timer down to a level that is acceptable for your workload.

From daf1aab9acfaaded09f53fa91dfe6e4e6926ec39 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 2 Feb 2015 08:08:25 -0800
Subject: [PATCH 07/56] documentation: Clarify memory-barrier semantics of
 atomic operations

All value-returning atomic read-modify-write operations must provide full
memory-barrier semantics on both sides of the operation.  This commit
clarifies the documentation to make it clear that these memory-barrier
semantics are provided by the operations themselves, not by their callers.

Reported-by: Peter Hurley <peter@hurleysoftware.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/atomic_ops.txt | 45 ++++++++++++++++++------------------
 1 file changed, 23 insertions(+), 22 deletions(-)

diff --git a/Documentation/atomic_ops.txt b/Documentation/atomic_ops.txt
index 183e41bdcb69..dab6da3382d9 100644
--- a/Documentation/atomic_ops.txt
+++ b/Documentation/atomic_ops.txt
@@ -201,11 +201,11 @@ These routines add 1 and subtract 1, respectively, from the given
 atomic_t and return the new counter value after the operation is
 performed.
 
-Unlike the above routines, it is required that explicit memory
-barriers are performed before and after the operation.  It must be
-done such that all memory operations before and after the atomic
-operation calls are strongly ordered with respect to the atomic
-operation itself.
+Unlike the above routines, it is required that these primitives
+include explicit memory barriers that are performed before and after
+the operation.  It must be done such that all memory operations before
+and after the atomic operation calls are strongly ordered with respect
+to the atomic operation itself.
 
 For example, it should behave as if a smp_mb() call existed both
 before and after the atomic operation.
@@ -233,21 +233,21 @@ These two routines increment and decrement by 1, respectively, the
 given atomic counter.  They return a boolean indicating whether the
 resulting counter value was zero or not.
 
-It requires explicit memory barrier semantics around the operation as
-above.
+Again, these primitives provide explicit memory barrier semantics around
+the atomic operation.
 
 	int atomic_sub_and_test(int i, atomic_t *v);
 
 This is identical to atomic_dec_and_test() except that an explicit
-decrement is given instead of the implicit "1".  It requires explicit
-memory barrier semantics around the operation.
+decrement is given instead of the implicit "1".  This primitive must
+provide explicit memory barrier semantics around the operation.
 
 	int atomic_add_negative(int i, atomic_t *v);
 
-The given increment is added to the given atomic counter value.  A
-boolean is return which indicates whether the resulting counter value
-is negative.  It requires explicit memory barrier semantics around the
-operation.
+The given increment is added to the given atomic counter value.  A boolean
+is return which indicates whether the resulting counter value is negative.
+This primitive must provide explicit memory barrier semantics around
+the operation.
 
 Then:
 
@@ -257,7 +257,7 @@ This performs an atomic exchange operation on the atomic variable v, setting
 the given new value.  It returns the old value that the atomic variable v had
 just before the operation.
 
-atomic_xchg requires explicit memory barriers around the operation.
+atomic_xchg must provide explicit memory barriers around the operation.
 
 	int atomic_cmpxchg(atomic_t *v, int old, int new);
 
@@ -266,7 +266,7 @@ with the given old and new values. Like all atomic_xxx operations,
 atomic_cmpxchg will only satisfy its atomicity semantics as long as all
 other accesses of *v are performed through atomic_xxx operations.
 
-atomic_cmpxchg requires explicit memory barriers around the operation.
+atomic_cmpxchg must provide explicit memory barriers around the operation.
 
 The semantics for atomic_cmpxchg are the same as those defined for 'cas'
 below.
@@ -279,8 +279,8 @@ If the atomic value v is not equal to u, this function adds a to v, and
 returns non zero. If v is equal to u then it returns zero. This is done as
 an atomic operation.
 
-atomic_add_unless requires explicit memory barriers around the operation
-unless it fails (returns 0).
+atomic_add_unless must provide explicit memory barriers around the
+operation unless it fails (returns 0).
 
 atomic_inc_not_zero, equivalent to atomic_add_unless(v, 1, 0)
 
@@ -460,9 +460,9 @@ the return value into an int.  There are other places where things
 like this occur as well.
 
 These routines, like the atomic_t counter operations returning values,
-require explicit memory barrier semantics around their execution.  All
-memory operations before the atomic bit operation call must be made
-visible globally before the atomic bit operation is made visible.
+must provide explicit memory barrier semantics around their execution.
+All memory operations before the atomic bit operation call must be
+made visible globally before the atomic bit operation is made visible.
 Likewise, the atomic bit operation must be visible globally before any
 subsequent memory operation is made visible.  For example:
 
@@ -536,8 +536,9 @@ except that two underscores are prefixed to the interface name.
 These non-atomic variants also do not require any special memory
 barrier semantics.
 
-The routines xchg() and cmpxchg() need the same exact memory barriers
-as the atomic and bit operations returning values.
+The routines xchg() and cmpxchg() must provide the same exact
+memory-barrier semantics as the atomic and bit operations returning
+values.
 
 Spinlocks and rwlocks have memory barrier expectations as well.
 The rule to follow is simple:

From ff382810590e7182a1482a225965d6943e61699c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 17 Feb 2015 10:00:06 -0800
Subject: [PATCH 08/56] documentation: Clarify control-dependency pairing

This commit explicitly states that control dependencies pair normally
with other barriers, and gives an example of such pairing.

Reported-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 Documentation/memory-barriers.txt | 42 +++++++++++++++++++++----------
 1 file changed, 29 insertions(+), 13 deletions(-)

diff --git a/Documentation/memory-barriers.txt b/Documentation/memory-barriers.txt
index ca2387ef27ab..6974f1c2b4e1 100644
--- a/Documentation/memory-barriers.txt
+++ b/Documentation/memory-barriers.txt
@@ -592,9 +592,9 @@ See also the subsection on "Cache Coherency" for a more thorough example.
 CONTROL DEPENDENCIES
 --------------------
 
-A control dependency requires a full read memory barrier, not simply a data
-dependency barrier to make it work correctly.  Consider the following bit of
-code:
+A load-load control dependency requires a full read memory barrier, not
+simply a data dependency barrier to make it work correctly.  Consider the
+following bit of code:
 
 	q = ACCESS_ONCE(a);
 	if (q) {
@@ -615,14 +615,15 @@ case what's actually required is:
 	}
 
 However, stores are not speculated.  This means that ordering -is- provided
-in the following example:
+for load-store control dependencies, as in the following example:
 
 	q = ACCESS_ONCE(a);
 	if (q) {
 		ACCESS_ONCE(b) = p;
 	}
 
-Please note that ACCESS_ONCE() is not optional!  Without the
+Control dependencies pair normally with other types of barriers.
+That said, please note that ACCESS_ONCE() is not optional!  Without the
 ACCESS_ONCE(), might combine the load from 'a' with other loads from
 'a', and the store to 'b' with other stores to 'b', with possible highly
 counterintuitive effects on ordering.
@@ -813,6 +814,8 @@ In summary:
       barrier() can help to preserve your control dependency.  Please
       see the Compiler Barrier section for more information.
 
+  (*) Control dependencies pair normally with other types of barriers.
+
   (*) Control dependencies do -not- provide transitivity.  If you
       need transitivity, use smp_mb().
 
@@ -823,14 +826,14 @@ SMP BARRIER PAIRING
 When dealing with CPU-CPU interactions, certain types of memory barrier should
 always be paired.  A lack of appropriate pairing is almost certainly an error.
 
-General barriers pair with each other, though they also pair with
-most other types of barriers, albeit without transitivity.  An acquire
-barrier pairs with a release barrier, but both may also pair with other
-barriers, including of course general barriers.  A write barrier pairs
-with a data dependency barrier, an acquire barrier, a release barrier,
-a read barrier, or a general barrier.  Similarly a read barrier or a
-data dependency barrier pairs with a write barrier, an acquire barrier,
-a release barrier, or a general barrier:
+General barriers pair with each other, though they also pair with most
+other types of barriers, albeit without transitivity.  An acquire barrier
+pairs with a release barrier, but both may also pair with other barriers,
+including of course general barriers.  A write barrier pairs with a data
+dependency barrier, a control dependency, an acquire barrier, a release
+barrier, a read barrier, or a general barrier.  Similarly a read barrier,
+control dependency, or a data dependency barrier pairs with a write
+barrier, an acquire barrier, a release barrier, or a general barrier:
 
 	CPU 1		      CPU 2
 	===============	      ===============
@@ -850,6 +853,19 @@ Or:
 			      <data dependency barrier>
 			      y = *x;
 
+Or even:
+
+	CPU 1		      CPU 2
+	===============	      ===============================
+	r1 = ACCESS_ONCE(y);
+	<general barrier>
+	ACCESS_ONCE(y) = 1;   if (r2 = ACCESS_ONCE(x)) {
+			         <implicit control dependency>
+			         ACCESS_ONCE(y) = 1;
+			      }
+
+	assert(r1 == 0 || r2 == 0);
+
 Basically, the read barrier always has to be there, even though it can be of
 the "weaker" type.
 

From d3f3f3f25b1d4ee152f3f19a812c3a282da4c120 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 18 Jan 2015 18:21:09 -0800
Subject: [PATCH 09/56] rcu: Abstract default callback-list initialization from
 init_callback_list()

In preparation for early-boot posting of callbacks, this commit abstracts
initialization of the default (non-no-CB) callbacks list from the
init_callback_list() function into a new init_default_callback_list()
function.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..f8cdb92da10b 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1327,18 +1327,28 @@ void rcu_cpu_stall_reset(void)
 		ACCESS_ONCE(rsp->jiffies_stall) = jiffies + ULONG_MAX / 2;
 }
 
+/*
+ * Initialize the specified rcu_data structure's default callback list
+ * to empty.  The default callback list is the one that is not used by
+ * no-callbacks CPUs.
+ */
+static void init_default_callback_list(struct rcu_data *rdp)
+{
+	int i;
+
+	rdp->nxtlist = NULL;
+	for (i = 0; i < RCU_NEXT_SIZE; i++)
+		rdp->nxttail[i] = &rdp->nxtlist;
+}
+
 /*
  * Initialize the specified rcu_data structure's callback list to empty.
  */
 static void init_callback_list(struct rcu_data *rdp)
 {
-	int i;
-
 	if (init_nocb_callback_list(rdp))
 		return;
-	rdp->nxtlist = NULL;
-	for (i = 0; i < RCU_NEXT_SIZE; i++)
-		rdp->nxttail[i] = &rdp->nxtlist;
+	init_default_callback_list(rdp);
 }
 
 /*

From 2723249a31a68ccc0ec8ac59a905d7f9430bf8f6 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 20 Jan 2015 22:44:13 -0800
Subject: [PATCH 10/56] rcu: Wire ->rda pointers at compile time

This commit wires up the rcu_state structures' ->rda pointers to the
per-CPU rcu_data structures at compile time, thus ensuring that this
linkage is present at early boot, in turn allowing posting of callbacks
before rcu_init() is executed.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f8cdb92da10b..d2fa95e4a268 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -91,8 +91,10 @@ static const char *tp_##sname##_varname __used __tracepoint_string = sname##_var
 
 #define RCU_STATE_INITIALIZER(sname, sabbr, cr) \
 DEFINE_RCU_TPS(sname) \
+DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data); \
 struct rcu_state sname##_state = { \
 	.level = { &sname##_state.node[0] }, \
+	.rda = &sname##_data, \
 	.call = cr, \
 	.fqs_state = RCU_GP_IDLE, \
 	.gpnum = 0UL - 300UL, \
@@ -104,8 +106,7 @@ struct rcu_state sname##_state = { \
 	.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
-}; \
-DEFINE_PER_CPU_SHARED_ALIGNED(struct rcu_data, sname##_data)
+}
 
 RCU_STATE_INITIALIZER(rcu_sched, 's', call_rcu_sched);
 RCU_STATE_INITIALIZER(rcu_bh, 'b', call_rcu_bh);
@@ -3843,7 +3844,6 @@ static void __init rcu_init_one(struct rcu_state *rsp,
 		}
 	}
 
-	rsp->rda = rda;
 	init_waitqueue_head(&rsp->gp_wq);
 	rnp = rsp->level[rcu_num_lvls - 1];
 	for_each_possible_cpu(i) {

From 143da9c2fc030a5774674f2ebc2f934fab3dcd9a Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 19:57:32 -0800
Subject: [PATCH 11/56] rcu: Prevent early-boot RCU callbacks from splatting

Currently, a call_rcu() that precedes rcu_init() will splat due to the
callback lists not having yet been initialized.  This commit causes the
first such callback to initialize the boot CPU's RCU callback list.

Note that this commit does not change rcu_init()-time initialization,
which means that the callback will be discarded at rcu_init() time.
Fixing this is the job of later commits.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 20 +++++++++++++++-----
 1 file changed, 15 insertions(+), 5 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d2fa95e4a268..fcfdbe53bb70 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2838,11 +2838,21 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 
 		if (cpu != -1)
 			rdp = per_cpu_ptr(rsp->rda, cpu);
-		offline = !__call_rcu_nocb(rdp, head, lazy, flags);
-		WARN_ON_ONCE(offline);
-		/* _call_rcu() is illegal on offline CPU; leak the callback. */
-		local_irq_restore(flags);
-		return;
+		if (likely(rdp->mynode)) {
+			/* Post-boot, so this should be for a no-CBs CPU. */
+			offline = !__call_rcu_nocb(rdp, head, lazy, flags);
+			WARN_ON_ONCE(offline);
+			/* Offline CPU, _call_rcu() illegal, leak callback.  */
+			local_irq_restore(flags);
+			return;
+		}
+		/*
+		 * Very early boot, before rcu_init().  Initialize if needed
+		 * and then drop through to queue the callback.
+		 */
+		BUG_ON(cpu != -1);
+		if (!likely(rdp->nxtlist))
+			init_default_callback_list(rdp);
 	}
 	ACCESS_ONCE(rdp->qlen) = rdp->qlen + 1;
 	if (lazy)

From 59f792d1ef214592ae9b86238fa8fd00f5929b76 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 21:43:40 -0800
Subject: [PATCH 12/56] rcu: Refine diagnostics for lacking kthread for no-CBs
 callbacks

Some diagnostics under CONFIG_PROVE_RCU in rcu_nocb_cpu_needs_barrier()
assume that there can be no early-boot callbacks.  This commit therefore
qualifies the diagnostic with rcu_scheduler_fully_active to permit
early boot callbacks to avoid this splat.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..75d5f096bcb0 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -1945,7 +1945,8 @@ static bool rcu_nocb_cpu_needs_barrier(struct rcu_state *rsp, int cpu)
 		rhp = ACCESS_ONCE(rdp->nocb_follower_head);
 
 	/* Having no rcuo kthread but CBs after scheduler starts is bad! */
-	if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp) {
+	if (!ACCESS_ONCE(rdp->nocb_kthread) && rhp &&
+	    rcu_scheduler_fully_active) {
 		/* RCU callback enqueued before CPU first came online??? */
 		pr_err("RCU: Never-onlined no-CBs CPU %d has CB %p\n",
 		       cpu, rhp->func);

From 39c8d313c3c546a414cc51b4f6571c2f8cc06407 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 20 Jan 2015 23:42:38 -0800
Subject: [PATCH 13/56] rcu: Avoid clobbering early boot callbacks

When a CPU comes online, it initializes its callback list.  This
is a bad thing if this is the first time that the CPU has come
online and if that CPU has early boot callbacks.  This commit therefore
avoid initializing the callback list if there are callbacks present,
in which case the initial call_rcu() did the initialization for us.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index fcfdbe53bb70..92fd3eab5823 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3583,7 +3583,8 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->qlen_last_fqs_check = 0;
 	rdp->n_force_qs_snap = rsp->n_force_qs;
 	rdp->blimit = blimit;
-	init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
+	if (!rdp->nxtlist)
+		init_callback_list(rdp);  /* Re-enable callbacks on this CPU. */
 	rdp->dynticks->dynticks_nesting = DYNTICK_TASK_EXIT_IDLE;
 	rcu_sysidle_init_percpu_data(rdp->dynticks);
 	atomic_set(&rdp->dynticks->dynticks,

From 1925d1967c93a1c421271aade7953f6857e9f579 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 18 Jan 2015 17:45:05 -0800
Subject: [PATCH 14/56] rcu: Fix a couple of typos in rcu_all_qs() comment
 header

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 077d0b700f74..4e37c7fd9e29 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -292,10 +292,10 @@ void rcu_note_context_switch(void)
 EXPORT_SYMBOL_GPL(rcu_note_context_switch);
 
 /*
- * Register a quiesecent state for all RCU flavors.  If there is an
+ * Register a quiescent state for all RCU flavors.  If there is an
  * emergency, invoke rcu_momentary_dyntick_idle() to do a heavy-weight
  * dyntick-idle quiescent state visible to other CPUs (but only for those
- * RCU flavors in desparate need of a quiescent state, which will normally
+ * RCU flavors in desperate need of a quiescent state, which will normally
  * be none of them).  Either way, do a lightweight quiescent state for
  * all RCU flavors.
  */

From 9bae6592d7d74dbb409e0dd8004f13af8b8d569e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 18 Jan 2015 18:01:21 -0800
Subject: [PATCH 15/56] rcu: Drive PROVE_RCU directly off of PROVE_LOCKING

In the past, it has been useful to enable PROVE_LOCKING without also
enabling PROVE_RCU.  However, experience with PROVE_RCU over the past
few years has demonstrated its usefulness, so this commit makes
PROVE_LOCKING directly imply PROVE_RCU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 lib/Kconfig.debug | 11 +----------
 1 file changed, 1 insertion(+), 10 deletions(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c5cefb3c009c..0766672e4c5f 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1180,16 +1180,7 @@ config DEBUG_CREDENTIALS
 menu "RCU Debugging"
 
 config PROVE_RCU
-	bool "RCU debugging: prove RCU correctness"
-	depends on PROVE_LOCKING
-	default n
-	help
-	 This feature enables lockdep extensions that check for correct
-	 use of RCU APIs.  This is currently under development.  Say Y
-	 if you want to debug RCU usage or help work on the PROVE_RCU
-	 feature.
-
-	 Say N if you are unsure.
+	def_bool PROVE_LOCKING
 
 config PROVE_RCU_REPEATEDLY
 	bool "RCU debugging: don't disable PROVE_RCU on first splat"

From 0d39482c3db13aae1db143d340816108dd53e443 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 18 Feb 2015 12:24:30 -0800
Subject: [PATCH 16/56] rcu: Provide rcu_expedite_gp() and rcu_unexpedite_gp()

Currently, expediting of normal synchronous grace-period primitives
(synchronize_rcu() and friends) is controlled by the rcu_expedited()
boot/sysfs parameter.  This works well, but does not handle nesting.
This commit therefore provides rcu_expedite_gp() to enable expediting
and rcu_unexpedite_gp() to cancel a prior rcu_expedite_gp(), both of
which support nesting.

Reported-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/rcupdate.h | 20 +++++++++++++++++
 kernel/rcu/update.c      | 48 ++++++++++++++++++++++++++++++++++++++++
 2 files changed, 68 insertions(+)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 78097491cd99..57a4d1f73a00 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -48,6 +48,26 @@
 
 extern int rcu_expedited; /* for sysctl */
 
+#ifdef CONFIG_TINY_RCU
+/* Tiny RCU doesn't expedite, as its purpose in life is instead to be tiny. */
+static inline bool rcu_gp_is_expedited(void)  /* Internal RCU use. */
+{
+	return false;
+}
+
+static inline void rcu_expedite_gp(void)
+{
+}
+
+static inline void rcu_unexpedite_gp(void)
+{
+}
+#else /* #ifdef CONFIG_TINY_RCU */
+bool rcu_gp_is_expedited(void);  /* Internal RCU use. */
+void rcu_expedite_gp(void);
+void rcu_unexpedite_gp(void);
+#endif /* #else #ifdef CONFIG_TINY_RCU */
+
 enum rcutorture_type {
 	RCU_FLAVOR,
 	RCU_BH_FLAVOR,
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index e0d31a345ee6..5f850823c187 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -62,6 +62,54 @@ MODULE_ALIAS("rcupdate");
 
 module_param(rcu_expedited, int, 0);
 
+#ifndef CONFIG_TINY_RCU
+
+static atomic_t rcu_expedited_nesting;
+
+/*
+ * Should normal grace-period primitives be expedited?  Intended for
+ * use within RCU.  Note that this function takes the rcu_expedited
+ * sysfs/boot variable into account as well as the rcu_expedite_gp()
+ * nesting.  So looping on rcu_unexpedite_gp() until rcu_gp_is_expedited()
+ * returns false is a -really- bad idea.
+ */
+bool rcu_gp_is_expedited(void)
+{
+	return rcu_expedited || atomic_read(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_gp_is_expedited);
+
+/**
+ * rcu_expedite_gp - Expedite future RCU grace periods
+ *
+ * After a call to this function, future calls to synchronize_rcu() and
+ * friends act as the corresponding synchronize_rcu_expedited() function
+ * had instead been called.
+ */
+void rcu_expedite_gp(void)
+{
+	atomic_inc(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_expedite_gp);
+
+/**
+ * rcu_unexpedite_gp - Cancel prior rcu_expedite_gp() invocation
+ *
+ * Undo a prior call to rcu_expedite_gp().  If all prior calls to
+ * rcu_expedite_gp() are undone by a subsequent call to rcu_unexpedite_gp(),
+ * and if the rcu_expedited sysfs/boot parameter is not set, then all
+ * subsequent calls to synchronize_rcu() and friends will return to
+ * their normal non-expedited behavior.
+ */
+void rcu_unexpedite_gp(void)
+{
+	atomic_dec(&rcu_expedited_nesting);
+}
+EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
+
+#endif /* #ifndef CONFIG_TINY_RCU */
+
+
 #ifdef CONFIG_PREEMPT_RCU
 
 /*

From 4bb3c5f4142a359de46cf14ebab64c4c903d6773 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 18 Feb 2015 16:31:29 -0800
Subject: [PATCH 17/56] rcu: Add rcu_expedite_gp() and rcu_unexpedite_gp() to
 rcutorture

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 25 +++++++++++++++++++++++++
 1 file changed, 25 insertions(+)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 30d42aa55d83..3833aa611ae7 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -853,6 +853,8 @@ rcu_torture_fqs(void *arg)
 static int
 rcu_torture_writer(void *arg)
 {
+	bool can_expedite = !rcu_gp_is_expedited();
+	int expediting = 0;
 	unsigned long gp_snap;
 	bool gp_cond1 = gp_cond, gp_exp1 = gp_exp, gp_normal1 = gp_normal;
 	bool gp_sync1 = gp_sync;
@@ -865,6 +867,12 @@ rcu_torture_writer(void *arg)
 	int nsynctypes = 0;
 
 	VERBOSE_TOROUT_STRING("rcu_torture_writer task started");
+	pr_alert("%s" TORTURE_FLAG
+		 " Grace periods expedited from boot/sysfs for %s,\n",
+		 torture_type, cur_ops->name);
+	pr_alert("%s" TORTURE_FLAG
+		 " Testing of dynamic grace-period expediting diabled.\n",
+		 torture_type);
 
 	/* Initialize synctype[] array.  If none set, take default. */
 	if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
@@ -949,9 +957,26 @@ rcu_torture_writer(void *arg)
 			}
 		}
 		rcutorture_record_progress(++rcu_torture_current_version);
+		/* Cycle through nesting levels of rcu_expedite_gp() calls. */
+		if (can_expedite &&
+		    !(torture_random(&rand) & 0xff & (!!expediting - 1))) {
+			WARN_ON_ONCE(expediting == 0 && rcu_gp_is_expedited());
+			if (expediting >= 0)
+				rcu_expedite_gp();
+			else
+				rcu_unexpedite_gp();
+			if (++expediting > 3)
+				expediting = -expediting;
+		}
 		rcu_torture_writer_state = RTWS_STUTTER;
 		stutter_wait("rcu_torture_writer");
 	} while (!torture_must_stop());
+	/* Reset expediting back to unexpedited. */
+	if (expediting > 0)
+		expediting = -expediting;
+	while (can_expedite && expediting++ < 0)
+		rcu_unexpedite_gp();
+	WARN_ON_ONCE(can_expedite && rcu_gp_is_expedited());
 	rcu_torture_writer_state = RTWS_STOPPING;
 	torture_kthread_stopping("rcu_torture_writer");
 	return 0;

From 5afff48bdf7481570c9385a8a674a81ffb8f09ee Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 18 Feb 2015 16:39:09 -0800
Subject: [PATCH 18/56] rcu: Update from rcu_expedited variable to
 rcu_gp_is_expedited()

This commit updates open-coded tests of the rcu_expedited variable
to instead use rcu_gp_is_expedited().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/srcu.c        | 2 +-
 kernel/rcu/tree.c        | 9 +++++----
 kernel/rcu/tree_plugin.h | 2 +-
 3 files changed, 7 insertions(+), 6 deletions(-)

diff --git a/kernel/rcu/srcu.c b/kernel/rcu/srcu.c
index 445bf8ffe3fb..c871f07eff69 100644
--- a/kernel/rcu/srcu.c
+++ b/kernel/rcu/srcu.c
@@ -507,7 +507,7 @@ static void __synchronize_srcu(struct srcu_struct *sp, int trycount)
  */
 void synchronize_srcu(struct srcu_struct *sp)
 {
-	__synchronize_srcu(sp, rcu_expedited
+	__synchronize_srcu(sp, rcu_gp_is_expedited()
 			   ? SYNCHRONIZE_SRCU_EXP_TRYCOUNT
 			   : SYNCHRONIZE_SRCU_TRYCOUNT);
 }
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..4325fbe79d84 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2954,7 +2954,7 @@ void synchronize_sched(void)
 			   "Illegal synchronize_sched() in RCU-sched read-side critical section");
 	if (rcu_blocking_is_gp())
 		return;
-	if (rcu_expedited)
+	if (rcu_gp_is_expedited())
 		synchronize_sched_expedited();
 	else
 		wait_rcu_gp(call_rcu_sched);
@@ -2981,7 +2981,7 @@ void synchronize_rcu_bh(void)
 			   "Illegal synchronize_rcu_bh() in RCU-bh read-side critical section");
 	if (rcu_blocking_is_gp())
 		return;
-	if (rcu_expedited)
+	if (rcu_gp_is_expedited())
 		synchronize_rcu_bh_expedited();
 	else
 		wait_rcu_gp(call_rcu_bh);
@@ -3660,11 +3660,12 @@ static int rcu_pm_notify(struct notifier_block *self,
 	case PM_HIBERNATION_PREPARE:
 	case PM_SUSPEND_PREPARE:
 		if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
-			rcu_expedited = 1;
+			rcu_expedite_gp();
 		break;
 	case PM_POST_HIBERNATION:
 	case PM_POST_SUSPEND:
-		rcu_expedited = 0;
+		if (nr_cpu_ids <= 256) /* Expediting bad for large systems. */
+			rcu_unexpedite_gp();
 		break;
 	default:
 		break;
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..63726b734d34 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -585,7 +585,7 @@ void synchronize_rcu(void)
 			   "Illegal synchronize_rcu() in RCU read-side critical section");
 	if (!rcu_scheduler_active)
 		return;
-	if (rcu_expedited)
+	if (rcu_gp_is_expedited())
 		synchronize_rcu_expedited();
 	else
 		wait_rcu_gp(call_rcu);

From ee42571f4381f184e2672dd34ab411e5bf5bd5e0 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 19 Feb 2015 10:51:32 -0800
Subject: [PATCH 19/56] rcu: Add Kconfig option to expedite grace periods
 during boot

This commit adds a CONFIG_RCU_EXPEDITE_BOOT Kconfig parameter
that emulates a very early boot rcu_expedite_gp().  A late-boot
call to rcu_end_inkernel_boot() will provide the corresponding
rcu_unexpedite_gp().  The late-boot call to rcu_end_inkernel_boot()
should be made just before init is spawned.

According to Arjan:

> To show the boot time, I'm using the timestamp of the "Write protecting"
> line, that's pretty much the last thing we print prior to ring 3 execution.
>
> A kernel with default RCU behavior (inside KVM, only virtual devices)
> looks like this:
>
> [    0.038724] Write protecting the kernel read-only data: 10240k
>
> a kernel with expedited RCU (using the command line option, so that I
> don't have to recompile between measurements and thus am completely
> oranges-to-oranges)
>
> [    0.031768] Write protecting the kernel read-only data: 10240k
>
> which, in percentage, is an 18% improvement.

Reported-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Tested-by: Arjan van de Ven <arjan@linux.intel.com>
---
 include/linux/rcupdate.h |  1 +
 init/Kconfig             | 13 +++++++++++++
 kernel/rcu/update.c      | 11 ++++++++++-
 3 files changed, 24 insertions(+), 1 deletion(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 57a4d1f73a00..b9f039b11d31 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -278,6 +278,7 @@ static inline int rcu_preempt_depth(void)
 
 /* Internal to kernel */
 void rcu_init(void);
+void rcu_end_inkernel_boot(void);
 void rcu_sched_qs(void);
 void rcu_bh_qs(void);
 void rcu_check_callbacks(int user);
diff --git a/init/Kconfig b/init/Kconfig
index f5dbc6d4261b..9a0592516f48 100644
--- a/init/Kconfig
+++ b/init/Kconfig
@@ -791,6 +791,19 @@ config RCU_NOCB_CPU_ALL
 
 endchoice
 
+config RCU_EXPEDITE_BOOT
+	bool
+	default n
+	help
+	  This option enables expedited grace periods at boot time,
+	  as if rcu_expedite_gp() had been invoked early in boot.
+	  The corresponding rcu_unexpedite_gp() is invoked from
+	  rcu_end_inkernel_boot(), which is intended to be invoked
+	  at the end of the kernel-only boot sequence, just before
+	  init is exec'ed.
+
+	  Accept the default if unsure.
+
 endmenu # "RCU Subsystem"
 
 config BUILD_BIN2C
diff --git a/kernel/rcu/update.c b/kernel/rcu/update.c
index 5f850823c187..7b12466f90bc 100644
--- a/kernel/rcu/update.c
+++ b/kernel/rcu/update.c
@@ -64,7 +64,8 @@ module_param(rcu_expedited, int, 0);
 
 #ifndef CONFIG_TINY_RCU
 
-static atomic_t rcu_expedited_nesting;
+static atomic_t rcu_expedited_nesting =
+	ATOMIC_INIT(IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT) ? 1 : 0);
 
 /*
  * Should normal grace-period primitives be expedited?  Intended for
@@ -109,6 +110,14 @@ EXPORT_SYMBOL_GPL(rcu_unexpedite_gp);
 
 #endif /* #ifndef CONFIG_TINY_RCU */
 
+/*
+ * Inform RCU of the end of the in-kernel boot sequence.
+ */
+void rcu_end_inkernel_boot(void)
+{
+	if (IS_ENABLED(CONFIG_RCU_EXPEDITE_BOOT))
+		rcu_unexpedite_gp();
+}
 
 #ifdef CONFIG_PREEMPT_RCU
 

From c136f991049f51212e3d837a9f41708158591869 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 19 Feb 2015 12:15:19 -0800
Subject: [PATCH 20/56] rcutorture: Make consistent use of variables

The "if" statement at the beginning of rcu_torture_writer() should
use the same set of variables.  In theory, this does not matter because
the corresponding variables (gp_sync and gp_sync1) have the same value
at this point in the code, but in practice such puzzles should be
removed.  This commit therefore makes the use of variables consistent.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/rcutorture.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/rcutorture.c b/kernel/rcu/rcutorture.c
index 3833aa611ae7..8dbe27611ec3 100644
--- a/kernel/rcu/rcutorture.c
+++ b/kernel/rcu/rcutorture.c
@@ -875,7 +875,7 @@ rcu_torture_writer(void *arg)
 		 torture_type);
 
 	/* Initialize synctype[] array.  If none set, take default. */
-	if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync)
+	if (!gp_cond1 && !gp_exp1 && !gp_normal1 && !gp_sync1)
 		gp_cond1 = gp_exp1 = gp_normal1 = gp_sync1 = true;
 	if (gp_cond1 && cur_ops->get_state && cur_ops->cond_sync)
 		synctype[nsynctypes++] = RTWS_COND_GET;

From 675da67f24e2d6d8df0cedf12e59085ed8bbf4e7 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 23 Feb 2015 15:57:07 -0800
Subject: [PATCH 21/56] rcu: Fixes to NO_HZ_FULL sysidle accounting

On second and subsequent passes through quiescent-state forcing, the
isidle variable was initialized to false, which would prevent full sysidle
state from being reached if a grace period needed more than one round
of quiescent-state forcing (which most should not).  However, the check
for offline CPUs in the quiescent-state forcing main loop had the wrong
sense, which could prevent CPUs from ever entering full sysidle state.

This commit fixes both of these bugs.  Given that sysidle is not yet
wired up, this has no effect in old kernels, but might have proven
frustrating had anyone attempted to wire it up.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 6 +++---
 1 file changed, 3 insertions(+), 3 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..735bd7ee749a 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1798,7 +1798,7 @@ static int rcu_gp_fqs(struct rcu_state *rsp, int fqs_state_in)
 		fqs_state = RCU_FORCE_QS;
 	} else {
 		/* Handle dyntick-idle and offline CPUs. */
-		isidle = false;
+		isidle = true;
 		force_qs_rnp(rsp, rcu_implicit_dynticks_qs, &isidle, &maxj);
 	}
 	/* Clear flag to prevent immediate re-entry. */
@@ -2596,8 +2596,8 @@ static void force_qs_rnp(struct rcu_state *rsp,
 		bit = 1;
 		for (; cpu <= rnp->grphi; cpu++, bit <<= 1) {
 			if ((rnp->qsmask & bit) != 0) {
-				if ((rnp->qsmaskinit & bit) != 0)
-					*isidle = false;
+				if ((rnp->qsmaskinit & bit) == 0)
+					*isidle = false; /* Pending hotplug. */
 				if (f(per_cpu_ptr(rsp->rda, cpu), isidle, maxj))
 					mask |= bit;
 			}

From 27153acbe1141ceecf098ca5d24c2ae2714c1a5f Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Wed, 11 Feb 2015 15:42:37 +0100
Subject: [PATCH 22/56] rcu: Remove unnecessary condition check in
 rcu_qsctr_help()

When the ->curtail and ->donetail pointers differ, ->rcucblist
always points to the beginning of the current list and thus
cannot be NULL. Therefore, the check ->rcucblist != NULL is
redundant and this commit removes it.

Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tiny.c | 3 +--
 1 file changed, 1 insertion(+), 2 deletions(-)

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index cc9ceca7bde1..d4e7fe5f3baf 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -103,8 +103,7 @@ EXPORT_SYMBOL(__rcu_is_watching);
 static int rcu_qsctr_help(struct rcu_ctrlblk *rcp)
 {
 	RCU_TRACE(reset_cpu_stall_ticks(rcp));
-	if (rcp->rcucblist != NULL &&
-	    rcp->donetail != rcp->curtail) {
+	if (rcp->donetail != rcp->curtail) {
 		rcp->donetail = rcp->curtail;
 		return 1;
 	}

From 915e8a4fe45eab871a862f6467ec7e59864735b2 Mon Sep 17 00:00:00 2001
From: Alexander Gordeev <agordeev@redhat.com>
Date: Wed, 11 Feb 2015 15:42:38 +0100
Subject: [PATCH 23/56] rcu: Remove fastpath from __rcu_process_callbacks()

The standard code path accommodates a condition when no
RCU callbacks are ready to invoke. Since size of the code
is a priority for tiny RCU, remove the fast path.

Cc: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Signed-off-by: Alexander Gordeev <agordeev@redhat.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tiny.c | 11 -----------
 1 file changed, 11 deletions(-)

diff --git a/kernel/rcu/tiny.c b/kernel/rcu/tiny.c
index d4e7fe5f3baf..069742d61c68 100644
--- a/kernel/rcu/tiny.c
+++ b/kernel/rcu/tiny.c
@@ -168,17 +168,6 @@ static void __rcu_process_callbacks(struct rcu_ctrlblk *rcp)
 	unsigned long flags;
 	RCU_TRACE(int cb_count = 0);
 
-	/* If no RCU callbacks ready to invoke, just return. */
-	if (&rcp->rcucblist == rcp->donetail) {
-		RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, 0, -1));
-		RCU_TRACE(trace_rcu_batch_end(rcp->name, 0,
-					      !!ACCESS_ONCE(rcp->rcucblist),
-					      need_resched(),
-					      is_idle_task(current),
-					      false));
-		return;
-	}
-
 	/* Move the ready-to-invoke callbacks to a local list. */
 	local_irq_save(flags);
 	RCU_TRACE(trace_rcu_batch_start(rcp->name, 0, rcp->qlen, -1));

From 5871968d531f39c23a8e6c69525bb705bca52e04 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 24 Feb 2015 11:05:36 -0800
Subject: [PATCH 24/56] rcu: Tighten up affinity and check for sysidle

If the RCU grace-period kthread invoking rcu_sysidle_check_cpu()
happens to be running on the tick_do_timer_cpu initially,
then rcu_bind_gp_kthread() won't bind it.  This kthread might
then migrate before invoking rcu_gp_fqs(), which will trigger the
WARN_ON_ONCE() in rcu_sysidle_check_cpu().  This commit therefore makes
rcu_bind_gp_kthread() do the binding even if the kthread is currently
on the same CPU.  Because this incurs added overhead, this commit also
causes each RCU grace-period kthread to invoke rcu_bind_gp_kthread()
once at boot rather than at the beginning of each grace period.
And as long as rcu_bind_gp_kthread() is being modified, this commit
eliminates its #ifdef.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        |  2 +-
 kernel/rcu/tree_plugin.h | 12 ++++++------
 2 files changed, 7 insertions(+), 7 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 735bd7ee749a..a6972c20eaa5 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1707,7 +1707,6 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
 	ACCESS_ONCE(rsp->gp_activity) = jiffies;
-	rcu_bind_gp_kthread();
 	raw_spin_lock_irq(&rnp->lock);
 	smp_mb__after_unlock_lock();
 	if (!ACCESS_ONCE(rsp->gp_flags)) {
@@ -1895,6 +1894,7 @@ static int __noreturn rcu_gp_kthread(void *arg)
 	struct rcu_state *rsp = arg;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
+	rcu_bind_gp_kthread();
 	for (;;) {
 
 		/* Handle grace-period start. */
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..b46c92824db1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2763,7 +2763,8 @@ static void rcu_sysidle_exit(int irq)
 
 /*
  * Check to see if the current CPU is idle.  Note that usermode execution
- * does not count as idle.  The caller must have disabled interrupts.
+ * does not count as idle.  The caller must have disabled interrupts,
+ * and must be running on tick_do_timer_cpu.
  */
 static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
 				  unsigned long *maxj)
@@ -2784,8 +2785,8 @@ static void rcu_sysidle_check_cpu(struct rcu_data *rdp, bool *isidle,
 	if (!*isidle || rdp->rsp != rcu_state_p ||
 	    cpu_is_offline(rdp->cpu) || rdp->cpu == tick_do_timer_cpu)
 		return;
-	if (rcu_gp_in_progress(rdp->rsp))
-		WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
+	/* Verify affinity of current kthread. */
+	WARN_ON_ONCE(smp_processor_id() != tick_do_timer_cpu);
 
 	/* Pick up current idle and NMI-nesting counter and check. */
 	cur = atomic_read(&rdtp->dynticks_idle);
@@ -3068,11 +3069,10 @@ static void rcu_bind_gp_kthread(void)
 		return;
 #ifdef CONFIG_NO_HZ_FULL_SYSIDLE
 	cpu = tick_do_timer_cpu;
-	if (cpu >= 0 && cpu < nr_cpu_ids && raw_smp_processor_id() != cpu)
+	if (cpu >= 0 && cpu < nr_cpu_ids)
 		set_cpus_allowed_ptr(current, cpumask_of(cpu));
 #else /* #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
-	if (!is_housekeeping_cpu(raw_smp_processor_id()))
-		housekeeping_affine(current);
+	housekeeping_affine(current);
 #endif /* #else #ifdef CONFIG_NO_HZ_FULL_SYSIDLE */
 }
 

From 34404ca8fb252ccee662c4368c555ccf774acc3b Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 20:39:20 -0800
Subject: [PATCH 25/56] rcu: Move early-boot callbacks to no-CBs lists for
 no-CBs CPUs

When a CPU is first determined to be a no-CBs CPUs, this commit causes
any early boot callbacks to be moved to the no-CBs callback list,
allowing them to be invoked.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        |  1 +
 kernel/rcu/tree_plugin.h | 24 ++++++++++++------------
 2 files changed, 13 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 92fd3eab5823..0317bf7d997f 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2851,6 +2851,7 @@ __call_rcu(struct rcu_head *head, void (*func)(struct rcu_head *rcu),
 		 * and then drop through to queue the callback.
 		 */
 		BUG_ON(cpu != -1);
+		WARN_ON_ONCE(!rcu_is_watching());
 		if (!likely(rdp->nxtlist))
 			init_default_callback_list(rdp);
 	}
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 75d5f096bcb0..afddd5641bea 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -2393,18 +2393,8 @@ void __init rcu_init_nohz(void)
 		pr_info("\tPoll for callbacks from no-CBs CPUs.\n");
 
 	for_each_rcu_flavor(rsp) {
-		for_each_cpu(cpu, rcu_nocb_mask) {
-			struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
-
-			/*
-			 * If there are early callbacks, they will need
-			 * to be moved to the nocb lists.
-			 */
-			WARN_ON_ONCE(rdp->nxttail[RCU_NEXT_TAIL] !=
-				     &rdp->nxtlist &&
-				     rdp->nxttail[RCU_NEXT_TAIL] != NULL);
-			init_nocb_callback_list(rdp);
-		}
+		for_each_cpu(cpu, rcu_nocb_mask)
+			init_nocb_callback_list(per_cpu_ptr(rsp->rda, cpu));
 		rcu_organize_nocb_kthreads(rsp);
 	}
 }
@@ -2541,6 +2531,16 @@ static bool init_nocb_callback_list(struct rcu_data *rdp)
 	if (!rcu_is_nocb_cpu(rdp->cpu))
 		return false;
 
+	/* If there are early-boot callbacks, move them to nocb lists. */
+	if (rdp->nxtlist) {
+		rdp->nocb_head = rdp->nxtlist;
+		rdp->nocb_tail = rdp->nxttail[RCU_NEXT_TAIL];
+		atomic_long_set(&rdp->nocb_q_count, rdp->qlen);
+		atomic_long_set(&rdp->nocb_q_count_lazy, rdp->qlen_lazy);
+		rdp->nxtlist = NULL;
+		rdp->qlen = 0;
+		rdp->qlen_lazy = 0;
+	}
 	rdp->nxttail[RCU_NEXT_TAIL] = NULL;
 	return true;
 }

From 476276781095c79580abe27a65988549ac7f5f89 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 21:10:21 -0800
Subject: [PATCH 26/56] rcu: Move early boot callback tests earlier

Because callbacks can now be posted quite early in boot, move the
early boot callback tests to precede RCU initialization.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 0317bf7d997f..c8e6569c5fbd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3948,6 +3948,8 @@ void __init rcu_init(void)
 {
 	int cpu;
 
+	rcu_early_boot_tests();
+
 	rcu_bootup_announce();
 	rcu_init_geometry();
 	rcu_init_one(&rcu_bh_state, &rcu_bh_data);
@@ -3964,8 +3966,6 @@ void __init rcu_init(void)
 	pm_notifier(rcu_pm_notify, 0);
 	for_each_online_cpu(cpu)
 		rcu_cpu_notify(NULL, CPU_UP_PREPARE, (void *)(long)cpu);
-
-	rcu_early_boot_tests();
 }
 
 #include "tree_plugin.h"

From 6629240575992a6f0d18c46f5160b34527b0e501 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 19:16:38 -0800
Subject: [PATCH 27/56] rcu: Use IS_ENABLED() to CONFIG_RCU_FANOUT_EXACT #ifdef

This commit uses IS_ENABLED() to remove the #ifdef from the
rcu_init_levelspread() functions.  No effect on executable code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 30 +++++++++++++-----------------
 1 file changed, 13 insertions(+), 17 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 4e37c7fd9e29..35e1604f7e3e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3734,30 +3734,26 @@ void rcu_scheduler_starting(void)
  * Compute the per-level fanout, either using the exact fanout specified
  * or balancing the tree, depending on CONFIG_RCU_FANOUT_EXACT.
  */
-#ifdef CONFIG_RCU_FANOUT_EXACT
 static void __init rcu_init_levelspread(struct rcu_state *rsp)
 {
 	int i;
 
-	rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
-	for (i = rcu_num_lvls - 2; i >= 0; i--)
-		rsp->levelspread[i] = CONFIG_RCU_FANOUT;
-}
-#else /* #ifdef CONFIG_RCU_FANOUT_EXACT */
-static void __init rcu_init_levelspread(struct rcu_state *rsp)
-{
-	int ccur;
-	int cprv;
-	int i;
+	if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT)) {
+		rsp->levelspread[rcu_num_lvls - 1] = rcu_fanout_leaf;
+		for (i = rcu_num_lvls - 2; i >= 0; i--)
+			rsp->levelspread[i] = CONFIG_RCU_FANOUT;
+	} else {
+		int ccur;
+		int cprv;
 
-	cprv = nr_cpu_ids;
-	for (i = rcu_num_lvls - 1; i >= 0; i--) {
-		ccur = rsp->levelcnt[i];
-		rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
-		cprv = ccur;
+		cprv = nr_cpu_ids;
+		for (i = rcu_num_lvls - 1; i >= 0; i--) {
+			ccur = rsp->levelcnt[i];
+			rsp->levelspread[i] = (cprv + ccur - 1) / ccur;
+			cprv = ccur;
+		}
 	}
 }
-#endif /* #else #ifdef CONFIG_RCU_FANOUT_EXACT */
 
 /*
  * Helper function for rcu_init() that initializes one rcu_state structure.

From d24209bb689e2c7f7418faec9b4a948e922d24da Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 21 Jan 2015 15:26:03 -0800
Subject: [PATCH 28/56] rcu: Improve diagnostics for blocked critical sections
 in irq

If an RCU read-side critical section occurs within an interrupt handler
or a softirq handler, it cannot have been preempted.  Therefore, there is
a check in rcu_read_unlock_special() checking for this error.  However,
when this check triggers, it lacks diagnostic information.  This commit
therefore moves rcu_read_unlock()'s lockdep annotation to follow the
call to __rcu_read_unlock() and changes rcu_read_unlock_special()'s
WARN_ON_ONCE() to an lockdep_rcu_suspicious() in order to locate where
the offending RCU read-side critical section began.  In addition, the
value of the ->rcu_read_unlock_special field is printed.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/lockdep.h  | 7 ++++++-
 include/linux/rcupdate.h | 2 +-
 kernel/rcu/tree_plugin.h | 8 +++++++-
 3 files changed, 14 insertions(+), 3 deletions(-)

diff --git a/include/linux/lockdep.h b/include/linux/lockdep.h
index 74ab23176e9b..066ba4157541 100644
--- a/include/linux/lockdep.h
+++ b/include/linux/lockdep.h
@@ -531,8 +531,13 @@ do {									\
 # define might_lock_read(lock) do { } while (0)
 #endif
 
-#ifdef CONFIG_PROVE_RCU
+#ifdef CONFIG_LOCKDEP
 void lockdep_rcu_suspicious(const char *file, const int line, const char *s);
+#else
+static inline void
+lockdep_rcu_suspicious(const char *file, const int line, const char *s)
+{
+}
 #endif
 
 #endif /* __LINUX_LOCKDEP_H */
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 3e6afed51051..70b896e16f19 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -942,9 +942,9 @@ static inline void rcu_read_unlock(void)
 {
 	rcu_lockdep_assert(rcu_is_watching(),
 			   "rcu_read_unlock() used illegally while idle");
-	rcu_lock_release(&rcu_lock_map);
 	__release(RCU);
 	__rcu_read_unlock();
+	rcu_lock_release(&rcu_lock_map); /* Keep acq info for rls diags. */
 }
 
 /**
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..8a33920b8845 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -334,7 +334,13 @@ void rcu_read_unlock_special(struct task_struct *t)
 	}
 
 	/* Hardware IRQ handlers cannot block, complain if they get here. */
-	if (WARN_ON_ONCE(in_irq() || in_serving_softirq())) {
+	if (in_irq() || in_serving_softirq()) {
+		lockdep_rcu_suspicious(__FILE__, __LINE__,
+				       "rcu_read_unlock() from irq or softirq with blocking in critical section!!!\n");
+		pr_alert("->rcu_read_unlock_special: %#x (b: %d, nq: %d)\n",
+			 t->rcu_read_unlock_special.s,
+			 t->rcu_read_unlock_special.b.blocked,
+			 t->rcu_read_unlock_special.b.need_qs);
 		local_irq_restore(flags);
 		return;
 	}

From ab6f5bd6741af7b157275de299b7b2b96f2df40e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 21 Jan 2015 16:58:06 -0800
Subject: [PATCH 29/56] rcu: Use IS_ENABLED() to simplify
 rcu_bootup_announce_oddness()

This commit gets rid of some inline #ifdefs by replacing them with
IS_ENABLED.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 48 +++++++++++++++++-----------------------
 1 file changed, 20 insertions(+), 28 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 8a33920b8845..81c4d91fa18a 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -58,38 +58,30 @@ static bool __read_mostly rcu_nocb_poll;    /* Offload kthread are to poll. */
  */
 static void __init rcu_bootup_announce_oddness(void)
 {
-#ifdef CONFIG_RCU_TRACE
-	pr_info("\tRCU debugfs-based tracing is enabled.\n");
-#endif
-#if (defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) || (!defined(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32)
-	pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
-	       CONFIG_RCU_FANOUT);
-#endif
-#ifdef CONFIG_RCU_FANOUT_EXACT
-	pr_info("\tHierarchical RCU autobalancing is disabled.\n");
-#endif
-#ifdef CONFIG_RCU_FAST_NO_HZ
-	pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
-#endif
-#ifdef CONFIG_PROVE_RCU
-	pr_info("\tRCU lockdep checking is enabled.\n");
-#endif
-#ifdef CONFIG_RCU_TORTURE_TEST_RUNNABLE
-	pr_info("\tRCU torture testing starts during boot.\n");
-#endif
-#if defined(CONFIG_RCU_CPU_STALL_INFO)
-	pr_info("\tAdditional per-CPU info printed with stalls.\n");
-#endif
-#if NUM_RCU_LVL_4 != 0
-	pr_info("\tFour-level hierarchy is enabled.\n");
-#endif
+	if (IS_ENABLED(CONFIG_RCU_TRACE))
+		pr_info("\tRCU debugfs-based tracing is enabled.\n");
+	if ((IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 64) ||
+	    (!IS_ENABLED(CONFIG_64BIT) && CONFIG_RCU_FANOUT != 32))
+		pr_info("\tCONFIG_RCU_FANOUT set to non-default value of %d\n",
+		       CONFIG_RCU_FANOUT);
+	if (IS_ENABLED(CONFIG_RCU_FANOUT_EXACT))
+		pr_info("\tHierarchical RCU autobalancing is disabled.\n");
+	if (IS_ENABLED(CONFIG_RCU_FAST_NO_HZ))
+		pr_info("\tRCU dyntick-idle grace-period acceleration is enabled.\n");
+	if (IS_ENABLED(CONFIG_PROVE_RCU))
+		pr_info("\tRCU lockdep checking is enabled.\n");
+	if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_RUNNABLE))
+		pr_info("\tRCU torture testing starts during boot.\n");
+	if (IS_ENABLED(CONFIG_RCU_CPU_STALL_INFO))
+		pr_info("\tAdditional per-CPU info printed with stalls.\n");
+	if (NUM_RCU_LVL_4 != 0)
+		pr_info("\tFour-level hierarchy is enabled.\n");
 	if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
 		pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
 	if (nr_cpu_ids != NR_CPUS)
 		pr_info("\tRCU restricting CPUs from NR_CPUS=%d to nr_cpu_ids=%d.\n", NR_CPUS, nr_cpu_ids);
-#ifdef CONFIG_RCU_BOOST
-	pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
-#endif
+	if (IS_ENABLED(CONFIG_RCU_BOOST))
+		pr_info("\tRCU kthread priority: %d.\n", kthread_prio);
 }
 
 #ifdef CONFIG_PREEMPT_RCU

From a3bd2c09adcc80946262fd15e63868de1f0f4963 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 21 Jan 2015 20:58:57 -0800
Subject: [PATCH 30/56] rcu: Add boot-up check for non-default
 CONFIG_RCU_FANOUT_LEAF values

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 81c4d91fa18a..c9225350d3ed 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -76,6 +76,9 @@ static void __init rcu_bootup_announce_oddness(void)
 		pr_info("\tAdditional per-CPU info printed with stalls.\n");
 	if (NUM_RCU_LVL_4 != 0)
 		pr_info("\tFour-level hierarchy is enabled.\n");
+	if (CONFIG_RCU_FANOUT_LEAF != 16)
+		pr_info("\tBuild-time adjustment of leaf fanout to %d.\n",
+			CONFIG_RCU_FANOUT_LEAF);
 	if (rcu_fanout_leaf != CONFIG_RCU_FANOUT_LEAF)
 		pr_info("\tBoot-time adjustment of leaf fanout to %d.\n", rcu_fanout_leaf);
 	if (nr_cpu_ids != NR_CPUS)

From b826565aaf8809df146666c03d1acbb7febbd13e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 2 Feb 2015 11:46:33 -0800
Subject: [PATCH 31/56] rcu: Reverse rcu_dereference_check() conditions

The rcu_dereference_check() family of primitives evaluates the RCU
lockdep expression first, and only then evaluates the expression passed
in.  This works fine normally, but can potentially fail in environments
(such as NMI handlers) where lockdep cannot be invoked.  The problem is
that even if the expression passed in is "1", the compiler would need to
prove that the RCU lockdep expression (rcu_read_lock_held(), for example)
is free of side effects in order to be able to elide it.  Given that
rcu_read_lock_held() is sometimes separately compiled, the compiler cannot
always use this optimization.

This commit therefore reverse the order of evaluation, so that the
expression passed in is evaluated first, and the RCU lockdep expression is
evaluated only if the passed-in expression evaluated to false, courtesy
of the C-language short-circuit boolean evaluation rules.  This compells
the compiler to forego executing the RCU lockdep expression in cases
where the passed-in expression evaluates to "1" at compile time, so that
(for example) rcu_dereference_raw() can be guaranteed to execute safely
within an NMI handler.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra (Intel) <peterz@infradead.org>
---
 include/linux/rcupdate.h | 6 +++---
 include/linux/srcu.h     | 2 +-
 2 files changed, 4 insertions(+), 4 deletions(-)

diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 70b896e16f19..416ae2848c6c 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -729,7 +729,7 @@ static inline void rcu_preempt_sleep_check(void)
  * annotated as __rcu.
  */
 #define rcu_dereference_check(p, c) \
-	__rcu_dereference_check((p), rcu_read_lock_held() || (c), __rcu)
+	__rcu_dereference_check((p), (c) || rcu_read_lock_held(), __rcu)
 
 /**
  * rcu_dereference_bh_check() - rcu_dereference_bh with debug checking
@@ -739,7 +739,7 @@ static inline void rcu_preempt_sleep_check(void)
  * This is the RCU-bh counterpart to rcu_dereference_check().
  */
 #define rcu_dereference_bh_check(p, c) \
-	__rcu_dereference_check((p), rcu_read_lock_bh_held() || (c), __rcu)
+	__rcu_dereference_check((p), (c) || rcu_read_lock_bh_held(), __rcu)
 
 /**
  * rcu_dereference_sched_check() - rcu_dereference_sched with debug checking
@@ -749,7 +749,7 @@ static inline void rcu_preempt_sleep_check(void)
  * This is the RCU-sched counterpart to rcu_dereference_check().
  */
 #define rcu_dereference_sched_check(p, c) \
-	__rcu_dereference_check((p), rcu_read_lock_sched_held() || (c), \
+	__rcu_dereference_check((p), (c) || rcu_read_lock_sched_held(), \
 				__rcu)
 
 #define rcu_dereference_raw(p) rcu_dereference_check(p, 1) /*@@@ needed? @@@*/
diff --git a/include/linux/srcu.h b/include/linux/srcu.h
index 9cfd9623fb03..bdeb4567b71e 100644
--- a/include/linux/srcu.h
+++ b/include/linux/srcu.h
@@ -182,7 +182,7 @@ static inline int srcu_read_lock_held(struct srcu_struct *sp)
  * lockdep_is_held() calls.
  */
 #define srcu_dereference_check(p, sp, c) \
-	__rcu_dereference_check((p), srcu_read_lock_held(sp) || (c), __rcu)
+	__rcu_dereference_check((p), (c) || srcu_read_lock_held(sp), __rcu)
 
 /**
  * srcu_dereference - fetch SRCU-protected pointer for later dereferencing

From 91afa21d5d0d79a02984d37509f1f827ae460f4e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 7 Feb 2015 20:08:51 -0800
Subject: [PATCH 32/56] torture: Avoid script syntax error when insufficient
 CPUs

Parentheses are special to bash, so use an overflow flag that doesn't
use them.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 tools/testing/selftests/rcutorture/bin/kvm.sh | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/tools/testing/selftests/rcutorture/bin/kvm.sh b/tools/testing/selftests/rcutorture/bin/kvm.sh
index 368d64ac779e..dd2812ceb0ba 100755
--- a/tools/testing/selftests/rcutorture/bin/kvm.sh
+++ b/tools/testing/selftests/rcutorture/bin/kvm.sh
@@ -310,7 +310,7 @@ function dump(first, pastlast)
 			cfr[jn] = cf[j] "." cfrep[cf[j]];
 		}
 		if (cpusr[jn] > ncpus && ncpus != 0)
-			ovf = "(!)";
+			ovf = "-ovf";
 		else
 			ovf = "";
 		print "echo ", cfr[jn], cpusr[jn] ovf ": Starting build. `date`";

From e7580f33889299e484a80f42c20611ead42f199e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Tue, 24 Feb 2015 14:23:39 -0800
Subject: [PATCH 33/56] rcu: Get rcu_sched_force_quiescent_state() where it
 belongs

The very similar functions rcu_force_quiescent_state(),
rcu_bh_force_quiescent_state(), and rcu_sched_force_quiescent_state()
are supposed to be together, but have drifted apart.  This commit
restores rcu_sched_force_quiescent_state() to its rightful place.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 18 +++++++++---------
 1 file changed, 9 insertions(+), 9 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 35e1604f7e3e..fbe9dd9ced54 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -409,6 +409,15 @@ void rcu_bh_force_quiescent_state(void)
 }
 EXPORT_SYMBOL_GPL(rcu_bh_force_quiescent_state);
 
+/*
+ * Force a quiescent state for RCU-sched.
+ */
+void rcu_sched_force_quiescent_state(void)
+{
+	force_quiescent_state(&rcu_sched_state);
+}
+EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
+
 /*
  * Show the state of the grace-period kthreads.
  */
@@ -482,15 +491,6 @@ void rcutorture_record_progress(unsigned long vernum)
 }
 EXPORT_SYMBOL_GPL(rcutorture_record_progress);
 
-/*
- * Force a quiescent state for RCU-sched.
- */
-void rcu_sched_force_quiescent_state(void)
-{
-	force_quiescent_state(&rcu_sched_state);
-}
-EXPORT_SYMBOL_GPL(rcu_sched_force_quiescent_state);
-
 /*
  * Does the CPU have callbacks ready to be invoked?
  */

From 9910affa89fe0895153880b115ec243636e70af3 Mon Sep 17 00:00:00 2001
From: Yao Dongdong <yaodongdong@huawei.com>
Date: Wed, 25 Feb 2015 17:09:46 +0800
Subject: [PATCH 34/56] rcu: Remove redundant check of cpu_online()

Because invoke_cpu_core() checks whether the current CPU is online,
there is no need for __call_rcu_core() to redundantly check it.
There should not be any performance degradation because the called
function is visible to the compiler.  This commit therefore removes
the redundant check.

Signed-off-by: Yao Dongdong <yaodongdong@huawei.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index fbe9dd9ced54..23194a77a768 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2741,7 +2741,7 @@ static void __call_rcu_core(struct rcu_state *rsp, struct rcu_data *rdp,
 	 * If called from an extended quiescent state, invoke the RCU
 	 * core in order to force a re-evaluation of RCU's idleness.
 	 */
-	if (!rcu_is_watching() && cpu_online(smp_processor_id()))
+	if (!rcu_is_watching())
 		invoke_rcu_core();
 
 	/* If interrupts were disabled or CPU offline, don't invoke RCU core. */

From 8038dad7e888581266c76df15d70ca457a3c5910 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 25 Feb 2015 10:34:39 -0800
Subject: [PATCH 35/56] smpboot: Add common code for notification from dying
 CPU

RCU ignores offlined CPUs, so they cannot safely run RCU read-side code.
(They -can- use SRCU, but not RCU.)  This means that any use of RCU
during or after the call to arch_cpu_idle_dead().  Unfortunately,
commit 2ed53c0d6cc99 added a complete() call, which will contain RCU
read-side critical sections if there is a task waiting to be awakened.

Which, as it turns out, there almost never is.  In my qemu/KVM testing,
the to-be-awakened task is not yet asleep more than 99.5% of the time.
In current mainline, failure is even harder to reproduce, requiring a
virtualized environment that delays the outgoing CPU by at least three
jiffies between the time it exits its stop_machine() task at CPU_DYING
time and the time it calls arch_cpu_idle_dead() from the idle loop.
However, this problem really can occur, especially in virtualized
environments, and therefore really does need to be fixed

This suggests moving back to the polling loop, but using a much shorter
wait, with gentle exponential backoff instead of the old 100-millisecond
wait.  Most of the time, the loop will exit without waiting at all,
and almost all of the remaining uses will wait only five microseconds.
If the outgoing CPU is preempted, a loop will wait one jiffy, then
increase the wait by a factor of 11/10ths, rounding up.  As before, there
is a five-second timeout.

This commit therefore provides common-code infrastructure to do the
dying-to-surviving CPU handoff in a safe manner.  This code also
provides an indication at CPU-online of whether the CPU to be onlined
previously timed out on offline.  The new cpu_check_up_prepare() function
returns -EBUSY if this CPU previously took more than five seconds to
go offline, or -EAGAIN if it has not yet managed to go offline.  The
rationale for -EAGAIN is that it might still be preempted, so an additional
wait might well find it correctly offlined.  Architecture-specific code
can decide how to handle these conditions.  Systems in which CPUs take
themselves completely offline might respond to an -EBUSY return as if
it was a zero (success) return.  Systems in which the surviving CPU must
take some action might take it at this time, or might simply mark the
other CPU as unusable.

Note that architectures that take the easy way out and simply pass the
-EBUSY and -EAGAIN upwards will change the sysfs API.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <linux-api@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
[ paulmck: Fixed state machine for architectures that don't check earlier
  CPU-hotplug results as suggested by James Hogan. ]
---
 include/linux/cpu.h |  12 ++++
 kernel/smpboot.c    | 156 ++++++++++++++++++++++++++++++++++++++++++++
 2 files changed, 168 insertions(+)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4260e8594bd7..4744ef915acd 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -95,6 +95,8 @@ enum {
 					* Called on the new cpu, just before
 					* enabling interrupts. Must not sleep,
 					* must not fail */
+#define CPU_BROKEN		0x000C /* CPU (unsigned)v did not die properly,
+					* perhaps due to preemption. */
 
 /* Used for CPU hotplug events occurring while tasks are frozen due to a suspend
  * operation in progress
@@ -271,4 +273,14 @@ void arch_cpu_idle_enter(void);
 void arch_cpu_idle_exit(void);
 void arch_cpu_idle_dead(void);
 
+DECLARE_PER_CPU(bool, cpu_dead_idle);
+
+int cpu_report_state(int cpu);
+int cpu_check_up_prepare(int cpu);
+void cpu_set_state_online(int cpu);
+#ifdef CONFIG_HOTPLUG_CPU
+bool cpu_wait_death(unsigned int cpu, int seconds);
+bool cpu_report_death(void);
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */
+
 #endif /* _LINUX_CPU_H_ */
diff --git a/kernel/smpboot.c b/kernel/smpboot.c
index 40190f28db35..c697f73d82d6 100644
--- a/kernel/smpboot.c
+++ b/kernel/smpboot.c
@@ -4,6 +4,7 @@
 #include <linux/cpu.h>
 #include <linux/err.h>
 #include <linux/smp.h>
+#include <linux/delay.h>
 #include <linux/init.h>
 #include <linux/list.h>
 #include <linux/slab.h>
@@ -314,3 +315,158 @@ void smpboot_unregister_percpu_thread(struct smp_hotplug_thread *plug_thread)
 	put_online_cpus();
 }
 EXPORT_SYMBOL_GPL(smpboot_unregister_percpu_thread);
+
+static DEFINE_PER_CPU(atomic_t, cpu_hotplug_state) = ATOMIC_INIT(CPU_POST_DEAD);
+
+/*
+ * Called to poll specified CPU's state, for example, when waiting for
+ * a CPU to come online.
+ */
+int cpu_report_state(int cpu)
+{
+	return atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+}
+
+/*
+ * If CPU has died properly, set its state to CPU_UP_PREPARE and
+ * return success.  Otherwise, return -EBUSY if the CPU died after
+ * cpu_wait_death() timed out.  And yet otherwise again, return -EAGAIN
+ * if cpu_wait_death() timed out and the CPU still hasn't gotten around
+ * to dying.  In the latter two cases, the CPU might not be set up
+ * properly, but it is up to the arch-specific code to decide.
+ * Finally, -EIO indicates an unanticipated problem.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+int cpu_check_up_prepare(int cpu)
+{
+	if (!IS_ENABLED(CONFIG_HOTPLUG_CPU)) {
+		atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+		return 0;
+	}
+
+	switch (atomic_read(&per_cpu(cpu_hotplug_state, cpu))) {
+
+	case CPU_POST_DEAD:
+
+		/* The CPU died properly, so just start it up again. */
+		atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_UP_PREPARE);
+		return 0;
+
+	case CPU_DEAD_FROZEN:
+
+		/*
+		 * Timeout during CPU death, so let caller know.
+		 * The outgoing CPU completed its processing, but after
+		 * cpu_wait_death() timed out and reported the error. The
+		 * caller is free to proceed, in which case the state
+		 * will be reset properly by cpu_set_state_online().
+		 * Proceeding despite this -EBUSY return makes sense
+		 * for systems where the outgoing CPUs take themselves
+		 * offline, with no post-death manipulation required from
+		 * a surviving CPU.
+		 */
+		return -EBUSY;
+
+	case CPU_BROKEN:
+
+		/*
+		 * The most likely reason we got here is that there was
+		 * a timeout during CPU death, and the outgoing CPU never
+		 * did complete its processing.  This could happen on
+		 * a virtualized system if the outgoing VCPU gets preempted
+		 * for more than five seconds, and the user attempts to
+		 * immediately online that same CPU.  Trying again later
+		 * might return -EBUSY above, hence -EAGAIN.
+		 */
+		return -EAGAIN;
+
+	default:
+
+		/* Should not happen.  Famous last words. */
+		return -EIO;
+	}
+}
+
+/*
+ * Mark the specified CPU online.
+ *
+ * Note that it is permissible to omit this call entirely, as is
+ * done in architectures that do no CPU-hotplug error checking.
+ */
+void cpu_set_state_online(int cpu)
+{
+	(void)atomic_xchg(&per_cpu(cpu_hotplug_state, cpu), CPU_ONLINE);
+}
+
+#ifdef CONFIG_HOTPLUG_CPU
+
+/*
+ * Wait for the specified CPU to exit the idle loop and die.
+ */
+bool cpu_wait_death(unsigned int cpu, int seconds)
+{
+	int jf_left = seconds * HZ;
+	int oldstate;
+	bool ret = true;
+	int sleep_jf = 1;
+
+	might_sleep();
+
+	/* The outgoing CPU will normally get done quite quickly. */
+	if (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) == CPU_DEAD)
+		goto update_state;
+	udelay(5);
+
+	/* But if the outgoing CPU dawdles, wait increasingly long times. */
+	while (atomic_read(&per_cpu(cpu_hotplug_state, cpu)) != CPU_DEAD) {
+		schedule_timeout_uninterruptible(sleep_jf);
+		jf_left -= sleep_jf;
+		if (jf_left <= 0)
+			break;
+		sleep_jf = DIV_ROUND_UP(sleep_jf * 11, 10);
+	}
+update_state:
+	oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+	if (oldstate == CPU_DEAD) {
+		/* Outgoing CPU died normally, update state. */
+		smp_mb(); /* atomic_read() before update. */
+		atomic_set(&per_cpu(cpu_hotplug_state, cpu), CPU_POST_DEAD);
+	} else {
+		/* Outgoing CPU still hasn't died, set state accordingly. */
+		if (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+				   oldstate, CPU_BROKEN) != oldstate)
+			goto update_state;
+		ret = false;
+	}
+	return ret;
+}
+
+/*
+ * Called by the outgoing CPU to report its successful death.  Return
+ * false if this report follows the surviving CPU's timing out.
+ *
+ * A separate "CPU_DEAD_FROZEN" is used when the surviving CPU
+ * timed out.  This approach allows architectures to omit calls to
+ * cpu_check_up_prepare() and cpu_set_state_online() without defeating
+ * the next cpu_wait_death()'s polling loop.
+ */
+bool cpu_report_death(void)
+{
+	int oldstate;
+	int newstate;
+	int cpu = smp_processor_id();
+
+	do {
+		oldstate = atomic_read(&per_cpu(cpu_hotplug_state, cpu));
+		if (oldstate != CPU_BROKEN)
+			newstate = CPU_DEAD;
+		else
+			newstate = CPU_DEAD_FROZEN;
+	} while (atomic_cmpxchg(&per_cpu(cpu_hotplug_state, cpu),
+				oldstate, newstate) != oldstate);
+	return newstate == CPU_DEAD;
+}
+
+#endif /* #ifdef CONFIG_HOTPLUG_CPU */

From 2a442c9c6453d3d043dfd89f2e03a1deff8a6f06 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 25 Feb 2015 11:42:15 -0800
Subject: [PATCH 36/56] x86: Use common outgoing-CPU-notification code

This commit removes the open-coded CPU-offline notification with new
common code.  Among other things, this change avoids calling scheduler
code using RCU from an offline CPU that RCU is ignoring.  It also allows
Xen to notice at online time that the CPU did not go offline correctly.
Note that Xen has the surviving CPU carry out some cleanup operations,
so if the surviving CPU times out, these cleanup operations might have
been carried out while the outgoing CPU was still running.  It might
therefore be unwise to bring this CPU back online, and this commit
avoids doing so.

Signed-off-by: Boris Ostrovsky <boris.ostrovsky@oracle.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <x86@kernel.org>
Cc: Konrad Rzeszutek Wilk <konrad.wilk@oracle.com>
Cc: David Vrabel <david.vrabel@citrix.com>
Cc: <xen-devel@lists.xenproject.org>
---
 arch/x86/include/asm/cpu.h |  2 --
 arch/x86/include/asm/smp.h |  2 +-
 arch/x86/kernel/smpboot.c  | 39 +++++++++++++++-----------------
 arch/x86/xen/smp.c         | 46 +++++++++++++++++++++-----------------
 4 files changed, 44 insertions(+), 45 deletions(-)

diff --git a/arch/x86/include/asm/cpu.h b/arch/x86/include/asm/cpu.h
index d2b12988d2ed..bf2caa1dedc5 100644
--- a/arch/x86/include/asm/cpu.h
+++ b/arch/x86/include/asm/cpu.h
@@ -34,8 +34,6 @@ extern int _debug_hotplug_cpu(int cpu, int action);
 #endif
 #endif
 
-DECLARE_PER_CPU(int, cpu_state);
-
 int mwait_usable(const struct cpuinfo_x86 *);
 
 #endif /* _ASM_X86_CPU_H */
diff --git a/arch/x86/include/asm/smp.h b/arch/x86/include/asm/smp.h
index 8cd1cc3bc835..a5cb4f6e9492 100644
--- a/arch/x86/include/asm/smp.h
+++ b/arch/x86/include/asm/smp.h
@@ -150,12 +150,12 @@ static inline void arch_send_call_function_ipi_mask(const struct cpumask *mask)
 }
 
 void cpu_disable_common(void);
-void cpu_die_common(unsigned int cpu);
 void native_smp_prepare_boot_cpu(void);
 void native_smp_prepare_cpus(unsigned int max_cpus);
 void native_smp_cpus_done(unsigned int max_cpus);
 int native_cpu_up(unsigned int cpunum, struct task_struct *tidle);
 int native_cpu_disable(void);
+int common_cpu_die(unsigned int cpu);
 void native_cpu_die(unsigned int cpu);
 void native_play_dead(void);
 void play_dead_common(void);
diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c
index febc6aabc72e..c8fa34963ead 100644
--- a/arch/x86/kernel/smpboot.c
+++ b/arch/x86/kernel/smpboot.c
@@ -77,9 +77,6 @@
 #include <asm/realmode.h>
 #include <asm/misc.h>
 
-/* State of each CPU */
-DEFINE_PER_CPU(int, cpu_state) = { 0 };
-
 /* Number of siblings per CPU package */
 int smp_num_siblings = 1;
 EXPORT_SYMBOL(smp_num_siblings);
@@ -257,7 +254,7 @@ static void notrace start_secondary(void *unused)
 	lock_vector_lock();
 	set_cpu_online(smp_processor_id(), true);
 	unlock_vector_lock();
-	per_cpu(cpu_state, smp_processor_id()) = CPU_ONLINE;
+	cpu_set_state_online(smp_processor_id());
 	x86_platform.nmi_init();
 
 	/* enable local interrupts */
@@ -948,7 +945,10 @@ int native_cpu_up(unsigned int cpu, struct task_struct *tidle)
 	 */
 	mtrr_save_state();
 
-	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+	/* x86 CPUs take themselves offline, so delayed offline is OK. */
+	err = cpu_check_up_prepare(cpu);
+	if (err && err != -EBUSY)
+		return err;
 
 	/* the FPU context is blank, nobody can own it */
 	__cpu_disable_lazy_restore(cpu);
@@ -1191,7 +1191,7 @@ void __init native_smp_prepare_boot_cpu(void)
 	switch_to_new_gdt(me);
 	/* already set me in cpu_online_mask in boot_cpu_init() */
 	cpumask_set_cpu(me, cpu_callout_mask);
-	per_cpu(cpu_state, me) = CPU_ONLINE;
+	cpu_set_state_online(me);
 }
 
 void __init native_smp_cpus_done(unsigned int max_cpus)
@@ -1318,14 +1318,10 @@ static void __ref remove_cpu_from_maps(int cpu)
 	numa_remove_cpu(cpu);
 }
 
-static DEFINE_PER_CPU(struct completion, die_complete);
-
 void cpu_disable_common(void)
 {
 	int cpu = smp_processor_id();
 
-	init_completion(&per_cpu(die_complete, smp_processor_id()));
-
 	remove_siblinginfo(cpu);
 
 	/* It's now safe to remove this processor from the online map */
@@ -1349,24 +1345,27 @@ int native_cpu_disable(void)
 	return 0;
 }
 
-void cpu_die_common(unsigned int cpu)
+int common_cpu_die(unsigned int cpu)
 {
-	wait_for_completion_timeout(&per_cpu(die_complete, cpu), HZ);
-}
+	int ret = 0;
 
-void native_cpu_die(unsigned int cpu)
-{
 	/* We don't do anything here: idle task is faking death itself. */
 
-	cpu_die_common(cpu);
-
 	/* They ack this in play_dead() by setting CPU_DEAD */
-	if (per_cpu(cpu_state, cpu) == CPU_DEAD) {
+	if (cpu_wait_death(cpu, 5)) {
 		if (system_state == SYSTEM_RUNNING)
 			pr_info("CPU %u is now offline\n", cpu);
 	} else {
 		pr_err("CPU %u didn't die...\n", cpu);
+		ret = -1;
 	}
+
+	return ret;
+}
+
+void native_cpu_die(unsigned int cpu)
+{
+	common_cpu_die(cpu);
 }
 
 void play_dead_common(void)
@@ -1375,10 +1374,8 @@ void play_dead_common(void)
 	reset_lazy_tlbstate();
 	amd_e400_remove_cpu(raw_smp_processor_id());
 
-	mb();
 	/* Ack it */
-	__this_cpu_write(cpu_state, CPU_DEAD);
-	complete(&per_cpu(die_complete, smp_processor_id()));
+	(void)cpu_report_death();
 
 	/*
 	 * With physical CPU hotplug, we should halt the cpu
diff --git a/arch/x86/xen/smp.c b/arch/x86/xen/smp.c
index 08e8489c47f1..1c5e760f34ca 100644
--- a/arch/x86/xen/smp.c
+++ b/arch/x86/xen/smp.c
@@ -90,14 +90,10 @@ static void cpu_bringup(void)
 
 	set_cpu_online(cpu, true);
 
-	this_cpu_write(cpu_state, CPU_ONLINE);
-
-	wmb();
+	cpu_set_state_online(cpu);  /* Implies full memory barrier. */
 
 	/* We can take interrupts now: we're officially "up". */
 	local_irq_enable();
-
-	wmb();			/* make sure everything is out */
 }
 
 /*
@@ -459,7 +455,13 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 	xen_setup_timer(cpu);
 	xen_init_lock_cpu(cpu);
 
-	per_cpu(cpu_state, cpu) = CPU_UP_PREPARE;
+	/*
+	 * PV VCPUs are always successfully taken down (see 'while' loop
+	 * in xen_cpu_die()), so -EBUSY is an error.
+	 */
+	rc = cpu_check_up_prepare(cpu);
+	if (rc)
+		return rc;
 
 	/* make sure interrupts start blocked */
 	per_cpu(xen_vcpu, cpu)->evtchn_upcall_mask = 1;
@@ -479,10 +481,8 @@ static int xen_cpu_up(unsigned int cpu, struct task_struct *idle)
 	rc = HYPERVISOR_vcpu_op(VCPUOP_up, cpu, NULL);
 	BUG_ON(rc);
 
-	while(per_cpu(cpu_state, cpu) != CPU_ONLINE) {
+	while (cpu_report_state(cpu) != CPU_ONLINE)
 		HYPERVISOR_sched_op(SCHEDOP_yield, NULL);
-		barrier();
-	}
 
 	return 0;
 }
@@ -511,11 +511,11 @@ static void xen_cpu_die(unsigned int cpu)
 		schedule_timeout(HZ/10);
 	}
 
-	cpu_die_common(cpu);
-
-	xen_smp_intr_free(cpu);
-	xen_uninit_lock_cpu(cpu);
-	xen_teardown_timer(cpu);
+	if (common_cpu_die(cpu) == 0) {
+		xen_smp_intr_free(cpu);
+		xen_uninit_lock_cpu(cpu);
+		xen_teardown_timer(cpu);
+	}
 }
 
 static void xen_play_dead(void) /* used only with HOTPLUG_CPU */
@@ -747,6 +747,16 @@ static void __init xen_hvm_smp_prepare_cpus(unsigned int max_cpus)
 static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
 {
 	int rc;
+
+	/*
+	 * This can happen if CPU was offlined earlier and
+	 * offlining timed out in common_cpu_die().
+	 */
+	if (cpu_report_state(cpu) == CPU_DEAD_FROZEN) {
+		xen_smp_intr_free(cpu);
+		xen_uninit_lock_cpu(cpu);
+	}
+
 	/*
 	 * xen_smp_intr_init() needs to run before native_cpu_up()
 	 * so that IPI vectors are set up on the booting CPU before
@@ -768,12 +778,6 @@ static int xen_hvm_cpu_up(unsigned int cpu, struct task_struct *tidle)
 	return rc;
 }
 
-static void xen_hvm_cpu_die(unsigned int cpu)
-{
-	xen_cpu_die(cpu);
-	native_cpu_die(cpu);
-}
-
 void __init xen_hvm_smp_init(void)
 {
 	if (!xen_have_vector_callback)
@@ -781,7 +785,7 @@ void __init xen_hvm_smp_init(void)
 	smp_ops.smp_prepare_cpus = xen_hvm_smp_prepare_cpus;
 	smp_ops.smp_send_reschedule = xen_smp_send_reschedule;
 	smp_ops.cpu_up = xen_hvm_cpu_up;
-	smp_ops.cpu_die = xen_hvm_cpu_die;
+	smp_ops.cpu_die = xen_cpu_die;
 	smp_ops.send_call_func_ipi = xen_smp_send_call_function_ipi;
 	smp_ops.send_call_func_single_ipi = xen_smp_send_call_function_single_ipi;
 	smp_ops.smp_prepare_boot_cpu = xen_smp_prepare_boot_cpu;

From a17b4b7487ebcb2aa6d0b859a0981e280d910622 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 26 Feb 2015 14:28:25 -0800
Subject: [PATCH 37/56] blackfin: Use common outgoing-CPU-notification code

This commit removes the open-coded CPU-offline notification with new
common code.  This change avoids calling scheduler code using RCU from
an offline CPU that RCU is ignoring.  This commit is compatible with
the existing code in not checking for timeout during a prior offline
for a given CPU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: Steven Miao <realmz6@gmail.com>
Cc: <adi-buildroot-devel@lists.sourceforge.net>
---
 arch/blackfin/mach-common/smp.c | 6 ++----
 1 file changed, 2 insertions(+), 4 deletions(-)

diff --git a/arch/blackfin/mach-common/smp.c b/arch/blackfin/mach-common/smp.c
index 8ad3e90cc8fc..1c7259597395 100644
--- a/arch/blackfin/mach-common/smp.c
+++ b/arch/blackfin/mach-common/smp.c
@@ -413,16 +413,14 @@ int __cpu_disable(void)
 	return 0;
 }
 
-static DECLARE_COMPLETION(cpu_killed);
-
 int __cpu_die(unsigned int cpu)
 {
-	return wait_for_completion_timeout(&cpu_killed, 5000);
+	return cpu_wait_death(cpu, 5);
 }
 
 void cpu_die(void)
 {
-	complete(&cpu_killed);
+	(void)cpu_report_death();
 
 	atomic_dec(&init_mm.mm_users);
 	atomic_dec(&init_mm.mm_count);

From 490ab882e2719f5e809a0cb5af7fda4620b66dca Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 26 Feb 2015 14:57:25 -0800
Subject: [PATCH 38/56] metag: Use common outgoing-CPU-notification code

This commit removes the open-coded CPU-offline notification with new
common code.  This change avoids calling scheduler code using RCU from
an offline CPU that RCU is ignoring.  This commit is compatible with
the existing code in not checking for timeout during a prior offline
for a given CPU.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: James Hogan <james.hogan@imgtec.com>
Cc: <linux-metag@vger.kernel.org>
---
 arch/metag/kernel/smp.c | 5 ++---
 1 file changed, 2 insertions(+), 3 deletions(-)

diff --git a/arch/metag/kernel/smp.c b/arch/metag/kernel/smp.c
index f006d2276f40..ac3a199e33e7 100644
--- a/arch/metag/kernel/smp.c
+++ b/arch/metag/kernel/smp.c
@@ -261,7 +261,6 @@ int __cpu_up(unsigned int cpu, struct task_struct *idle)
 }
 
 #ifdef CONFIG_HOTPLUG_CPU
-static DECLARE_COMPLETION(cpu_killed);
 
 /*
  * __cpu_disable runs on the processor to be shutdown.
@@ -299,7 +298,7 @@ int __cpu_disable(void)
  */
 void __cpu_die(unsigned int cpu)
 {
-	if (!wait_for_completion_timeout(&cpu_killed, msecs_to_jiffies(1)))
+	if (!cpu_wait_death(cpu, 1))
 		pr_err("CPU%u: unable to kill\n", cpu);
 }
 
@@ -314,7 +313,7 @@ void cpu_die(void)
 	local_irq_disable();
 	idle_task_exit();
 
-	complete(&cpu_killed);
+	(void)cpu_report_death();
 
 	asm ("XOR	TXENABLE, D0Re0,D0Re0\n");
 }

From b33078b6098148c3efdacc907249a247c9d5491e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 16 Jan 2015 14:01:21 -0800
Subject: [PATCH 39/56] rcu: Consolidate offline-CPU callback initialization

Currently, both rcu_cleanup_dead_cpu() and rcu_send_cbs_to_orphanage()
initialize the outgoing CPU's callback list.  However, only
rcu_cleanup_dead_cpu() invokes rcu_send_cbs_to_orphanage(), and
it does so unconditionally, which means that only one of these
initializations is required.  This commit therefore consolidates the
callback-list initialization with the rest of the callback handling in
rcu_send_cbs_to_orphanage().

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 9 +++++----
 1 file changed, 5 insertions(+), 4 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 48d640ca1a05..8e020c59ecfd 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2256,8 +2256,12 @@ rcu_send_cbs_to_orphanage(int cpu, struct rcu_state *rsp,
 		rsp->orphan_donetail = rdp->nxttail[RCU_DONE_TAIL];
 	}
 
-	/* Finally, initialize the rcu_data structure's list to empty.  */
+	/*
+	 * Finally, initialize the rcu_data structure's list to empty and
+	 * disallow further callbacks on this CPU.
+	 */
 	init_callback_list(rdp);
+	rdp->nxttail[RCU_NEXT_TAIL] = NULL;
 }
 
 /*
@@ -2398,9 +2402,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
 		  cpu, rdp->qlen, rdp->nxtlist);
-	init_callback_list(rdp);
-	/* Disallow further callbacks on this CPU. */
-	rdp->nxttail[RCU_NEXT_TAIL] = NULL;
 	mutex_unlock(&rsp->onoff_mutex);
 }
 

From 78043c467a91573cc1d51827fe10d7d15ae79a60 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 18 Jan 2015 17:46:24 -0800
Subject: [PATCH 40/56] rcu: Put all orphan-callback-related code under same
 comment

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 8e020c59ecfd..98da632d1d49 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2385,9 +2385,9 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 
 	/* Exclude any attempts to start a new grace period. */
 	mutex_lock(&rsp->onoff_mutex);
-	raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
 
 	/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
+	raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
 	rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
 	rcu_adopt_orphan_cbs(rsp, flags);
 	raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);

From c8aead6a9b27fdd94b7bcb74b587ae012d8145f2 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 16:56:46 -0800
Subject: [PATCH 41/56] rcu: Simplify sync_rcu_preempt_exp_init()

This commit eliminates a boolean and associated "if" statement by
rearranging the code.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 0a571e9a0f1d..d37c9fbdba71 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -677,19 +677,16 @@ static void
 sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
 {
 	unsigned long flags;
-	int must_wait = 0;
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
 	if (!rcu_preempt_has_tasks(rnp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		rcu_report_exp_rnp(rsp, rnp, false); /* No tasks, report. */
 	} else {
 		rnp->exp_tasks = rnp->blkd_tasks.next;
 		rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
-		must_wait = 1;
 	}
-	if (!must_wait)
-		rcu_report_exp_rnp(rsp, rnp, false); /* Don't wake self. */
 }
 
 /**

From 18c629eaebf1814ca7f0c27327f75aa93aa4a5de Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 19 Jan 2015 18:59:56 -0800
Subject: [PATCH 42/56] rcu: Eliminate empty HOTPLUG_CPU ifdef

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 4 ----
 1 file changed, 4 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index d37c9fbdba71..79376e2461c9 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -520,10 +520,6 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp)
 	WARN_ON_ONCE(rnp->qsmask);
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
 /*
  * Check for a quiescent state from the current CPU.  When a task blocks,
  * the task is recorded in the corresponding CPU's rcu_node structure,

From 237a0f2193c6daf9b1edd7fd15d55e680f268952 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 22 Jan 2015 14:32:06 -0800
Subject: [PATCH 43/56] rcu: Detect stalls caused by failure to propagate up
 rcu_node tree

If all CPUs have passed through quiescent states, then stalls might be
due to starvation of the grace-period kthread or to failure to propagate
the quiescent states up the rcu_node combining tree.  The current stall
warning messages do not differentiate, so this commit adds a printout
of the root rcu_node structure's ->qsmask field.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 5 +++--
 1 file changed, 3 insertions(+), 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 98da632d1d49..3b7e4133ca99 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1196,9 +1196,10 @@ static void print_other_cpu_stall(struct rcu_state *rsp, unsigned long gpnum)
 		} else {
 			j = jiffies;
 			gpa = ACCESS_ONCE(rsp->gp_activity);
-			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld\n",
+			pr_err("All QSes seen, last %s kthread activity %ld (%ld-%ld), jiffies_till_next_fqs=%ld, root ->qsmask %#lx\n",
 			       rsp->name, j - gpa, j, gpa,
-			       jiffies_till_next_fqs);
+			       jiffies_till_next_fqs,
+			       rcu_get_root(rsp)->qsmask);
 			/* In this case, the current CPU might be at fault. */
 			sched_show_task(current);
 		}

From 37745d281069682d901f00c0121949a7d224195f Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 22 Jan 2015 18:24:08 -0800
Subject: [PATCH 44/56] rcu: Provide diagnostic option to slow down
 grace-period initialization

Grace-period initialization normally proceeds quite quickly, so
that it is very difficult to reproduce races against grace-period
initialization.  This commit therefore allows grace-period
initialization to be artificially slowed down, increasing
race-reproduction probability.  A pair of new Kconfig parameters are
provided, CONFIG_RCU_TORTURE_TEST_SLOW_INIT to enable the slowdowns, and
CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY to specify the number of jiffies
of slowdown to apply.  A boot-time parameter named rcutree.gp_init_delay
allows boot-time delay to be specified.  By default, no delay will be
applied even if CONFIG_RCU_TORTURE_TEST_SLOW_INIT is set.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 Documentation/kernel-parameters.txt |  6 ++++++
 kernel/rcu/tree.c                   | 10 ++++++++++
 lib/Kconfig.debug                   | 24 ++++++++++++++++++++++++
 3 files changed, 40 insertions(+)

diff --git a/Documentation/kernel-parameters.txt b/Documentation/kernel-parameters.txt
index bfcb1a62a7b4..94de410ec341 100644
--- a/Documentation/kernel-parameters.txt
+++ b/Documentation/kernel-parameters.txt
@@ -2968,6 +2968,12 @@ bytes respectively. Such letter suffixes can also be entirely omitted.
 			Set maximum number of finished RCU callbacks to
 			process in one batch.
 
+	rcutree.gp_init_delay=	[KNL]
+			Set the number of jiffies to delay each step of
+			RCU grace-period initialization.  This only has
+			effect when CONFIG_RCU_TORTURE_TEST_SLOW_INIT is
+			set.
+
 	rcutree.rcu_fanout_leaf= [KNL]
 			Increase the number of CPUs assigned to each
 			leaf rcu_node structure.  Useful for very large
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 3b7e4133ca99..b42001fd55fb 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -160,6 +160,12 @@ static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
 static int kthread_prio = CONFIG_RCU_KTHREAD_PRIO;
 module_param(kthread_prio, int, 0644);
 
+/* Delay in jiffies for grace-period initialization delays. */
+static int gp_init_delay = IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT)
+				? CONFIG_RCU_TORTURE_TEST_SLOW_INIT_DELAY
+				: 0;
+module_param(gp_init_delay, int, 0644);
+
 /*
  * Track the rcutorture test sequence number and the update version
  * number within a given test.  The rcutorture_testseq is incremented
@@ -1769,6 +1775,10 @@ static int rcu_gp_init(struct rcu_state *rsp)
 		raw_spin_unlock_irq(&rnp->lock);
 		cond_resched_rcu_qs();
 		ACCESS_ONCE(rsp->gp_activity) = jiffies;
+		if (IS_ENABLED(CONFIG_RCU_TORTURE_TEST_SLOW_INIT) &&
+		    gp_init_delay > 0 &&
+		    !(rsp->gpnum % (rcu_num_nodes * 10)))
+			schedule_timeout_uninterruptible(gp_init_delay);
 	}
 
 	mutex_unlock(&rsp->onoff_mutex);
diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index c5cefb3c009c..feee8dab441e 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1257,6 +1257,30 @@ config RCU_TORTURE_TEST_RUNNABLE
 	  Say N here if you want the RCU torture tests to start only
 	  after being manually enabled via /proc.
 
+config RCU_TORTURE_TEST_SLOW_INIT
+	bool "Slow down RCU grace-period initialization to expose races"
+	depends on RCU_TORTURE_TEST
+	help
+	  This option makes grace-period initialization block for a
+	  few jiffies between initializing each pair of consecutive
+	  rcu_node structures.	This helps to expose races involving
+	  grace-period initialization, in other words, it makes your
+	  kernel less stable.  It can also greatly increase grace-period
+	  latency, especially on systems with large numbers of CPUs.
+	  This is useful when torture-testing RCU, but in almost no
+	  other circumstance.
+
+	  Say Y here if you want your system to crash and hang more often.
+	  Say N if you want a sane system.
+
+config RCU_TORTURE_TEST_SLOW_INIT_DELAY
+	int "How much to slow down RCU grace-period initialization"
+	range 0 5
+	default 0
+	help
+	  This option specifies the number of jiffies to wait between
+	  each rcu_node structure initialization.
+
 config RCU_CPU_STALL_TIMEOUT
 	int "RCU CPU stall timeout in seconds"
 	depends on RCU_STALL_COMMON

From b6505deafa1397c81c3f268bfe0f349cf0be2b97 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 22 Jan 2015 18:39:24 -0800
Subject: [PATCH 45/56] rcutorture: Enable slow grace-period initializations

This commit sets CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y, but leaves the
default time zero.  This can be overridden by passing the
"--bootargs rcutree.gp_init_delay=1" argument to kvm.sh.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 tools/testing/selftests/rcutorture/configs/rcu/CFcommon | 1 +
 1 file changed, 1 insertion(+)

diff --git a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
index d2d2a86139db..49701218dc62 100644
--- a/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
+++ b/tools/testing/selftests/rcutorture/configs/rcu/CFcommon
@@ -1,2 +1,3 @@
 CONFIG_RCU_TORTURE_TEST=y
 CONFIG_PRINTK_TIME=y
+CONFIG_RCU_TORTURE_TEST_SLOW_INIT=y

From 999c286347538388170f919146d7cfa58689472e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sat, 31 Jan 2015 21:12:02 -0800
Subject: [PATCH 46/56] rcu: Remove event tracing from rcu_cpu_notify(), used
 by offline CPUs

Offline CPUs cannot safely invoke trace events, but such CPUs do execute
within rcu_cpu_notify().  Therefore, this commit removes the trace events
from rcu_cpu_notify().  These trace events are for utilization, against
which rcu_cpu_notify() execution time should be negligible.

Reported-by: Fengguang Wu <fengguang.wu@intel.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 --
 1 file changed, 2 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b42001fd55fb..a7151d26b940 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -3629,7 +3629,6 @@ static int rcu_cpu_notify(struct notifier_block *self,
 	struct rcu_node *rnp = rdp->mynode;
 	struct rcu_state *rsp;
 
-	trace_rcu_utilization(TPS("Start CPU hotplug"));
 	switch (action) {
 	case CPU_UP_PREPARE:
 	case CPU_UP_PREPARE_FROZEN:
@@ -3661,7 +3660,6 @@ static int rcu_cpu_notify(struct notifier_block *self,
 	default:
 		break;
 	}
-	trace_rcu_utilization(TPS("End CPU hotplug"));
 	return NOTIFY_OK;
 }
 

From 8eb74b2b291e7bf6aa59fcb4e59f236382f00bf5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 13 Feb 2015 10:52:28 -0800
Subject: [PATCH 47/56] rcu: Rework preemptible expedited bitmask handling

Currently, the rcu_node tree ->expmask bitmasks are initially set to
reflect the online CPUs.  This is pointless, because only the CPUs
preempted within RCU read-side critical sections by the preceding
synchronize_sched_expedited() need to be tracked.  This commit therefore
instead sets up these bitmasks based on the state of the ->blkd_tasks
lists.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree_plugin.h | 100 +++++++++++++++++++++++++++++----------
 1 file changed, 76 insertions(+), 24 deletions(-)

diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index 79376e2461c9..a22721547442 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -626,9 +626,6 @@ static int sync_rcu_preempt_exp_done(struct rcu_node *rnp)
  * recursively up the tree.  (Calm down, calm down, we do the recursion
  * iteratively!)
  *
- * Most callers will set the "wake" flag, but the task initiating the
- * expedited grace period need not wake itself.
- *
  * Caller must hold sync_rcu_preempt_exp_mutex.
  */
 static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
@@ -663,26 +660,85 @@ static void rcu_report_exp_rnp(struct rcu_state *rsp, struct rcu_node *rnp,
 
 /*
  * Snapshot the tasks blocking the newly started preemptible-RCU expedited
- * grace period for the specified rcu_node structure.  If there are no such
- * tasks, report it up the rcu_node hierarchy.
+ * grace period for the specified rcu_node structure, phase 1.  If there
+ * are such tasks, set the ->expmask bits up the rcu_node tree and also
+ * set the ->expmask bits on the leaf rcu_node structures to tell phase 2
+ * that work is needed here.
  *
- * Caller must hold sync_rcu_preempt_exp_mutex and must exclude
- * CPU hotplug operations.
+ * Caller must hold sync_rcu_preempt_exp_mutex.
  */
 static void
-sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
+sync_rcu_preempt_exp_init1(struct rcu_state *rsp, struct rcu_node *rnp)
+{
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_node *rnp_up;
+
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+	smp_mb__after_unlock_lock();
+	WARN_ON_ONCE(rnp->expmask);
+	WARN_ON_ONCE(rnp->exp_tasks);
+	if (!rcu_preempt_has_tasks(rnp)) {
+		/* No blocked tasks, nothing to do. */
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		return;
+	}
+	/* Call for Phase 2 and propagate ->expmask bits up the tree. */
+	rnp->expmask = 1;
+	rnp_up = rnp;
+	while (rnp_up->parent) {
+		mask = rnp_up->grpmask;
+		rnp_up = rnp_up->parent;
+		if (rnp_up->expmask & mask)
+			break;
+		raw_spin_lock(&rnp_up->lock); /* irqs already off */
+		smp_mb__after_unlock_lock();
+		rnp_up->expmask |= mask;
+		raw_spin_unlock(&rnp_up->lock); /* irqs still off */
+	}
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
+/*
+ * Snapshot the tasks blocking the newly started preemptible-RCU expedited
+ * grace period for the specified rcu_node structure, phase 2.  If the
+ * leaf rcu_node structure has its ->expmask field set, check for tasks.
+ * If there are some, clear ->expmask and set ->exp_tasks accordingly,
+ * then initiate RCU priority boosting.  Otherwise, clear ->expmask and
+ * invoke rcu_report_exp_rnp() to clear out the upper-level ->expmask bits,
+ * enabling rcu_read_unlock_special() to do the bit-clearing.
+ *
+ * Caller must hold sync_rcu_preempt_exp_mutex.
+ */
+static void
+sync_rcu_preempt_exp_init2(struct rcu_state *rsp, struct rcu_node *rnp)
 {
 	unsigned long flags;
 
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();
-	if (!rcu_preempt_has_tasks(rnp)) {
+	if (!rnp->expmask) {
+		/* Phase 1 didn't do anything, so Phase 2 doesn't either. */
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-		rcu_report_exp_rnp(rsp, rnp, false); /* No tasks, report. */
-	} else {
+		return;
+	}
+
+	/* Phase 1 is over. */
+	rnp->expmask = 0;
+
+	/*
+	 * If there are still blocked tasks, set up ->exp_tasks so that
+	 * rcu_read_unlock_special() will wake us and then boost them.
+	 */
+	if (rcu_preempt_has_tasks(rnp)) {
 		rnp->exp_tasks = rnp->blkd_tasks.next;
 		rcu_initiate_boost(rnp, flags);  /* releases rnp->lock */
+		return;
 	}
+
+	/* No longer any blocked tasks, so undo bit setting. */
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+	rcu_report_exp_rnp(rsp, rnp, false);
 }
 
 /**
@@ -699,7 +755,6 @@ sync_rcu_preempt_exp_init(struct rcu_state *rsp, struct rcu_node *rnp)
  */
 void synchronize_rcu_expedited(void)
 {
-	unsigned long flags;
 	struct rcu_node *rnp;
 	struct rcu_state *rsp = &rcu_preempt_state;
 	unsigned long snap;
@@ -750,19 +805,16 @@ void synchronize_rcu_expedited(void)
 	/* force all RCU readers onto ->blkd_tasks lists. */
 	synchronize_sched_expedited();
 
-	/* Initialize ->expmask for all non-leaf rcu_node structures. */
-	rcu_for_each_nonleaf_node_breadth_first(rsp, rnp) {
-		raw_spin_lock_irqsave(&rnp->lock, flags);
-		smp_mb__after_unlock_lock();
-		rnp->expmask = rnp->qsmaskinit;
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-	}
-
-	/* Snapshot current state of ->blkd_tasks lists. */
+	/*
+	 * Snapshot current state of ->blkd_tasks lists into ->expmask.
+	 * Phase 1 sets bits and phase 2 permits rcu_read_unlock_special()
+	 * to start clearing them.  Doing this in one phase leads to
+	 * strange races between setting and clearing bits, so just say "no"!
+	 */
 	rcu_for_each_leaf_node(rsp, rnp)
-		sync_rcu_preempt_exp_init(rsp, rnp);
-	if (NUM_RCU_NODES > 1)
-		sync_rcu_preempt_exp_init(rsp, rcu_get_root(rsp));
+		sync_rcu_preempt_exp_init1(rsp, rnp);
+	rcu_for_each_leaf_node(rsp, rnp)
+		sync_rcu_preempt_exp_init2(rsp, rnp);
 
 	put_online_cpus();
 

From cc99a310caf811aebbd0986f433d824e4a5e7ce5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 23 Feb 2015 08:59:29 -0800
Subject: [PATCH 48/56] rcu: Move rcu_report_unblock_qs_rnp() to common code

The rcu_report_unblock_qs_rnp() function is invoked when the
last task blocking the current grace period exits its outermost
RCU read-side critical section.  Previously, this was called only
from rcu_read_unlock_special(), and was therefore defined only when
CONFIG_RCU_PREEMPT=y.  However, this function will be invoked even when
CONFIG_RCU_PREEMPT=n once CPU-hotplug operations are processed only at
the beginnings of RCU grace periods.  The reason for this change is that
the last task on a given leaf rcu_node structure's ->blkd_tasks list
might well exit its RCU read-side critical section between the time that
recent CPU-hotplug operations were applied and when the new grace period
was initialized.  This situation could result in RCU waiting forever on
that leaf rcu_node structure, because if all that structure's CPUs were
already offline, there would be no quiescent-state events to drive that
structure's part of the grace period.

This commit therefore moves rcu_report_unblock_qs_rnp() to common code
that is built unconditionally so that the quiescent-state-forcing code
can clean up after this situation, avoiding the grace-period stall.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 39 +++++++++++++++++++++++++++++++++++++++
 kernel/rcu/tree_plugin.h | 40 ++--------------------------------------
 2 files changed, 41 insertions(+), 38 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index a7151d26b940..5b5cb1ff73ed 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2126,6 +2126,45 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 	rcu_report_qs_rsp(rsp, flags); /* releases rnp->lock. */
 }
 
+/*
+ * Record a quiescent state for all tasks that were previously queued
+ * on the specified rcu_node structure and that were blocking the current
+ * RCU grace period.  The caller must hold the specified rnp->lock with
+ * irqs disabled, and this lock is released upon return, but irqs remain
+ * disabled.
+ */
+static void __maybe_unused rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
+				      struct rcu_node *rnp, unsigned long flags)
+	__releases(rnp->lock)
+{
+	unsigned long mask;
+	struct rcu_node *rnp_p;
+
+	WARN_ON_ONCE(rsp == &rcu_bh_state || rsp == &rcu_sched_state);
+	if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+		raw_spin_unlock_irqrestore(&rnp->lock, flags);
+		return;  /* Still need more quiescent states! */
+	}
+
+	rnp_p = rnp->parent;
+	if (rnp_p == NULL) {
+		/*
+		 * Either there is only one rcu_node in the tree,
+		 * or tasks were kicked up to root rcu_node due to
+		 * CPUs going offline.
+		 */
+		rcu_report_qs_rsp(rsp, flags);
+		return;
+	}
+
+	/* Report up the rest of the hierarchy. */
+	mask = rnp->grpmask;
+	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
+	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
+	smp_mb__after_unlock_lock();
+	rcu_report_qs_rnp(mask, rsp, rnp_p, flags);
+}
+
 /*
  * Record a quiescent state for the specified CPU to that CPU's rcu_data
  * structure.  This must be either called from the specified CPU, or
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index a22721547442..ec6c2efb28cd 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -232,43 +232,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 	return rnp->gp_tasks != NULL;
 }
 
-/*
- * Record a quiescent state for all tasks that were previously queued
- * on the specified rcu_node structure and that were blocking the current
- * RCU grace period.  The caller must hold the specified rnp->lock with
- * irqs disabled, and this lock is released upon return, but irqs remain
- * disabled.
- */
-static void rcu_report_unblock_qs_rnp(struct rcu_node *rnp, unsigned long flags)
-	__releases(rnp->lock)
-{
-	unsigned long mask;
-	struct rcu_node *rnp_p;
-
-	if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
-		return;  /* Still need more quiescent states! */
-	}
-
-	rnp_p = rnp->parent;
-	if (rnp_p == NULL) {
-		/*
-		 * Either there is only one rcu_node in the tree,
-		 * or tasks were kicked up to root rcu_node due to
-		 * CPUs going offline.
-		 */
-		rcu_report_qs_rsp(&rcu_preempt_state, flags);
-		return;
-	}
-
-	/* Report up the rest of the hierarchy. */
-	mask = rnp->grpmask;
-	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
-	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
-	smp_mb__after_unlock_lock();
-	rcu_report_qs_rnp(mask, &rcu_preempt_state, rnp_p, flags);
-}
-
 /*
  * Advance a ->blkd_tasks-list pointer to the next entry, instead
  * returning NULL if at the end of the list.
@@ -399,7 +362,8 @@ void rcu_read_unlock_special(struct task_struct *t)
 							 rnp->grplo,
 							 rnp->grphi,
 							 !!rnp->gp_tasks);
-			rcu_report_unblock_qs_rnp(rnp, flags);
+			rcu_report_unblock_qs_rnp(&rcu_preempt_state,
+						  rnp, flags);
 		} else {
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		}

From 0aa04b055e71bd3b8040dd71a126126c66b6f01e Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 23 Jan 2015 21:52:37 -0800
Subject: [PATCH 49/56] rcu: Process offlining and onlining only at
 grace-period start

Races between CPU hotplug and grace periods can be difficult to resolve,
so the ->onoff_mutex is used to exclude the two events.  Unfortunately,
this means that it is impossible for an outgoing CPU to perform the
last bits of its offlining from its last pass through the idle loop,
because sleeplocks cannot be acquired in that context.

This commit avoids these problems by buffering online and offline events
in a new ->qsmaskinitnext field in the leaf rcu_node structures.  When a
grace period starts, the events accumulated in this mask are applied to
the ->qsmaskinit field, and, if needed, up the rcu_node tree.  The special
case of all CPUs corresponding to a given leaf rcu_node structure being
offline while there are still elements in that structure's ->blkd_tasks
list is handled using a new ->wait_blkd_tasks field.  In this case,
propagating the offline bits up the tree is deferred until the beginning
of the grace period after all of the tasks have exited their RCU read-side
critical sections and removed themselves from the list, at which point
the ->wait_blkd_tasks flag is cleared.  If one of that leaf rcu_node
structure's CPUs comes back online before the list empties, then the
->wait_blkd_tasks flag is simply cleared.

This of course means that RCU's notion of which CPUs are offline can be
out of date.  This is OK because RCU need only wait on CPUs that were
online at the time that the grace period started.  In addition, RCU's
force-quiescent-state actions will handle the case where a CPU goes
offline after the grace period starts.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c        | 154 ++++++++++++++++++++++++++++++---------
 kernel/rcu/tree.h        |   9 +++
 kernel/rcu/tree_plugin.h |  22 +-----
 kernel/rcu/tree_trace.c  |   4 +-
 4 files changed, 136 insertions(+), 53 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 5b5cb1ff73ed..f0f4d3510d24 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -152,6 +152,8 @@ EXPORT_SYMBOL_GPL(rcu_scheduler_active);
  */
 static int rcu_scheduler_fully_active __read_mostly;
 
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf);
+static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf);
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu);
 static void invoke_rcu_core(void);
 static void invoke_rcu_callbacks(struct rcu_state *rsp, struct rcu_data *rdp);
@@ -178,6 +180,17 @@ module_param(gp_init_delay, int, 0644);
 unsigned long rcutorture_testseq;
 unsigned long rcutorture_vernum;
 
+/*
+ * Compute the mask of online CPUs for the specified rcu_node structure.
+ * This will not be stable unless the rcu_node structure's ->lock is
+ * held, but the bit corresponding to the current CPU will be stable
+ * in most contexts.
+ */
+unsigned long rcu_rnp_online_cpus(struct rcu_node *rnp)
+{
+	return ACCESS_ONCE(rnp->qsmaskinitnext);
+}
+
 /*
  * Return true if an RCU grace period is in progress.  The ACCESS_ONCE()s
  * permit this function to be invoked without holding the root rcu_node
@@ -960,7 +973,7 @@ bool rcu_lockdep_current_cpu_online(void)
 	preempt_disable();
 	rdp = this_cpu_ptr(&rcu_sched_data);
 	rnp = rdp->mynode;
-	ret = (rdp->grpmask & rnp->qsmaskinit) ||
+	ret = (rdp->grpmask & rcu_rnp_online_cpus(rnp)) ||
 	      !rcu_scheduler_fully_active;
 	preempt_enable();
 	return ret;
@@ -1710,6 +1723,7 @@ static void note_gp_changes(struct rcu_state *rsp, struct rcu_data *rdp)
  */
 static int rcu_gp_init(struct rcu_state *rsp)
 {
+	unsigned long oldmask;
 	struct rcu_data *rdp;
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
@@ -1744,6 +1758,55 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	mutex_lock(&rsp->onoff_mutex);
 	smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
 
+	/*
+	 * Apply per-leaf buffered online and offline operations to the
+	 * rcu_node tree.  Note that this new grace period need not wait
+	 * for subsequent online CPUs, and that quiescent-state forcing
+	 * will handle subsequent offline CPUs.
+	 */
+	rcu_for_each_leaf_node(rsp, rnp) {
+		raw_spin_lock_irq(&rnp->lock);
+		smp_mb__after_unlock_lock();
+		if (rnp->qsmaskinit == rnp->qsmaskinitnext &&
+		    !rnp->wait_blkd_tasks) {
+			/* Nothing to do on this leaf rcu_node structure. */
+			raw_spin_unlock_irq(&rnp->lock);
+			continue;
+		}
+
+		/* Record old state, apply changes to ->qsmaskinit field. */
+		oldmask = rnp->qsmaskinit;
+		rnp->qsmaskinit = rnp->qsmaskinitnext;
+
+		/* If zero-ness of ->qsmaskinit changed, propagate up tree. */
+		if (!oldmask != !rnp->qsmaskinit) {
+			if (!oldmask) /* First online CPU for this rcu_node. */
+				rcu_init_new_rnp(rnp);
+			else if (rcu_preempt_has_tasks(rnp)) /* blocked tasks */
+				rnp->wait_blkd_tasks = true;
+			else /* Last offline CPU and can propagate. */
+				rcu_cleanup_dead_rnp(rnp);
+		}
+
+		/*
+		 * If all waited-on tasks from prior grace period are
+		 * done, and if all this rcu_node structure's CPUs are
+		 * still offline, propagate up the rcu_node tree and
+		 * clear ->wait_blkd_tasks.  Otherwise, if one of this
+		 * rcu_node structure's CPUs has since come back online,
+		 * simply clear ->wait_blkd_tasks (but rcu_cleanup_dead_rnp()
+		 * checks for this, so just call it unconditionally).
+		 */
+		if (rnp->wait_blkd_tasks &&
+		    (!rcu_preempt_has_tasks(rnp) ||
+		     rnp->qsmaskinit)) {
+			rnp->wait_blkd_tasks = false;
+			rcu_cleanup_dead_rnp(rnp);
+		}
+
+		raw_spin_unlock_irq(&rnp->lock);
+	}
+
 	/*
 	 * Set the quiescent-state-needed bits in all the rcu_node
 	 * structures for all currently online CPUs in breadth-first order,
@@ -2133,7 +2196,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
  * irqs disabled, and this lock is released upon return, but irqs remain
  * disabled.
  */
-static void __maybe_unused rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
+static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 				      struct rcu_node *rnp, unsigned long flags)
 	__releases(rnp->lock)
 {
@@ -2409,6 +2472,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 		raw_spin_lock(&rnp->lock); /* irqs already disabled. */
 		smp_mb__after_unlock_lock(); /* GP memory ordering. */
 		rnp->qsmaskinit &= ~mask;
+		rnp->qsmask &= ~mask;
 		if (rnp->qsmaskinit) {
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 			return;
@@ -2427,6 +2491,7 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
+	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
@@ -2443,12 +2508,12 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
 
 	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+	mask = rdp->grpmask;
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	smp_mb__after_unlock_lock();	/* Enforce GP memory-order guarantee. */
-	rnp->qsmaskinit &= ~rdp->grpmask;
-	if (rnp->qsmaskinit == 0 && !rcu_preempt_has_tasks(rnp))
-		rcu_cleanup_dead_rnp(rnp);
-	rcu_report_qs_rnp(rdp->grpmask, rsp, rnp, flags); /* Rlses rnp->lock. */
+	rnp->qsmaskinitnext &= ~mask;
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+
 	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
 		  cpu, rdp->qlen, rdp->nxtlist);
@@ -2654,12 +2719,21 @@ static void force_qs_rnp(struct rcu_state *rsp,
 			}
 		}
 		if (mask != 0) {
-
-			/* rcu_report_qs_rnp() releases rnp->lock. */
+			/* Idle/offline CPUs, report. */
 			rcu_report_qs_rnp(mask, rsp, rnp, flags);
-			continue;
+		} else if (rnp->parent &&
+			 list_empty(&rnp->blkd_tasks) &&
+			 !rnp->qsmask &&
+			 (rnp->parent->qsmask & rnp->grpmask)) {
+			/*
+			 * Race between grace-period initialization and task
+			 * existing RCU read-side critical section, report.
+			 */
+			rcu_report_unblock_qs_rnp(rsp, rnp, flags);
+		} else {
+			/* Nothing to do here, so just drop the lock. */
+			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		}
-		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 	}
 }
 
@@ -3568,6 +3642,28 @@ void rcu_barrier_sched(void)
 }
 EXPORT_SYMBOL_GPL(rcu_barrier_sched);
 
+/*
+ * Propagate ->qsinitmask bits up the rcu_node tree to account for the
+ * first CPU in a given leaf rcu_node structure coming online.  The caller
+ * must hold the corresponding leaf rcu_node ->lock with interrrupts
+ * disabled.
+ */
+static void rcu_init_new_rnp(struct rcu_node *rnp_leaf)
+{
+	long mask;
+	struct rcu_node *rnp = rnp_leaf;
+
+	for (;;) {
+		mask = rnp->grpmask;
+		rnp = rnp->parent;
+		if (rnp == NULL)
+			return;
+		raw_spin_lock(&rnp->lock); /* Interrupts already disabled. */
+		rnp->qsmaskinit |= mask;
+		raw_spin_unlock(&rnp->lock); /* Interrupts remain disabled. */
+	}
+}
+
 /*
  * Do boot-time initialization of a CPU's per-CPU RCU data.
  */
@@ -3620,31 +3716,23 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 		   (atomic_read(&rdp->dynticks->dynticks) & ~0x1) + 1);
 	raw_spin_unlock(&rnp->lock);		/* irqs remain disabled. */
 
-	/* Add CPU to rcu_node bitmasks. */
+	/*
+	 * Add CPU to leaf rcu_node pending-online bitmask.  Any needed
+	 * propagation up the rcu_node tree will happen at the beginning
+	 * of the next grace period.
+	 */
 	rnp = rdp->mynode;
 	mask = rdp->grpmask;
-	do {
-		/* Exclude any attempts to start a new GP on small systems. */
-		raw_spin_lock(&rnp->lock);	/* irqs already disabled. */
-		rnp->qsmaskinit |= mask;
-		mask = rnp->grpmask;
-		if (rnp == rdp->mynode) {
-			/*
-			 * If there is a grace period in progress, we will
-			 * set up to wait for it next time we run the
-			 * RCU core code.
-			 */
-			rdp->gpnum = rnp->completed;
-			rdp->completed = rnp->completed;
-			rdp->passed_quiesce = 0;
-			rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
-			rdp->qs_pending = 0;
-			trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
-		}
-		raw_spin_unlock(&rnp->lock); /* irqs already disabled. */
-		rnp = rnp->parent;
-	} while (rnp != NULL && !(rnp->qsmaskinit & mask));
-	local_irq_restore(flags);
+	raw_spin_lock(&rnp->lock);		/* irqs already disabled. */
+	smp_mb__after_unlock_lock();
+	rnp->qsmaskinitnext |= mask;
+	rdp->gpnum = rnp->completed; /* Make CPU later note any new GP. */
+	rdp->completed = rnp->completed;
+	rdp->passed_quiesce = false;
+	rdp->rcu_qs_ctr_snap = __this_cpu_read(rcu_qs_ctr);
+	rdp->qs_pending = false;
+	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
 
 	mutex_unlock(&rsp->onoff_mutex);
 }
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index 119de399eb2f..aa42562ff5b2 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -141,12 +141,20 @@ struct rcu_node {
 				/*  complete (only for PREEMPT_RCU). */
 	unsigned long qsmaskinit;
 				/* Per-GP initial value for qsmask & expmask. */
+				/*  Initialized from ->qsmaskinitnext at the */
+				/*  beginning of each grace period. */
+	unsigned long qsmaskinitnext;
+				/* Online CPUs for next grace period. */
 	unsigned long grpmask;	/* Mask to apply to parent qsmask. */
 				/*  Only one bit will be set in this mask. */
 	int	grplo;		/* lowest-numbered CPU or group here. */
 	int	grphi;		/* highest-numbered CPU or group here. */
 	u8	grpnum;		/* CPU/group number for next level up. */
 	u8	level;		/* root is at level 0. */
+	bool	wait_blkd_tasks;/* Necessary to wait for blocked tasks to */
+				/*  exit RCU read-side critical sections */
+				/*  before propagating offline up the */
+				/*  rcu_node tree? */
 	struct rcu_node *parent;
 	struct list_head blkd_tasks;
 				/* Tasks blocked in RCU read-side critical */
@@ -559,6 +567,7 @@ static void rcu_prepare_kthreads(int cpu);
 static void rcu_cleanup_after_idle(void);
 static void rcu_prepare_for_idle(void);
 static void rcu_idle_count_callbacks_posted(void);
+static bool rcu_preempt_has_tasks(struct rcu_node *rnp);
 static void print_cpu_stall_info_begin(void);
 static void print_cpu_stall_info(struct rcu_state *rsp, int cpu);
 static void print_cpu_stall_info_end(void);
diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h
index ec6c2efb28cd..d45e961515c1 100644
--- a/kernel/rcu/tree_plugin.h
+++ b/kernel/rcu/tree_plugin.h
@@ -180,7 +180,7 @@ static void rcu_preempt_note_context_switch(void)
 		 * But first, note that the current CPU must still be
 		 * on line!
 		 */
-		WARN_ON_ONCE((rdp->grpmask & rnp->qsmaskinit) == 0);
+		WARN_ON_ONCE((rdp->grpmask & rcu_rnp_online_cpus(rnp)) == 0);
 		WARN_ON_ONCE(!list_empty(&t->rcu_node_entry));
 		if ((rnp->qsmask & rdp->grpmask) && rnp->gp_tasks != NULL) {
 			list_add(&t->rcu_node_entry, rnp->gp_tasks->prev);
@@ -263,7 +263,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
  */
 void rcu_read_unlock_special(struct task_struct *t)
 {
-	bool empty;
 	bool empty_exp;
 	bool empty_norm;
 	bool empty_exp_now;
@@ -319,7 +318,6 @@ void rcu_read_unlock_special(struct task_struct *t)
 				break;
 			raw_spin_unlock(&rnp->lock); /* irqs remain disabled. */
 		}
-		empty = !rcu_preempt_has_tasks(rnp);
 		empty_norm = !rcu_preempt_blocked_readers_cgp(rnp);
 		empty_exp = !rcu_preempted_readers_exp(rnp);
 		smp_mb(); /* ensure expedited fastpath sees end of RCU c-s. */
@@ -339,14 +337,6 @@ void rcu_read_unlock_special(struct task_struct *t)
 		drop_boost_mutex = rt_mutex_owner(&rnp->boost_mtx) == t;
 #endif /* #ifdef CONFIG_RCU_BOOST */
 
-		/*
-		 * If this was the last task on the list, go see if we
-		 * need to propagate ->qsmaskinit bit clearing up the
-		 * rcu_node tree.
-		 */
-		if (!empty && !rcu_preempt_has_tasks(rnp))
-			rcu_cleanup_dead_rnp(rnp);
-
 		/*
 		 * If this was the last task on the current list, and if
 		 * we aren't waiting on any CPUs, report the quiescent state.
@@ -868,8 +858,6 @@ static int rcu_preempt_blocked_readers_cgp(struct rcu_node *rnp)
 	return 0;
 }
 
-#ifdef CONFIG_HOTPLUG_CPU
-
 /*
  * Because there is no preemptible RCU, there can be no readers blocked.
  */
@@ -878,8 +866,6 @@ static bool rcu_preempt_has_tasks(struct rcu_node *rnp)
 	return false;
 }
 
-#endif /* #ifdef CONFIG_HOTPLUG_CPU */
-
 /*
  * Because preemptible RCU does not exist, we never have to check for
  * tasks blocked within RCU read-side critical sections.
@@ -1179,7 +1165,7 @@ static void rcu_preempt_boost_start_gp(struct rcu_node *rnp)
  * Returns zero if all is well, a negated errno otherwise.
  */
 static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
-						 struct rcu_node *rnp)
+				       struct rcu_node *rnp)
 {
 	int rnp_index = rnp - &rsp->node[0];
 	unsigned long flags;
@@ -1189,7 +1175,7 @@ static int rcu_spawn_one_boost_kthread(struct rcu_state *rsp,
 	if (&rcu_preempt_state != rsp)
 		return 0;
 
-	if (!rcu_scheduler_fully_active || rnp->qsmaskinit == 0)
+	if (!rcu_scheduler_fully_active || rcu_rnp_online_cpus(rnp) == 0)
 		return 0;
 
 	rsp->boost = 1;
@@ -1282,7 +1268,7 @@ static void rcu_cpu_kthread(unsigned int cpu)
 static void rcu_boost_kthread_setaffinity(struct rcu_node *rnp, int outgoingcpu)
 {
 	struct task_struct *t = rnp->boost_kthread_task;
-	unsigned long mask = rnp->qsmaskinit;
+	unsigned long mask = rcu_rnp_online_cpus(rnp);
 	cpumask_var_t cm;
 	int cpu;
 
diff --git a/kernel/rcu/tree_trace.c b/kernel/rcu/tree_trace.c
index fbb6240509ea..f92361efd0f5 100644
--- a/kernel/rcu/tree_trace.c
+++ b/kernel/rcu/tree_trace.c
@@ -283,8 +283,8 @@ static void print_one_rcu_state(struct seq_file *m, struct rcu_state *rsp)
 			seq_puts(m, "\n");
 			level = rnp->level;
 		}
-		seq_printf(m, "%lx/%lx %c%c>%c %d:%d ^%d    ",
-			   rnp->qsmask, rnp->qsmaskinit,
+		seq_printf(m, "%lx/%lx->%lx %c%c>%c %d:%d ^%d    ",
+			   rnp->qsmask, rnp->qsmaskinit, rnp->qsmaskinitnext,
 			   ".G"[rnp->gp_tasks != NULL],
 			   ".E"[rnp->exp_tasks != NULL],
 			   ".T"[!list_empty(&rnp->blkd_tasks)],

From c199068913c9c5cbb5498e289bb387703e087ea8 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Fri, 23 Jan 2015 22:29:37 -0800
Subject: [PATCH 50/56] rcu: Eliminate ->onoff_mutex from rcu_node structure

Because that RCU grace-period initialization need no longer exclude
CPU-hotplug operations, this commit eliminates the ->onoff_mutex and
its uses.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 15 ---------------
 kernel/rcu/tree.h |  2 --
 2 files changed, 17 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index f0f4d3510d24..79d53399247e 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -101,7 +101,6 @@ struct rcu_state sname##_state = { \
 	.orphan_nxttail = &sname##_state.orphan_nxtlist, \
 	.orphan_donetail = &sname##_state.orphan_donelist, \
 	.barrier_mutex = __MUTEX_INITIALIZER(sname##_state.barrier_mutex), \
-	.onoff_mutex = __MUTEX_INITIALIZER(sname##_state.onoff_mutex), \
 	.name = RCU_STATE_NAME(sname), \
 	.abbr = sabbr, \
 }; \
@@ -1754,10 +1753,6 @@ static int rcu_gp_init(struct rcu_state *rsp)
 	trace_rcu_grace_period(rsp->name, rsp->gpnum, TPS("start"));
 	raw_spin_unlock_irq(&rnp->lock);
 
-	/* Exclude any concurrent CPU-hotplug operations. */
-	mutex_lock(&rsp->onoff_mutex);
-	smp_mb__after_unlock_lock(); /* ->gpnum increment before GP! */
-
 	/*
 	 * Apply per-leaf buffered online and offline operations to the
 	 * rcu_node tree.  Note that this new grace period need not wait
@@ -1844,7 +1839,6 @@ static int rcu_gp_init(struct rcu_state *rsp)
 			schedule_timeout_uninterruptible(gp_init_delay);
 	}
 
-	mutex_unlock(&rsp->onoff_mutex);
 	return 1;
 }
 
@@ -2498,9 +2492,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	/* Adjust any no-longer-needed kthreads. */
 	rcu_boost_kthread_setaffinity(rnp, -1);
 
-	/* Exclude any attempts to start a new grace period. */
-	mutex_lock(&rsp->onoff_mutex);
-
 	/* Orphan the dead CPU's callbacks, and adopt them if appropriate. */
 	raw_spin_lock_irqsave(&rsp->orphan_lock, flags);
 	rcu_send_cbs_to_orphanage(cpu, rsp, rnp, rdp);
@@ -2517,7 +2508,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
 		  cpu, rdp->qlen, rdp->nxtlist);
-	mutex_unlock(&rsp->onoff_mutex);
 }
 
 #else /* #ifdef CONFIG_HOTPLUG_CPU */
@@ -3700,9 +3690,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rcu_get_root(rsp);
 
-	/* Exclude new grace periods. */
-	mutex_lock(&rsp->onoff_mutex);
-
 	/* Set up local state, ensuring consistent view of global state. */
 	raw_spin_lock_irqsave(&rnp->lock, flags);
 	rdp->beenonline = 1;	 /* We have now been online. */
@@ -3733,8 +3720,6 @@ rcu_init_percpu_data(int cpu, struct rcu_state *rsp)
 	rdp->qs_pending = false;
 	trace_rcu_grace_period(rsp->name, rdp->gpnum, TPS("cpuonl"));
 	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
-	mutex_unlock(&rsp->onoff_mutex);
 }
 
 static void rcu_prepare_cpu(int cpu)
diff --git a/kernel/rcu/tree.h b/kernel/rcu/tree.h
index aa42562ff5b2..a69d3dab2ec4 100644
--- a/kernel/rcu/tree.h
+++ b/kernel/rcu/tree.h
@@ -456,8 +456,6 @@ struct rcu_state {
 	long qlen;				/* Total number of callbacks. */
 	/* End of fields guarded by orphan_lock. */
 
-	struct mutex onoff_mutex;		/* Coordinate hotplug & GPs. */
-
 	struct mutex barrier_mutex;		/* Guards barrier fields. */
 	atomic_t barrier_cpu_count;		/* # CPUs waiting on. */
 	struct completion barrier_completion;	/* Wake at barrier end. */

From 528a25b00e1f84eaba6c98e63f58ee0a8e472102 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 28 Jan 2015 14:09:43 -0800
Subject: [PATCH 51/56] cpu: Make CPU-offline idle-loop transition point more
 precise

This commit uses a per-CPU variable to make the CPU-offline code path
through the idle loop more precise, so that the outgoing CPU is
guaranteed to make it into the idle loop before it is powered off.
This commit is in preparation for putting the RCU offline-handling
code on this code path, which will eliminate the magic one-jiffy
wait that RCU uses as the maximum time for an outgoing CPU to get
all the way through the scheduler.

The magic one-jiffy wait for incoming CPUs remains a separate issue.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/cpu.c        | 4 +++-
 kernel/sched/idle.c | 7 ++++++-
 2 files changed, 9 insertions(+), 2 deletions(-)

diff --git a/kernel/cpu.c b/kernel/cpu.c
index 1972b161c61e..d46b4dae0ca0 100644
--- a/kernel/cpu.c
+++ b/kernel/cpu.c
@@ -408,8 +408,10 @@ static int __ref _cpu_down(unsigned int cpu, int tasks_frozen)
 	 *
 	 * Wait for the stop thread to go away.
 	 */
-	while (!idle_cpu(cpu))
+	while (!per_cpu(cpu_dead_idle, cpu))
 		cpu_relax();
+	smp_mb(); /* Read from cpu_dead_idle before __cpu_die(). */
+	per_cpu(cpu_dead_idle, cpu) = false;
 
 	/* This actually kills the CPU. */
 	__cpu_die(cpu);
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index 94b2d7b88a27..e99e361ade20 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -198,6 +198,8 @@ exit_idle:
 	start_critical_timings();
 }
 
+DEFINE_PER_CPU(bool, cpu_dead_idle);
+
 /*
  * Generic idle loop implementation
  *
@@ -222,8 +224,11 @@ static void cpu_idle_loop(void)
 			check_pgt_cache();
 			rmb();
 
-			if (cpu_is_offline(smp_processor_id()))
+			if (cpu_is_offline(smp_processor_id())) {
+				smp_mb(); /* all activity before dead. */
+				this_cpu_write(cpu_dead_idle, true);
 				arch_cpu_idle_dead();
+			}
 
 			local_irq_disable();
 			arch_cpu_idle_enter();

From 88428cc5c27c63a4313e213813bc39b9899224d5 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Wed, 28 Jan 2015 14:42:09 -0800
Subject: [PATCH 52/56] rcu: Handle outgoing CPUs on exit from idle loop

This commit informs RCU of an outgoing CPU just before that CPU invokes
arch_cpu_idle_dead() during its last pass through the idle loop (via a
new CPU_DYING_IDLE notifier value).  This change means that RCU need not
deal with outgoing CPUs passing through the scheduler after informing
RCU that they are no longer online.  Note that removing the CPU from
the rcu_node ->qsmaskinit bit masks is done at CPU_DYING_IDLE time,
and orphaning callbacks is still done at CPU_DEAD time, the reason being
that at CPU_DEAD time we have another CPU that can adopt them.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 include/linux/cpu.h      |  2 ++
 include/linux/rcupdate.h |  2 ++
 kernel/rcu/tree.c        | 41 ++++++++++++++++++++++++++++++----------
 kernel/sched/idle.c      |  2 ++
 4 files changed, 37 insertions(+), 10 deletions(-)

diff --git a/include/linux/cpu.h b/include/linux/cpu.h
index 4744ef915acd..d028721748d4 100644
--- a/include/linux/cpu.h
+++ b/include/linux/cpu.h
@@ -95,6 +95,8 @@ enum {
 					* Called on the new cpu, just before
 					* enabling interrupts. Must not sleep,
 					* must not fail */
+#define CPU_DYING_IDLE		0x000B /* CPU (unsigned)v dying, reached
+					* idle loop. */
 #define CPU_BROKEN		0x000C /* CPU (unsigned)v did not die properly,
 					* perhaps due to preemption. */
 
diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h
index 78097491cd99..762022f07afd 100644
--- a/include/linux/rcupdate.h
+++ b/include/linux/rcupdate.h
@@ -266,6 +266,8 @@ void rcu_idle_enter(void);
 void rcu_idle_exit(void);
 void rcu_irq_enter(void);
 void rcu_irq_exit(void);
+int rcu_cpu_notify(struct notifier_block *self,
+		   unsigned long action, void *hcpu);
 
 #ifdef CONFIG_RCU_STALL_COMMON
 void rcu_sysrq_start(void);
diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 79d53399247e..d5247ed44004 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2475,6 +2475,26 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 	}
 }
 
+/*
+ * The CPU is exiting the idle loop into the arch_cpu_idle_dead()
+ * function.  We now remove it from the rcu_node tree's ->qsmaskinit
+ * bit masks.
+ */
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+	unsigned long flags;
+	unsigned long mask;
+	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
+	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
+
+	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
+	mask = rdp->grpmask;
+	raw_spin_lock_irqsave(&rnp->lock, flags);
+	smp_mb__after_unlock_lock();	/* Enforce GP memory-order guarantee. */
+	rnp->qsmaskinitnext &= ~mask;
+	raw_spin_unlock_irqrestore(&rnp->lock, flags);
+}
+
 /*
  * The CPU has been completely removed, and some other CPU is reporting
  * this fact from process context.  Do the remainder of the cleanup,
@@ -2485,7 +2505,6 @@ static void rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 	unsigned long flags;
-	unsigned long mask;
 	struct rcu_data *rdp = per_cpu_ptr(rsp->rda, cpu);
 	struct rcu_node *rnp = rdp->mynode;  /* Outgoing CPU's rdp & rnp. */
 
@@ -2498,13 +2517,6 @@ static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 	rcu_adopt_orphan_cbs(rsp, flags);
 	raw_spin_unlock_irqrestore(&rsp->orphan_lock, flags);
 
-	/* Remove outgoing CPU from mask in the leaf rcu_node structure. */
-	mask = rdp->grpmask;
-	raw_spin_lock_irqsave(&rnp->lock, flags);
-	smp_mb__after_unlock_lock();	/* Enforce GP memory-order guarantee. */
-	rnp->qsmaskinitnext &= ~mask;
-	raw_spin_unlock_irqrestore(&rnp->lock, flags);
-
 	WARN_ONCE(rdp->qlen != 0 || rdp->nxtlist != NULL,
 		  "rcu_cleanup_dead_cpu: Callbacks on offline CPU %d: qlen=%lu, nxtlist=%p\n",
 		  cpu, rdp->qlen, rdp->nxtlist);
@@ -2520,6 +2532,10 @@ static void __maybe_unused rcu_cleanup_dead_rnp(struct rcu_node *rnp_leaf)
 {
 }
 
+static void rcu_cleanup_dying_idle_cpu(int cpu, struct rcu_state *rsp)
+{
+}
+
 static void rcu_cleanup_dead_cpu(int cpu, struct rcu_state *rsp)
 {
 }
@@ -3733,8 +3749,8 @@ static void rcu_prepare_cpu(int cpu)
 /*
  * Handle CPU online/offline notification events.
  */
-static int rcu_cpu_notify(struct notifier_block *self,
-				    unsigned long action, void *hcpu)
+int rcu_cpu_notify(struct notifier_block *self,
+		   unsigned long action, void *hcpu)
 {
 	long cpu = (long)hcpu;
 	struct rcu_data *rdp = per_cpu_ptr(rcu_state_p->rda, cpu);
@@ -3760,6 +3776,11 @@ static int rcu_cpu_notify(struct notifier_block *self,
 		for_each_rcu_flavor(rsp)
 			rcu_cleanup_dying_cpu(rsp);
 		break;
+	case CPU_DYING_IDLE:
+		for_each_rcu_flavor(rsp) {
+			rcu_cleanup_dying_idle_cpu(cpu, rsp);
+		}
+		break;
 	case CPU_DEAD:
 	case CPU_DEAD_FROZEN:
 	case CPU_UP_CANCELED:
diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c
index e99e361ade20..b0090accfb5b 100644
--- a/kernel/sched/idle.c
+++ b/kernel/sched/idle.c
@@ -225,6 +225,8 @@ static void cpu_idle_loop(void)
 			rmb();
 
 			if (cpu_is_offline(smp_processor_id())) {
+				rcu_cpu_notify(NULL, CPU_DYING_IDLE,
+					       (void *)(long)smp_processor_id());
 				smp_mb(); /* all activity before dead. */
 				this_cpu_write(cpu_dead_idle, true);
 				arch_cpu_idle_dead();

From 186bea5d35c821d49e70015d0a6eb73fe9f55d8c Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Thu, 29 Jan 2015 16:37:19 -0800
Subject: [PATCH 53/56] rcutorture: Default to grace-period-initialization
 delays

Given that CPU-hotplug events are now applied only at the starts of
grace periods, it makes sense to unconditionally enable slow grace-period
initialization for rcutorture testing.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 lib/Kconfig.debug | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/lib/Kconfig.debug b/lib/Kconfig.debug
index feee8dab441e..1173afc308ad 100644
--- a/lib/Kconfig.debug
+++ b/lib/Kconfig.debug
@@ -1276,7 +1276,7 @@ config RCU_TORTURE_TEST_SLOW_INIT
 config RCU_TORTURE_TEST_SLOW_INIT_DELAY
 	int "How much to slow down RCU grace-period initialization"
 	range 0 5
-	default 0
+	default 3
 	help
 	  This option specifies the number of jiffies to wait between
 	  each rcu_node structure initialization.

From 5c60d25fa1b22fdcf141f8006d31c32b08db7311 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Mon, 9 Feb 2015 05:37:47 -0800
Subject: [PATCH 54/56] rcu: Add diagnostics to grace-period cleanup

At grace-period initialization time, RCU checks that all quiescent
states were really reported for the previous grace period.  Now that
grace-period cleanup has been split out of grace-period initialization,
this commit also performs those checks at grace-period cleanup time.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
---
 kernel/rcu/tree.c | 2 ++
 1 file changed, 2 insertions(+)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index d5247ed44004..17b5abf999ca 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -1920,6 +1920,8 @@ static void rcu_gp_cleanup(struct rcu_state *rsp)
 	rcu_for_each_node_breadth_first(rsp, rnp) {
 		raw_spin_lock_irq(&rnp->lock);
 		smp_mb__after_unlock_lock();
+		WARN_ON_ONCE(rcu_preempt_blocked_readers_cgp(rnp));
+		WARN_ON_ONCE(rnp->qsmask);
 		ACCESS_ONCE(rnp->completed) = rsp->gpnum;
 		rdp = this_cpu_ptr(rsp->rda);
 		if (rnp == rdp->mynode)

From a77da14ce9afb338040b405f6ab8afddc310411d Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 8 Mar 2015 14:52:27 -0700
Subject: [PATCH 55/56] rcu: Yet another fix for preemption and CPU hotplug

As noted earlier, the following sequence of events can occur when
running PREEMPT_RCU and HOTPLUG_CPU on a system with a multi-level
rcu_node combining tree:

1.	A group of tasks block on CPUs corresponding to a given leaf
	rcu_node structure while within RCU read-side critical sections.
2.	All CPUs corrsponding to that rcu_node structure go offline.
3.	The next grace period starts, but because there are still tasks
	blocked, the upper-level bits corresponding to this leaf rcu_node
	structure remain set.
4.	All the tasks exit their RCU read-side critical sections and
	remove themselves from the leaf rcu_node structure's list,
	leaving it empty.
5.	But because there now is code to check for this condition at
	force-quiescent-state time, the upper bits are cleared and the
	grace period completes.

However, there is another complication that can occur following step 4 above:

4a.	The grace period starts, and the leaf rcu_node structure's
	gp_tasks pointer is set to NULL because there are no tasks
	blocked on this structure.
4b.	One of the CPUs corresponding to the leaf rcu_node structure
	comes back online.
4b.	An endless stream of tasks are preempted within RCU read-side
	critical sections on this CPU, such that the ->blkd_tasks
	list is always non-empty.

The grace period will never end.

This commit therefore makes the force-quiescent-state processing check only
for absence of tasks blocking the current grace period rather than absence
of tasks altogether.  This will cause a quiescent state to be reported if
the current leaf rcu_node structure is not blocking the current grace period
and its parent thinks that it is, regardless of how RCU managed to get
itself into this state.

Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org> # 4.0.x
Tested-by: Sasha Levin <sasha.levin@oracle.com>
---
 kernel/rcu/tree.c | 43 +++++++++++++++++++++++++++----------------
 1 file changed, 27 insertions(+), 16 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index 17b5abf999ca..b3684b284677 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2199,8 +2199,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 	unsigned long mask;
 	struct rcu_node *rnp_p;
 
-	WARN_ON_ONCE(rsp == &rcu_bh_state || rsp == &rcu_sched_state);
-	if (rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
+	if (rcu_state_p == &rcu_sched_state || rsp != rcu_state_p ||
+	    rnp->qsmask != 0 || rcu_preempt_blocked_readers_cgp(rnp)) {
 		raw_spin_unlock_irqrestore(&rnp->lock, flags);
 		return;  /* Still need more quiescent states! */
 	}
@@ -2208,9 +2208,8 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 	rnp_p = rnp->parent;
 	if (rnp_p == NULL) {
 		/*
-		 * Either there is only one rcu_node in the tree,
-		 * or tasks were kicked up to root rcu_node due to
-		 * CPUs going offline.
+		 * Only one rcu_node structure in the tree, so don't
+		 * try to report up to its nonexistent parent!
 		 */
 		rcu_report_qs_rsp(rsp, flags);
 		return;
@@ -2713,8 +2712,29 @@ static void force_qs_rnp(struct rcu_state *rsp,
 			return;
 		}
 		if (rnp->qsmask == 0) {
-			rcu_initiate_boost(rnp, flags); /* releases rnp->lock */
-			continue;
+			if (rcu_state_p == &rcu_sched_state ||
+			    rsp != rcu_state_p ||
+			    rcu_preempt_blocked_readers_cgp(rnp)) {
+				/*
+				 * No point in scanning bits because they
+				 * are all zero.  But we might need to
+				 * priority-boost blocked readers.
+				 */
+				rcu_initiate_boost(rnp, flags);
+				/* rcu_initiate_boost() releases rnp->lock */
+				continue;
+			}
+			if (rnp->parent &&
+			    (rnp->parent->qsmask & rnp->grpmask)) {
+				/*
+				 * Race between grace-period
+				 * initialization and task exiting RCU
+				 * read-side critical section: Report.
+				 */
+				rcu_report_unblock_qs_rnp(rsp, rnp, flags);
+				/* rcu_report_unblock_qs_rnp() rlses ->lock */
+				continue;
+			}
 		}
 		cpu = rnp->grplo;
 		bit = 1;
@@ -2729,15 +2749,6 @@ static void force_qs_rnp(struct rcu_state *rsp,
 		if (mask != 0) {
 			/* Idle/offline CPUs, report. */
 			rcu_report_qs_rnp(mask, rsp, rnp, flags);
-		} else if (rnp->parent &&
-			 list_empty(&rnp->blkd_tasks) &&
-			 !rnp->qsmask &&
-			 (rnp->parent->qsmask & rnp->grpmask)) {
-			/*
-			 * Race between grace-period initialization and task
-			 * existing RCU read-side critical section, report.
-			 */
-			rcu_report_unblock_qs_rnp(rsp, rnp, flags);
 		} else {
 			/* Nothing to do here, so just drop the lock. */
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);

From 654e953340491e498871321d7e2c9b0a12821933 Mon Sep 17 00:00:00 2001
From: "Paul E. McKenney" <paulmck@linux.vnet.ibm.com>
Date: Sun, 15 Mar 2015 09:19:35 -0700
Subject: [PATCH 56/56] rcu: Associate quiescent-state reports with grace
 period

As noted in earlier commit logs, CPU hotplug operations running
concurrently with grace-period initialization can result in a given
leaf rcu_node structure having all CPUs offline and no blocked readers,
but with this rcu_node structure nevertheless blocking the current
grace period.  Therefore, the quiescent-state forcing code now checks
for this situation and repairs it.

Unfortunately, this checking can result in false positives, for example,
when the last task has just removed itself from this leaf rcu_node
structure, but has not yet started clearing the ->qsmask bits further
up the structure.  This means that the grace-period kthread (which
forces quiescent states) and some other task might be attempting to
concurrently clear these ->qsmask bits.  This is usually not a problem:
One of these tasks will be the first to acquire the upper-level rcu_node
structure's lock and with therefore clear the bit, and the other task,
seeing the bit already cleared, will stop trying to clear bits.

Sadly, this means that the following unusual sequence of events -can-
result in a problem:

1.	The grace-period kthread wins, and clears the ->qsmask bits.

2.	This is the last thing blocking the current grace period, so
	that the grace-period kthread clears ->qsmask bits all the way
	to the root and finds that the root ->qsmask field is now zero.

3.	Another grace period is required, so that the grace period kthread
	initializes it, including setting all the needed qsmask bits.

4.	The leaf rcu_node structure (the one that started this whole
	mess) is blocking this new grace period, either because it
	has at least one online CPU or because there is at least one
	task that had blocked within an RCU read-side critical section
	while running on one of this leaf rcu_node structure's CPUs.
	(And yes, that CPU might well have gone offline before the
	grace period in step (3) above started, which can mean that
	there is a task on the leaf rcu_node structure's ->blkd_tasks
	list, but ->qsmask equal to zero.)

5.	The other kthread didn't get around to trying to clear the upper
	level ->qsmask bits until all the above had happened.  This means
	that it now sees bits set in the upper-level ->qsmask field, so it
	proceeds to clear them.  Too bad that it is doing so on behalf of
	a quiescent state that does not apply to the current grace period!

This sequence of events can result in the new grace period being too
short.  It can also result in the new grace period ending before the
leaf rcu_node structure's ->qsmask bits have been cleared, which will
result in splats during initialization of the next grace period.  In
addition, it can result in tasks blocking the new grace period still
being queued at the start of the next grace period, which will result
in other splats.  Sasha's testing turned up another of these splats,
as did rcutorture testing.  (And yes, rcutorture is being adjusted to
make these splats show up more quickly.  Which probably is having the
undesirable side effect of making other problems show up less quickly.
Can't have everything!)

Reported-by: Sasha Levin <sasha.levin@oracle.com>
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: <stable@vger.kernel.org> # 4.0.x
Tested-by: Sasha Levin <sasha.levin@oracle.com>
---
 kernel/rcu/tree.c | 34 ++++++++++++++++++++++------------
 1 file changed, 22 insertions(+), 12 deletions(-)

diff --git a/kernel/rcu/tree.c b/kernel/rcu/tree.c
index b3684b284677..8fcc64ed858c 100644
--- a/kernel/rcu/tree.c
+++ b/kernel/rcu/tree.c
@@ -2132,25 +2132,32 @@ static void rcu_report_qs_rsp(struct rcu_state *rsp, unsigned long flags)
  * Similar to rcu_report_qs_rdp(), for which it is a helper function.
  * Allows quiescent states for a group of CPUs to be reported at one go
  * to the specified rcu_node structure, though all the CPUs in the group
- * must be represented by the same rcu_node structure (which need not be
- * a leaf rcu_node structure, though it often will be).  That structure's
- * lock must be held upon entry, and it is released before return.
+ * must be represented by the same rcu_node structure (which need not be a
+ * leaf rcu_node structure, though it often will be).  The gps parameter
+ * is the grace-period snapshot, which means that the quiescent states
+ * are valid only if rnp->gpnum is equal to gps.  That structure's lock
+ * must be held upon entry, and it is released before return.
  */
 static void
 rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
-		  struct rcu_node *rnp, unsigned long flags)
+		  struct rcu_node *rnp, unsigned long gps, unsigned long flags)
 	__releases(rnp->lock)
 {
+	unsigned long oldmask = 0;
 	struct rcu_node *rnp_c;
 
 	/* Walk up the rcu_node hierarchy. */
 	for (;;) {
-		if (!(rnp->qsmask & mask)) {
+		if (!(rnp->qsmask & mask) || rnp->gpnum != gps) {
 
-			/* Our bit has already been cleared, so done. */
+			/*
+			 * Our bit has already been cleared, or the
+			 * relevant grace period is already over, so done.
+			 */
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);
 			return;
 		}
+		WARN_ON_ONCE(oldmask); /* Any child must be all zeroed! */
 		rnp->qsmask &= ~mask;
 		trace_rcu_quiescent_state_report(rsp->name, rnp->gpnum,
 						 mask, rnp->qsmask, rnp->level,
@@ -2174,7 +2181,7 @@ rcu_report_qs_rnp(unsigned long mask, struct rcu_state *rsp,
 		rnp = rnp->parent;
 		raw_spin_lock_irqsave(&rnp->lock, flags);
 		smp_mb__after_unlock_lock();
-		WARN_ON_ONCE(rnp_c->qsmask);
+		oldmask = rnp_c->qsmask;
 	}
 
 	/*
@@ -2196,6 +2203,7 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 				      struct rcu_node *rnp, unsigned long flags)
 	__releases(rnp->lock)
 {
+	unsigned long gps;
 	unsigned long mask;
 	struct rcu_node *rnp_p;
 
@@ -2215,12 +2223,13 @@ static void rcu_report_unblock_qs_rnp(struct rcu_state *rsp,
 		return;
 	}
 
-	/* Report up the rest of the hierarchy. */
+	/* Report up the rest of the hierarchy, tracking current ->gpnum. */
+	gps = rnp->gpnum;
 	mask = rnp->grpmask;
 	raw_spin_unlock(&rnp->lock);	/* irqs remain disabled. */
 	raw_spin_lock(&rnp_p->lock);	/* irqs already disabled. */
 	smp_mb__after_unlock_lock();
-	rcu_report_qs_rnp(mask, rsp, rnp_p, flags);
+	rcu_report_qs_rnp(mask, rsp, rnp_p, gps, flags);
 }
 
 /*
@@ -2271,7 +2280,8 @@ rcu_report_qs_rdp(int cpu, struct rcu_state *rsp, struct rcu_data *rdp)
 		 */
 		needwake = rcu_accelerate_cbs(rsp, rnp, rdp);
 
-		rcu_report_qs_rnp(mask, rsp, rnp, flags); /* rlses rnp->lock */
+		rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
+		/* ^^^ Released rnp->lock */
 		if (needwake)
 			rcu_gp_kthread_wake(rsp);
 	}
@@ -2747,8 +2757,8 @@ static void force_qs_rnp(struct rcu_state *rsp,
 			}
 		}
 		if (mask != 0) {
-			/* Idle/offline CPUs, report. */
-			rcu_report_qs_rnp(mask, rsp, rnp, flags);
+			/* Idle/offline CPUs, report (releases rnp->lock. */
+			rcu_report_qs_rnp(mask, rsp, rnp, rnp->gpnum, flags);
 		} else {
 			/* Nothing to do here, so just drop the lock. */
 			raw_spin_unlock_irqrestore(&rnp->lock, flags);