From 3be209a8e22cedafc1b6945608b7bb8d9887ab61 Mon Sep 17 00:00:00 2001
From: Shawn Bohrer <sbohrer@rgmadvisors.com>
Date: Mon, 12 Sep 2011 09:28:04 -0500
Subject: [PATCH 1/6] sched/rt: Migrate equal priority tasks to available CPUs

Commit 43fa5460fe60dea5c610490a1d263415419c60f6 ("sched: Try not to
migrate higher priority RT tasks") also introduced a change in behavior
which keeps RT tasks on the same CPU if there is an equal priority RT
task currently running even if there are empty CPUs available.

This can cause unnecessary wakeup latencies, and can prevent the
scheduler from balancing all RT tasks across available CPUs.

This change causes an RT task to search for a new CPU if an equal
priority RT task is already running on wakeup.  Lower priority tasks
will still have to wait on higher priority tasks, but the system should
still balance out because there is always the possibility that if there
are both a high and low priority RT tasks on a given CPU that the high
priority task could wakeup while the low priority task is running and
force it to search for a better runqueue.

Signed-off-by: Shawn Bohrer <sbohrer@rgmadvisors.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Tested-by: Steven Rostedt <rostedt@goodmis.org>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org # 37+
Link: http://lkml.kernel.org/r/1315837684-18733-1-git-send-email-sbohrer@rgmadvisors.com
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched_rt.c | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/kernel/sched_rt.c b/kernel/sched_rt.c
index 97540f0c9e47..af1177858be3 100644
--- a/kernel/sched_rt.c
+++ b/kernel/sched_rt.c
@@ -1050,7 +1050,7 @@ select_task_rq_rt(struct task_struct *p, int sd_flag, int flags)
 	 */
 	if (curr && unlikely(rt_task(curr)) &&
 	    (curr->rt.nr_cpus_allowed < 2 ||
-	     curr->prio < p->prio) &&
+	     curr->prio <= p->prio) &&
 	    (p->rt.nr_cpus_allowed > 1)) {
 		int target = find_lowest_rq(p);
 
@@ -1581,7 +1581,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p)
 	    p->rt.nr_cpus_allowed > 1 &&
 	    rt_task(rq->curr) &&
 	    (rq->curr->rt.nr_cpus_allowed < 2 ||
-	     rq->curr->prio < p->prio))
+	     rq->curr->prio <= p->prio))
 		push_rt_tasks(rq);
 }
 

From 5bd078dda4d4fbdb4bd138a6bd5b6e274c019ed2 Mon Sep 17 00:00:00 2001
From: Rob Herring <robherring2@gmail.com>
Date: Wed, 14 Sep 2011 11:31:36 -0500
Subject: [PATCH 2/6] irq: Add declaration of irq_domain_simple_ops to
 irqdomain.h

irq_domain_simple_ops is exported, but is not declared in irqdomain.h,
so add it.

Signed-off-by: Rob Herring <rob.herring@calxeda.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: marc.zyngier@arm.com
Cc: thomas.abraham@linaro.org
Cc: jamie@jamieiles.com
Cc: b-cousson@ti.com
Cc: shawn.guo@linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: devicetree-discuss@lists.ozlabs.org
Link: http://lkml.kernel.org/r/1316017900-19918-2-git-send-email-robherring2@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/irqdomain.h | 1 +
 1 file changed, 1 insertion(+)

diff --git a/include/linux/irqdomain.h b/include/linux/irqdomain.h
index e807ad687a07..3ad553e8eae2 100644
--- a/include/linux/irqdomain.h
+++ b/include/linux/irqdomain.h
@@ -80,6 +80,7 @@ extern void irq_domain_del(struct irq_domain *domain);
 #endif /* CONFIG_IRQ_DOMAIN */
 
 #if defined(CONFIG_IRQ_DOMAIN) && defined(CONFIG_OF_IRQ)
+extern struct irq_domain_ops irq_domain_simple_ops;
 extern void irq_domain_add_simple(struct device_node *controller, int irq_base);
 extern void irq_domain_generate_simple(const struct of_device_id *match,
 					u64 phys_base, unsigned int irq_start);

From eef24afb28561a5a9f4be8f8da97735b7e6a826f Mon Sep 17 00:00:00 2001
From: Rob Herring <robherring2@gmail.com>
Date: Wed, 14 Sep 2011 11:31:37 -0500
Subject: [PATCH 3/6] irq: Fix check for already initialized irq_domain in
 irq_domain_add

The sanity check in irq_domain_add() tests desc->irq_data != NULL or
irq_data->domain != NULL. This prevents adding an irq_domain to a irq
descriptor when irq_data exists, which true when the irq descriptor
exists.

This went unnoticed so far as the simple domain code did not enter
this code path because domain->nr_irqs is always 0 for the simple domains.

Split the check for irq_data == NULL out and have a separate warning
for it.

[ tglx: Made the check for irq_data == NULL separate ]

Signed-off-by: Rob Herring <rob.herring@calxeda.com>
Cc: Grant Likely <grant.likely@secretlab.ca>
Cc: marc.zyngier@arm.com
Cc: thomas.abraham@linaro.org
Cc: jamie@jamieiles.com
Cc: b-cousson@ti.com
Cc: shawn.guo@linaro.org
Cc: linux-arm-kernel@lists.infradead.org
Cc: devicetree-discuss@lists.ozlabs.org
Link: http://lkml.kernel.org/r/1316017900-19918-3-git-send-email-robherring2@gmail.com
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 kernel/irq/irqdomain.c | 6 +++++-
 1 file changed, 5 insertions(+), 1 deletion(-)

diff --git a/kernel/irq/irqdomain.c b/kernel/irq/irqdomain.c
index d5828da3fd38..b57a3776de44 100644
--- a/kernel/irq/irqdomain.c
+++ b/kernel/irq/irqdomain.c
@@ -29,7 +29,11 @@ void irq_domain_add(struct irq_domain *domain)
 	 */
 	for (hwirq = 0; hwirq < domain->nr_irq; hwirq++) {
 		d = irq_get_irq_data(irq_domain_to_irq(domain, hwirq));
-		if (d || d->domain) {
+		if (!d) {
+			WARN(1, "error: assigning domain to non existant irq_desc");
+			return;
+		}
+		if (d->domain) {
 			/* things are broken; just report, don't clean up */
 			WARN(1, "error: irq_desc already assigned to a domain");
 			return;

From 47997d756aa2a84ab577e1b0383cc12d582fc69c Mon Sep 17 00:00:00 2001
From: Matt Fleming <matt.fleming@intel.com>
Date: Wed, 21 Sep 2011 16:08:03 +0200
Subject: [PATCH 4/6] x86/rtc: Don't recursively acquire rtc_lock

A deadlock was introduced on x86 in commit ef68c8f87ed1 ("x86:
Serialize EFI time accesses on rtc_lock") because efi_get_time()
and friends can be called with rtc_lock already held by
read_persistent_time(), e.g.:

 timekeeping_init()
    read_persistent_clock()     <-- acquire rtc_lock
        efi_get_time()
            phys_efi_get_time() <-- acquire rtc_lock <DEADLOCK>

To fix this let's push the locking down into the get_wallclock()
and set_wallclock() implementations.  Only the clock
implementations that access the x86 RTC directly need to acquire
rtc_lock, so it makes sense to push the locking down into the
rtc, vrtc and efi code.

The virtualization implementations don't require rtc_lock to be
held because they provide their own serialization.

Signed-off-by: Matt Fleming <matt.fleming@intel.com>
Acked-by: Jan Beulich <jbeulich@novell.com>
Acked-by: Avi Kivity <avi@redhat.com> [for the virtualization aspect]
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Zhang Rui <rui.zhang@intel.com>
Cc: Josh Boyer <jwboyer@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 arch/x86/kernel/rtc.c         | 23 ++++++++++++-----------
 arch/x86/platform/mrst/vrtc.c |  9 +++++++++
 2 files changed, 21 insertions(+), 11 deletions(-)

diff --git a/arch/x86/kernel/rtc.c b/arch/x86/kernel/rtc.c
index 3f2ad2640d85..ccdbc16b8941 100644
--- a/arch/x86/kernel/rtc.c
+++ b/arch/x86/kernel/rtc.c
@@ -42,8 +42,11 @@ int mach_set_rtc_mmss(unsigned long nowtime)
 {
 	int real_seconds, real_minutes, cmos_minutes;
 	unsigned char save_control, save_freq_select;
+	unsigned long flags;
 	int retval = 0;
 
+	spin_lock_irqsave(&rtc_lock, flags);
+
 	 /* tell the clock it's being set */
 	save_control = CMOS_READ(RTC_CONTROL);
 	CMOS_WRITE((save_control|RTC_SET), RTC_CONTROL);
@@ -93,12 +96,17 @@ int mach_set_rtc_mmss(unsigned long nowtime)
 	CMOS_WRITE(save_control, RTC_CONTROL);
 	CMOS_WRITE(save_freq_select, RTC_FREQ_SELECT);
 
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
 	return retval;
 }
 
 unsigned long mach_get_cmos_time(void)
 {
 	unsigned int status, year, mon, day, hour, min, sec, century = 0;
+	unsigned long flags;
+
+	spin_lock_irqsave(&rtc_lock, flags);
 
 	/*
 	 * If UIP is clear, then we have >= 244 microseconds before
@@ -125,6 +133,8 @@ unsigned long mach_get_cmos_time(void)
 	status = CMOS_READ(RTC_CONTROL);
 	WARN_ON_ONCE(RTC_ALWAYS_BCD && (status & RTC_DM_BINARY));
 
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
 	if (RTC_ALWAYS_BCD || !(status & RTC_DM_BINARY)) {
 		sec = bcd2bin(sec);
 		min = bcd2bin(min);
@@ -169,24 +179,15 @@ EXPORT_SYMBOL(rtc_cmos_write);
 
 int update_persistent_clock(struct timespec now)
 {
-	unsigned long flags;
-	int retval;
-
-	spin_lock_irqsave(&rtc_lock, flags);
-	retval = x86_platform.set_wallclock(now.tv_sec);
-	spin_unlock_irqrestore(&rtc_lock, flags);
-
-	return retval;
+	return x86_platform.set_wallclock(now.tv_sec);
 }
 
 /* not static: needed by APM */
 void read_persistent_clock(struct timespec *ts)
 {
-	unsigned long retval, flags;
+	unsigned long retval;
 
-	spin_lock_irqsave(&rtc_lock, flags);
 	retval = x86_platform.get_wallclock();
-	spin_unlock_irqrestore(&rtc_lock, flags);
 
 	ts->tv_sec = retval;
 	ts->tv_nsec = 0;
diff --git a/arch/x86/platform/mrst/vrtc.c b/arch/x86/platform/mrst/vrtc.c
index 73d70d65e76e..6d5dbcdd444a 100644
--- a/arch/x86/platform/mrst/vrtc.c
+++ b/arch/x86/platform/mrst/vrtc.c
@@ -58,8 +58,11 @@ EXPORT_SYMBOL_GPL(vrtc_cmos_write);
 unsigned long vrtc_get_time(void)
 {
 	u8 sec, min, hour, mday, mon;
+	unsigned long flags;
 	u32 year;
 
+	spin_lock_irqsave(&rtc_lock, flags);
+
 	while ((vrtc_cmos_read(RTC_FREQ_SELECT) & RTC_UIP))
 		cpu_relax();
 
@@ -70,6 +73,8 @@ unsigned long vrtc_get_time(void)
 	mon = vrtc_cmos_read(RTC_MONTH);
 	year = vrtc_cmos_read(RTC_YEAR);
 
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
 	/* vRTC YEAR reg contains the offset to 1960 */
 	year += 1960;
 
@@ -83,8 +88,10 @@ unsigned long vrtc_get_time(void)
 int vrtc_set_mmss(unsigned long nowtime)
 {
 	int real_sec, real_min;
+	unsigned long flags;
 	int vrtc_min;
 
+	spin_lock_irqsave(&rtc_lock, flags);
 	vrtc_min = vrtc_cmos_read(RTC_MINUTES);
 
 	real_sec = nowtime % 60;
@@ -95,6 +102,8 @@ int vrtc_set_mmss(unsigned long nowtime)
 
 	vrtc_cmos_write(real_sec, RTC_SECONDS);
 	vrtc_cmos_write(real_min, RTC_MINUTES);
+	spin_unlock_irqrestore(&rtc_lock, flags);
+
 	return 0;
 }
 

From 6ebbe7a07b3bc40b168d2afc569a6543c020d2e3 Mon Sep 17 00:00:00 2001
From: Simon Kirby <sim@hostway.ca>
Date: Thu, 22 Sep 2011 17:03:46 -0700
Subject: [PATCH 5/6] sched: Fix up wchan borkage

Commit c259e01a1ec ("sched: Separate the scheduler entry for
preemption") contained a boo-boo wrecking wchan output. It forgot to
put the new schedule() function in the __sched section and thereby
doesn't get properly ignored for things like wchan.

Tested-by: Simon Kirby <sim@hostway.ca>
Cc: stable@kernel.org # 2.6.39+
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Link: http://lkml.kernel.org/r/20110923000346.GA25425@hostway.ca
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 kernel/sched.c | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/kernel/sched.c b/kernel/sched.c
index ec5f472bc5b9..d249ea88428c 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -4372,7 +4372,7 @@ static inline void sched_submit_work(struct task_struct *tsk)
 		blk_schedule_flush_plug(tsk);
 }
 
-asmlinkage void schedule(void)
+asmlinkage void __sched schedule(void)
 {
 	struct task_struct *tsk = current;
 

From d670ec13178d0fd8680e6742a2bc6e04f28f87d8 Mon Sep 17 00:00:00 2001
From: Peter Zijlstra <a.p.zijlstra@chello.nl>
Date: Thu, 1 Sep 2011 12:42:04 +0200
Subject: [PATCH 6/6] posix-cpu-timers: Cure SMP wobbles

David reported:

  Attached below is a watered-down version of rt/tst-cpuclock2.c from
  GLIBC.  Just build it with "gcc -o test test.c -lpthread -lrt" or
  similar.

  Run it several times, and you will see cases where the main thread
  will measure a process clock difference before and after the nanosleep
  which is smaller than the cpu-burner thread's individual thread clock
  difference.  This doesn't make any sense since the cpu-burner thread
  is part of the top-level process's thread group.

  I've reproduced this on both x86-64 and sparc64 (using both 32-bit and
  64-bit binaries).

  For example:

  [davem@boricha build-x86_64-linux]$ ./test
  process: before(0.001221967) after(0.498624371) diff(497402404)
  thread:  before(0.000081692) after(0.498316431) diff(498234739)
  self:    before(0.001223521) after(0.001240219) diff(16698)
  [davem@boricha build-x86_64-linux]$

  The diff of 'process' should always be >= the diff of 'thread'.

  I make sure to wrap the 'thread' clock measurements the most tightly
  around the nanosleep() call, and that the 'process' clock measurements
  are the outer-most ones.

  ---
  #include <unistd.h>
  #include <stdio.h>
  #include <stdlib.h>
  #include <time.h>
  #include <fcntl.h>
  #include <string.h>
  #include <errno.h>
  #include <pthread.h>

  static pthread_barrier_t barrier;

  static void *chew_cpu(void *arg)
  {
	  pthread_barrier_wait(&barrier);
	  while (1)
		  __asm__ __volatile__("" : : : "memory");
	  return NULL;
  }

  int main(void)
  {
	  clockid_t process_clock, my_thread_clock, th_clock;
	  struct timespec process_before, process_after;
	  struct timespec me_before, me_after;
	  struct timespec th_before, th_after;
	  struct timespec sleeptime;
	  unsigned long diff;
	  pthread_t th;
	  int err;

	  err = clock_getcpuclockid(0, &process_clock);
	  if (err)
		  return 1;

	  err = pthread_getcpuclockid(pthread_self(), &my_thread_clock);
	  if (err)
		  return 1;

	  pthread_barrier_init(&barrier, NULL, 2);
	  err = pthread_create(&th, NULL, chew_cpu, NULL);
	  if (err)
		  return 1;

	  err = pthread_getcpuclockid(th, &th_clock);
	  if (err)
		  return 1;

	  pthread_barrier_wait(&barrier);

	  err = clock_gettime(process_clock, &process_before);
	  if (err)
		  return 1;

	  err = clock_gettime(my_thread_clock, &me_before);
	  if (err)
		  return 1;

	  err = clock_gettime(th_clock, &th_before);
	  if (err)
		  return 1;

	  sleeptime.tv_sec = 0;
	  sleeptime.tv_nsec = 500000000;
	  nanosleep(&sleeptime, NULL);

	  err = clock_gettime(th_clock, &th_after);
	  if (err)
		  return 1;

	  err = clock_gettime(my_thread_clock, &me_after);
	  if (err)
		  return 1;

	  err = clock_gettime(process_clock, &process_after);
	  if (err)
		  return 1;

	  diff = process_after.tv_nsec - process_before.tv_nsec;
	  printf("process: before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n",
		 process_before.tv_sec, process_before.tv_nsec,
		 process_after.tv_sec, process_after.tv_nsec, diff);
	  diff = th_after.tv_nsec - th_before.tv_nsec;
	  printf("thread:  before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n",
		 th_before.tv_sec, th_before.tv_nsec,
		 th_after.tv_sec, th_after.tv_nsec, diff);
	  diff = me_after.tv_nsec - me_before.tv_nsec;
	  printf("self:    before(%lu.%.9lu) after(%lu.%.9lu) diff(%lu)\n",
		 me_before.tv_sec, me_before.tv_nsec,
		 me_after.tv_sec, me_after.tv_nsec, diff);

	  return 0;
  }

This is due to us using p->se.sum_exec_runtime in
thread_group_cputime() where we iterate the thread group and sum all
data. This does not take time since the last schedule operation (tick
or otherwise) into account. We can cure this by using
task_sched_runtime() at the cost of having to take locks.

This also means we can (and must) do away with
thread_group_sched_runtime() since the modified thread_group_cputime()
is now more accurate and would deadlock when called from
thread_group_sched_runtime().

Aside of that it makes the function safe on 32 bit systems. The old
code added t->se.sum_exec_runtime unprotected. sum_exec_runtime is a
64bit value and could be changed on another cpu at the same time.

Reported-by: David Miller <davem@davemloft.net>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: stable@kernel.org
Link: http://lkml.kernel.org/r/1314874459.7945.22.camel@twins
Tested-by: David Miller <davem@davemloft.net>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
---
 include/linux/sched.h     |  1 -
 kernel/posix-cpu-timers.c |  5 +++--
 kernel/sched.c            | 24 ------------------------
 3 files changed, 3 insertions(+), 27 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 4ac2c0578e0f..41d0237fd449 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -1956,7 +1956,6 @@ static inline void disable_sched_clock_irqtime(void) {}
 
 extern unsigned long long
 task_sched_runtime(struct task_struct *task);
-extern unsigned long long thread_group_sched_runtime(struct task_struct *task);
 
 /* sched_exec is called by processes performing an exec */
 #ifdef CONFIG_SMP
diff --git a/kernel/posix-cpu-timers.c b/kernel/posix-cpu-timers.c
index 58f405b581e7..c8008dd58ef2 100644
--- a/kernel/posix-cpu-timers.c
+++ b/kernel/posix-cpu-timers.c
@@ -250,7 +250,7 @@ void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times)
 	do {
 		times->utime = cputime_add(times->utime, t->utime);
 		times->stime = cputime_add(times->stime, t->stime);
-		times->sum_exec_runtime += t->se.sum_exec_runtime;
+		times->sum_exec_runtime += task_sched_runtime(t);
 	} while_each_thread(tsk, t);
 out:
 	rcu_read_unlock();
@@ -312,7 +312,8 @@ static int cpu_clock_sample_group(const clockid_t which_clock,
 		cpu->cpu = cputime.utime;
 		break;
 	case CPUCLOCK_SCHED:
-		cpu->sched = thread_group_sched_runtime(p);
+		thread_group_cputime(p, &cputime);
+		cpu->sched = cputime.sum_exec_runtime;
 		break;
 	}
 	return 0;
diff --git a/kernel/sched.c b/kernel/sched.c
index d249ea88428c..b50b0f0c9aa9 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -3724,30 +3724,6 @@ unsigned long long task_sched_runtime(struct task_struct *p)
 	return ns;
 }
 
-/*
- * Return sum_exec_runtime for the thread group.
- * In case the task is currently running, return the sum plus current's
- * pending runtime that have not been accounted yet.
- *
- * Note that the thread group might have other running tasks as well,
- * so the return value not includes other pending runtime that other
- * running tasks might have.
- */
-unsigned long long thread_group_sched_runtime(struct task_struct *p)
-{
-	struct task_cputime totals;
-	unsigned long flags;
-	struct rq *rq;
-	u64 ns;
-
-	rq = task_rq_lock(p, &flags);
-	thread_group_cputime(p, &totals);
-	ns = totals.sum_exec_runtime + do_task_delta_exec(p, rq);
-	task_rq_unlock(rq, p, &flags);
-
-	return ns;
-}
-
 /*
  * Account user cpu time to a process.
  * @p: the process that the cpu time gets accounted to