Merge branch 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip

Pull scheduler fixes from Ingo Molnar: "Thiscontains misc fixes: preempt_schedule_common() and io_schedule() recursion fixes, sched/dl fixes, a completion_done() revert, two sched/rt fixes and a comment update patch" * 'sched-urgent-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: sched/rt: Avoid obvious configuration fail sched/autogroup: Fix failure to set cpu.rt_runtime_us sched/dl: Do update_rq_clock() in yield_task_dl() sched: Prevent recursion in io_schedule() sched/completion: Serialize completion_done() with complete() sched: Fix preempt_schedule_common() triggering tracing recursion sched/dl: Prevent enqueue of a sleeping task in dl_task_timer() sched: Make dl_task_time() use task_rq_lock() sched: Clarify ordering between task_rq_lock() and move_queued_task()
2015-02-21 10:40:02 -08:00 · 2015-02-21 10:40:02 -08:00 · e2defd0271
parent b5aeca54d0 2636ed5f8d
commit e2defd0271
6 changed files with 156 additions and 103 deletions
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@ -363,9 +363,6 @@ extern void show_regs(struct pt_regs *);
 */
 extern void show_stack(struct task_struct *task, unsigned long *sp);
 void io_schedule(void);
 long io_schedule_timeout(long timeout);
 extern void cpu_init (void);
 extern void trap_init(void);
 extern void update_process_times(int user);
@ -422,6 +419,13 @@ extern signed long schedule_timeout_uninterruptible(signed long timeout);
 asmlinkage void schedule(void);
 extern void schedule_preempt_disabled(void);
 extern long io_schedule_timeout(long timeout);
 static inline void io_schedule(void)
 {
 	io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
 }
 struct nsproxy;
 struct user_namespace;
--- a/kernel/sched/auto_group.c
+++ b/kernel/sched/auto_group.c
@ -87,8 +87,7 @@ static inline struct autogroup *autogroup_create(void)
 	 * so we don't have to move tasks around upon policy change,
 	 * or flail around trying to allocate bandwidth on the fly.
 	 * A bandwidth exception in __sched_setscheduler() allows
-	 * the policy change to proceed.  Thereafter, task_group()
+	 * the policy change to proceed.
 	 * returns &root_task_group, so zero bandwidth is required.
 	 */
 	free_rt_sched_group(tg);
 	tg->rt_se = root_task_group.rt_se;
@ -115,9 +114,6 @@ bool task_wants_autogroup(struct task_struct *p, struct task_group *tg)
 	if (tg != &root_task_group)
 		return false;
 	if (p->sched_class != &fair_sched_class)
 		return false;
 	/*
 	 * We can only assume the task group can't go away on us if
 	 * autogroup_move_group() can see us on ->thread_group list.
--- a/kernel/sched/completion.c
+++ b/kernel/sched/completion.c
@ -274,7 +274,7 @@ bool try_wait_for_completion(struct completion *x)
 	 * first without taking the lock so we can
 	 * return early in the blocking case.
 	 */
-	if (!ACCESS_ONCE(x->done))
+	if (!READ_ONCE(x->done))
 		return 0;
 	spin_lock_irqsave(&x->wait.lock, flags);
@ -297,6 +297,21 @@ EXPORT_SYMBOL(try_wait_for_completion);
 */
 bool completion_done(struct completion *x)
 {
-	return !!ACCESS_ONCE(x->done);
+	if (!READ_ONCE(x->done))
 		return false;
 	/*
 	 * If ->done, we need to wait for complete() to release ->wait.lock
 	 * otherwise we can end up freeing the completion before complete()
 	 * is done referencing it.
 	 *
 	 * The RMB pairs with complete()'s RELEASE of ->wait.lock and orders
 	 * the loads of ->done and ->wait.lock such that we cannot observe
 	 * the lock before complete() acquires it while observing the ->done
 	 * after it's acquired the lock.
 	 */
 	smp_rmb();
 	spin_unlock_wait(&x->wait.lock);
 	return true;
 }
 EXPORT_SYMBOL(completion_done);
--- a/kernel/sched/core.c
+++ b/kernel/sched/core.c
@ -306,66 +306,6 @@ __read_mostly int scheduler_running;
 */
 int sysctl_sched_rt_runtime = 950000;
 /*
 * __task_rq_lock - lock the rq @p resides on.
 */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	lockdep_assert_held(&p->pi_lock);
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 		while (unlikely(task_on_rq_migrating(p)))
 			cpu_relax();
 	}
 }
 /*
 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
 */
 static struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(p->pi_lock)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	for (;;) {
 		raw_spin_lock_irqsave(&p->pi_lock, *flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 		while (unlikely(task_on_rq_migrating(p)))
 			cpu_relax();
 	}
 }
 static void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
 	raw_spin_unlock(&rq->lock);
 }
 static inline void
 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 /*
 * this_rq_lock - lock this runqueue and disable interrupts.
 */
@ -2899,7 +2839,7 @@ void __sched schedule_preempt_disabled(void)
 	preempt_disable();
 }
-static void preempt_schedule_common(void)
+static void __sched notrace preempt_schedule_common(void)
 {
 	do {
 		__preempt_count_add(PREEMPT_ACTIVE);
@ -4418,36 +4358,29 @@ EXPORT_SYMBOL_GPL(yield_to);
 * This task is about to go to sleep on IO. Increment rq->nr_iowait so
 * that process accounting knows that this is a task in IO wait state.
 */
 void __sched io_schedule(void)
 {
 	struct rq *rq = raw_rq();
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	blk_flush_plug(current);
 	current->in_iowait = 1;
 	schedule();
 	current->in_iowait = 0;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 }
 EXPORT_SYMBOL(io_schedule);
 long __sched io_schedule_timeout(long timeout)
 {
-	struct rq *rq = raw_rq();
+	int old_iowait = current->in_iowait;
 	struct rq *rq;
 	long ret;
 	delayacct_blkio_start();
 	atomic_inc(&rq->nr_iowait);
 	blk_flush_plug(current);
 	current->in_iowait = 1;
 	if (old_iowait)
 		blk_schedule_flush_plug(current);
 	else
 		blk_flush_plug(current);
 	delayacct_blkio_start();
 	rq = raw_rq();
 	atomic_inc(&rq->nr_iowait);
 	ret = schedule_timeout(timeout);
-	current->in_iowait = 0;
+	current->in_iowait = old_iowait;
 	atomic_dec(&rq->nr_iowait);
 	delayacct_blkio_end();
 	return ret;
 }
 EXPORT_SYMBOL(io_schedule_timeout);
 /**
 * sys_sched_get_priority_max - return maximum RT priority.
@ -7642,6 +7575,12 @@ static inline int tg_has_rt_tasks(struct task_group *tg)
 {
 	struct task_struct *g, *p;
 	/*
 	 * Autogroups do not have RT tasks; see autogroup_create().
 	 */
 	if (task_group_is_autogroup(tg))
 		return 0;
 	for_each_process_thread(g, p) {
 		if (rt_task(p) && task_group(p) == tg)
 			return 1;
@ -7734,6 +7673,17 @@ static int tg_set_rt_bandwidth(struct task_group *tg,
 {
 	int i, err = 0;
 	/*
 	 * Disallowing the root group RT runtime is BAD, it would disallow the
 	 * kernel creating (and or operating) RT threads.
 	 */
 	if (tg == &root_task_group && rt_runtime == 0)
 		return -EINVAL;
 	/* No period doesn't make any sense. */
 	if (rt_period == 0)
 		return -EINVAL;
 	mutex_lock(&rt_constraints_mutex);
 	read_lock(&tasklist_lock);
 	err = __rt_schedulable(tg, rt_period, rt_runtime);
@ -7790,9 +7740,6 @@ static int sched_group_set_rt_period(struct task_group *tg, long rt_period_us)
 	rt_period = (u64)rt_period_us * NSEC_PER_USEC;
 	rt_runtime = tg->rt_bandwidth.rt_runtime;
 	if (rt_period == 0)
 		return -EINVAL;
 	return tg_set_rt_bandwidth(tg, rt_period, rt_runtime);
 }
--- a/kernel/sched/deadline.c
+++ b/kernel/sched/deadline.c
@ -511,16 +511,10 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer)
 						     struct sched_dl_entity,
 						     dl_timer);
 	struct task_struct *p = dl_task_of(dl_se);
 	unsigned long flags;
 	struct rq *rq;
 again:
 	rq = task_rq(p);
 	raw_spin_lock(&rq->lock);
-	if (rq != task_rq(p)) {
+	rq = task_rq_lock(current, &flags);
 		/* Task was moved, retrying. */
 		raw_spin_unlock(&rq->lock);
 		goto again;
 	}
 	/*
 	 * We need to take care of several possible races here:
@ -541,6 +535,26 @@ again:
 	sched_clock_tick();
 	update_rq_clock(rq);
 	/*
 	 * If the throttle happened during sched-out; like:
 	 *
 	 *   schedule()
 	 *     deactivate_task()
 	 *       dequeue_task_dl()
 	 *         update_curr_dl()
 	 *           start_dl_timer()
 	 *         __dequeue_task_dl()
 	 *     prev->on_rq = 0;
 	 *
 	 * We can be both throttled and !queued. Replenish the counter
 	 * but do not enqueue -- wait for our wakeup to do that.
 	 */
 	if (!task_on_rq_queued(p)) {
 		replenish_dl_entity(dl_se, dl_se);
 		goto unlock;
 	}
 	enqueue_task_dl(rq, p, ENQUEUE_REPLENISH);
 	if (dl_task(rq->curr))
 		check_preempt_curr_dl(rq, p, 0);
@ -555,7 +569,7 @@ again:
 		push_dl_task(rq);
 #endif
 unlock:
-	raw_spin_unlock(&rq->lock);
+	task_rq_unlock(rq, current, &flags);
 	return HRTIMER_NORESTART;
 }
@ -898,6 +912,7 @@ static void yield_task_dl(struct rq *rq)
 		rq->curr->dl.dl_yielded = 1;
 		p->dl.runtime = 0;
 	}
 	update_rq_clock(rq);
 	update_curr_dl(rq);
 }
--- a/kernel/sched/sched.h
+++ b/kernel/sched/sched.h
@ -1380,6 +1380,82 @@ static inline void sched_avg_update(struct rq *rq) { }
 extern void start_bandwidth_timer(struct hrtimer *period_timer, ktime_t period);
 /*
 * __task_rq_lock - lock the rq @p resides on.
 */
 static inline struct rq *__task_rq_lock(struct task_struct *p)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	lockdep_assert_held(&p->pi_lock);
 	for (;;) {
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 		while (unlikely(task_on_rq_migrating(p)))
 			cpu_relax();
 	}
 }
 /*
 * task_rq_lock - lock p->pi_lock and lock the rq @p resides on.
 */
 static inline struct rq *task_rq_lock(struct task_struct *p, unsigned long *flags)
 	__acquires(p->pi_lock)
 	__acquires(rq->lock)
 {
 	struct rq *rq;
 	for (;;) {
 		raw_spin_lock_irqsave(&p->pi_lock, *flags);
 		rq = task_rq(p);
 		raw_spin_lock(&rq->lock);
 		/*
 		 *	move_queued_task()		task_rq_lock()
 		 *
 		 *	ACQUIRE (rq->lock)
 		 *	[S] ->on_rq = MIGRATING		[L] rq = task_rq()
 		 *	WMB (__set_task_cpu())		ACQUIRE (rq->lock);
 		 *	[S] ->cpu = new_cpu		[L] task_rq()
 		 *					[L] ->on_rq
 		 *	RELEASE (rq->lock)
 		 *
 		 * If we observe the old cpu in task_rq_lock, the acquire of
 		 * the old rq->lock will fully serialize against the stores.
 		 *
 		 * If we observe the new cpu in task_rq_lock, the acquire will
 		 * pair with the WMB to ensure we must then also see migrating.
 		 */
 		if (likely(rq == task_rq(p) && !task_on_rq_migrating(p)))
 			return rq;
 		raw_spin_unlock(&rq->lock);
 		raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 		while (unlikely(task_on_rq_migrating(p)))
 			cpu_relax();
 	}
 }
 static inline void __task_rq_unlock(struct rq *rq)
 	__releases(rq->lock)
 {
 	raw_spin_unlock(&rq->lock);
 }
 static inline void
 task_rq_unlock(struct rq *rq, struct task_struct *p, unsigned long *flags)
 	__releases(rq->lock)
 	__releases(p->pi_lock)
 {
 	raw_spin_unlock(&rq->lock);
 	raw_spin_unlock_irqrestore(&p->pi_lock, *flags);
 }
 #ifdef CONFIG_SMP
 #ifdef CONFIG_PREEMPT