From ceb6ba45dc8074d2a1ec1117463dc94a20d4203d Mon Sep 17 00:00:00 2001 From: Vincent Guittot Date: Thu, 1 Jul 2021 19:18:37 +0200 Subject: [PATCH 1/3] sched/fair: Sync load_sum with load_avg after dequeue commit 9e077b52d86a ("sched/pelt: Check that *_avg are null when *_sum are") reported some inconsitencies between *_avg and *_sum. commit 1c35b07e6d39 ("sched/fair: Ensure _sum and _avg values stay consistent") fixed some but one remains when dequeuing load. sync the cfs's load_sum with its load_avg after dequeuing the load of a sched_entity. Fixes: 9e077b52d86a ("sched/pelt: Check that *_avg are null when *_sum are") Reported-by: Sachin Sant Signed-off-by: Vincent Guittot Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Odin Ugedal Tested-by: Sachin Sant Link: https://lore.kernel.org/r/20210701171837.32156-1-vincent.guittot@linaro.org --- kernel/sched/fair.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 45edf61eed73..1e263c9cdc13 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -3037,8 +3037,9 @@ enqueue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) static inline void dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { + u32 divider = get_pelt_divider(&se->avg); sub_positive(&cfs_rq->avg.load_avg, se->avg.load_avg); - sub_positive(&cfs_rq->avg.load_sum, se_weight(se) * se->avg.load_sum); + cfs_rq->avg.load_sum = cfs_rq->avg.load_avg * divider; } #else static inline void From 72d0ad7cb5bad265adb2014dbe46c4ccb11afaba Mon Sep 17 00:00:00 2001 From: Odin Ugedal Date: Tue, 29 Jun 2021 14:14:52 +0200 Subject: [PATCH 2/3] sched/fair: Fix CFS bandwidth hrtimer expiry type The time remaining until expiry of the refresh_timer can be negative. Casting the type to an unsigned 64-bit value will cause integer underflow, making the runtime_refresh_within return false instead of true. These situations are rare, but they do happen. This does not cause user-facing issues or errors; other than possibly unthrottling cfs_rq's using runtime from the previous period(s), making the CFS bandwidth enforcement less strict in those (special) situations. Signed-off-by: Odin Ugedal Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Ben Segall Link: https://lore.kernel.org/r/20210629121452.18429-1-odin@uged.al --- kernel/sched/fair.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c index 1e263c9cdc13..1b15a19910a3 100644 --- a/kernel/sched/fair.c +++ b/kernel/sched/fair.c @@ -5054,7 +5054,7 @@ static const u64 cfs_bandwidth_slack_period = 5 * NSEC_PER_MSEC; static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) { struct hrtimer *refresh_timer = &cfs_b->period_timer; - u64 remaining; + s64 remaining; /* if the call-back is running a quota refresh is already occurring */ if (hrtimer_callback_running(refresh_timer)) @@ -5062,7 +5062,7 @@ static int runtime_refresh_within(struct cfs_bandwidth *cfs_b, u64 min_expire) /* is a quota refresh about to occur? */ remaining = ktime_to_ns(hrtimer_expires_remaining(refresh_timer)); - if (remaining < min_expire) + if (remaining < (s64)min_expire) return 1; return 0; From 3e1493f46390618ea78607cb30c58fc19e2a5035 Mon Sep 17 00:00:00 2001 From: Xuewen Yan Date: Wed, 30 Jun 2021 22:12:04 +0800 Subject: [PATCH 3/3] sched/uclamp: Ignore max aggregation if rq is idle When a task wakes up on an idle rq, uclamp_rq_util_with() would max aggregate with rq value. But since there is no task enqueued yet, the values are stale based on the last task that was running. When the new task actually wakes up and enqueued, then the rq uclamp values should reflect that of the newly woken up task effective uclamp values. This is a problem particularly for uclamp_max because it default to 1024. If a task p with uclamp_max = 512 wakes up, then max aggregation would ignore the capping that should apply when this task is enqueued, which is wrong. Fix that by ignoring max aggregation if the rq is idle since in that case the effective uclamp value of the rq will be the ones of the task that will wake up. Fixes: 9d20ad7dfc9a ("sched/uclamp: Add uclamp_util_with()") Signed-off-by: Xuewen Yan Signed-off-by: Peter Zijlstra (Intel) Reviewed-by: Valentin Schneider [qias: Changelog] Reviewed-by: Qais Yousef Link: https://lore.kernel.org/r/20210630141204.8197-1-xuewen.yan94@gmail.com --- kernel/sched/sched.h | 21 ++++++++++++++------- 1 file changed, 14 insertions(+), 7 deletions(-) diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h index c80d42e9589b..14a41a243f7b 100644 --- a/kernel/sched/sched.h +++ b/kernel/sched/sched.h @@ -2818,20 +2818,27 @@ static __always_inline unsigned long uclamp_rq_util_with(struct rq *rq, unsigned long util, struct task_struct *p) { - unsigned long min_util; - unsigned long max_util; + unsigned long min_util = 0; + unsigned long max_util = 0; if (!static_branch_likely(&sched_uclamp_used)) return util; - min_util = READ_ONCE(rq->uclamp[UCLAMP_MIN].value); - max_util = READ_ONCE(rq->uclamp[UCLAMP_MAX].value); - if (p) { - min_util = max(min_util, uclamp_eff_value(p, UCLAMP_MIN)); - max_util = max(max_util, uclamp_eff_value(p, UCLAMP_MAX)); + min_util = uclamp_eff_value(p, UCLAMP_MIN); + max_util = uclamp_eff_value(p, UCLAMP_MAX); + + /* + * Ignore last runnable task's max clamp, as this task will + * reset it. Similarly, no need to read the rq's min clamp. + */ + if (rq->uclamp_flags & UCLAMP_FLAG_IDLE) + goto out; } + min_util = max_t(unsigned long, min_util, READ_ONCE(rq->uclamp[UCLAMP_MIN].value)); + max_util = max_t(unsigned long, max_util, READ_ONCE(rq->uclamp[UCLAMP_MAX].value)); +out: /* * Since CPU's {min,max}_util clamps are MAX aggregated considering * RUNNABLE tasks with _different_ clamps, we can end up with an