From d484b8bfc6fa71a088e4ac85d9ce11aa0385867e Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Mon, 8 Mar 2021 16:31:46 +0200 Subject: [PATCH 1/7] intel_idle: update ICX C6 data Change IceLake Xeon C6 latency from 128 us to 170 us. The latency was measured with the "wult" tool and corresponds to the 99.99th percentile when measuring with the "nic" method. Note, the 128 us figure correspond to the median latency, but in intel_idle we use the "worst case" latency figure instead. C6 target residency was increased from 384 us to 600 us, which may result in less C6 residency in some workloads. This value was tested and compared to values 384, and 1000. Value 600 is a reasonable tradeoff between power and performance. Signed-off-by: Artem Bityutskiy Acked-by: Zhang Rui Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 3273360f30f7..6cac0b748efa 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -744,8 +744,8 @@ static struct cpuidle_state icx_cstates[] __initdata = { .name = "C6", .desc = "MWAIT 0x20", .flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED, - .exit_latency = 128, - .target_residency = 384, + .exit_latency = 170, + .target_residency = 600, .enter = &intel_idle, .enter_s2idle = intel_idle_s2idle, }, { From 4c81cb7e64436a729cf20cdddaf18a9b4a638430 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Mar 2021 20:13:37 +0200 Subject: [PATCH 2/7] tick/nohz: Improve tick_nohz_get_next_hrtimer() kerneldoc Make the tick_nohz_get_next_hrtimer() kerneldoc comment state clearly that the function may return negative numbers. Signed-off-by: Rafael J. Wysocki --- kernel/time/tick-sched.c | 6 +++++- 1 file changed, 5 insertions(+), 1 deletion(-) diff --git a/kernel/time/tick-sched.c b/kernel/time/tick-sched.c index e10a4af88737..ee0032b95a7c 100644 --- a/kernel/time/tick-sched.c +++ b/kernel/time/tick-sched.c @@ -1124,7 +1124,11 @@ ktime_t tick_nohz_get_next_hrtimer(void) * tick_nohz_get_sleep_length - return the expected length of the current sleep * @delta_next: duration until the next event if the tick cannot be stopped * - * Called from power state control code with interrupts disabled + * Called from power state control code with interrupts disabled. + * + * The return value of this function and/or the value returned by it through the + * @delta_next pointer can be negative which must be taken into account by its + * callers. */ ktime_t tick_nohz_get_sleep_length(ktime_t *delta_next) { From 2ab80d46fead0309d7f190d8023c8d64b2ffcbd5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Mar 2021 20:15:19 +0200 Subject: [PATCH 3/7] cpuidle: Use s64 as exit_latency_ns and target_residency_ns data type Subsequent changes will cause the exit_latency_ns and target_residency_ns fields in struct cpuidle_state to be used in computations in which data type conversions to u64 may turn a negative number close to zero into a verly large positive number leading to incorrect results. In preparation for that, change the data type of the fields mentioned above to s64, but ensure that they will not be negative themselves. No intentional functional impact. Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/driver.c | 4 ++++ include/linux/cpuidle.h | 4 ++-- 2 files changed, 6 insertions(+), 2 deletions(-) diff --git a/drivers/cpuidle/driver.c b/drivers/cpuidle/driver.c index 4070e573bf43..f70aa17e2a8e 100644 --- a/drivers/cpuidle/driver.c +++ b/drivers/cpuidle/driver.c @@ -181,9 +181,13 @@ static void __cpuidle_driver_init(struct cpuidle_driver *drv) */ if (s->target_residency > 0) s->target_residency_ns = s->target_residency * NSEC_PER_USEC; + else if (s->target_residency_ns < 0) + s->target_residency_ns = 0; if (s->exit_latency > 0) s->exit_latency_ns = s->exit_latency * NSEC_PER_USEC; + else if (s->exit_latency_ns < 0) + s->exit_latency_ns = 0; } } diff --git a/include/linux/cpuidle.h b/include/linux/cpuidle.h index bd605b5585cf..fce476275e16 100644 --- a/include/linux/cpuidle.h +++ b/include/linux/cpuidle.h @@ -49,8 +49,8 @@ struct cpuidle_state { char name[CPUIDLE_NAME_LEN]; char desc[CPUIDLE_DESC_LEN]; - u64 exit_latency_ns; - u64 target_residency_ns; + s64 exit_latency_ns; + s64 target_residency_ns; unsigned int flags; unsigned int exit_latency; /* in US */ int power_usage; /* in mW */ From d3c33be1f350e7fd2b04381b000f3f950bd1ba77 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Mar 2021 20:19:03 +0200 Subject: [PATCH 4/7] cpuidle: teo: Adjust handling of very short idle times If the time till the next timer event is shorter than the target residency of the first idle state (state 0), the TEO governor does not update its metrics for any idle states, but arguably it should record a "hit" for idle state 0 in that case, so modify it to do that. Accordingly, also make it record an "early hit" for idle state 0 if the measured idle duration is less than its target residency, which allows one branch more to be dropped from teo_update(). Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 30 ++++++++++++++---------------- 1 file changed, 14 insertions(+), 16 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 6deaaf5f05b5..7dcfc9499910 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -117,7 +117,8 @@ static DEFINE_PER_CPU(struct teo_cpu, teo_cpus); static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); - int i, idx_hit = -1, idx_timer = -1; + int i, idx_hit = 0, idx_timer = 0; + unsigned int hits, misses; u64 measured_ns; if (cpu_data->time_span_ns >= cpu_data->sleep_length_ns) { @@ -174,25 +175,22 @@ static void teo_update(struct cpuidle_driver *drv, struct cpuidle_device *dev) * also increase the "early hits" metric for the state that actually * matches the measured idle duration. */ - if (idx_timer >= 0) { - unsigned int hits = cpu_data->states[idx_timer].hits; - unsigned int misses = cpu_data->states[idx_timer].misses; + hits = cpu_data->states[idx_timer].hits; + hits -= hits >> DECAY_SHIFT; - hits -= hits >> DECAY_SHIFT; - misses -= misses >> DECAY_SHIFT; + misses = cpu_data->states[idx_timer].misses; + misses -= misses >> DECAY_SHIFT; - if (idx_timer > idx_hit) { - misses += PULSE; - if (idx_hit >= 0) - cpu_data->states[idx_hit].early_hits += PULSE; - } else { - hits += PULSE; - } - - cpu_data->states[idx_timer].misses = misses; - cpu_data->states[idx_timer].hits = hits; + if (idx_timer == idx_hit) { + hits += PULSE; + } else { + misses += PULSE; + cpu_data->states[idx_hit].early_hits += PULSE; } + cpu_data->states[idx_timer].misses = misses; + cpu_data->states[idx_timer].hits = hits; + /* * Save idle duration values corresponding to non-timer wakeups for * pattern detection. From 030adec9f68e30cbbc24c57296a141943177c148 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Mar 2021 20:21:43 +0200 Subject: [PATCH 5/7] cpuidle: teo: Take negative "sleep length" values into account Modify the TEO governor to take possible negative return values of tick_nohz_get_next_hrtimer() into account by changing the data type of some variables used by it to s64 which allows it to carry out computations without potentially problematic data type conversions into u64. Also change the computations in teo_select() so that the negative values themselves are handled in a natural way to avoid adding extra negative value checks to that function. Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/teo.c | 22 +++++++++++++++------- 1 file changed, 15 insertions(+), 7 deletions(-) diff --git a/drivers/cpuidle/governors/teo.c b/drivers/cpuidle/governors/teo.c index 7dcfc9499910..ac4bb27d69b0 100644 --- a/drivers/cpuidle/governors/teo.c +++ b/drivers/cpuidle/governors/teo.c @@ -100,8 +100,8 @@ struct teo_idle_state { * @intervals: Saved idle duration values. */ struct teo_cpu { - u64 time_span_ns; - u64 sleep_length_ns; + s64 time_span_ns; + s64 sleep_length_ns; struct teo_idle_state states[CPUIDLE_STATE_MAX]; int interval_idx; u64 intervals[INTERVALS]; @@ -214,7 +214,7 @@ static bool teo_time_ok(u64 interval_ns) */ static int teo_find_shallower_state(struct cpuidle_driver *drv, struct cpuidle_device *dev, int state_idx, - u64 duration_ns) + s64 duration_ns) { int i; @@ -240,10 +240,10 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, { struct teo_cpu *cpu_data = per_cpu_ptr(&teo_cpus, dev->cpu); s64 latency_req = cpuidle_governor_latency_req(dev->cpu); - u64 duration_ns; + int max_early_idx, prev_max_early_idx, constraint_idx, idx0, idx, i; unsigned int hits, misses, early_hits; - int max_early_idx, prev_max_early_idx, constraint_idx, idx, i; ktime_t delta_tick; + s64 duration_ns; if (dev->last_state_idx >= 0) { teo_update(drv, dev); @@ -262,6 +262,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, prev_max_early_idx = -1; constraint_idx = drv->state_count; idx = -1; + idx0 = idx; for (i = 0; i < drv->state_count; i++) { struct cpuidle_state *s = &drv->states[i]; @@ -322,6 +323,7 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, idx = i; /* first enabled state */ hits = cpu_data->states[i].hits; misses = cpu_data->states[i].misses; + idx0 = i; } if (s->target_residency_ns > duration_ns) @@ -374,11 +376,16 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, if (idx < 0) { idx = 0; /* No states enabled. Must use 0. */ - } else if (idx > 0) { + } else if (idx > idx0) { unsigned int count = 0; u64 sum = 0; /* + * The target residencies of at least two different enabled idle + * states are less than or equal to the current expected idle + * duration. Try to refine the selection using the most recent + * measured idle duration values. + * * Count and sum the most recent idle duration values less than * the current expected idle duration value. */ @@ -426,7 +433,8 @@ static int teo_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * till the closest timer including the tick, try to correct * that. */ - if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) + if (idx > idx0 && + drv->states[idx].target_residency_ns > delta_tick) idx = teo_find_shallower_state(drv, dev, idx, delta_tick); } From 060e3535adf5c961b01421b9fdaddab8dd43ba85 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 29 Mar 2021 20:37:12 +0200 Subject: [PATCH 6/7] cpuidle: menu: Take negative "sleep length" values into account Make the menu governor check the tick_nohz_get_next_hrtimer() return value so as to avoid dealing with negative "sleep length" values and make it use that value directly when the tick is stopped. While at it, rename local variable delta_next in menu_select() to delta_tick which better reflects its purpose. Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 17 +++++++++++------ 1 file changed, 11 insertions(+), 6 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index b0a7ad566081..c3aa8d6ccee3 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -271,7 +271,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, u64 predicted_ns; u64 interactivity_req; unsigned long nr_iowaiters; - ktime_t delta_next; + ktime_t delta, delta_tick; int i, idx; if (data->needs_update) { @@ -280,7 +280,12 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, } /* determine the expected residency time, round up */ - data->next_timer_ns = tick_nohz_get_sleep_length(&delta_next); + delta = tick_nohz_get_sleep_length(&delta_tick); + if (unlikely(delta < 0)) { + delta = 0; + delta_tick = 0; + } + data->next_timer_ns = delta; nr_iowaiters = nr_iowait_cpu(dev->cpu); data->bucket = which_bucket(data->next_timer_ns, nr_iowaiters); @@ -318,7 +323,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * state selection. */ if (predicted_ns < TICK_NSEC) - predicted_ns = delta_next; + predicted_ns = data->next_timer_ns; } else { /* * Use the performance multiplier and the user-configurable @@ -377,7 +382,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, * stuck in the shallow one for too long. */ if (drv->states[idx].target_residency_ns < TICK_NSEC && - s->target_residency_ns <= delta_next) + s->target_residency_ns <= delta_tick) idx = i; return idx; @@ -399,7 +404,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, predicted_ns < TICK_NSEC) && !tick_nohz_tick_stopped()) { *stop_tick = false; - if (idx > 0 && drv->states[idx].target_residency_ns > delta_next) { + if (idx > 0 && drv->states[idx].target_residency_ns > delta_tick) { /* * The tick is not going to be stopped and the target * residency of the state to be returned is not within @@ -411,7 +416,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev, continue; idx = i; - if (drv->states[i].target_residency_ns <= delta_next) + if (drv->states[i].target_residency_ns <= delta_tick) break; } } From 22141d5f411895bb1b0df2a6b05f702e11e63918 Mon Sep 17 00:00:00 2001 From: Artem Bityutskiy Date: Wed, 7 Apr 2021 09:10:28 +0300 Subject: [PATCH 7/7] intel_idle: add Iclelake-D support This patch adds Icelake Xeon D support to the intel_idle driver. Since Icelake D and Icelake SP C-state characteristics the same, we use Icelake SP C-states table for Icelake D as well. Signed-off-by: Artem Bityutskiy Acked-by: Chen Yu Signed-off-by: Rafael J. Wysocki --- drivers/idle/intel_idle.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 6cac0b748efa..ec1b9d306ba6 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -1156,6 +1156,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { X86_MATCH_INTEL_FAM6_MODEL(KABYLAKE, &idle_cpu_skl), X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx), X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx), + X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl), X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt),