From 027b6934aaa3a7eff90598980aca992b5f1f1e2e Mon Sep 17 00:00:00 2001 From: Geert Uytterhoeven Date: Mon, 14 Mar 2016 16:29:02 +0100 Subject: [PATCH 01/23] PM / devfreq: Spelling s/frequnecy/frequency/ Signed-off-by: Geert Uytterhoeven Reviewed-by: Chanwoo Choi Signed-off-by: Rafael J. Wysocki --- drivers/devfreq/Kconfig | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/devfreq/Kconfig b/drivers/devfreq/Kconfig index 64281bb2f650..4de78c552251 100644 --- a/drivers/devfreq/Kconfig +++ b/drivers/devfreq/Kconfig @@ -61,7 +61,7 @@ config DEVFREQ_GOV_USERSPACE Sets the frequency at the user specified one. This governor returns the user configured frequency if there has been an input to /sys/devices/.../power/devfreq_set_freq. - Otherwise, the governor does not change the frequnecy + Otherwise, the governor does not change the frequency given at the initialization. comment "DEVFREQ Drivers" From 02113ba93ea414b7113d64e42b87dc0720e3e578 Mon Sep 17 00:00:00 2001 From: Jon Hunter Date: Tue, 15 Mar 2016 11:33:40 +0000 Subject: [PATCH 02/23] PM / clk: Add support for obtaining clocks from device-tree The PM clocks framework requires clients to pass either a con-id or a valid clk pointer in order to add a clock to a device. Add a new function of_pm_clk_add_clks() to allows device clocks to be retrieved from device-tree and populated for a given device. Note that it is not necessary to make the compilation of this new function dependent upon CONFIG_OF because there are stubs functions for the device-tree APIs used. In order to handle errors encountered when adding clocks from device-tree, add a function pm_clk_remove_clk() to remove any clocks (using a pointer to the clk structure) that have been added successfully before the error occurred. Signed-off-by: Jon Hunter Acked-by: Geert Uytterhoeven Signed-off-by: Rafael J. Wysocki --- drivers/base/power/clock_ops.c | 89 ++++++++++++++++++++++++++++++++++ include/linux/pm_clock.h | 9 ++++ 2 files changed, 98 insertions(+) diff --git a/drivers/base/power/clock_ops.c b/drivers/base/power/clock_ops.c index 272a52ebafc0..0e64a1b5e62a 100644 --- a/drivers/base/power/clock_ops.c +++ b/drivers/base/power/clock_ops.c @@ -137,6 +137,62 @@ int pm_clk_add_clk(struct device *dev, struct clk *clk) return __pm_clk_add(dev, NULL, clk); } + +/** + * of_pm_clk_add_clks - Start using device clock(s) for power management. + * @dev: Device whose clock(s) is going to be used for power management. + * + * Add a series of clocks described in the 'clocks' device-tree node for + * a device to the list of clocks used for the power management of @dev. + * On success, returns the number of clocks added. Returns a negative + * error code if there are no clocks in the device node for the device + * or if adding a clock fails. + */ +int of_pm_clk_add_clks(struct device *dev) +{ + struct clk **clks; + unsigned int i, count; + int ret; + + if (!dev || !dev->of_node) + return -EINVAL; + + count = of_count_phandle_with_args(dev->of_node, "clocks", + "#clock-cells"); + if (count == 0) + return -ENODEV; + + clks = kcalloc(count, sizeof(*clks), GFP_KERNEL); + if (!clks) + return -ENOMEM; + + for (i = 0; i < count; i++) { + clks[i] = of_clk_get(dev->of_node, i); + if (IS_ERR(clks[i])) { + ret = PTR_ERR(clks[i]); + goto error; + } + + ret = pm_clk_add_clk(dev, clks[i]); + if (ret) { + clk_put(clks[i]); + goto error; + } + } + + kfree(clks); + + return i; + +error: + while (i--) + pm_clk_remove_clk(dev, clks[i]); + + kfree(clks); + + return ret; +} + /** * __pm_clk_remove - Destroy PM clock entry. * @ce: PM clock entry to destroy. @@ -197,6 +253,39 @@ void pm_clk_remove(struct device *dev, const char *con_id) __pm_clk_remove(ce); } +/** + * pm_clk_remove_clk - Stop using a device clock for power management. + * @dev: Device whose clock should not be used for PM any more. + * @clk: Clock pointer + * + * Remove the clock pointed to by @clk from the list of clocks used for + * the power management of @dev. + */ +void pm_clk_remove_clk(struct device *dev, struct clk *clk) +{ + struct pm_subsys_data *psd = dev_to_psd(dev); + struct pm_clock_entry *ce; + + if (!psd || !clk) + return; + + spin_lock_irq(&psd->lock); + + list_for_each_entry(ce, &psd->clock_list, node) { + if (clk == ce->clk) + goto remove; + } + + spin_unlock_irq(&psd->lock); + return; + + remove: + list_del(&ce->node); + spin_unlock_irq(&psd->lock); + + __pm_clk_remove(ce); +} + /** * pm_clk_init - Initialize a device's list of power management clocks. * @dev: Device to initialize the list of PM clocks for. diff --git a/include/linux/pm_clock.h b/include/linux/pm_clock.h index 25266c600021..308d6044f153 100644 --- a/include/linux/pm_clock.h +++ b/include/linux/pm_clock.h @@ -42,7 +42,9 @@ extern int pm_clk_create(struct device *dev); extern void pm_clk_destroy(struct device *dev); extern int pm_clk_add(struct device *dev, const char *con_id); extern int pm_clk_add_clk(struct device *dev, struct clk *clk); +extern int of_pm_clk_add_clks(struct device *dev); extern void pm_clk_remove(struct device *dev, const char *con_id); +extern void pm_clk_remove_clk(struct device *dev, struct clk *clk); extern int pm_clk_suspend(struct device *dev); extern int pm_clk_resume(struct device *dev); #else @@ -69,11 +71,18 @@ static inline int pm_clk_add_clk(struct device *dev, struct clk *clk) { return -EINVAL; } +static inline int of_pm_clk_add_clks(struct device *dev) +{ + return -EINVAL; +} static inline void pm_clk_remove(struct device *dev, const char *con_id) { } #define pm_clk_suspend NULL #define pm_clk_resume NULL +static inline void pm_clk_remove_clk(struct device *dev, struct clk *clk) +{ +} #endif #ifdef CONFIG_HAVE_CLK From e132b9b3bc7f19e9b158e42b323881d5dee5ecf3 Mon Sep 17 00:00:00 2001 From: Rik van Riel Date: Wed, 16 Mar 2016 12:14:00 -0400 Subject: [PATCH 03/23] cpuidle: menu: use high confidence factors only when considering polling The menu governor uses five different factors to pick the idle state: - the user configured latency_req - the time until the next timer (next_timer_us) - the typical sleep interval, as measured recently - an estimate of sleep time by dividing next_timer_us by an observed factor - a load corrected version of the above, divided again by load Only the first three items are known with enough confidence that we can use them to consider polling, instead of an actual CPU idle state, because the cost of being wrong about polling can be excessive power use. The latter two are used in the menu governor's main selection loop, and can result in choosing a shallower idle state when the system is expected to be busy again soon. This pushes a busy system in the "performance" direction of the performance<>power tradeoff, when choosing between idle states, but stays more strictly on the "power" state when deciding between polling and C1. Signed-off-by: Rik van Riel Signed-off-by: Rafael J. Wysocki --- drivers/cpuidle/governors/menu.c | 42 ++++++++++++++++++-------------- 1 file changed, 24 insertions(+), 18 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 27fc733cb5b9..00c5f891e352 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -196,7 +196,7 @@ static void menu_update(struct cpuidle_driver *drv, struct cpuidle_device *dev); * of points is below a threshold. If it is... then use the * average of these 8 points as the estimated value. */ -static void get_typical_interval(struct menu_device *data) +static unsigned int get_typical_interval(struct menu_device *data) { int i, divisor; unsigned int max, thresh, avg; @@ -253,9 +253,7 @@ again: if (likely(variance <= U64_MAX/36)) { if ((((u64)avg*avg > variance*36) && (divisor * 4 >= INTERVALS * 3)) || variance <= 400) { - if (data->next_timer_us > avg) - data->predicted_us = avg; - return; + return avg; } } @@ -269,7 +267,7 @@ again: * with sporadic activity with a bunch of short pauses. */ if ((divisor * 4) <= INTERVALS * 3) - return; + return UINT_MAX; thresh = max - 1; goto again; @@ -286,6 +284,7 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) int latency_req = pm_qos_request(PM_QOS_CPU_DMA_LATENCY); int i; unsigned int interactivity_req; + unsigned int expected_interval; unsigned long nr_iowaiters, cpu_load; if (data->needs_update) { @@ -312,31 +311,38 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) data->correction_factor[data->bucket], RESOLUTION * DECAY); - get_typical_interval(data); - - /* - * Performance multiplier defines a minimum predicted idle - * duration / latency ratio. Adjust the latency limit if - * necessary. - */ - interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load); - if (latency_req > interactivity_req) - latency_req = interactivity_req; + expected_interval = get_typical_interval(data); + expected_interval = min(expected_interval, data->next_timer_us); if (CPUIDLE_DRIVER_STATE_START > 0) { data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1; /* * We want to default to C1 (hlt), not to busy polling - * unless the timer is happening really really soon. + * unless the timer is happening really really soon, or + * C1's exit latency exceeds the user configured limit. */ - if (interactivity_req > 20 && + if (expected_interval > drv->states[CPUIDLE_DRIVER_STATE_START].target_residency && + latency_req > drv->states[CPUIDLE_DRIVER_STATE_START].exit_latency && !drv->states[CPUIDLE_DRIVER_STATE_START].disabled && - dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable == 0) + !dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable) data->last_state_idx = CPUIDLE_DRIVER_STATE_START; } else { data->last_state_idx = CPUIDLE_DRIVER_STATE_START; } + /* + * Use the lowest expected idle interval to pick the idle state. + */ + data->predicted_us = min(data->predicted_us, expected_interval); + + /* + * Use the performance multiplier and the user-configurable + * latency_req to determine the maximum exit latency. + */ + interactivity_req = data->predicted_us / performance_multiplier(nr_iowaiters, cpu_load); + if (latency_req > interactivity_req) + latency_req = interactivity_req; + /* * Find the idle state with the lowest power while satisfying * our constraints. From f5eda99ee6c0c3e85be9d6f20685151514d5102f Mon Sep 17 00:00:00 2001 From: Wang Hongcheng Date: Fri, 11 Mar 2016 17:28:23 +0800 Subject: [PATCH 04/23] ACPI / APD: Add device HID for future AMD UART controller Add device HID AMDI0020 to match the AMD ACPI Vendor ID (AMDI) as registered in http://www.uefi.org/acpi_id_list, and the UART controller on future AMD paltform will use the HID instead of AMD0020. Signed-off-by: Wang Hongcheng Acked-by: Ken Xue Signed-off-by: Rafael J. Wysocki --- drivers/acpi/acpi_apd.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/acpi_apd.c b/drivers/acpi/acpi_apd.c index d0aad06b3872..f245bf35bedb 100644 --- a/drivers/acpi/acpi_apd.c +++ b/drivers/acpi/acpi_apd.c @@ -145,6 +145,7 @@ static const struct acpi_device_id acpi_apd_device_ids[] = { { "AMD0010", APD_ADDR(cz_i2c_desc) }, { "AMDI0010", APD_ADDR(cz_i2c_desc) }, { "AMD0020", APD_ADDR(cz_uart_desc) }, + { "AMDI0020", APD_ADDR(cz_uart_desc) }, { "AMD0030", }, #endif #ifdef CONFIG_ARM64 From bf4703fdd166fffd5b2e4c42d2ebbf708b94748e Mon Sep 17 00:00:00 2001 From: Irina Tirdea Date: Sun, 13 Mar 2016 02:33:30 +0200 Subject: [PATCH 05/23] ACPI / property: fix data node parsing in acpi_get_next_subnode() When an ACPI node has both ACPI device nodes and ACPI data nodes, acpi_get_next_subnode() will return the ACPI data nodes of its last parsed child. To avoid that, make acpi_get_next_subnode() go back to the original ACPI device object when all of the device node children of it have been found already. Signed-off-by: Irina Tirdea [ rjw: Changelog ] Signed-off-by: Rafael J. Wysocki --- drivers/acpi/property.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/property.c b/drivers/acpi/property.c index 2aee41655ce9..f2fd3fee588a 100644 --- a/drivers/acpi/property.c +++ b/drivers/acpi/property.c @@ -816,6 +816,7 @@ struct fwnode_handle *acpi_get_next_subnode(struct device *dev, next = adev->node.next; if (next == head) { child = NULL; + adev = ACPI_COMPANION(dev); goto nondev; } adev = list_entry(next, struct acpi_device, node); From c75361c0b08a59d3c9863a5a673ae039d5118c35 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 11 Mar 2016 09:43:07 +0100 Subject: [PATCH 06/23] cpufreq: Make cpufreq_quick_get() safe to call The function, cpufreq_quick_get, accesses the global 'cpufreq_driver' and its fields without taking the associated lock, cpufreq_driver_lock. Without the locking, nothing guarantees that 'cpufreq_driver' remains consistent during the call. This patch fixes the issue by taking the lock before accessing the data structure. Signed-off-by: Richard Cochran Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq.c | 12 ++++++++++-- 1 file changed, 10 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 4c7825856eab..f870399f00b1 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1401,9 +1401,17 @@ unsigned int cpufreq_quick_get(unsigned int cpu) { struct cpufreq_policy *policy; unsigned int ret_freq = 0; + unsigned long flags; - if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get) - return cpufreq_driver->get(cpu); + read_lock_irqsave(&cpufreq_driver_lock, flags); + + if (cpufreq_driver && cpufreq_driver->setpolicy && cpufreq_driver->get) { + ret_freq = cpufreq_driver->get(cpu); + read_unlock_irqrestore(&cpufreq_driver_lock, flags); + return ret_freq; + } + + read_unlock_irqrestore(&cpufreq_driver_lock, flags); policy = cpufreq_cpu_get(cpu); if (policy) { From fdfdb2b1301670a69195ba1e5666df4a7f02eb46 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Fri, 18 Mar 2016 23:20:02 +0100 Subject: [PATCH 07/23] intel_pstate: Do not call wrmsrl_on_cpu() with disabled interrupts After commit a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) wrmsrl_on_cpu() cannot be called in the intel_pstate_adjust_busy_pstate() path as that is executed with disabled interrupts. However, atom_set_pstate() called from there via intel_pstate_set_pstate() uses wrmsrl_on_cpu() to update the IA32_PERF_CTL MSR which triggers the WARN_ON_ONCE() in smp_call_function_single(). The reason why wrmsrl_on_cpu() is used by atom_set_pstate() is because intel_pstate_set_pstate() calling it is also invoked during the initialization and cleanup of the driver and in those cases it is not guaranteed to be run on the CPU that is being updated. However, in the case when intel_pstate_set_pstate() is called by intel_pstate_adjust_busy_pstate(), wrmsrl() can be used to update the register safely. Moreover, intel_pstate_set_pstate() already contains code that only is executed if the function is called by intel_pstate_adjust_busy_pstate() and there is a special argument passed to it because of that. To fix the problem at hand, rearrange the code taking the above observations into account. First, replace the ->set() callback in struct pstate_funcs with a ->get_val() one that will return the value to be written to the IA32_PERF_CTL MSR without updating the register. Second, split intel_pstate_set_pstate() into two functions, intel_pstate_update_pstate() to be called by intel_pstate_adjust_busy_pstate() that will contain all of the intel_pstate_set_pstate() code which only needs to be executed in that case and will use wrmsrl() to update the MSR (after obtaining the value to write to it from the ->get_val() callback), and intel_pstate_set_min_pstate() to be invoked during the initialization and cleanup that will set the P-state to the minimum one and will update the MSR using wrmsrl_on_cpu(). Finally, move the code shared between intel_pstate_update_pstate() and intel_pstate_set_min_pstate() to a new static inline function intel_pstate_record_pstate() and make them both call it. Of course, that unifies the handling of the IA32_PERF_CTL MSR writes between Atom and Core. Fixes: a4675fbc4a7a (cpufreq: intel_pstate: Replace timers with utilization update callbacks) Reported-and-tested-by: Josh Boyer Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/intel_pstate.c | 73 ++++++++++++++++++++-------------- 1 file changed, 43 insertions(+), 30 deletions(-) diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c index cb5607495816..4b644526fd59 100644 --- a/drivers/cpufreq/intel_pstate.c +++ b/drivers/cpufreq/intel_pstate.c @@ -134,7 +134,7 @@ struct pstate_funcs { int (*get_min)(void); int (*get_turbo)(void); int (*get_scaling)(void); - void (*set)(struct cpudata*, int pstate); + u64 (*get_val)(struct cpudata*, int pstate); void (*get_vid)(struct cpudata *); int32_t (*get_target_pstate)(struct cpudata *); }; @@ -565,7 +565,7 @@ static int atom_get_turbo_pstate(void) return value & 0x7F; } -static void atom_set_pstate(struct cpudata *cpudata, int pstate) +static u64 atom_get_val(struct cpudata *cpudata, int pstate) { u64 val; int32_t vid_fp; @@ -585,9 +585,7 @@ static void atom_set_pstate(struct cpudata *cpudata, int pstate) if (pstate > cpudata->pstate.max_pstate) vid = cpudata->vid.turbo; - val |= vid; - - wrmsrl_on_cpu(cpudata->cpu, MSR_IA32_PERF_CTL, val); + return val | vid; } static int silvermont_get_scaling(void) @@ -711,7 +709,7 @@ static inline int core_get_scaling(void) return 100000; } -static void core_set_pstate(struct cpudata *cpudata, int pstate) +static u64 core_get_val(struct cpudata *cpudata, int pstate) { u64 val; @@ -719,7 +717,7 @@ static void core_set_pstate(struct cpudata *cpudata, int pstate) if (limits->no_turbo && !limits->turbo_disabled) val |= (u64)1 << 32; - wrmsrl(MSR_IA32_PERF_CTL, val); + return val; } static int knl_get_turbo_pstate(void) @@ -750,7 +748,7 @@ static struct cpu_defaults core_params = { .get_min = core_get_min_pstate, .get_turbo = core_get_turbo_pstate, .get_scaling = core_get_scaling, - .set = core_set_pstate, + .get_val = core_get_val, .get_target_pstate = get_target_pstate_use_performance, }, }; @@ -769,7 +767,7 @@ static struct cpu_defaults silvermont_params = { .get_max_physical = atom_get_max_pstate, .get_min = atom_get_min_pstate, .get_turbo = atom_get_turbo_pstate, - .set = atom_set_pstate, + .get_val = atom_get_val, .get_scaling = silvermont_get_scaling, .get_vid = atom_get_vid, .get_target_pstate = get_target_pstate_use_cpu_load, @@ -790,7 +788,7 @@ static struct cpu_defaults airmont_params = { .get_max_physical = atom_get_max_pstate, .get_min = atom_get_min_pstate, .get_turbo = atom_get_turbo_pstate, - .set = atom_set_pstate, + .get_val = atom_get_val, .get_scaling = airmont_get_scaling, .get_vid = atom_get_vid, .get_target_pstate = get_target_pstate_use_cpu_load, @@ -812,7 +810,7 @@ static struct cpu_defaults knl_params = { .get_min = core_get_min_pstate, .get_turbo = knl_get_turbo_pstate, .get_scaling = core_get_scaling, - .set = core_set_pstate, + .get_val = core_get_val, .get_target_pstate = get_target_pstate_use_performance, }, }; @@ -839,25 +837,24 @@ static void intel_pstate_get_min_max(struct cpudata *cpu, int *min, int *max) *min = clamp_t(int, min_perf, cpu->pstate.min_pstate, max_perf); } -static void intel_pstate_set_pstate(struct cpudata *cpu, int pstate, bool force) +static inline void intel_pstate_record_pstate(struct cpudata *cpu, int pstate) { - int max_perf, min_perf; - - if (force) { - update_turbo_state(); - - intel_pstate_get_min_max(cpu, &min_perf, &max_perf); - - pstate = clamp_t(int, pstate, min_perf, max_perf); - - if (pstate == cpu->pstate.current_pstate) - return; - } trace_cpu_frequency(pstate * cpu->pstate.scaling, cpu->cpu); - cpu->pstate.current_pstate = pstate; +} - pstate_funcs.set(cpu, pstate); +static void intel_pstate_set_min_pstate(struct cpudata *cpu) +{ + int pstate = cpu->pstate.min_pstate; + + intel_pstate_record_pstate(cpu, pstate); + /* + * Generally, there is no guarantee that this code will always run on + * the CPU being updated, so force the register update to run on the + * right CPU. + */ + wrmsrl_on_cpu(cpu->cpu, MSR_IA32_PERF_CTL, + pstate_funcs.get_val(cpu, pstate)); } static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) @@ -870,7 +867,8 @@ static void intel_pstate_get_cpu_pstates(struct cpudata *cpu) if (pstate_funcs.get_vid) pstate_funcs.get_vid(cpu); - intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate, false); + + intel_pstate_set_min_pstate(cpu); } static inline void intel_pstate_calc_busy(struct cpudata *cpu) @@ -997,6 +995,21 @@ static inline int32_t get_target_pstate_use_performance(struct cpudata *cpu) return cpu->pstate.current_pstate - pid_calc(&cpu->pid, core_busy); } +static inline void intel_pstate_update_pstate(struct cpudata *cpu, int pstate) +{ + int max_perf, min_perf; + + update_turbo_state(); + + intel_pstate_get_min_max(cpu, &min_perf, &max_perf); + pstate = clamp_t(int, pstate, min_perf, max_perf); + if (pstate == cpu->pstate.current_pstate) + return; + + intel_pstate_record_pstate(cpu, pstate); + wrmsrl(MSR_IA32_PERF_CTL, pstate_funcs.get_val(cpu, pstate)); +} + static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) { int from, target_pstate; @@ -1006,7 +1019,7 @@ static inline void intel_pstate_adjust_busy_pstate(struct cpudata *cpu) target_pstate = pstate_funcs.get_target_pstate(cpu); - intel_pstate_set_pstate(cpu, target_pstate, true); + intel_pstate_update_pstate(cpu, target_pstate); sample = &cpu->sample; trace_pstate_sample(fp_toint(sample->core_pct_busy), @@ -1180,7 +1193,7 @@ static void intel_pstate_stop_cpu(struct cpufreq_policy *policy) if (hwp_active) return; - intel_pstate_set_pstate(cpu, cpu->pstate.min_pstate, false); + intel_pstate_set_min_pstate(cpu); } static int intel_pstate_cpu_init(struct cpufreq_policy *policy) @@ -1255,7 +1268,7 @@ static void copy_cpu_funcs(struct pstate_funcs *funcs) pstate_funcs.get_min = funcs->get_min; pstate_funcs.get_turbo = funcs->get_turbo; pstate_funcs.get_scaling = funcs->get_scaling; - pstate_funcs.set = funcs->set; + pstate_funcs.get_val = funcs->get_val; pstate_funcs.get_vid = funcs->get_vid; pstate_funcs.get_target_pstate = funcs->get_target_pstate; From ed72662a84771fec1337dd8b187e070af6fd3890 Mon Sep 17 00:00:00 2001 From: Richard Cochran Date: Fri, 18 Mar 2016 22:26:11 +0100 Subject: [PATCH 08/23] cpufreq: acpi-cpufreq: Clean up hot plug notifier callback This driver has two issues. First, it tries to fiddle with the hot plugged CPU's MSR on the UP_PREPARE event, at a time when the CPU is not yet online. Second, the driver sets the "boost-disable" bit for a CPU when going down, but does not clear the bit again if the CPU comes up again due to DOWN_FAILED. This patch fixes the issues by changing the driver to react to the ONLINE/DOWN_FAILED events instead of UP_PREPARE. As an added benefit, the driver also becomes symmetric with respect to the hot plug mechanism. Signed-off-by: Richard Cochran Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/acpi-cpufreq.c | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index 59a7b380fbe2..e7801e0112bb 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -514,8 +514,10 @@ static int boost_notify(struct notifier_block *nb, unsigned long action, */ switch (action) { - case CPU_UP_PREPARE: - case CPU_UP_PREPARE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: boost_set_msrs(acpi_cpufreq_driver.boost_enabled, cpumask); break; From 0c313cb207326f759a58f486214288411b25d4cf Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Sun, 20 Mar 2016 01:33:35 +0100 Subject: [PATCH 09/23] cpuidle: menu: Fall back to polling if next timer event is near Commit a9ceb78bc75c (cpuidle,menu: use interactivity_req to disable polling) changed the behavior of the fallback state selection part of menu_select() so it looks at interactivity_req instead of data->next_timer_us when it makes its decision. That effectively caused polling to be used more often as fallback idle which led to significant increases of energy consumption in some cases. Commit e132b9b3bc7f (cpuidle: menu: use high confidence factors only when considering polling) changed that logic again to be more predictable, but that didn't help with the increased energy consumption problem. For this reason, go back to making decisions on which state to fall back to based on data->next_timer_us which is the time we know for sure something will happen rather than a prediction (which may be inaccurate and turns out to be so often enough to be problematic). However, take the target residency of the first proper idle state (C1) into account, so that state is not used as the fallback one if its target residency is greater than data->next_timer_us. Fixes: a9ceb78bc75c (cpuidle,menu: use interactivity_req to disable polling) Signed-off-by: Rafael J. Wysocki Reported-and-tested-by: Doug Smythies --- drivers/cpuidle/governors/menu.c | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/drivers/cpuidle/governors/menu.c b/drivers/cpuidle/governors/menu.c index 00c5f891e352..03d38c291de6 100644 --- a/drivers/cpuidle/governors/menu.c +++ b/drivers/cpuidle/governors/menu.c @@ -315,17 +315,21 @@ static int menu_select(struct cpuidle_driver *drv, struct cpuidle_device *dev) expected_interval = min(expected_interval, data->next_timer_us); if (CPUIDLE_DRIVER_STATE_START > 0) { - data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1; + struct cpuidle_state *s = &drv->states[CPUIDLE_DRIVER_STATE_START]; + unsigned int polling_threshold; + /* * We want to default to C1 (hlt), not to busy polling * unless the timer is happening really really soon, or * C1's exit latency exceeds the user configured limit. */ - if (expected_interval > drv->states[CPUIDLE_DRIVER_STATE_START].target_residency && - latency_req > drv->states[CPUIDLE_DRIVER_STATE_START].exit_latency && - !drv->states[CPUIDLE_DRIVER_STATE_START].disabled && + polling_threshold = max_t(unsigned int, 20, s->target_residency); + if (data->next_timer_us > polling_threshold && + latency_req > s->exit_latency && !s->disabled && !dev->states_usage[CPUIDLE_DRIVER_STATE_START].disable) data->last_state_idx = CPUIDLE_DRIVER_STATE_START; + else + data->last_state_idx = CPUIDLE_DRIVER_STATE_START - 1; } else { data->last_state_idx = CPUIDLE_DRIVER_STATE_START; } From 3e5963bc343b3fb4ca045e9d1c14cb9ce89234b8 Mon Sep 17 00:00:00 2001 From: Michael Neuling Date: Mon, 21 Mar 2016 22:24:52 +0530 Subject: [PATCH 10/23] cpufreq: powernv: Define per_cpu chip pointer to optimize hot-path Commit 96c4726f01cd "cpufreq: powernv: Remove cpu_to_chip_id() from hot-path" introduced a 'core_to_chip_map' array to cache the chip-ids of all cores. Replace this with a per-CPU variable that stores the pointer to the chip-array. This removes the linear lookup and provides a neater and simpler solution. Signed-off-by: Michael Neuling Signed-off-by: Shilpasri G Bhat Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/powernv-cpufreq.c | 50 +++++++++++-------------------- 1 file changed, 17 insertions(+), 33 deletions(-) diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index 50bf12033bbc..a00bcc2cef09 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -44,7 +44,6 @@ static struct cpufreq_frequency_table powernv_freqs[POWERNV_MAX_PSTATES+1]; static bool rebooting, throttled, occ_reset; -static unsigned int *core_to_chip_map; static const char * const throttle_reason[] = { "No throttling", @@ -65,6 +64,7 @@ static struct chip { } *chips; static int nr_chips; +static DEFINE_PER_CPU(struct chip *, chip_info); /* * Note: The set of pstates consists of contiguous integers, the @@ -324,34 +324,31 @@ static inline unsigned int get_nominal_index(void) static void powernv_cpufreq_throttle_check(void *data) { + struct chip *chip; unsigned int cpu = smp_processor_id(); - unsigned int chip_id = core_to_chip_map[cpu_core_index_of_thread(cpu)]; unsigned long pmsr; - int pmsr_pmax, i; + int pmsr_pmax; pmsr = get_pmspr(SPRN_PMSR); - - for (i = 0; i < nr_chips; i++) - if (chips[i].id == chip_id) - break; + chip = this_cpu_read(chip_info); /* Check for Pmax Capping */ pmsr_pmax = (s8)PMSR_MAX(pmsr); if (pmsr_pmax != powernv_pstate_info.max) { - if (chips[i].throttled) + if (chip->throttled) goto next; - chips[i].throttled = true; + chip->throttled = true; if (pmsr_pmax < powernv_pstate_info.nominal) pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", - cpu, chips[i].id, pmsr_pmax, + cpu, chip->id, pmsr_pmax, powernv_pstate_info.nominal); - trace_powernv_throttle(chips[i].id, - throttle_reason[chips[i].throttle_reason], + trace_powernv_throttle(chip->id, + throttle_reason[chip->throttle_reason], pmsr_pmax); - } else if (chips[i].throttled) { - chips[i].throttled = false; - trace_powernv_throttle(chips[i].id, - throttle_reason[chips[i].throttle_reason], + } else if (chip->throttled) { + chip->throttled = false; + trace_powernv_throttle(chip->id, + throttle_reason[chip->throttle_reason], pmsr_pmax); } @@ -558,47 +555,34 @@ static int init_chip_info(void) unsigned int chip[256]; unsigned int cpu, i; unsigned int prev_chip_id = UINT_MAX; - cpumask_t cpu_mask; - int ret = -ENOMEM; - core_to_chip_map = kcalloc(cpu_nr_cores(), sizeof(unsigned int), - GFP_KERNEL); - if (!core_to_chip_map) - goto out; - - cpumask_copy(&cpu_mask, cpu_possible_mask); - for_each_cpu(cpu, &cpu_mask) { + for_each_possible_cpu(cpu) { unsigned int id = cpu_to_chip_id(cpu); if (prev_chip_id != id) { prev_chip_id = id; chip[nr_chips++] = id; } - core_to_chip_map[cpu_core_index_of_thread(cpu)] = id; - cpumask_andnot(&cpu_mask, &cpu_mask, cpu_sibling_mask(cpu)); } chips = kcalloc(nr_chips, sizeof(struct chip), GFP_KERNEL); if (!chips) - goto free_chip_map; + return -ENOMEM; for (i = 0; i < nr_chips; i++) { chips[i].id = chip[i]; cpumask_copy(&chips[i].mask, cpumask_of_node(chip[i])); INIT_WORK(&chips[i].throttle, powernv_cpufreq_work_fn); + for_each_cpu(cpu, &chips[i].mask) + per_cpu(chip_info, cpu) = &chips[i]; } return 0; -free_chip_map: - kfree(core_to_chip_map); -out: - return ret; } static inline void clean_chip_info(void) { kfree(chips); - kfree(core_to_chip_map); } static inline void unregister_all_notifiers(void) From 4a798f508f5d0809f45c939e329e17b3eb0ffbc1 Mon Sep 17 00:00:00 2001 From: Colin Ian King Date: Fri, 18 Mar 2016 15:08:01 +0000 Subject: [PATCH 11/23] ACPI / util: cast data to u64 before shifting to fix sign extension obj->buffer.pointer[i] should be cast to u64 to prevent an unintentional sign extension. For example, if pointer[7] is 0x80, then the value 0xffffffffff000000 is or'd into mask rather than the intended value 0xff00000000000000 Detected with static analysis by CoverityScan Signed-off-by: Colin Ian King Signed-off-by: Rafael J. Wysocki --- drivers/acpi/utils.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/acpi/utils.c b/drivers/acpi/utils.c index f12a72428aac..050673f0c0b3 100644 --- a/drivers/acpi/utils.c +++ b/drivers/acpi/utils.c @@ -692,7 +692,7 @@ bool acpi_check_dsm(acpi_handle handle, const u8 *uuid, int rev, u64 funcs) mask = obj->integer.value; else if (obj->type == ACPI_TYPE_BUFFER) for (i = 0; i < obj->buffer.length && i < 8; i++) - mask |= (((u8)obj->buffer.pointer[i]) << (i * 8)); + mask |= (((u64)obj->buffer.pointer[i]) << (i * 8)); ACPI_FREE(obj); /* From 4a2e7aab4ffce1e0e79b303dc2f9a03aa9f3a332 Mon Sep 17 00:00:00 2001 From: Lorenzo Pieralisi Date: Mon, 21 Mar 2016 11:12:55 +0000 Subject: [PATCH 12/23] PCI: ACPI: IA64: fix IO port generic range check The [0 - 64k] ACPI PCI IO port resource boundary check in: acpi_dev_ioresource_flags() is currently applied blindly in the ACPI resource parsing to all architectures, but only x86 suffers from that IO space limitation. On arches (ie IA64 and ARM64) where IO space is memory mapped, the PCI root bridges IO resource windows are firstly initialized from the _CRS (in acpi_decode_space()) and contain the CPU physical address at which a root bridge decodes IO space in the CPU physical address space with the offset value representing the offset required to translate the PCI bus address into the CPU physical address. The IO resource windows are then parsed and updated in arch code before creating and enumerating PCI buses (eg IA64 add_io_space()) to map in an arch specific way the obtained CPU physical address range to a slice of virtual address space reserved to map PCI IO space, ending up with PCI bridges resource windows containing IO resources like the following on a working IA64 configuration: PCI host bridge to bus 0000:00 pci_bus 0000:00: root bus resource [io 0x1000000-0x100ffff window] (bus address [0x0000-0xffff]) pci_bus 0000:00: root bus resource [mem 0x000a0000-0x000fffff window] pci_bus 0000:00: root bus resource [mem 0x80000000-0x8fffffff window] pci_bus 0000:00: root bus resource [mem 0x80004000000-0x800ffffffff window] pci_bus 0000:00: root bus resource [bus 00] This implies that the [0 - 64K] check in acpi_dev_ioresource_flags() leaves platforms with memory mapped IO space (ie IA64) broken (ie kernel can't claim IO resources since the host bridge IO resource is disabled and discarded by ACPI core code, see log on IA64 with missing root bridge IO resource, silently filtered by current [0 - 64k] check in acpi_dev_ioresource_flags()): PCI host bridge to bus 0000:00 pci_bus 0000:00: root bus resource [mem 0x000a0000-0x000fffff window] pci_bus 0000:00: root bus resource [mem 0x80000000-0x8fffffff window] pci_bus 0000:00: root bus resource [mem 0x80004000000-0x800ffffffff window] pci_bus 0000:00: root bus resource [bus 00] [...] pci 0000:00:03.0: [1002:515e] type 00 class 0x030000 pci 0000:00:03.0: reg 0x10: [mem 0x80000000-0x87ffffff pref] pci 0000:00:03.0: reg 0x14: [io 0x1000-0x10ff] pci 0000:00:03.0: reg 0x18: [mem 0x88020000-0x8802ffff] pci 0000:00:03.0: reg 0x30: [mem 0x88000000-0x8801ffff pref] pci 0000:00:03.0: supports D1 D2 pci 0000:00:03.0: can't claim BAR 1 [io 0x1000-0x10ff]: no compatible bridge window For this reason, the IO port resources boundaries check in generic ACPI parsing code should be guarded with a CONFIG_X86 guard so that more arches (ie ARM64) can benefit from the generic ACPI resources parsing interface without incurring in unexpected resource filtering, fixing at the same time current breakage on IA64. This patch factors out IO ports boundary [0 - 64k] check in generic ACPI code and makes the IO space check X86 specific to make sure that IO space resources are usable on other arches too. Fixes: 3772aea7d6f3 (ia64/PCI/ACPI: Use common ACPI resource parsing interface for host bridge) Signed-off-by: Lorenzo Pieralisi Cc: 4.4+ # 4.4+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/resource.c | 14 +++++++++++++- 1 file changed, 13 insertions(+), 1 deletion(-) diff --git a/drivers/acpi/resource.c b/drivers/acpi/resource.c index d02fd53042a5..56241eb341f4 100644 --- a/drivers/acpi/resource.c +++ b/drivers/acpi/resource.c @@ -27,8 +27,20 @@ #ifdef CONFIG_X86 #define valid_IRQ(i) (((i) != 0) && ((i) != 2)) +static inline bool acpi_iospace_resource_valid(struct resource *res) +{ + /* On X86 IO space is limited to the [0 - 64K] IO port range */ + return res->end < 0x10003; +} #else #define valid_IRQ(i) (true) +/* + * ACPI IO descriptors on arches other than X86 contain MMIO CPU physical + * addresses mapping IO space in CPU physical address space, IO space + * resources can be placed anywhere in the 64-bit physical address space. + */ +static inline bool +acpi_iospace_resource_valid(struct resource *res) { return true; } #endif static bool acpi_dev_resource_len_valid(u64 start, u64 end, u64 len, bool io) @@ -127,7 +139,7 @@ static void acpi_dev_ioresource_flags(struct resource *res, u64 len, if (!acpi_dev_resource_len_valid(res->start, res->end, len, true)) res->flags |= IORESOURCE_DISABLED | IORESOURCE_UNSET; - if (res->end >= 0x10003) + if (!acpi_iospace_resource_valid(res)) res->flags |= IORESOURCE_DISABLED | IORESOURCE_UNSET; if (io_decode == ACPI_DECODE_16) From ac13b9967de7eeb7746f6533673e4dec94933bc9 Mon Sep 17 00:00:00 2001 From: Jisheng Zhang Date: Tue, 22 Mar 2016 22:34:30 +0800 Subject: [PATCH 13/23] cpufreq: acpi-cpufreq: make Intel/AMD MSR access, io port access static These frequency register read/write operations' implementations for the given processor (Intel/AMD MSR access or I/O port access) are only used internally in acpi-cpufreq, so make them static. Signed-off-by: Jisheng Zhang Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/acpi-cpufreq.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/drivers/cpufreq/acpi-cpufreq.c b/drivers/cpufreq/acpi-cpufreq.c index e7801e0112bb..fb5712141040 100644 --- a/drivers/cpufreq/acpi-cpufreq.c +++ b/drivers/cpufreq/acpi-cpufreq.c @@ -245,7 +245,7 @@ static unsigned extract_freq(u32 val, struct acpi_cpufreq_data *data) } } -u32 cpu_freq_read_intel(struct acpi_pct_register *not_used) +static u32 cpu_freq_read_intel(struct acpi_pct_register *not_used) { u32 val, dummy; @@ -253,7 +253,7 @@ u32 cpu_freq_read_intel(struct acpi_pct_register *not_used) return val; } -void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val) +static void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val) { u32 lo, hi; @@ -262,7 +262,7 @@ void cpu_freq_write_intel(struct acpi_pct_register *not_used, u32 val) wrmsr(MSR_IA32_PERF_CTL, lo, hi); } -u32 cpu_freq_read_amd(struct acpi_pct_register *not_used) +static u32 cpu_freq_read_amd(struct acpi_pct_register *not_used) { u32 val, dummy; @@ -270,12 +270,12 @@ u32 cpu_freq_read_amd(struct acpi_pct_register *not_used) return val; } -void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val) +static void cpu_freq_write_amd(struct acpi_pct_register *not_used, u32 val) { wrmsr(MSR_AMD_PERF_CTL, val, 0); } -u32 cpu_freq_read_io(struct acpi_pct_register *reg) +static u32 cpu_freq_read_io(struct acpi_pct_register *reg) { u32 val; @@ -283,7 +283,7 @@ u32 cpu_freq_read_io(struct acpi_pct_register *reg) return val; } -void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val) +static void cpu_freq_write_io(struct acpi_pct_register *reg, u32 val) { acpi_os_write_port(reg->address, val, reg->bit_width); } From 1b0289848d5dcea74a6e5115d6c9892b0dbe9c8f Mon Sep 17 00:00:00 2001 From: Shilpasri G Bhat Date: Tue, 22 Mar 2016 18:57:09 +0530 Subject: [PATCH 14/23] cpufreq: powernv: Add sysfs attributes to show throttle stats Create sysfs attributes to export throttle information in /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats directory. The newly added sysfs files are as follows: 1)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat 2)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub-turbo_stat 3)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle 4)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap 5)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp 6)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault 7)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent 8)/sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset Detailed explanation of each attribute is added to Documentation/ABI/testing/sysfs-devices-system-cpu Signed-off-by: Shilpasri G Bhat Acked-by: Viresh Kumar Signed-off-by: Rafael J. Wysocki --- .../ABI/testing/sysfs-devices-system-cpu | 69 +++++++++++++++++ drivers/cpufreq/powernv-cpufreq.c | 74 ++++++++++++++++++- 2 files changed, 141 insertions(+), 2 deletions(-) diff --git a/Documentation/ABI/testing/sysfs-devices-system-cpu b/Documentation/ABI/testing/sysfs-devices-system-cpu index b683e8ee69ec..16501334b99f 100644 --- a/Documentation/ABI/testing/sysfs-devices-system-cpu +++ b/Documentation/ABI/testing/sysfs-devices-system-cpu @@ -271,3 +271,72 @@ Description: Parameters for the CPU cache attributes - WriteBack: data is written only to the cache line and the modified cache line is written to main memory only when it is replaced + +What: /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/turbo_stat + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/sub_turbo_stat + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/unthrottle + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/powercap + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overtemp + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/supply_fault + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/overcurrent + /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats/occ_reset +Date: March 2016 +Contact: Linux kernel mailing list + Linux for PowerPC mailing list +Description: POWERNV CPUFreq driver's frequency throttle stats directory and + attributes + + 'cpuX/cpufreq/throttle_stats' directory contains the CPU frequency + throttle stat attributes for the chip. The throttle stats of a cpu + is common across all the cpus belonging to a chip. Below are the + throttle attributes exported in the 'throttle_stats' directory: + + - turbo_stat : This file gives the total number of times the max + frequency is throttled to lower frequency in turbo (at and above + nominal frequency) range of frequencies. + + - sub_turbo_stat : This file gives the total number of times the + max frequency is throttled to lower frequency in sub-turbo(below + nominal frequency) range of frequencies. + + - unthrottle : This file gives the total number of times the max + frequency is unthrottled after being throttled. + + - powercap : This file gives the total number of times the max + frequency is throttled due to 'Power Capping'. + + - overtemp : This file gives the total number of times the max + frequency is throttled due to 'CPU Over Temperature'. + + - supply_fault : This file gives the total number of times the + max frequency is throttled due to 'Power Supply Failure'. + + - overcurrent : This file gives the total number of times the + max frequency is throttled due to 'Overcurrent'. + + - occ_reset : This file gives the total number of times the max + frequency is throttled due to 'OCC Reset'. + + The sysfs attributes representing different throttle reasons like + powercap, overtemp, supply_fault, overcurrent and occ_reset map to + the reasons provided by OCC firmware for throttling the frequency. + +What: /sys/devices/system/cpu/cpufreq/policyX/throttle_stats + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/turbo_stat + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/sub_turbo_stat + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/unthrottle + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/powercap + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/overtemp + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/supply_fault + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/overcurrent + /sys/devices/system/cpu/cpufreq/policyX/throttle_stats/occ_reset +Date: March 2016 +Contact: Linux kernel mailing list + Linux for PowerPC mailing list +Description: POWERNV CPUFreq driver's frequency throttle stats directory and + attributes + + 'policyX/throttle_stats' directory and all the attributes are same as + the /sys/devices/system/cpu/cpuX/cpufreq/throttle_stats directory and + attributes which give the frequency throttle information of the chip. diff --git a/drivers/cpufreq/powernv-cpufreq.c b/drivers/cpufreq/powernv-cpufreq.c index a00bcc2cef09..39ac78c94be0 100644 --- a/drivers/cpufreq/powernv-cpufreq.c +++ b/drivers/cpufreq/powernv-cpufreq.c @@ -54,6 +54,16 @@ static const char * const throttle_reason[] = { "OCC Reset" }; +enum throttle_reason_type { + NO_THROTTLE = 0, + POWERCAP, + CPU_OVERTEMP, + POWER_SUPPLY_FAILURE, + OVERCURRENT, + OCC_RESET_THROTTLE, + OCC_MAX_REASON +}; + static struct chip { unsigned int id; bool throttled; @@ -61,6 +71,9 @@ static struct chip { u8 throttle_reason; cpumask_t mask; struct work_struct throttle; + int throttle_turbo; + int throttle_sub_turbo; + int reason[OCC_MAX_REASON]; } *chips; static int nr_chips; @@ -196,6 +209,42 @@ static struct freq_attr *powernv_cpu_freq_attr[] = { NULL, }; +#define throttle_attr(name, member) \ +static ssize_t name##_show(struct cpufreq_policy *policy, char *buf) \ +{ \ + struct chip *chip = per_cpu(chip_info, policy->cpu); \ + \ + return sprintf(buf, "%u\n", chip->member); \ +} \ + \ +static struct freq_attr throttle_attr_##name = __ATTR_RO(name) \ + +throttle_attr(unthrottle, reason[NO_THROTTLE]); +throttle_attr(powercap, reason[POWERCAP]); +throttle_attr(overtemp, reason[CPU_OVERTEMP]); +throttle_attr(supply_fault, reason[POWER_SUPPLY_FAILURE]); +throttle_attr(overcurrent, reason[OVERCURRENT]); +throttle_attr(occ_reset, reason[OCC_RESET_THROTTLE]); +throttle_attr(turbo_stat, throttle_turbo); +throttle_attr(sub_turbo_stat, throttle_sub_turbo); + +static struct attribute *throttle_attrs[] = { + &throttle_attr_unthrottle.attr, + &throttle_attr_powercap.attr, + &throttle_attr_overtemp.attr, + &throttle_attr_supply_fault.attr, + &throttle_attr_overcurrent.attr, + &throttle_attr_occ_reset.attr, + &throttle_attr_turbo_stat.attr, + &throttle_attr_sub_turbo_stat.attr, + NULL, +}; + +static const struct attribute_group throttle_attr_grp = { + .name = "throttle_stats", + .attrs = throttle_attrs, +}; + /* Helper routines */ /* Access helpers to power mgt SPR */ @@ -338,10 +387,14 @@ static void powernv_cpufreq_throttle_check(void *data) if (chip->throttled) goto next; chip->throttled = true; - if (pmsr_pmax < powernv_pstate_info.nominal) + if (pmsr_pmax < powernv_pstate_info.nominal) { pr_warn_once("CPU %d on Chip %u has Pmax reduced below nominal frequency (%d < %d)\n", cpu, chip->id, pmsr_pmax, powernv_pstate_info.nominal); + chip->throttle_sub_turbo++; + } else { + chip->throttle_turbo++; + } trace_powernv_throttle(chip->id, throttle_reason[chip->throttle_reason], pmsr_pmax); @@ -408,6 +461,21 @@ static int powernv_cpufreq_cpu_init(struct cpufreq_policy *policy) for (i = 0; i < threads_per_core; i++) cpumask_set_cpu(base + i, policy->cpus); + if (!policy->driver_data) { + int ret; + + ret = sysfs_create_group(&policy->kobj, &throttle_attr_grp); + if (ret) { + pr_info("Failed to create throttle stats directory for cpu %d\n", + policy->cpu); + return ret; + } + /* + * policy->driver_data is used as a flag for one-time + * creation of throttle sysfs files. + */ + policy->driver_data = policy; + } return cpufreq_table_validate_and_show(policy, powernv_freqs); } @@ -514,8 +582,10 @@ static int powernv_cpufreq_occ_msg(struct notifier_block *nb, break; if (omsg.throttle_status >= 0 && - omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) + omsg.throttle_status <= OCC_MAX_THROTTLE_STATUS) { chips[i].throttle_reason = omsg.throttle_status; + chips[i].reason[omsg.throttle_status]++; + } if (!omsg.throttle_status) chips[i].restore = true; From 0a300767e5882ad5b687966c80f9620aa0871be5 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 21 Mar 2016 15:45:24 +0100 Subject: [PATCH 15/23] cpufreq: Introduce cpufreq_start_governor() Starting a governor in cpufreq always follows the same pattern involving two calls to cpufreq_governor(), one with the event argument set to CPUFREQ_GOV_START and one with that argument set to CPUFREQ_GOV_LIMITS. Introduce cpufreq_start_governor() that will carry out those two operations and make all places where governors are started use it. That slightly modifies the behavior of cpufreq_set_policy() which now also will go back to the old governor if the second call to cpufreq_governor() (the one with event equal to CPUFREQ_GOV_LIMITS) fails, but that really is how it should work in the first place. Also cpufreq_resume() will now pring an error message if the CPUFREQ_GOV_LIMITS call to cpufreq_governor() fails, but that makes it follow cpufreq_add_policy_cpu() and cpufreq_offline() in that respect. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 44 +++++++++++++++++++-------------------- 1 file changed, 22 insertions(+), 22 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index f870399f00b1..43f3912a2ac8 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -76,6 +76,7 @@ static inline bool has_target(void) /* internal prototypes */ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event); static unsigned int __cpufreq_get(struct cpufreq_policy *policy); +static int cpufreq_start_governor(struct cpufreq_policy *policy); /** * Two notifier lists: the "policy" list is involved in the @@ -964,10 +965,7 @@ static int cpufreq_add_policy_cpu(struct cpufreq_policy *policy, unsigned int cp cpumask_set_cpu(cpu, policy->cpus); if (has_target()) { - ret = cpufreq_governor(policy, CPUFREQ_GOV_START); - if (!ret) - ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); - + ret = cpufreq_start_governor(policy); if (ret) pr_err("%s: Failed to start governor\n", __func__); } @@ -1308,10 +1306,7 @@ static void cpufreq_offline(unsigned int cpu) /* Start governor again for active policy */ if (!policy_is_inactive(policy)) { if (has_target()) { - ret = cpufreq_governor(policy, CPUFREQ_GOV_START); - if (!ret) - ret = cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); - + ret = cpufreq_start_governor(policy); if (ret) pr_err("%s: Failed to start governor\n", __func__); } @@ -1591,9 +1586,7 @@ void cpufreq_resume(void) policy); } else { down_write(&policy->rwsem); - ret = cpufreq_governor(policy, CPUFREQ_GOV_START); - if (!ret) - cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); + ret = cpufreq_start_governor(policy); up_write(&policy->rwsem); if (ret) @@ -1935,6 +1928,14 @@ static int cpufreq_governor(struct cpufreq_policy *policy, unsigned int event) return ret; } +static int cpufreq_start_governor(struct cpufreq_policy *policy) +{ + int ret; + + ret = cpufreq_governor(policy, CPUFREQ_GOV_START); + return ret ? ret : cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); +} + int cpufreq_register_governor(struct cpufreq_governor *governor) { int err; @@ -2071,8 +2072,10 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, return cpufreq_driver->setpolicy(new_policy); } - if (new_policy->governor == policy->governor) - goto out; + if (new_policy->governor == policy->governor) { + pr_debug("cpufreq: governor limits update\n"); + return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); + } pr_debug("governor switch\n"); @@ -2100,10 +2103,11 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, policy->governor = new_policy->governor; ret = cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT); if (!ret) { - ret = cpufreq_governor(policy, CPUFREQ_GOV_START); - if (!ret) - goto out; - + ret = cpufreq_start_governor(policy); + if (!ret) { + pr_debug("cpufreq: governor change\n"); + return 0; + } cpufreq_governor(policy, CPUFREQ_GOV_POLICY_EXIT); } @@ -2114,14 +2118,10 @@ static int cpufreq_set_policy(struct cpufreq_policy *policy, if (cpufreq_governor(policy, CPUFREQ_GOV_POLICY_INIT)) policy->governor = NULL; else - cpufreq_governor(policy, CPUFREQ_GOV_START); + cpufreq_start_governor(policy); } return ret; - - out: - pr_debug("governor: change or update limits\n"); - return cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); } /** From 999f572983de5e1a4ee9f42daa51b5448f12dd64 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 21 Mar 2016 15:46:25 +0100 Subject: [PATCH 16/23] cpufreq: Introduce cpufreq_update_current_freq() Move the part of cpufreq_update_policy() that obtains the current frequency from the driver and updates policy->cur if necessary to a separate function, cpufreq_get_current_freq(). That should not introduce functional changes and subsequent change set will need it. Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 28 +++++++++++++++++++--------- 1 file changed, 19 insertions(+), 9 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index 43f3912a2ac8..ffdb7fca5bb7 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1487,6 +1487,24 @@ unsigned int cpufreq_get(unsigned int cpu) } EXPORT_SYMBOL(cpufreq_get); +static unsigned int cpufreq_update_current_freq(struct cpufreq_policy *policy) +{ + unsigned int new_freq; + + new_freq = cpufreq_driver->get(policy->cpu); + if (!new_freq) + return 0; + + if (!policy->cur) { + pr_debug("cpufreq: Driver did not initialize current freq\n"); + policy->cur = new_freq; + } else if (policy->cur != new_freq && has_target()) { + cpufreq_out_of_sync(policy, new_freq); + } + + return new_freq; +} + static struct subsys_interface cpufreq_interface = { .name = "cpufreq", .subsys = &cpu_subsys, @@ -2152,19 +2170,11 @@ int cpufreq_update_policy(unsigned int cpu) * -> ask driver for current freq and notify governors about a change */ if (cpufreq_driver->get && !cpufreq_driver->setpolicy) { - new_policy.cur = cpufreq_driver->get(cpu); + new_policy.cur = cpufreq_update_current_freq(policy); if (WARN_ON(!new_policy.cur)) { ret = -EIO; goto unlock; } - - if (!policy->cur) { - pr_debug("Driver did not initialize current freq\n"); - policy->cur = new_policy.cur; - } else { - if (policy->cur != new_policy.cur && has_target()) - cpufreq_out_of_sync(policy, new_policy.cur); - } } ret = cpufreq_set_policy(policy, &new_policy); From 3bbf8fe3ae0845af9192118753c2399efe531832 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Mon, 21 Mar 2016 15:47:48 +0100 Subject: [PATCH 17/23] cpufreq: Always update current frequency before startig governor Make policy->cur match the current frequency returned by the driver's ->get() callback before starting the governor in case they went out of sync in the meantime and drop the piece of code attempting to resync policy->cur with the real frequency of the boot CPU from cpufreq_resume() as it serves no purpose any more (and it's racy and super-ugly anyway). Signed-off-by: Rafael J. Wysocki Acked-by: Viresh Kumar --- drivers/cpufreq/cpufreq.c | 14 +++----------- 1 file changed, 3 insertions(+), 11 deletions(-) diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c index ffdb7fca5bb7..b87596b591b3 100644 --- a/drivers/cpufreq/cpufreq.c +++ b/drivers/cpufreq/cpufreq.c @@ -1612,17 +1612,6 @@ void cpufreq_resume(void) __func__, policy); } } - - /* - * schedule call cpufreq_update_policy() for first-online CPU, as that - * wouldn't be hotplugged-out on suspend. It will verify that the - * current freq is in sync with what we believe it to be. - */ - policy = cpufreq_cpu_get_raw(cpumask_first(cpu_online_mask)); - if (WARN_ON(!policy)) - return; - - schedule_work(&policy->update); } /** @@ -1950,6 +1939,9 @@ static int cpufreq_start_governor(struct cpufreq_policy *policy) { int ret; + if (cpufreq_driver->get && !cpufreq_driver->setpolicy) + cpufreq_update_current_freq(policy); + ret = cpufreq_governor(policy, CPUFREQ_GOV_START); return ret ? ret : cpufreq_governor(policy, CPUFREQ_GOV_LIMITS); } From 539a4c4247c2697d291a8fb02c485ccd7cf34f05 Mon Sep 17 00:00:00 2001 From: "Rafael J. Wysocki" Date: Tue, 22 Mar 2016 01:17:43 +0100 Subject: [PATCH 18/23] cpufreq: governor: Always schedule work on the CPU running update Modify dbs_irq_work() to always schedule the process-context work on the current CPU which also ran the dbs_update_util_handler() that the irq_work being handled came from. This causes the entire frequency update handling (involving the "ondemand" or "conservative" governors) to be carried out by the CPU whose frequency is to be updated and reduces the overall amount of inter-CPU noise related to cpufreq. Signed-off-by: Rafael J. Wysocki --- drivers/cpufreq/cpufreq_governor.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/drivers/cpufreq/cpufreq_governor.c b/drivers/cpufreq/cpufreq_governor.c index 1c25ef405616..10a5cfeae8c5 100644 --- a/drivers/cpufreq/cpufreq_governor.c +++ b/drivers/cpufreq/cpufreq_governor.c @@ -329,7 +329,7 @@ static void dbs_irq_work(struct irq_work *irq_work) struct policy_dbs_info *policy_dbs; policy_dbs = container_of(irq_work, struct policy_dbs_info, irq_work); - schedule_work(&policy_dbs->work); + schedule_work_on(smp_processor_id(), &policy_dbs->work); } static void dbs_update_util_handler(struct update_util_data *data, u64 time, From 276142730c39c9839465a36a90e5674a8c34e839 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 23 Mar 2016 00:11:20 +0100 Subject: [PATCH 19/23] PM / sleep: Clear pm_suspend_global_flags upon hibernate When suspending to RAM, waking up and later suspending to disk, we gratuitously runtime resume devices after the thaw phase. This does not occur if we always suspend to RAM or always to disk. pm_complete_with_resume_check(), which gets called from pci_pm_complete() among others, schedules a runtime resume if PM_SUSPEND_FLAG_FW_RESUME is set. The flag is set during a suspend-to-RAM cycle. It is cleared at the beginning of the suspend-to-RAM cycle but not afterwards and it is not cleared during a suspend-to-disk cycle at all. Fix it. Fixes: ef25ba047601 (PM / sleep: Add flags to indicate platform firmware involvement) Signed-off-by: Lukas Wunner Cc: 4.4+ # 4.4+ Signed-off-by: Rafael J. Wysocki --- kernel/power/hibernate.c | 1 + 1 file changed, 1 insertion(+) diff --git a/kernel/power/hibernate.c b/kernel/power/hibernate.c index b7342a24f559..b7dd5718836e 100644 --- a/kernel/power/hibernate.c +++ b/kernel/power/hibernate.c @@ -339,6 +339,7 @@ int hibernation_snapshot(int platform_mode) pm_message_t msg; int error; + pm_suspend_clear_flags(); error = platform_begin(platform_mode); if (error) goto Close; From fbda4b38fa3995aa0777fe9cbbdcb223c6292083 Mon Sep 17 00:00:00 2001 From: Lukas Wunner Date: Wed, 23 Mar 2016 00:11:20 +0100 Subject: [PATCH 20/23] ACPI / PM: Runtime resume devices when waking from hibernate Commit 58a1fbbb2ee8 ("PM / PCI / ACPI: Kick devices that might have been reset by firmware") added a runtime resume for devices that were runtime suspended when the system entered suspend-to-RAM. Briefly, the motivation was to ensure that devices did not remain in a reset-power-on state after resume, potentially preventing deep SoC-wide low-power states from being entered on idle. Currently we're not doing the same when leaving suspend-to-disk and this asymmetry is a problem if drivers rely on the automatic resume triggered by pm_complete_with_resume_check(). Fix it. Fixes: 58a1fbbb2ee8 (PM / PCI / ACPI: Kick devices that might have been reset by firmware) Signed-off-by: Lukas Wunner Cc: 4.4+ # 4.4+ Signed-off-by: Rafael J. Wysocki --- drivers/acpi/sleep.c | 1 + 1 file changed, 1 insertion(+) diff --git a/drivers/acpi/sleep.c b/drivers/acpi/sleep.c index 9cb975200cac..f054cadf30d8 100644 --- a/drivers/acpi/sleep.c +++ b/drivers/acpi/sleep.c @@ -714,6 +714,7 @@ static int acpi_hibernation_enter(void) static void acpi_hibernation_leave(void) { + pm_set_resume_via_firmware(); /* * If ACPI is not enabled by the BIOS and the boot kernel, we need to * enable it here. From d70e28f57e14a481977436695b0c9ba165472431 Mon Sep 17 00:00:00 2001 From: Len Brown Date: Sun, 13 Mar 2016 00:33:48 -0500 Subject: [PATCH 21/23] intel_idle: prevent SKL-H boot failure when C8+C9+C10 enabled Some SKL-H configurations require "intel_idle.max_cstate=7" to boot. While that is an effective workaround, it disables C10. This patch detects the problematic configuration, and disables C8 and C9, keeping C10 enabled. Note that enabling SGX in BIOS SETUP can also prevent this issue, if the system BIOS provides that option. https://bugzilla.kernel.org/show_bug.cgi?id=109081 "Freezes with Intel i7 6700HQ (Skylake), unless intel_idle.max_cstate=7" Signed-off-by: Len Brown Cc: stable@vger.kernel.org --- drivers/idle/intel_idle.c | 112 ++++++++++++++++++++++++++++++-------- 1 file changed, 88 insertions(+), 24 deletions(-) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index cd4510a63375..146eed70bdf4 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -65,7 +65,7 @@ #include #include -#define INTEL_IDLE_VERSION "0.4" +#define INTEL_IDLE_VERSION "0.4.1" #define PREFIX "intel_idle: " static struct cpuidle_driver intel_idle_driver = { @@ -993,37 +993,93 @@ static void intel_idle_cpuidle_devices_uninit(void) return; } +/* + * ivt_idle_state_table_update(void) + * + * Tune IVT multi-socket targets + * Assumption: num_sockets == (max_package_num + 1) + */ +static void ivt_idle_state_table_update(void) +{ + /* IVT uses a different table for 1-2, 3-4, and > 4 sockets */ + int cpu, package_num, num_sockets = 1; + + for_each_online_cpu(cpu) { + package_num = topology_physical_package_id(cpu); + if (package_num + 1 > num_sockets) { + num_sockets = package_num + 1; + + if (num_sockets > 4) { + cpuidle_state_table = ivt_cstates_8s; + return; + } + } + } + + if (num_sockets > 2) + cpuidle_state_table = ivt_cstates_4s; + + /* else, 1 and 2 socket systems use default ivt_cstates */ +} +/* + * sklh_idle_state_table_update(void) + * + * On SKL-H (model 0x5e) disable C8 and C9 if: + * C10 is enabled and SGX disabled + */ +static void sklh_idle_state_table_update(void) +{ + unsigned long long msr; + unsigned int eax, ebx, ecx, edx; + + + /* if PC10 disabled via cmdline intel_idle.max_cstate=7 or shallower */ + if (max_cstate <= 7) + return; + + /* if PC10 not present in CPUID.MWAIT.EDX */ + if ((mwait_substates & (0xF << 28)) == 0) + return; + + rdmsrl(MSR_NHM_SNB_PKG_CST_CFG_CTL, msr); + + /* PC10 is not enabled in PKG C-state limit */ + if ((msr & 0xF) != 8) + return; + + ecx = 0; + cpuid(7, &eax, &ebx, &ecx, &edx); + + /* if SGX is present */ + if (ebx & (1 << 2)) { + + rdmsrl(MSR_IA32_FEATURE_CONTROL, msr); + + /* if SGX is enabled */ + if (msr & (1 << 18)) + return; + } + + skl_cstates[5].disabled = 1; /* C8-SKL */ + skl_cstates[6].disabled = 1; /* C9-SKL */ +} /* * intel_idle_state_table_update() * * Update the default state_table for this CPU-id - * - * Currently used to access tuned IVT multi-socket targets - * Assumption: num_sockets == (max_package_num + 1) */ -void intel_idle_state_table_update(void) + +static void intel_idle_state_table_update(void) { - /* IVT uses a different table for 1-2, 3-4, and > 4 sockets */ - if (boot_cpu_data.x86_model == 0x3e) { /* IVT */ - int cpu, package_num, num_sockets = 1; + switch (boot_cpu_data.x86_model) { - for_each_online_cpu(cpu) { - package_num = topology_physical_package_id(cpu); - if (package_num + 1 > num_sockets) { - num_sockets = package_num + 1; - - if (num_sockets > 4) { - cpuidle_state_table = ivt_cstates_8s; - return; - } - } - } - - if (num_sockets > 2) - cpuidle_state_table = ivt_cstates_4s; - /* else, 1 and 2 socket systems use default ivt_cstates */ + case 0x3e: /* IVT */ + ivt_idle_state_table_update(); + break; + case 0x5e: /* SKL-H */ + sklh_idle_state_table_update(); + break; } - return; } /* @@ -1063,6 +1119,14 @@ static int __init intel_idle_cpuidle_driver_init(void) if (num_substates == 0) continue; + /* if state marked as disabled, skip it */ + if (cpuidle_state_table[cstate].disabled != 0) { + pr_debug(PREFIX "state %s is disabled", + cpuidle_state_table[cstate].name); + continue; + } + + if (((mwait_cstate + 1) > 2) && !boot_cpu_has(X86_FEATURE_NONSTOP_TSC)) mark_tsc_unstable("TSC halts in idle" From 281baf7a702693deaa45c98ef0c5161006b48257 Mon Sep 17 00:00:00 2001 From: Dasaratharaman Chandramouli Date: Thu, 4 Sep 2014 17:22:54 -0700 Subject: [PATCH 22/23] intel_idle: Support for Intel Xeon Phi Processor x200 Product Family Enables "Intel(R) Xeon Phi(TM) Processor x200 Product Family" support, formerly code-named KNL. It is based on modified Intel Atom Silvermont microarchitecture. Signed-off-by: Dasaratharaman Chandramouli [micah.barany@intel.com: adjusted values of residency and latency] Signed-off-by: Micah Barany [hubert.chrzaniuk@intel.com: removed deprecated CPUIDLE_FLAG_TIME_VALID flag] Signed-off-by: Hubert Chrzaniuk Signed-off-by: Pawel Karczewski Signed-off-by: Len Brown --- drivers/idle/intel_idle.c | 25 +++++++++++++++++++++++++ 1 file changed, 25 insertions(+) diff --git a/drivers/idle/intel_idle.c b/drivers/idle/intel_idle.c index 146eed70bdf4..ba947df5a8c7 100644 --- a/drivers/idle/intel_idle.c +++ b/drivers/idle/intel_idle.c @@ -716,6 +716,26 @@ static struct cpuidle_state avn_cstates[] = { { .enter = NULL } }; +static struct cpuidle_state knl_cstates[] = { + { + .name = "C1-KNL", + .desc = "MWAIT 0x00", + .flags = MWAIT2flg(0x00), + .exit_latency = 1, + .target_residency = 2, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze }, + { + .name = "C6-KNL", + .desc = "MWAIT 0x10", + .flags = MWAIT2flg(0x10) | CPUIDLE_FLAG_TLB_FLUSHED, + .exit_latency = 120, + .target_residency = 500, + .enter = &intel_idle, + .enter_freeze = intel_idle_freeze }, + { + .enter = NULL } +}; /** * intel_idle @@ -890,6 +910,10 @@ static const struct idle_cpu idle_cpu_avn = { .disable_promotion_to_c1e = true, }; +static const struct idle_cpu idle_cpu_knl = { + .state_table = knl_cstates, +}; + #define ICPU(model, cpu) \ { X86_VENDOR_INTEL, 6, model, X86_FEATURE_MWAIT, (unsigned long)&cpu } @@ -921,6 +945,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = { ICPU(0x56, idle_cpu_bdw), ICPU(0x4e, idle_cpu_skl), ICPU(0x5e, idle_cpu_skl), + ICPU(0x57, idle_cpu_knl), {} }; MODULE_DEVICE_TABLE(x86cpu, intel_idle_ids); From f447671b9e4fa4b0c9faf39455269d62d94218ae Mon Sep 17 00:00:00 2001 From: David Wu Date: Wed, 16 Mar 2016 02:45:26 +0800 Subject: [PATCH 23/23] PM / AVS: rockchip-io: add io selectors and supplies for rk3399 This adds the necessary data for handling io voltage domains on the rk3399. As interesting tidbit, the rk3399 contains two separate iodomain areas. One in the regular General Register Files (GRF) and one in PMUGRF in the pmu power domain. Signed-off-by: David Wu Reviewed-by: Heiko Stuebner Acked-by: Kevin Hilman Signed-off-by: Rafael J. Wysocki --- .../bindings/power/rockchip-io-domain.txt | 11 ++++ drivers/power/avs/rockchip-io-domain.c | 58 +++++++++++++++++++ 2 files changed, 69 insertions(+) mode change 100644 => 100755 drivers/power/avs/rockchip-io-domain.c diff --git a/Documentation/devicetree/bindings/power/rockchip-io-domain.txt b/Documentation/devicetree/bindings/power/rockchip-io-domain.txt index b8627e763dba..c84fb47265eb 100644 --- a/Documentation/devicetree/bindings/power/rockchip-io-domain.txt +++ b/Documentation/devicetree/bindings/power/rockchip-io-domain.txt @@ -35,6 +35,8 @@ Required properties: - "rockchip,rk3288-io-voltage-domain" for rk3288 - "rockchip,rk3368-io-voltage-domain" for rk3368 - "rockchip,rk3368-pmu-io-voltage-domain" for rk3368 pmu-domains + - "rockchip,rk3399-io-voltage-domain" for rk3399 + - "rockchip,rk3399-pmu-io-voltage-domain" for rk3399 pmu-domains - rockchip,grf: phandle to the syscon managing the "general register files" @@ -79,6 +81,15 @@ Possible supplies for rk3368 pmu-domains: - pmu-supply: The supply connected to PMUIO_VDD. - vop-supply: The supply connected to LCDC_VDD. +Possible supplies for rk3399: +- bt656-supply: The supply connected to APIO2_VDD. +- audio-supply: The supply connected to APIO5_VDD. +- sdmmc-supply: The supply connected to SDMMC0_VDD. +- gpio1830 The supply connected to APIO4_VDD. + +Possible supplies for rk3399 pmu-domains: +- pmu1830-supply:The supply connected to PMUIO2_VDD. + Example: io-domains { diff --git a/drivers/power/avs/rockchip-io-domain.c b/drivers/power/avs/rockchip-io-domain.c old mode 100644 new mode 100755 index 80994566a1c8..8986382718dd --- a/drivers/power/avs/rockchip-io-domain.c +++ b/drivers/power/avs/rockchip-io-domain.c @@ -47,6 +47,10 @@ #define RK3368_SOC_CON15_FLASH0 BIT(14) #define RK3368_SOC_FLASH_SUPPLY_NUM 2 +#define RK3399_PMUGRF_CON0 0x180 +#define RK3399_PMUGRF_CON0_VSEL BIT(8) +#define RK3399_PMUGRF_VSEL_SUPPLY_NUM 9 + struct rockchip_iodomain; /** @@ -181,6 +185,25 @@ static void rk3368_iodomain_init(struct rockchip_iodomain *iod) dev_warn(iod->dev, "couldn't update flash0 ctrl\n"); } +static void rk3399_pmu_iodomain_init(struct rockchip_iodomain *iod) +{ + int ret; + u32 val; + + /* if no pmu io supply we should leave things alone */ + if (!iod->supplies[RK3399_PMUGRF_VSEL_SUPPLY_NUM].reg) + return; + + /* + * set pmu io iodomain to also use this framework + * instead of a special gpio. + */ + val = RK3399_PMUGRF_CON0_VSEL | (RK3399_PMUGRF_CON0_VSEL << 16); + ret = regmap_write(iod->grf, RK3399_PMUGRF_CON0, val); + if (ret < 0) + dev_warn(iod->dev, "couldn't update pmu io iodomain ctrl\n"); +} + /* * On the rk3188 the io-domains are handled by a shared register with the * lower 8 bits being still being continuing drive-strength settings. @@ -252,6 +275,33 @@ static const struct rockchip_iodomain_soc_data soc_data_rk3368_pmu = { }, }; +static const struct rockchip_iodomain_soc_data soc_data_rk3399 = { + .grf_offset = 0xe640, + .supply_names = { + "bt656", /* APIO2_VDD */ + "audio", /* APIO5_VDD */ + "sdmmc", /* SDMMC0_VDD */ + "gpio1830", /* APIO4_VDD */ + }, +}; + +static const struct rockchip_iodomain_soc_data soc_data_rk3399_pmu = { + .grf_offset = 0x180, + .supply_names = { + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + NULL, + "pmu1830", /* PMUIO2_VDD */ + }, + .init = rk3399_pmu_iodomain_init, +}; + static const struct of_device_id rockchip_iodomain_match[] = { { .compatible = "rockchip,rk3188-io-voltage-domain", @@ -269,6 +319,14 @@ static const struct of_device_id rockchip_iodomain_match[] = { .compatible = "rockchip,rk3368-pmu-io-voltage-domain", .data = (void *)&soc_data_rk3368_pmu }, + { + .compatible = "rockchip,rk3399-io-voltage-domain", + .data = (void *)&soc_data_rk3399 + }, + { + .compatible = "rockchip,rk3399-pmu-io-voltage-domain", + .data = (void *)&soc_data_rk3399_pmu + }, { /* sentinel */ }, }; MODULE_DEVICE_TABLE(of, rockchip_iodomain_match);