Merge branches 'pm-cpufreq' and 'pm-cpuidle'

Merge cpufreq and cpuidle changes for 5.18-rc1:

 - Make the schedutil cpufreq governor use to_gov_attr_set() instead
   of open coding it (Kevin Hao).

 - Replace acpi_bus_get_device() with acpi_fetch_acpi_dev() in the
   cpufreq longhaul driver (Rafael Wysocki).

 - Unify show() and store() naming in cpufreq and make it use
   __ATTR_XX (Lianjie Zhang).

 - Make the intel_pstate driver use the EPP value set by the firmware
   by default (Srinivas Pandruvada).

 - Re-order the init checks in the powernow-k8 cpufreq driver (Mario
   Limonciello).

 - Make the ACPI processor idle driver check for architectural
   support for LPI to avoid using it on x86 by mistake (Mario
   Limonciello).

 - Add Sapphire Rapids Xeon support to the intel_idle driver (Artem
   Bityutskiy).

 - Add 'preferred_cstates' module argument to the intel_idle driver
   to work around C1 and C1E handling issue on Sapphire Rapids (Artem
   Bityutskiy).

 - Add core C6 optimization on Sapphire Rapids to the intel_idle
   driver (Artem Bityutskiy).

 - Optimize the haltpoll cpuidle driver a bit (Li RongQing).

 - Remove leftover text from intel_idle() kerneldoc comment and fix
   up white space in intel_idle (Rafael Wysocki).

* pm-cpufreq:
  cpufreq: powernow-k8: Re-order the init checks
  cpufreq: intel_pstate: Use firmware default EPP
  cpufreq: unify show() and store() naming and use __ATTR_XX
  cpufreq: longhaul: Replace acpi_bus_get_device()
  cpufreq: schedutil: Use to_gov_attr_set() to get the gov_attr_set
  cpufreq: Move to_gov_attr_set() to cpufreq.h

* pm-cpuidle:
  cpuidle: intel_idle: Drop redundant backslash at line end
  cpuidle: intel_idle: Update intel_idle() kerneldoc comment
  cpuidle: haltpoll: Call cpuidle_poll_state_init() later
  intel_idle: add core C6 optimization for SPR
  intel_idle: add 'preferred_cstates' module argument
  intel_idle: add SPR support
  ACPI: processor idle: Check for architectural support for LPI
  cpuidle: PSCI: Move the `has_lpi` check to the beginning of the function
This commit is contained in:
Rafael J. Wysocki 2022-03-18 18:14:55 +01:00
commit 86c17c40d2
14 changed files with 184 additions and 50 deletions

View File

@ -54,6 +54,9 @@ static int psci_acpi_cpu_init_idle(unsigned int cpu)
struct acpi_lpi_state *lpi;
struct acpi_processor *pr = per_cpu(processors, cpu);
if (unlikely(!pr || !pr->flags.has_lpi))
return -EINVAL;
/*
* If the PSCI cpu_suspend function hook has not been initialized
* idle states must not be enabled, so bail out
@ -61,9 +64,6 @@ static int psci_acpi_cpu_init_idle(unsigned int cpu)
if (!psci_ops.cpu_suspend)
return -EOPNOTSUPP;
if (unlikely(!pr || !pr->flags.has_lpi))
return -EINVAL;
count = pr->power.count - 1;
if (count <= 0)
return -ENODEV;

View File

@ -1080,6 +1080,11 @@ static int flatten_lpi_states(struct acpi_processor *pr,
return 0;
}
int __weak acpi_processor_ffh_lpi_probe(unsigned int cpu)
{
return -EOPNOTSUPP;
}
static int acpi_processor_get_lpi_info(struct acpi_processor *pr)
{
int ret, i;
@ -1088,6 +1093,11 @@ static int acpi_processor_get_lpi_info(struct acpi_processor *pr)
struct acpi_device *d = NULL;
struct acpi_lpi_states_array info[2], *tmp, *prev, *curr;
/* make sure our architecture has support */
ret = acpi_processor_ffh_lpi_probe(pr->id);
if (ret == -EOPNOTSUPP)
return ret;
if (!osc_pc_lpi_support_confirmed)
return -EOPNOTSUPP;
@ -1139,11 +1149,6 @@ static int acpi_processor_get_lpi_info(struct acpi_processor *pr)
return 0;
}
int __weak acpi_processor_ffh_lpi_probe(unsigned int cpu)
{
return -ENODEV;
}
int __weak acpi_processor_ffh_lpi_enter(struct acpi_lpi_state *lpi)
{
return -ENODEV;

View File

@ -146,7 +146,7 @@ static unsigned int cs_dbs_update(struct cpufreq_policy *policy)
/************************** sysfs interface ************************/
static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set,
static ssize_t sampling_down_factor_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@ -161,7 +161,7 @@ static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set,
return count;
}
static ssize_t store_up_threshold(struct gov_attr_set *attr_set,
static ssize_t up_threshold_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@ -177,7 +177,7 @@ static ssize_t store_up_threshold(struct gov_attr_set *attr_set,
return count;
}
static ssize_t store_down_threshold(struct gov_attr_set *attr_set,
static ssize_t down_threshold_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@ -195,7 +195,7 @@ static ssize_t store_down_threshold(struct gov_attr_set *attr_set,
return count;
}
static ssize_t store_ignore_nice_load(struct gov_attr_set *attr_set,
static ssize_t ignore_nice_load_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@ -220,7 +220,7 @@ static ssize_t store_ignore_nice_load(struct gov_attr_set *attr_set,
return count;
}
static ssize_t store_freq_step(struct gov_attr_set *attr_set, const char *buf,
static ssize_t freq_step_store(struct gov_attr_set *attr_set, const char *buf,
size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);

View File

@ -27,7 +27,7 @@ static DEFINE_MUTEX(gov_dbs_data_mutex);
/* Common sysfs tunables */
/*
* store_sampling_rate - update sampling rate effective immediately if needed.
* sampling_rate_store - update sampling rate effective immediately if needed.
*
* If new rate is smaller than the old, simply updating
* dbs.sampling_rate might not be appropriate. For example, if the
@ -41,7 +41,7 @@ static DEFINE_MUTEX(gov_dbs_data_mutex);
* This must be called with dbs_data->mutex held, otherwise traversing
* policy_dbs_list isn't safe.
*/
ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
ssize_t sampling_rate_store(struct gov_attr_set *attr_set, const char *buf,
size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@ -80,7 +80,7 @@ ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
return count;
}
EXPORT_SYMBOL_GPL(store_sampling_rate);
EXPORT_SYMBOL_GPL(sampling_rate_store);
/**
* gov_update_cpu_data - Update CPU load data.

View File

@ -51,7 +51,7 @@ static inline struct dbs_data *to_dbs_data(struct gov_attr_set *attr_set)
}
#define gov_show_one(_gov, file_name) \
static ssize_t show_##file_name \
static ssize_t file_name##_show \
(struct gov_attr_set *attr_set, char *buf) \
{ \
struct dbs_data *dbs_data = to_dbs_data(attr_set); \
@ -60,7 +60,7 @@ static ssize_t show_##file_name \
}
#define gov_show_one_common(file_name) \
static ssize_t show_##file_name \
static ssize_t file_name##_show \
(struct gov_attr_set *attr_set, char *buf) \
{ \
struct dbs_data *dbs_data = to_dbs_data(attr_set); \
@ -68,12 +68,10 @@ static ssize_t show_##file_name \
}
#define gov_attr_ro(_name) \
static struct governor_attr _name = \
__ATTR(_name, 0444, show_##_name, NULL)
static struct governor_attr _name = __ATTR_RO(_name)
#define gov_attr_rw(_name) \
static struct governor_attr _name = \
__ATTR(_name, 0644, show_##_name, store_##_name)
static struct governor_attr _name = __ATTR_RW(_name)
/* Common to all CPUs of a policy */
struct policy_dbs_info {
@ -176,7 +174,7 @@ void od_register_powersave_bias_handler(unsigned int (*f)
(struct cpufreq_policy *, unsigned int, unsigned int),
unsigned int powersave_bias);
void od_unregister_powersave_bias_handler(void);
ssize_t store_sampling_rate(struct gov_attr_set *attr_set, const char *buf,
ssize_t sampling_rate_store(struct gov_attr_set *attr_set, const char *buf,
size_t count);
void gov_update_cpu_data(struct dbs_data *dbs_data);
#endif /* _CPUFREQ_GOVERNOR_H */

View File

@ -8,11 +8,6 @@
#include "cpufreq_governor.h"
static inline struct gov_attr_set *to_gov_attr_set(struct kobject *kobj)
{
return container_of(kobj, struct gov_attr_set, kobj);
}
static inline struct governor_attr *to_gov_attr(struct attribute *attr)
{
return container_of(attr, struct governor_attr, attr);

View File

@ -202,7 +202,7 @@ static unsigned int od_dbs_update(struct cpufreq_policy *policy)
/************************** sysfs interface ************************/
static struct dbs_governor od_dbs_gov;
static ssize_t store_io_is_busy(struct gov_attr_set *attr_set, const char *buf,
static ssize_t io_is_busy_store(struct gov_attr_set *attr_set, const char *buf,
size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@ -220,7 +220,7 @@ static ssize_t store_io_is_busy(struct gov_attr_set *attr_set, const char *buf,
return count;
}
static ssize_t store_up_threshold(struct gov_attr_set *attr_set,
static ssize_t up_threshold_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@ -237,7 +237,7 @@ static ssize_t store_up_threshold(struct gov_attr_set *attr_set,
return count;
}
static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set,
static ssize_t sampling_down_factor_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@ -265,7 +265,7 @@ static ssize_t store_sampling_down_factor(struct gov_attr_set *attr_set,
return count;
}
static ssize_t store_ignore_nice_load(struct gov_attr_set *attr_set,
static ssize_t ignore_nice_load_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);
@ -290,7 +290,7 @@ static ssize_t store_ignore_nice_load(struct gov_attr_set *attr_set,
return count;
}
static ssize_t store_powersave_bias(struct gov_attr_set *attr_set,
static ssize_t powersave_bias_store(struct gov_attr_set *attr_set,
const char *buf, size_t count)
{
struct dbs_data *dbs_data = to_dbs_data(attr_set);

View File

@ -1692,6 +1692,37 @@ static void intel_pstate_enable_hwp_interrupt(struct cpudata *cpudata)
}
}
static void intel_pstate_update_epp_defaults(struct cpudata *cpudata)
{
cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
/*
* If this CPU gen doesn't call for change in balance_perf
* EPP return.
*/
if (epp_values[EPP_INDEX_BALANCE_PERFORMANCE] == HWP_EPP_BALANCE_PERFORMANCE)
return;
/*
* If powerup EPP is something other than chipset default 0x80 and
* - is more performance oriented than 0x80 (default balance_perf EPP)
* - But less performance oriented than performance EPP
* then use this as new balance_perf EPP.
*/
if (cpudata->epp_default < HWP_EPP_BALANCE_PERFORMANCE &&
cpudata->epp_default > HWP_EPP_PERFORMANCE) {
epp_values[EPP_INDEX_BALANCE_PERFORMANCE] = cpudata->epp_default;
return;
}
/*
* Use hard coded value per gen to update the balance_perf
* and default EPP.
*/
cpudata->epp_default = epp_values[EPP_INDEX_BALANCE_PERFORMANCE];
intel_pstate_set_epp(cpudata, cpudata->epp_default);
}
static void intel_pstate_hwp_enable(struct cpudata *cpudata)
{
/* First disable HWP notification interrupt till we activate again */
@ -1705,12 +1736,7 @@ static void intel_pstate_hwp_enable(struct cpudata *cpudata)
if (cpudata->epp_default >= 0)
return;
if (epp_values[EPP_INDEX_BALANCE_PERFORMANCE] == HWP_EPP_BALANCE_PERFORMANCE) {
cpudata->epp_default = intel_pstate_get_epp(cpudata, 0);
} else {
cpudata->epp_default = epp_values[EPP_INDEX_BALANCE_PERFORMANCE];
intel_pstate_set_epp(cpudata, cpudata->epp_default);
}
intel_pstate_update_epp_defaults(cpudata);
}
static int atom_get_min_pstate(void)

View File

@ -668,9 +668,9 @@ static acpi_status longhaul_walk_callback(acpi_handle obj_handle,
u32 nesting_level,
void *context, void **return_value)
{
struct acpi_device *d;
struct acpi_device *d = acpi_fetch_acpi_dev(obj_handle);
if (acpi_bus_get_device(obj_handle, &d))
if (!d)
return 0;
*return_value = acpi_driver_data(d);

View File

@ -1172,14 +1172,14 @@ static int powernowk8_init(void)
unsigned int i, supported_cpus = 0;
int ret;
if (!x86_match_cpu(powernow_k8_ids))
return -ENODEV;
if (boot_cpu_has(X86_FEATURE_HW_PSTATE)) {
__request_acpi_cpufreq();
return -ENODEV;
}
if (!x86_match_cpu(powernow_k8_ids))
return -ENODEV;
cpus_read_lock();
for_each_online_cpu(i) {
smp_call_function_single(i, check_supported_cpu, &ret, 1);

View File

@ -108,11 +108,11 @@ static int __init haltpoll_init(void)
if (boot_option_idle_override != IDLE_NO_OVERRIDE)
return -ENODEV;
cpuidle_poll_state_init(drv);
if (!kvm_para_available() || !haltpoll_want())
return -ENODEV;
cpuidle_poll_state_init(drv);
ret = cpuidle_register_driver(drv);
if (ret < 0)
return ret;

View File

@ -64,6 +64,7 @@ static struct cpuidle_driver intel_idle_driver = {
/* intel_idle.max_cstate=0 disables driver */
static int max_cstate = CPUIDLE_STATE_MAX - 1;
static unsigned int disabled_states_mask;
static unsigned int preferred_states_mask;
static struct cpuidle_device __percpu *intel_idle_cpuidle_devices;
@ -121,9 +122,6 @@ static unsigned int mwait_substates __initdata;
* If the local APIC timer is not known to be reliable in the target idle state,
* enable one-shot tick broadcasting for the target CPU before executing MWAIT.
*
* Optionally call leave_mm() for the target CPU upfront to avoid wakeups due to
* flushing user TLBs.
*
* Must be called under local_irq_disable().
*/
static __cpuidle int intel_idle(struct cpuidle_device *dev,
@ -761,6 +759,46 @@ static struct cpuidle_state icx_cstates[] __initdata = {
.enter = NULL }
};
/*
* On Sapphire Rapids Xeon C1 has to be disabled if C1E is enabled, and vice
* versa. On SPR C1E is enabled only if "C1E promotion" bit is set in
* MSR_IA32_POWER_CTL. But in this case there effectively no C1, because C1
* requests are promoted to C1E. If the "C1E promotion" bit is cleared, then
* both C1 and C1E requests end up with C1, so there is effectively no C1E.
*
* By default we enable C1 and disable C1E by marking it with
* 'CPUIDLE_FLAG_UNUSABLE'.
*/
static struct cpuidle_state spr_cstates[] __initdata = {
{
.name = "C1",
.desc = "MWAIT 0x00",
.flags = MWAIT2flg(0x00),
.exit_latency = 1,
.target_residency = 1,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C1E",
.desc = "MWAIT 0x01",
.flags = MWAIT2flg(0x01) | CPUIDLE_FLAG_ALWAYS_ENABLE |
CPUIDLE_FLAG_UNUSABLE,
.exit_latency = 2,
.target_residency = 4,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.name = "C6",
.desc = "MWAIT 0x20",
.flags = MWAIT2flg(0x20) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 290,
.target_residency = 800,
.enter = &intel_idle,
.enter_s2idle = intel_idle_s2idle, },
{
.enter = NULL }
};
static struct cpuidle_state atom_cstates[] __initdata = {
{
.name = "C1E",
@ -1104,6 +1142,12 @@ static const struct idle_cpu idle_cpu_icx __initconst = {
.use_acpi = true,
};
static const struct idle_cpu idle_cpu_spr __initconst = {
.state_table = spr_cstates,
.disable_promotion_to_c1e = true,
.use_acpi = true,
};
static const struct idle_cpu idle_cpu_avn __initconst = {
.state_table = avn_cstates,
.disable_promotion_to_c1e = true,
@ -1166,6 +1210,7 @@ static const struct x86_cpu_id intel_idle_ids[] __initconst = {
X86_MATCH_INTEL_FAM6_MODEL(SKYLAKE_X, &idle_cpu_skx),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_X, &idle_cpu_icx),
X86_MATCH_INTEL_FAM6_MODEL(ICELAKE_D, &idle_cpu_icx),
X86_MATCH_INTEL_FAM6_MODEL(SAPPHIRERAPIDS_X, &idle_cpu_spr),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNL, &idle_cpu_knl),
X86_MATCH_INTEL_FAM6_MODEL(XEON_PHI_KNM, &idle_cpu_knl),
X86_MATCH_INTEL_FAM6_MODEL(ATOM_GOLDMONT, &idle_cpu_bxt),
@ -1353,6 +1398,8 @@ static inline void intel_idle_init_cstates_acpi(struct cpuidle_driver *drv) { }
static inline bool intel_idle_off_by_default(u32 mwait_hint) { return false; }
#endif /* !CONFIG_ACPI_PROCESSOR_CSTATE */
static void c1e_promotion_enable(void);
/**
* ivt_idle_state_table_update - Tune the idle states table for Ivy Town.
*
@ -1523,6 +1570,41 @@ static void __init skx_idle_state_table_update(void)
}
}
/**
* spr_idle_state_table_update - Adjust Sapphire Rapids idle states table.
*/
static void __init spr_idle_state_table_update(void)
{
unsigned long long msr;
/* Check if user prefers C1E over C1. */
if (preferred_states_mask & BIT(2)) {
if (preferred_states_mask & BIT(1))
/* Both can't be enabled, stick to the defaults. */
return;
spr_cstates[0].flags |= CPUIDLE_FLAG_UNUSABLE;
spr_cstates[1].flags &= ~CPUIDLE_FLAG_UNUSABLE;
/* Enable C1E using the "C1E promotion" bit. */
c1e_promotion_enable();
disable_promotion_to_c1e = false;
}
/*
* By default, the C6 state assumes the worst-case scenario of package
* C6. However, if PC6 is disabled, we update the numbers to match
* core C6.
*/
rdmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr);
/* Limit value 2 and above allow for PC6. */
if ((msr & 0x7) < 2) {
spr_cstates[2].exit_latency = 190;
spr_cstates[2].target_residency = 600;
}
}
static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
{
unsigned int mwait_cstate = MWAIT_HINT2CSTATE(mwait_hint) + 1;
@ -1557,6 +1639,9 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
case INTEL_FAM6_SKYLAKE_X:
skx_idle_state_table_update();
break;
case INTEL_FAM6_SAPPHIRERAPIDS_X:
spr_idle_state_table_update();
break;
}
for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
@ -1629,6 +1714,15 @@ static void auto_demotion_disable(void)
wrmsrl(MSR_PKG_CST_CONFIG_CONTROL, msr_bits);
}
static void c1e_promotion_enable(void)
{
unsigned long long msr_bits;
rdmsrl(MSR_IA32_POWER_CTL, msr_bits);
msr_bits |= 0x2;
wrmsrl(MSR_IA32_POWER_CTL, msr_bits);
}
static void c1e_promotion_disable(void)
{
unsigned long long msr_bits;
@ -1798,3 +1892,14 @@ module_param(max_cstate, int, 0444);
*/
module_param_named(states_off, disabled_states_mask, uint, 0444);
MODULE_PARM_DESC(states_off, "Mask of disabled idle states");
/*
* Some platforms come with mutually exclusive C-states, so that if one is
* enabled, the other C-states must not be used. Example: C1 and C1E on
* Sapphire Rapids platform. This parameter allows for selecting the
* preferred C-states among the groups of mutually exclusive C-states - the
* selected C-states will be registered, the other C-states from the mutually
* exclusive group won't be registered. If the platform has no mutually
* exclusive C-states, this parameter has no effect.
*/
module_param_named(preferred_cstates, preferred_states_mask, uint, 0444);
MODULE_PARM_DESC(preferred_cstates, "Mask of preferred idle states");

View File

@ -661,6 +661,11 @@ struct gov_attr_set {
/* sysfs ops for cpufreq governors */
extern const struct sysfs_ops governor_sysfs_ops;
static inline struct gov_attr_set *to_gov_attr_set(struct kobject *kobj)
{
return container_of(kobj, struct gov_attr_set, kobj);
}
void gov_attr_set_init(struct gov_attr_set *attr_set, struct list_head *list_node);
void gov_attr_set_get(struct gov_attr_set *attr_set, struct list_head *list_node);
unsigned int gov_attr_set_put(struct gov_attr_set *attr_set, struct list_head *list_node);

View File

@ -539,7 +539,7 @@ ATTRIBUTE_GROUPS(sugov);
static void sugov_tunables_free(struct kobject *kobj)
{
struct gov_attr_set *attr_set = container_of(kobj, struct gov_attr_set, kobj);
struct gov_attr_set *attr_set = to_gov_attr_set(kobj);
kfree(to_sugov_tunables(attr_set));
}