Merge branches 'pm-cpufreq' and 'pm-cpuidle'

Merge cpufreq and cpuidle updates for 6.5-rc1:

 - Prevent cpufreq drivers that provide the ->adjust_perf() callback
   without a ->fast_switch() one which is used as a fallback from the
   former in some cases (Wyes Karny).

 - Fix some issues related to the AMD P-state cpufreq driver (Mario
   Limonciello, Wyes Karny).

 - Fix the energy_performance_preference attribute handling in the
   intel_pstate driver in passive mode (Tero Kristo).

 - Clean up the intel_idle driver, make it work with VM guests that
   cannot use the MWAIT instruction and address the case in which the
   host may enter a deep idle state when the guest is idle (Arjan van
   de Ven).

* pm-cpufreq:
  cpufreq: intel_pstate: Fix energy_performance_preference for passive
  cpufreq: amd-pstate: Add a kernel config option to set default mode
  cpufreq: amd-pstate: Set a fallback policy based on preferred_profile
  ACPI: CPPC: Add definition for undefined FADT preferred PM profile value
  cpufreq: amd-pstate: Set default governor to schedutil
  cpufreq: amd-pstate: Make amd-pstate EPP driver name hyphenated
  cpufreq: amd-pstate: Write CPPC enable bit per-socket
  cpufreq: Fail driver register if it has adjust_perf without fast_switch

* pm-cpuidle:
  intel_idle: Add a "Long HLT" C1 state for the VM guest mode
  intel_idle: Add support for using intel_idle in a VM guest using just hlt
  intel_idle: clean up the (new) state_update_enter_method function
  intel_idle: refactor state->enter manipulation into its own function
This commit is contained in:
Rafael J. Wysocki 2023-06-26 17:34:01 +02:00
commit 4af191d60d
9 changed files with 341 additions and 57 deletions

View File

@ -38,7 +38,7 @@ choice
prompt "Default CPUFreq governor"
default CPU_FREQ_DEFAULT_GOV_USERSPACE if ARM_SA1110_CPUFREQ
default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if ARM64 || ARM
default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if X86_INTEL_PSTATE && SMP
default CPU_FREQ_DEFAULT_GOV_SCHEDUTIL if (X86_INTEL_PSTATE || X86_AMD_PSTATE) && SMP
default CPU_FREQ_DEFAULT_GOV_PERFORMANCE
help
This option sets which CPUFreq governor shall be loaded at

View File

@ -51,6 +51,23 @@ config X86_AMD_PSTATE
If in doubt, say N.
config X86_AMD_PSTATE_DEFAULT_MODE
int "AMD Processor P-State default mode"
depends on X86_AMD_PSTATE
default 3 if X86_AMD_PSTATE
range 1 4
help
Select the default mode the amd-pstate driver will use on
supported hardware.
The value set has the following meanings:
1 -> Disabled
2 -> Passive
3 -> Active (EPP)
4 -> Guided
For details, take a look at:
<file:Documentation/admin-guide/pm/amd-pstate.rst>.
config X86_AMD_PSTATE_UT
tristate "selftest for AMD Processor P-State driver"
depends on X86 && ACPI_PROCESSOR

View File

@ -62,7 +62,8 @@
static struct cpufreq_driver *current_pstate_driver;
static struct cpufreq_driver amd_pstate_driver;
static struct cpufreq_driver amd_pstate_epp_driver;
static int cppc_state = AMD_PSTATE_DISABLE;
static int cppc_state = AMD_PSTATE_UNDEFINED;
static bool cppc_enabled;
/*
* AMD Energy Preference Performance (EPP)
@ -228,7 +229,28 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata,
static inline int pstate_enable(bool enable)
{
return wrmsrl_safe(MSR_AMD_CPPC_ENABLE, enable);
int ret, cpu;
unsigned long logical_proc_id_mask = 0;
if (enable == cppc_enabled)
return 0;
for_each_present_cpu(cpu) {
unsigned long logical_id = topology_logical_die_id(cpu);
if (test_bit(logical_id, &logical_proc_id_mask))
continue;
set_bit(logical_id, &logical_proc_id_mask);
ret = wrmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_ENABLE,
enable);
if (ret)
return ret;
}
cppc_enabled = enable;
return 0;
}
static int cppc_enable(bool enable)
@ -236,6 +258,9 @@ static int cppc_enable(bool enable)
int cpu, ret = 0;
struct cppc_perf_ctrls perf_ctrls;
if (enable == cppc_enabled)
return 0;
for_each_present_cpu(cpu) {
ret = cppc_set_enable(cpu, enable);
if (ret)
@ -251,6 +276,7 @@ static int cppc_enable(bool enable)
}
}
cppc_enabled = enable;
return ret;
}
@ -1045,6 +1071,26 @@ static const struct attribute_group amd_pstate_global_attr_group = {
.attrs = pstate_global_attributes,
};
static bool amd_pstate_acpi_pm_profile_server(void)
{
switch (acpi_gbl_FADT.preferred_profile) {
case PM_ENTERPRISE_SERVER:
case PM_SOHO_SERVER:
case PM_PERFORMANCE_SERVER:
return true;
}
return false;
}
static bool amd_pstate_acpi_pm_profile_undefined(void)
{
if (acpi_gbl_FADT.preferred_profile == PM_UNSPECIFIED)
return true;
if (acpi_gbl_FADT.preferred_profile >= NR_PM_PROFILES)
return true;
return false;
}
static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
{
int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret;
@ -1102,10 +1148,14 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy)
policy->max = policy->cpuinfo.max_freq;
/*
* Set the policy to powersave to provide a valid fallback value in case
* Set the policy to provide a valid fallback value in case
* the default cpufreq governor is neither powersave nor performance.
*/
policy->policy = CPUFREQ_POLICY_POWERSAVE;
if (amd_pstate_acpi_pm_profile_server() ||
amd_pstate_acpi_pm_profile_undefined())
policy->policy = CPUFREQ_POLICY_PERFORMANCE;
else
policy->policy = CPUFREQ_POLICY_POWERSAVE;
if (boot_cpu_has(X86_FEATURE_CPPC)) {
ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value);
@ -1356,10 +1406,29 @@ static struct cpufreq_driver amd_pstate_epp_driver = {
.online = amd_pstate_epp_cpu_online,
.suspend = amd_pstate_epp_suspend,
.resume = amd_pstate_epp_resume,
.name = "amd_pstate_epp",
.name = "amd-pstate-epp",
.attr = amd_pstate_epp_attr,
};
static int __init amd_pstate_set_driver(int mode_idx)
{
if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
cppc_state = mode_idx;
if (cppc_state == AMD_PSTATE_DISABLE)
pr_info("driver is explicitly disabled\n");
if (cppc_state == AMD_PSTATE_ACTIVE)
current_pstate_driver = &amd_pstate_epp_driver;
if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
current_pstate_driver = &amd_pstate_driver;
return 0;
}
return -EINVAL;
}
static int __init amd_pstate_init(void)
{
struct device *dev_root;
@ -1367,15 +1436,6 @@ static int __init amd_pstate_init(void)
if (boot_cpu_data.x86_vendor != X86_VENDOR_AMD)
return -ENODEV;
/*
* by default the pstate driver is disabled to load
* enable the amd_pstate passive mode driver explicitly
* with amd_pstate=passive or other modes in kernel command line
*/
if (cppc_state == AMD_PSTATE_DISABLE) {
pr_info("driver load is disabled, boot with specific mode to enable this\n");
return -ENODEV;
}
if (!acpi_cpc_valid()) {
pr_warn_once("the _CPC object is not present in SBIOS or ACPI disabled\n");
@ -1386,6 +1446,33 @@ static int __init amd_pstate_init(void)
if (cpufreq_get_current_driver())
return -EEXIST;
switch (cppc_state) {
case AMD_PSTATE_UNDEFINED:
/* Disable on the following configs by default:
* 1. Undefined platforms
* 2. Server platforms
* 3. Shared memory designs
*/
if (amd_pstate_acpi_pm_profile_undefined() ||
amd_pstate_acpi_pm_profile_server() ||
!boot_cpu_has(X86_FEATURE_CPPC)) {
pr_info("driver load is disabled, boot with specific mode to enable this\n");
return -ENODEV;
}
ret = amd_pstate_set_driver(CONFIG_X86_AMD_PSTATE_DEFAULT_MODE);
if (ret)
return ret;
break;
case AMD_PSTATE_DISABLE:
return -ENODEV;
case AMD_PSTATE_PASSIVE:
case AMD_PSTATE_ACTIVE:
case AMD_PSTATE_GUIDED:
break;
default:
return -EINVAL;
}
/* capability check */
if (boot_cpu_has(X86_FEATURE_CPPC)) {
pr_debug("AMD CPPC MSR based functionality is supported\n");
@ -1438,21 +1525,7 @@ static int __init amd_pstate_param(char *str)
size = strlen(str);
mode_idx = get_mode_idx_from_str(str, size);
if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) {
cppc_state = mode_idx;
if (cppc_state == AMD_PSTATE_DISABLE)
pr_info("driver is explicitly disabled\n");
if (cppc_state == AMD_PSTATE_ACTIVE)
current_pstate_driver = &amd_pstate_epp_driver;
if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED)
current_pstate_driver = &amd_pstate_driver;
return 0;
}
return -EINVAL;
return amd_pstate_set_driver(mode_idx);
}
early_param("amd_pstate", amd_pstate_param);

View File

@ -2828,7 +2828,8 @@ int cpufreq_register_driver(struct cpufreq_driver *driver_data)
(driver_data->setpolicy && (driver_data->target_index ||
driver_data->target)) ||
(!driver_data->get_intermediate != !driver_data->target_intermediate) ||
(!driver_data->online != !driver_data->offline))
(!driver_data->online != !driver_data->offline) ||
(driver_data->adjust_perf && !driver_data->fast_switch))
return -EINVAL;
pr_debug("trying to register driver %s\n", driver_data->name);

View File

@ -824,6 +824,8 @@ static ssize_t store_energy_performance_preference(
err = cpufreq_start_governor(policy);
if (!ret)
ret = err;
} else {
ret = 0;
}
}

View File

@ -199,6 +199,43 @@ static __cpuidle int intel_idle_xstate(struct cpuidle_device *dev,
return __intel_idle(dev, drv, index);
}
static __always_inline int __intel_idle_hlt(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index)
{
raw_safe_halt();
raw_local_irq_disable();
return index;
}
/**
* intel_idle_hlt - Ask the processor to enter the given idle state using hlt.
* @dev: cpuidle device of the target CPU.
* @drv: cpuidle driver (assumed to point to intel_idle_driver).
* @index: Target idle state index.
*
* Use the HLT instruction to notify the processor that the CPU represented by
* @dev is idle and it can try to enter the idle state corresponding to @index.
*
* Must be called under local_irq_disable().
*/
static __cpuidle int intel_idle_hlt(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index)
{
return __intel_idle_hlt(dev, drv, index);
}
static __cpuidle int intel_idle_hlt_irq_on(struct cpuidle_device *dev,
struct cpuidle_driver *drv, int index)
{
int ret;
raw_local_irq_enable();
ret = __intel_idle_hlt(dev, drv, index);
raw_local_irq_disable();
return ret;
}
/**
* intel_idle_s2idle - Ask the processor to enter the given idle state.
* @dev: cpuidle device of the target CPU.
@ -1242,6 +1279,25 @@ static struct cpuidle_state snr_cstates[] __initdata = {
.enter = NULL }
};
static struct cpuidle_state vmguest_cstates[] __initdata = {
{
.name = "C1",
.desc = "HLT",
.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_IRQ_ENABLE,
.exit_latency = 5,
.target_residency = 10,
.enter = &intel_idle_hlt, },
{
.name = "C1L",
.desc = "Long HLT",
.flags = MWAIT2flg(0x00) | CPUIDLE_FLAG_TLB_FLUSHED,
.exit_latency = 5,
.target_residency = 200,
.enter = &intel_idle_hlt, },
{
.enter = NULL }
};
static const struct idle_cpu idle_cpu_nehalem __initconst = {
.state_table = nehalem_cstates,
.auto_demotion_disable_flags = NHM_C1_AUTO_DEMOTE | NHM_C3_AUTO_DEMOTE,
@ -1839,6 +1895,66 @@ static bool __init intel_idle_verify_cstate(unsigned int mwait_hint)
return true;
}
static void state_update_enter_method(struct cpuidle_state *state, int cstate)
{
if (state->enter == intel_idle_hlt) {
if (force_irq_on) {
pr_info("forced intel_idle_irq for state %d\n", cstate);
state->enter = intel_idle_hlt_irq_on;
}
return;
}
if (state->enter == intel_idle_hlt_irq_on)
return; /* no update scenarios */
if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) {
/*
* Combining with XSTATE with IBRS or IRQ_ENABLE flags
* is not currently supported but this driver.
*/
WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IBRS);
WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
state->enter = intel_idle_xstate;
return;
}
if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
state->flags & CPUIDLE_FLAG_IBRS) {
/*
* IBRS mitigation requires that C-states are entered
* with interrupts disabled.
*/
WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
state->enter = intel_idle_ibrs;
return;
}
if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) {
state->enter = intel_idle_irq;
return;
}
if (force_irq_on) {
pr_info("forced intel_idle_irq for state %d\n", cstate);
state->enter = intel_idle_irq;
}
}
/*
* For mwait based states, we want to verify the cpuid data to see if the state
* is actually supported by this specific CPU.
* For non-mwait based states, this check should be skipped.
*/
static bool should_verify_mwait(struct cpuidle_state *state)
{
if (state->enter == intel_idle_hlt)
return false;
if (state->enter == intel_idle_hlt_irq_on)
return false;
return true;
}
static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
{
int cstate;
@ -1887,35 +2003,15 @@ static void __init intel_idle_init_cstates_icpu(struct cpuidle_driver *drv)
}
mwait_hint = flg2MWAIT(cpuidle_state_table[cstate].flags);
if (!intel_idle_verify_cstate(mwait_hint))
if (should_verify_mwait(&cpuidle_state_table[cstate]) && !intel_idle_verify_cstate(mwait_hint))
continue;
/* Structure copy. */
drv->states[drv->state_count] = cpuidle_state_table[cstate];
state = &drv->states[drv->state_count];
if (state->flags & CPUIDLE_FLAG_INIT_XSTATE) {
/*
* Combining with XSTATE with IBRS or IRQ_ENABLE flags
* is not currently supported but this driver.
*/
WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IBRS);
WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
state->enter = intel_idle_xstate;
} else if (cpu_feature_enabled(X86_FEATURE_KERNEL_IBRS) &&
state->flags & CPUIDLE_FLAG_IBRS) {
/*
* IBRS mitigation requires that C-states are entered
* with interrupts disabled.
*/
WARN_ON_ONCE(state->flags & CPUIDLE_FLAG_IRQ_ENABLE);
state->enter = intel_idle_ibrs;
} else if (state->flags & CPUIDLE_FLAG_IRQ_ENABLE) {
state->enter = intel_idle_irq;
} else if (force_irq_on) {
pr_info("forced intel_idle_irq for state %d\n", cstate);
state->enter = intel_idle_irq;
}
state_update_enter_method(state, cstate);
if ((disabled_states_mask & BIT(drv->state_count)) ||
((icpu->use_acpi || force_use_acpi) &&
@ -2041,6 +2137,93 @@ static void __init intel_idle_cpuidle_devices_uninit(void)
cpuidle_unregister_device(per_cpu_ptr(intel_idle_cpuidle_devices, i));
}
/*
* Match up the latency and break even point of the bare metal (cpu based)
* states with the deepest VM available state.
*
* We only want to do this for the deepest state, the ones that has
* the TLB_FLUSHED flag set on the .
*
* All our short idle states are dominated by vmexit/vmenter latencies,
* not the underlying hardware latencies so we keep our values for these.
*/
static void matchup_vm_state_with_baremetal(void)
{
int cstate;
for (cstate = 0; cstate < CPUIDLE_STATE_MAX; ++cstate) {
int matching_cstate;
if (intel_idle_max_cstate_reached(cstate))
break;
if (!cpuidle_state_table[cstate].enter)
break;
if (!(cpuidle_state_table[cstate].flags & CPUIDLE_FLAG_TLB_FLUSHED))
continue;
for (matching_cstate = 0; matching_cstate < CPUIDLE_STATE_MAX; ++matching_cstate) {
if (!icpu->state_table[matching_cstate].enter)
break;
if (icpu->state_table[matching_cstate].exit_latency > cpuidle_state_table[cstate].exit_latency) {
cpuidle_state_table[cstate].exit_latency = icpu->state_table[matching_cstate].exit_latency;
cpuidle_state_table[cstate].target_residency = icpu->state_table[matching_cstate].target_residency;
}
}
}
}
static int __init intel_idle_vminit(const struct x86_cpu_id *id)
{
int retval;
cpuidle_state_table = vmguest_cstates;
icpu = (const struct idle_cpu *)id->driver_data;
pr_debug("v" INTEL_IDLE_VERSION " model 0x%X\n",
boot_cpu_data.x86_model);
intel_idle_cpuidle_devices = alloc_percpu(struct cpuidle_device);
if (!intel_idle_cpuidle_devices)
return -ENOMEM;
/*
* We don't know exactly what the host will do when we go idle, but as a worst estimate
* we can assume that the exit latency of the deepest host state will be hit for our
* deep (long duration) guest idle state.
* The same logic applies to the break even point for the long duration guest idle state.
* So lets copy these two properties from the table we found for the host CPU type.
*/
matchup_vm_state_with_baremetal();
intel_idle_cpuidle_driver_init(&intel_idle_driver);
retval = cpuidle_register_driver(&intel_idle_driver);
if (retval) {
struct cpuidle_driver *drv = cpuidle_get_driver();
printk(KERN_DEBUG pr_fmt("intel_idle yielding to %s\n"),
drv ? drv->name : "none");
goto init_driver_fail;
}
retval = cpuhp_setup_state(CPUHP_AP_ONLINE_DYN, "idle/intel:online",
intel_idle_cpu_online, NULL);
if (retval < 0)
goto hp_setup_fail;
return 0;
hp_setup_fail:
intel_idle_cpuidle_devices_uninit();
cpuidle_unregister_driver(&intel_idle_driver);
init_driver_fail:
free_percpu(intel_idle_cpuidle_devices);
return retval;
}
static int __init intel_idle_init(void)
{
const struct x86_cpu_id *id;
@ -2059,6 +2242,8 @@ static int __init intel_idle_init(void)
id = x86_match_cpu(intel_idle_ids);
if (id) {
if (!boot_cpu_has(X86_FEATURE_MWAIT)) {
if (boot_cpu_has(X86_FEATURE_HYPERVISOR))
return intel_idle_vminit(id);
pr_debug("Please enable MWAIT in BIOS SETUP\n");
return -ENODEV;
}

View File

@ -307,7 +307,8 @@ enum acpi_preferred_pm_profiles {
PM_SOHO_SERVER = 5,
PM_APPLIANCE_PC = 6,
PM_PERFORMANCE_SERVER = 7,
PM_TABLET = 8
PM_TABLET = 8,
NR_PM_PROFILES = 9
};
/* Values for sleep_status and sleep_control registers (V5+ FADT) */

View File

@ -94,7 +94,8 @@ struct amd_cpudata {
* enum amd_pstate_mode - driver working mode of amd pstate
*/
enum amd_pstate_mode {
AMD_PSTATE_DISABLE = 0,
AMD_PSTATE_UNDEFINED = 0,
AMD_PSTATE_DISABLE,
AMD_PSTATE_PASSIVE,
AMD_PSTATE_ACTIVE,
AMD_PSTATE_GUIDED,
@ -102,6 +103,7 @@ enum amd_pstate_mode {
};
static const char * const amd_pstate_mode_string[] = {
[AMD_PSTATE_UNDEFINED] = "undefined",
[AMD_PSTATE_DISABLE] = "disable",
[AMD_PSTATE_PASSIVE] = "passive",
[AMD_PSTATE_ACTIVE] = "active",

View File

@ -340,7 +340,10 @@ struct cpufreq_driver {
/*
* ->fast_switch() replacement for drivers that use an internal
* representation of performance levels and can pass hints other than
* the target performance level to the hardware.
* the target performance level to the hardware. This can only be set
* if ->fast_switch is set too, because in those cases (under specific
* conditions) scale invariance can be disabled, which causes the
* schedutil governor to fall back to the latter.
*/
void (*adjust_perf)(unsigned int cpu,
unsigned long min_perf,