Merge branches 'pm-sleep', 'pm-domains' and 'pm-docs'

Merge changes related to system sleep, PM domains changes and power
management documentation changes for 5.18-rc1:

 - Fix load_image_and_restore() error path (Ye Bin).

 - Fix typos in comments in the system wakeup hadling code (Tom Rix).

 - Clean up non-kernel-doc comments in hibernation code (Jiapeng
   Chong).

 - Fix __setup handler error handling in system-wide suspend and
   hibernation core code (Randy Dunlap).

 - Add device name to suspend_report_result() (Youngjin Jang).

 - Make virtual guests honour ACPI S4 hardware signature by
   default (David Woodhouse).

 - Block power off of a parent PM domain unless child is in deepest
   state (Ulf Hansson).

 - Use dev_err_probe() to simplify error handling for generic PM
   domains (Ahmad Fatoum).

 - Fix sleep-in-atomic bug caused by genpd_debug_remove() (Shawn Guo).

 - Document Intel uncore frequency scaling (Srinivas Pandruvada).

* pm-sleep:
  PM: hibernate: Honour ACPI hardware signature by default for virtual guests
  PM: sleep: Add device name to suspend_report_result()
  PM: suspend: fix return value of __setup handler
  PM: hibernate: fix __setup handler error handling
  PM: hibernate: Clean up non-kernel-doc comments
  PM: sleep: wakeup: Fix typos in comments
  PM: hibernate: fix load_image_and_restore() error path

* pm-domains:
  PM: domains: Fix sleep-in-atomic bug caused by genpd_debug_remove()
  PM: domains: use dev_err_probe() to simplify error handling
  PM: domains: Prevent power off for parent unless child is in deepest state

* pm-docs:
  Documentation: admin-guide: pm: Document uncore frequency scaling
This commit is contained in:
Rafael J. Wysocki 2022-03-18 18:29:21 +01:00
commit dfad78e07e
16 changed files with 146 additions and 59 deletions

View File

@ -0,0 +1,60 @@
.. SPDX-License-Identifier: GPL-2.0
.. include:: <isonum.txt>
==============================
Intel Uncore Frequency Scaling
==============================
:Copyright: |copy| 2022 Intel Corporation
:Author: Srinivas Pandruvada <srinivas.pandruvada@linux.intel.com>
Introduction
------------
The uncore can consume significant amount of power in Intel's Xeon servers based
on the workload characteristics. To optimize the total power and improve overall
performance, SoCs have internal algorithms for scaling uncore frequency. These
algorithms monitor workload usage of uncore and set a desirable frequency.
It is possible that users have different expectations of uncore performance and
want to have control over it. The objective is similar to allowing users to set
the scaling min/max frequencies via cpufreq sysfs to improve CPU performance.
Users may have some latency sensitive workloads where they do not want any
change to uncore frequency. Also, users may have workloads which require
different core and uncore performance at distinct phases and they may want to
use both cpufreq and the uncore scaling interface to distribute power and
improve overall performance.
Sysfs Interface
---------------
To control uncore frequency, a sysfs interface is provided in the directory:
`/sys/devices/system/cpu/intel_uncore_frequency/`.
There is one directory for each package and die combination as the scope of
uncore scaling control is per die in multiple die/package SoCs or per
package for single die per package SoCs. The name represents the
scope of control. For example: 'package_00_die_00' is for package id 0 and
die 0.
Each package_*_die_* contains the following attributes:
``initial_max_freq_khz``
Out of reset, this attribute represent the maximum possible frequency.
This is a read-only attribute. If users adjust max_freq_khz,
they can always go back to maximum using the value from this attribute.
``initial_min_freq_khz``
Out of reset, this attribute represent the minimum possible frequency.
This is a read-only attribute. If users adjust min_freq_khz,
they can always go back to minimum using the value from this attribute.
``max_freq_khz``
This attribute is used to set the maximum uncore frequency.
``min_freq_khz``
This attribute is used to set the minimum uncore frequency.
``current_freq_khz``
This attribute is used to get the current uncore frequency.

View File

@ -15,3 +15,4 @@ Working-State Power Management
cpufreq_drivers
intel_epb
intel-speed-select
intel_uncore_frequency_scaling

View File

@ -15,6 +15,7 @@
#include <asm/desc.h>
#include <asm/cacheflush.h>
#include <asm/realmode.h>
#include <asm/hypervisor.h>
#include <linux/ftrace.h>
#include "../../realmode/rm/wakeup.h"
@ -140,9 +141,9 @@ static int __init acpi_sleep_setup(char *str)
acpi_realmode_flags |= 4;
#ifdef CONFIG_HIBERNATION
if (strncmp(str, "s4_hwsig", 8) == 0)
acpi_check_s4_hw_signature(1);
acpi_check_s4_hw_signature = 1;
if (strncmp(str, "s4_nohwsig", 10) == 0)
acpi_check_s4_hw_signature(0);
acpi_check_s4_hw_signature = 0;
#endif
if (strncmp(str, "nonvs", 5) == 0)
acpi_nvs_nosave();
@ -160,3 +161,21 @@ static int __init acpi_sleep_setup(char *str)
}
__setup("acpi_sleep=", acpi_sleep_setup);
#if defined(CONFIG_HIBERNATION) && defined(CONFIG_HYPERVISOR_GUEST)
static int __init init_s4_sigcheck(void)
{
/*
* If running on a hypervisor, honour the ACPI specification
* by default and trigger a clean reboot when the hardware
* signature in FACS is changed after hibernation.
*/
if (acpi_check_s4_hw_signature == -1 &&
!hypervisor_is_type(X86_HYPER_NATIVE))
acpi_check_s4_hw_signature = 1;
return 0;
}
/* This must happen before acpi_init() which is a subsys initcall */
arch_initcall(init_s4_sigcheck);
#endif

View File

@ -869,12 +869,7 @@ static inline void acpi_sleep_syscore_init(void) {}
#ifdef CONFIG_HIBERNATION
static unsigned long s4_hardware_signature;
static struct acpi_table_facs *facs;
static int sigcheck = -1; /* Default behaviour is just to warn */
void __init acpi_check_s4_hw_signature(int check)
{
sigcheck = check;
}
int acpi_check_s4_hw_signature = -1; /* Default behaviour is just to warn */
static int acpi_hibernation_begin(pm_message_t stage)
{
@ -999,7 +994,7 @@ static void acpi_sleep_hibernate_setup(void)
hibernation_set_ops(old_suspend_ordering ?
&acpi_hibernation_ops_old : &acpi_hibernation_ops);
sleep_states[ACPI_STATE_S4] = 1;
if (!sigcheck)
if (!acpi_check_s4_hw_signature)
return;
acpi_get_table(ACPI_SIG_FACS, 1, (struct acpi_table_header **)&facs);
@ -1011,7 +1006,7 @@ static void acpi_sleep_hibernate_setup(void)
*/
s4_hardware_signature = facs->hardware_signature;
if (sigcheck > 0) {
if (acpi_check_s4_hw_signature > 0) {
/*
* If we're actually obeying the ACPI specification
* then the signature is written out as part of the

View File

@ -636,6 +636,18 @@ static int genpd_power_off(struct generic_pm_domain *genpd, bool one_dev_on,
atomic_read(&genpd->sd_count) > 0)
return -EBUSY;
/*
* The children must be in their deepest (powered-off) states to allow
* the parent to be powered off. Note that, there's no need for
* additional locking, as powering on a child, requires the parent's
* lock to be acquired first.
*/
list_for_each_entry(link, &genpd->parent_links, parent_node) {
struct generic_pm_domain *child = link->child;
if (child->state_idx < child->state_count - 1)
return -EBUSY;
}
list_for_each_entry(pdd, &genpd->dev_list, list_node) {
enum pm_qos_flags_status stat;
@ -1073,6 +1085,13 @@ static void genpd_sync_power_off(struct generic_pm_domain *genpd, bool use_lock,
|| atomic_read(&genpd->sd_count) > 0)
return;
/* Check that the children are in their deepest (powered-off) state. */
list_for_each_entry(link, &genpd->parent_links, parent_node) {
struct generic_pm_domain *child = link->child;
if (child->state_idx < child->state_count - 1)
return;
}
/* Choose the deepest state when suspending */
genpd->state_idx = genpd->state_count - 1;
if (_genpd_power_off(genpd, false))
@ -2058,9 +2077,9 @@ static int genpd_remove(struct generic_pm_domain *genpd)
kfree(link);
}
genpd_debug_remove(genpd);
list_del(&genpd->gpd_list_node);
genpd_unlock(genpd);
genpd_debug_remove(genpd);
cancel_work_sync(&genpd->power_off_work);
if (genpd_is_cpu_domain(genpd))
free_cpumask_var(genpd->cpus);
@ -2248,12 +2267,8 @@ int of_genpd_add_provider_simple(struct device_node *np,
/* Parse genpd OPP table */
if (genpd->set_performance_state) {
ret = dev_pm_opp_of_add_table(&genpd->dev);
if (ret) {
if (ret != -EPROBE_DEFER)
dev_err(&genpd->dev, "Failed to add OPP table: %d\n",
ret);
return ret;
}
if (ret)
return dev_err_probe(&genpd->dev, ret, "Failed to add OPP table\n");
/*
* Save table for faster processing while setting performance
@ -2312,9 +2327,8 @@ int of_genpd_add_provider_onecell(struct device_node *np,
if (genpd->set_performance_state) {
ret = dev_pm_opp_of_add_table_indexed(&genpd->dev, i);
if (ret) {
if (ret != -EPROBE_DEFER)
dev_err(&genpd->dev, "Failed to add OPP table for index %d: %d\n",
i, ret);
dev_err_probe(&genpd->dev, ret,
"Failed to add OPP table for index %d\n", i);
goto error;
}
@ -2672,12 +2686,8 @@ static int __genpd_dev_pm_attach(struct device *dev, struct device *base_dev,
ret = genpd_add_device(pd, dev, base_dev);
mutex_unlock(&gpd_list_lock);
if (ret < 0) {
if (ret != -EPROBE_DEFER)
dev_err(dev, "failed to add to PM domain %s: %d",
pd->name, ret);
return ret;
}
if (ret < 0)
return dev_err_probe(dev, ret, "failed to add to PM domain %s\n", pd->name);
dev->pm_domain->detach = genpd_dev_pm_detach;
dev->pm_domain->sync = genpd_dev_pm_sync;

View File

@ -485,7 +485,7 @@ static int dpm_run_callback(pm_callback_t cb, struct device *dev,
trace_device_pm_callback_start(dev, info, state.event);
error = cb(dev);
trace_device_pm_callback_end(dev, error);
suspend_report_result(cb, error);
suspend_report_result(dev, cb, error);
initcall_debug_report(dev, calltime, cb, error);
@ -1568,7 +1568,7 @@ static int legacy_suspend(struct device *dev, pm_message_t state,
trace_device_pm_callback_start(dev, info, state.event);
error = cb(dev, state);
trace_device_pm_callback_end(dev, error);
suspend_report_result(cb, error);
suspend_report_result(dev, cb, error);
initcall_debug_report(dev, calltime, cb, error);
@ -1855,7 +1855,7 @@ unlock:
device_unlock(dev);
if (ret < 0) {
suspend_report_result(callback, ret);
suspend_report_result(dev, callback, ret);
pm_runtime_put(dev);
return ret;
}
@ -1960,10 +1960,10 @@ int dpm_suspend_start(pm_message_t state)
}
EXPORT_SYMBOL_GPL(dpm_suspend_start);
void __suspend_report_result(const char *function, void *fn, int ret)
void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret)
{
if (ret)
pr_err("%s(): %pS returns %d\n", function, fn, ret);
dev_err(dev, "%s(): %pS returns %d\n", function, fn, ret);
}
EXPORT_SYMBOL_GPL(__suspend_report_result);

View File

@ -289,7 +289,7 @@ EXPORT_SYMBOL_GPL(dev_pm_disable_wake_irq);
*
* Enables wakeirq conditionally. We need to enable wake-up interrupt
* lazily on the first rpm_suspend(). This is needed as the consumer device
* starts in RPM_SUSPENDED state, and the the first pm_runtime_get() would
* starts in RPM_SUSPENDED state, and the first pm_runtime_get() would
* otherwise try to disable already disabled wakeirq. The wake-up interrupt
* starts disabled with IRQ_NOAUTOEN set.
*

View File

@ -587,7 +587,7 @@ static bool wakeup_source_not_registered(struct wakeup_source *ws)
* @ws: Wakeup source to handle.
*
* Update the @ws' statistics and, if @ws has just been activated, notify the PM
* core of the event by incrementing the counter of of wakeup events being
* core of the event by incrementing the counter of the wakeup events being
* processed.
*/
static void wakeup_source_activate(struct wakeup_source *ws)
@ -733,7 +733,7 @@ static void wakeup_source_deactivate(struct wakeup_source *ws)
/*
* Increment the counter of registered wakeup events and decrement the
* couter of wakeup events in progress simultaneously.
* counter of wakeup events in progress simultaneously.
*/
cec = atomic_add_return(MAX_IN_PROGRESS, &combined_event_count);
trace_wakeup_source_deactivate(ws->name, cec);

View File

@ -596,7 +596,7 @@ static int pci_legacy_suspend(struct device *dev, pm_message_t state)
int error;
error = drv->suspend(pci_dev, state);
suspend_report_result(drv->suspend, error);
suspend_report_result(dev, drv->suspend, error);
if (error)
return error;
@ -775,7 +775,7 @@ static int pci_pm_suspend(struct device *dev)
int error;
error = pm->suspend(dev);
suspend_report_result(pm->suspend, error);
suspend_report_result(dev, pm->suspend, error);
if (error)
return error;
@ -821,7 +821,7 @@ static int pci_pm_suspend_noirq(struct device *dev)
int error;
error = pm->suspend_noirq(dev);
suspend_report_result(pm->suspend_noirq, error);
suspend_report_result(dev, pm->suspend_noirq, error);
if (error)
return error;
@ -1010,7 +1010,7 @@ static int pci_pm_freeze(struct device *dev)
int error;
error = pm->freeze(dev);
suspend_report_result(pm->freeze, error);
suspend_report_result(dev, pm->freeze, error);
if (error)
return error;
}
@ -1030,7 +1030,7 @@ static int pci_pm_freeze_noirq(struct device *dev)
int error;
error = pm->freeze_noirq(dev);
suspend_report_result(pm->freeze_noirq, error);
suspend_report_result(dev, pm->freeze_noirq, error);
if (error)
return error;
}
@ -1116,7 +1116,7 @@ static int pci_pm_poweroff(struct device *dev)
int error;
error = pm->poweroff(dev);
suspend_report_result(pm->poweroff, error);
suspend_report_result(dev, pm->poweroff, error);
if (error)
return error;
}
@ -1154,7 +1154,7 @@ static int pci_pm_poweroff_noirq(struct device *dev)
int error;
error = pm->poweroff_noirq(dev);
suspend_report_result(pm->poweroff_noirq, error);
suspend_report_result(dev, pm->poweroff_noirq, error);
if (error)
return error;
}

View File

@ -171,7 +171,7 @@ static int __pnp_bus_suspend(struct device *dev, pm_message_t state)
if (pnp_drv->driver.pm && pnp_drv->driver.pm->suspend) {
error = pnp_drv->driver.pm->suspend(dev);
suspend_report_result(pnp_drv->driver.pm->suspend, error);
suspend_report_result(dev, pnp_drv->driver.pm->suspend, error);
if (error)
return error;
}

View File

@ -446,7 +446,7 @@ static int suspend_common(struct device *dev, bool do_wakeup)
HCD_WAKEUP_PENDING(hcd->shared_hcd))
return -EBUSY;
retval = hcd->driver->pci_suspend(hcd, do_wakeup);
suspend_report_result(hcd->driver->pci_suspend, retval);
suspend_report_result(dev, hcd->driver->pci_suspend, retval);
/* Check again in case wakeup raced with pci_suspend */
if ((retval == 0 && do_wakeup && HCD_WAKEUP_PENDING(hcd)) ||
@ -556,7 +556,7 @@ static int hcd_pci_suspend_noirq(struct device *dev)
dev_dbg(dev, "--> PCI %s\n",
pci_power_name(pci_dev->current_state));
} else {
suspend_report_result(pci_prepare_to_sleep, retval);
suspend_report_result(dev, pci_prepare_to_sleep, retval);
return retval;
}

View File

@ -526,7 +526,7 @@ acpi_status acpi_release_memory(acpi_handle handle, struct resource *res,
int acpi_resources_are_enforced(void);
#ifdef CONFIG_HIBERNATION
void __init acpi_check_s4_hw_signature(int check);
extern int acpi_check_s4_hw_signature;
#endif
#ifdef CONFIG_PM_SLEEP

View File

@ -770,11 +770,11 @@ extern int dpm_suspend_late(pm_message_t state);
extern int dpm_suspend(pm_message_t state);
extern int dpm_prepare(pm_message_t state);
extern void __suspend_report_result(const char *function, void *fn, int ret);
extern void __suspend_report_result(const char *function, struct device *dev, void *fn, int ret);
#define suspend_report_result(fn, ret) \
#define suspend_report_result(dev, fn, ret) \
do { \
__suspend_report_result(__func__, fn, ret); \
__suspend_report_result(__func__, dev, fn, ret); \
} while (0)
extern int device_pm_wait_for_dev(struct device *sub, struct device *dev);
@ -814,7 +814,7 @@ static inline int dpm_suspend_start(pm_message_t state)
return 0;
}
#define suspend_report_result(fn, ret) do {} while (0)
#define suspend_report_result(dev, fn, ret) do {} while (0)
static inline int device_pm_wait_for_dev(struct device *a, struct device *b)
{

View File

@ -689,8 +689,10 @@ static int load_image_and_restore(void)
lock_device_hotplug();
error = create_basic_memory_bitmaps();
if (error)
if (error) {
swsusp_close(FMODE_READ | FMODE_EXCL);
goto Unlock;
}
error = swsusp_read(&flags);
swsusp_close(FMODE_READ | FMODE_EXCL);
@ -1328,7 +1330,7 @@ static int __init resumedelay_setup(char *str)
int rc = kstrtouint(str, 0, &resume_delay);
if (rc)
return rc;
pr_warn("resumedelay: bad option string '%s'\n", str);
return 1;
}

View File

@ -157,22 +157,22 @@ static int __init setup_test_suspend(char *value)
value++;
suspend_type = strsep(&value, ",");
if (!suspend_type)
return 0;
return 1;
repeat = strsep(&value, ",");
if (repeat) {
if (kstrtou32(repeat, 0, &test_repeat_count_max))
return 0;
return 1;
}
for (i = PM_SUSPEND_MIN; i < PM_SUSPEND_MAX; i++)
if (!strcmp(pm_labels[i], suspend_type)) {
test_state_label = pm_labels[i];
return 0;
return 1;
}
printk(warn_bad_state, suspend_type);
return 0;
return 1;
}
__setup("test_suspend", setup_test_suspend);

View File

@ -89,7 +89,7 @@ struct swap_map_page_list {
struct swap_map_page_list *next;
};
/**
/*
* The swap_map_handle structure is used for handling swap in
* a file-alike way
*/
@ -117,7 +117,7 @@ struct swsusp_header {
static struct swsusp_header *swsusp_header;
/**
/*
* The following functions are used for tracing the allocated
* swap pages, so that they can be freed in case of an error.
*/
@ -171,7 +171,7 @@ static int swsusp_extents_insert(unsigned long swap_offset)
return 0;
}
/**
/*
* alloc_swapdev_block - allocate a swap page and register that it has
* been allocated, so that it can be freed in case of an error.
*/
@ -190,7 +190,7 @@ sector_t alloc_swapdev_block(int swap)
return 0;
}
/**
/*
* free_all_swap_pages - free swap pages allocated for saving image data.
* It also frees the extents used to register which swap entries had been
* allocated.