From e23feb16685a8d1c62aa5bba7ebcddf4ba57ffcb Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 11 Oct 2013 16:54:55 -0700 Subject: [PATCH 1/7] PowerCap: Documentation Added power cap framework documentation. This explains the use of power capping framework, sysfs and programming interface. There are two documents: - Documentation/power/powercap/powercap.txt : Explains use case and APIs. - Documentation/ABI/testing/sysfs-class-powercap: Explains ABIs. Signed-off-by: Srinivas Pandruvada Signed-off-by: Jacob Pan Signed-off-by: Arjan van de Ven Reviewed-by: Rafael J. Wysocki Reviewed-by: Len Brown Signed-off-by: Rafael J. Wysocki --- .../ABI/testing/sysfs-class-powercap | 152 +++++++++++ Documentation/power/powercap/powercap.txt | 236 ++++++++++++++++++ 2 files changed, 388 insertions(+) create mode 100644 Documentation/ABI/testing/sysfs-class-powercap create mode 100644 Documentation/power/powercap/powercap.txt diff --git a/Documentation/ABI/testing/sysfs-class-powercap b/Documentation/ABI/testing/sysfs-class-powercap new file mode 100644 index 000000000000..db3b3ff70d84 --- /dev/null +++ b/Documentation/ABI/testing/sysfs-class-powercap @@ -0,0 +1,152 @@ +What: /sys/class/powercap/ +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + The powercap/ class sub directory belongs to the power cap + subsystem. Refer to + Documentation/power/powercap/powercap.txt for details. + +What: /sys/class/powercap/ +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + A is a unique name under /sys/class/powercap. + Here determines how the power is going to be + controlled. A can contain multiple power zones. + +What: /sys/class/powercap//enabled +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + This allows to enable/disable power capping for a "control type". + This status affects every power zone using this "control_type. + +What: /sys/class/powercap// +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + A power zone is a single or a collection of devices, which can + be independently monitored and controlled. A power zone sysfs + entry is qualified with the name of the . + E.g. intel-rapl:0:1:1. + +What: /sys/class/powercap/// +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Power zones may be organized in a hierarchy in which child + power zones provide monitoring and control for a subset of + devices under the parent. For example, if there is a parent + power zone for a whole CPU package, each CPU core in it can + be a child power zone. + +What: /sys/class/powercap/...//name +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Specifies the name of this power zone. + +What: /sys/class/powercap/...//energy_uj +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Current energy counter in micro-joules. Write "0" to reset. + If the counter can not be reset, then this attribute is + read-only. + +What: /sys/class/powercap/...//max_energy_range_uj +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Range of the above energy counter in micro-joules. + + +What: /sys/class/powercap/...//power_uw +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Current power in micro-watts. + +What: /sys/class/powercap/...//max_power_range_uw +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Range of the above power value in micro-watts. + +What: /sys/class/powercap/...//constraint_X_name +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Each power zone can define one or more constraints. Each + constraint can have an optional name. Here "X" can have values + from 0 to max integer. + +What: /sys/class/powercap/...//constraint_X_power_limit_uw +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Power limit in micro-watts should be applicable for + the time window specified by "constraint_X_time_window_us". + Here "X" can have values from 0 to max integer. + +What: /sys/class/powercap/...//constraint_X_time_window_us +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Time window in micro seconds. This is used along with + constraint_X_power_limit_uw to define a power constraint. + Here "X" can have values from 0 to max integer. + + +What: /sys/class/powercap//.../constraint_X_max_power_uw +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Maximum allowed power in micro watts for this constraint. + Here "X" can have values from 0 to max integer. + +What: /sys/class/powercap//.../constraint_X_min_power_uw +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Minimum allowed power in micro watts for this constraint. + Here "X" can have values from 0 to max integer. + +What: /sys/class/powercap/...//constraint_X_max_time_window_us +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Maximum allowed time window in micro seconds for this + constraint. Here "X" can have values from 0 to max integer. + +What: /sys/class/powercap/...//constraint_X_min_time_window_us +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description: + Minimum allowed time window in micro seconds for this + constraint. Here "X" can have values from 0 to max integer. + +What: /sys/class/powercap/...//enabled +Date: September 2013 +KernelVersion: 3.13 +Contact: linux-pm@vger.kernel.org +Description + This allows to enable/disable power capping at power zone level. + This applies to current power zone and its children. diff --git a/Documentation/power/powercap/powercap.txt b/Documentation/power/powercap/powercap.txt new file mode 100644 index 000000000000..1e6ef164e07a --- /dev/null +++ b/Documentation/power/powercap/powercap.txt @@ -0,0 +1,236 @@ +Power Capping Framework +================================== + +The power capping framework provides a consistent interface between the kernel +and the user space that allows power capping drivers to expose the settings to +user space in a uniform way. + +Terminology +========================= +The framework exposes power capping devices to user space via sysfs in the +form of a tree of objects. The objects at the root level of the tree represent +'control types', which correspond to different methods of power capping. For +example, the intel-rapl control type represents the Intel "Running Average +Power Limit" (RAPL) technology, whereas the 'idle-injection' control type +corresponds to the use of idle injection for controlling power. + +Power zones represent different parts of the system, which can be controlled and +monitored using the power capping method determined by the control type the +given zone belongs to. They each contain attributes for monitoring power, as +well as controls represented in the form of power constraints. If the parts of +the system represented by different power zones are hierarchical (that is, one +bigger part consists of multiple smaller parts that each have their own power +controls), those power zones may also be organized in a hierarchy with one +parent power zone containing multiple subzones and so on to reflect the power +control topology of the system. In that case, it is possible to apply power +capping to a set of devices together using the parent power zone and if more +fine grained control is required, it can be applied through the subzones. + + +Example sysfs interface tree: + +/sys/devices/virtual/powercap +??? intel-rapl + ??? intel-rapl:0 + ?   ??? constraint_0_name + ?   ??? constraint_0_power_limit_uw + ?   ??? constraint_0_time_window_us + ?   ??? constraint_1_name + ?   ??? constraint_1_power_limit_uw + ?   ??? constraint_1_time_window_us + ?   ??? device -> ../../intel-rapl + ?   ??? energy_uj + ?   ??? intel-rapl:0:0 + ?   ?   ??? constraint_0_name + ?   ?   ??? constraint_0_power_limit_uw + ?   ?   ??? constraint_0_time_window_us + ?   ?   ??? constraint_1_name + ?   ?   ??? constraint_1_power_limit_uw + ?   ?   ??? constraint_1_time_window_us + ?   ?   ??? device -> ../../intel-rapl:0 + ?   ?   ??? energy_uj + ?   ?   ??? max_energy_range_uj + ?   ?   ??? name + ?   ?   ??? enabled + ?   ?   ??? power + ?   ?   ?   ??? async + ?   ?   ?   [] + ?   ?   ??? subsystem -> ../../../../../../class/power_cap + ?   ?   ??? uevent + ?   ??? intel-rapl:0:1 + ?   ?   ??? constraint_0_name + ?   ?   ??? constraint_0_power_limit_uw + ?   ?   ??? constraint_0_time_window_us + ?   ?   ??? constraint_1_name + ?   ?   ??? constraint_1_power_limit_uw + ?   ?   ??? constraint_1_time_window_us + ?   ?   ??? device -> ../../intel-rapl:0 + ?   ?   ??? energy_uj + ?   ?   ??? max_energy_range_uj + ?   ?   ??? name + ?   ?   ??? enabled + ?   ?   ??? power + ?   ?   ?   ??? async + ?   ?   ?   [] + ?   ?   ??? subsystem -> ../../../../../../class/power_cap + ?   ?   ??? uevent + ?   ??? max_energy_range_uj + ?   ??? max_power_range_uw + ?   ??? name + ?   ??? enabled + ?   ??? power + ?   ?   ??? async + ?   ?   [] + ?   ??? subsystem -> ../../../../../class/power_cap + ?   ??? enabled + ?   ??? uevent + ??? intel-rapl:1 + ?   ??? constraint_0_name + ?   ??? constraint_0_power_limit_uw + ?   ??? constraint_0_time_window_us + ?   ??? constraint_1_name + ?   ??? constraint_1_power_limit_uw + ?   ??? constraint_1_time_window_us + ?   ??? device -> ../../intel-rapl + ?   ??? energy_uj + ?   ??? intel-rapl:1:0 + ?   ?   ??? constraint_0_name + ?   ?   ??? constraint_0_power_limit_uw + ?   ?   ??? constraint_0_time_window_us + ?   ?   ??? constraint_1_name + ?   ?   ??? constraint_1_power_limit_uw + ?   ?   ??? constraint_1_time_window_us + ?   ?   ??? device -> ../../intel-rapl:1 + ?   ?   ??? energy_uj + ?   ?   ??? max_energy_range_uj + ?   ?   ??? name + ?   ?   ??? enabled + ?   ?   ??? power + ?   ?   ?   ??? async + ?   ?   ?   [] + ?   ?   ??? subsystem -> ../../../../../../class/power_cap + ?   ?   ??? uevent + ?   ??? intel-rapl:1:1 + ?   ?   ??? constraint_0_name + ?   ?   ??? constraint_0_power_limit_uw + ?   ?   ??? constraint_0_time_window_us + ?   ?   ??? constraint_1_name + ?   ?   ??? constraint_1_power_limit_uw + ?   ?   ??? constraint_1_time_window_us + ?   ?   ??? device -> ../../intel-rapl:1 + ?   ?   ??? energy_uj + ?   ?   ??? max_energy_range_uj + ?   ?   ??? name + ?   ?   ??? enabled + ?   ?   ??? power + ?   ?   ?   ??? async + ?   ?   ?   [] + ?   ?   ??? subsystem -> ../../../../../../class/power_cap + ?   ?   ??? uevent + ?   ??? max_energy_range_uj + ?   ??? max_power_range_uw + ?   ??? name + ?   ??? enabled + ?   ??? power + ?   ?   ??? async + ?   ?   [] + ?   ??? subsystem -> ../../../../../class/power_cap + ?   ??? uevent + ??? power + ?   ??? async + ?   [] + ??? subsystem -> ../../../../class/power_cap + ??? enabled + ??? uevent + +The above example illustrates a case in which the Intel RAPL technology, +available in Intel® IA-64 and IA-32 Processor Architectures, is used. There is one +control type called intel-rapl which contains two power zones, intel-rapl:0 and +intel-rapl:1, representing CPU packages. Each of these power zones contains +two subzones, intel-rapl:j:0 and intel-rapl:j:1 (j = 0, 1), representing the +"core" and the "uncore" parts of the given CPU package, respectively. All of +the zones and subzones contain energy monitoring attributes (energy_uj, +max_energy_range_uj) and constraint attributes (constraint_*) allowing controls +to be applied (the constraints in the 'package' power zones apply to the whole +CPU packages and the subzone constraints only apply to the respective parts of +the given package individually). Since Intel RAPL doesn't provide instantaneous +power value, there is no power_uw attribute. + +In addition to that, each power zone contains a name attribute, allowing the +part of the system represented by that zone to be identified. +For example: + +cat /sys/class/power_cap/intel-rapl/intel-rapl:0/name +package-0 + +The Intel RAPL technology allows two constraints, short term and long term, +with two different time windows to be applied to each power zone. Thus for +each zone there are 2 attributes representing the constraint names, 2 power +limits and 2 attributes representing the sizes of the time windows. Such that, +constraint_j_* attributes correspond to the jth constraint (j = 0,1). + +For example: + constraint_0_name + constraint_0_power_limit_uw + constraint_0_time_window_us + constraint_1_name + constraint_1_power_limit_uw + constraint_1_time_window_us + +Power Zone Attributes +================================= +Monitoring attributes +---------------------- + +energy_uj (rw): Current energy counter in micro joules. Write "0" to reset. +If the counter can not be reset, then this attribute is read only. + +max_energy_range_uj (ro): Range of the above energy counter in micro-joules. + +power_uw (ro): Current power in micro watts. + +max_power_range_uw (ro): Range of the above power value in micro-watts. + +name (ro): Name of this power zone. + +It is possible that some domains have both power ranges and energy counter ranges; +however, only one is mandatory. + +Constraints +---------------- +constraint_X_power_limit_uw (rw): Power limit in micro watts, which should be +applicable for the time window specified by "constraint_X_time_window_us". + +constraint_X_time_window_us (rw): Time window in micro seconds. + +constraint_X_name (ro): An optional name of the constraint + +constraint_X_max_power_uw(ro): Maximum allowed power in micro watts. + +constraint_X_min_power_uw(ro): Minimum allowed power in micro watts. + +constraint_X_max_time_window_us(ro): Maximum allowed time window in micro seconds. + +constraint_X_min_time_window_us(ro): Minimum allowed time window in micro seconds. + +Except power_limit_uw and time_window_us other fields are optional. + +Common zone and control type attributes +---------------------------------------- +enabled (rw): Enable/Disable controls at zone level or for all zones using +a control type. + +Power Cap Client Driver Interface +================================== +The API summary: + +Call powercap_register_control_type() to register control type object. +Call powercap_register_zone() to register a power zone (under a given +control type), either as a top-level power zone or as a subzone of another +power zone registered earlier. +The number of constraints in a power zone and the corresponding callbacks have +to be defined prior to calling powercap_register_zone() to register that zone. + +To Free a power zone call powercap_unregister_zone(). +To free a control type object call powercap_unregister_control_type(). +Detailed API can be generated using kernel-doc on include/linux/powercap.h. From 75d2364ea0cab3a95be3f8d1f8dabd20ac4b1b2a Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 11 Oct 2013 16:54:56 -0700 Subject: [PATCH 2/7] PowerCap: Add class driver The power capping framework providing a consistent interface between the kernel and user space that allows power capping drivers to expose their settings to user space in a uniform way. The overall design of the framework is described in the documentation added by the previous patch in this series. Signed-off-by: Srinivas Pandruvada Signed-off-by: Jacob Pan Reviewed-by: Rafael J. Wysocki Signed-off-by: Rafael J. Wysocki --- drivers/powercap/Kconfig | 19 + drivers/powercap/Makefile | 1 + drivers/powercap/powercap_sys.c | 683 ++++++++++++++++++++++++++++++++ include/linux/powercap.h | 325 +++++++++++++++ 4 files changed, 1028 insertions(+) create mode 100644 drivers/powercap/Kconfig create mode 100644 drivers/powercap/Makefile create mode 100644 drivers/powercap/powercap_sys.c create mode 100644 include/linux/powercap.h diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig new file mode 100644 index 000000000000..a37055eb5ebc --- /dev/null +++ b/drivers/powercap/Kconfig @@ -0,0 +1,19 @@ +# +# Generic power capping sysfs interface configuration +# + +menuconfig POWERCAP + bool "Generic powercap sysfs driver" + help + The power capping sysfs interface allows kernel subsystems to expose power + capping settings to user space in a consistent way. Usually, it consists + of multiple control types that determine which settings may be exposed and + power zones representing parts of the system that can be subject to power + capping. + + If you want this code to be compiled in, say Y here. + +if POWERCAP +# Client driver configurations go here. + +endif diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile new file mode 100644 index 000000000000..6defbc8dc4bf --- /dev/null +++ b/drivers/powercap/Makefile @@ -0,0 +1 @@ +obj-$(CONFIG_POWERCAP) += powercap_sys.o diff --git a/drivers/powercap/powercap_sys.c b/drivers/powercap/powercap_sys.c new file mode 100644 index 000000000000..c22fa4c78eaa --- /dev/null +++ b/drivers/powercap/powercap_sys.c @@ -0,0 +1,683 @@ +/* + * Power capping class + * Copyright (c) 2013, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc. + * + */ + +#include +#include +#include +#include +#include + +#define to_powercap_zone(n) container_of(n, struct powercap_zone, dev) +#define to_powercap_control_type(n) \ + container_of(n, struct powercap_control_type, dev) + +/* Power zone show function */ +#define define_power_zone_show(_attr) \ +static ssize_t _attr##_show(struct device *dev, \ + struct device_attribute *dev_attr,\ + char *buf) \ +{ \ + u64 value; \ + ssize_t len = -EINVAL; \ + struct powercap_zone *power_zone = to_powercap_zone(dev); \ + \ + if (power_zone->ops->get_##_attr) { \ + if (!power_zone->ops->get_##_attr(power_zone, &value)) \ + len = sprintf(buf, "%lld\n", value); \ + } \ + \ + return len; \ +} + +/* The only meaningful input is 0 (reset), others are silently ignored */ +#define define_power_zone_store(_attr) \ +static ssize_t _attr##_store(struct device *dev,\ + struct device_attribute *dev_attr, \ + const char *buf, size_t count) \ +{ \ + int err; \ + struct powercap_zone *power_zone = to_powercap_zone(dev); \ + u64 value; \ + \ + err = kstrtoull(buf, 10, &value); \ + if (err) \ + return -EINVAL; \ + if (value) \ + return count; \ + if (power_zone->ops->reset_##_attr) { \ + if (!power_zone->ops->reset_##_attr(power_zone)) \ + return count; \ + } \ + \ + return -EINVAL; \ +} + +/* Power zone constraint show function */ +#define define_power_zone_constraint_show(_attr) \ +static ssize_t show_constraint_##_attr(struct device *dev, \ + struct device_attribute *dev_attr,\ + char *buf) \ +{ \ + u64 value; \ + ssize_t len = -ENODATA; \ + struct powercap_zone *power_zone = to_powercap_zone(dev); \ + int id; \ + struct powercap_zone_constraint *pconst;\ + \ + if (!sscanf(dev_attr->attr.name, "constraint_%d_", &id)) \ + return -EINVAL; \ + if (id >= power_zone->const_id_cnt) \ + return -EINVAL; \ + pconst = &power_zone->constraints[id]; \ + if (pconst && pconst->ops && pconst->ops->get_##_attr) { \ + if (!pconst->ops->get_##_attr(power_zone, id, &value)) \ + len = sprintf(buf, "%lld\n", value); \ + } \ + \ + return len; \ +} + +/* Power zone constraint store function */ +#define define_power_zone_constraint_store(_attr) \ +static ssize_t store_constraint_##_attr(struct device *dev,\ + struct device_attribute *dev_attr, \ + const char *buf, size_t count) \ +{ \ + int err; \ + u64 value; \ + struct powercap_zone *power_zone = to_powercap_zone(dev); \ + int id; \ + struct powercap_zone_constraint *pconst;\ + \ + if (!sscanf(dev_attr->attr.name, "constraint_%d_", &id)) \ + return -EINVAL; \ + if (id >= power_zone->const_id_cnt) \ + return -EINVAL; \ + pconst = &power_zone->constraints[id]; \ + err = kstrtoull(buf, 10, &value); \ + if (err) \ + return -EINVAL; \ + if (pconst && pconst->ops && pconst->ops->set_##_attr) { \ + if (!pconst->ops->set_##_attr(power_zone, id, value)) \ + return count; \ + } \ + \ + return -ENODATA; \ +} + +/* Power zone information callbacks */ +define_power_zone_show(power_uw); +define_power_zone_show(max_power_range_uw); +define_power_zone_show(energy_uj); +define_power_zone_store(energy_uj); +define_power_zone_show(max_energy_range_uj); + +/* Power zone attributes */ +static DEVICE_ATTR_RO(max_power_range_uw); +static DEVICE_ATTR_RO(power_uw); +static DEVICE_ATTR_RO(max_energy_range_uj); +static DEVICE_ATTR_RW(energy_uj); + +/* Power zone constraint attributes callbacks */ +define_power_zone_constraint_show(power_limit_uw); +define_power_zone_constraint_store(power_limit_uw); +define_power_zone_constraint_show(time_window_us); +define_power_zone_constraint_store(time_window_us); +define_power_zone_constraint_show(max_power_uw); +define_power_zone_constraint_show(min_power_uw); +define_power_zone_constraint_show(max_time_window_us); +define_power_zone_constraint_show(min_time_window_us); + +/* For one time seeding of constraint device attributes */ +struct powercap_constraint_attr { + struct device_attribute power_limit_attr; + struct device_attribute time_window_attr; + struct device_attribute max_power_attr; + struct device_attribute min_power_attr; + struct device_attribute max_time_window_attr; + struct device_attribute min_time_window_attr; + struct device_attribute name_attr; +}; + +static struct powercap_constraint_attr + constraint_attrs[MAX_CONSTRAINTS_PER_ZONE]; + +/* A list of powercap control_types */ +static LIST_HEAD(powercap_cntrl_list); +/* Mutex to protect list of powercap control_types */ +static DEFINE_MUTEX(powercap_cntrl_list_lock); + +#define POWERCAP_CONSTRAINT_NAME_LEN 30 /* Some limit to avoid overflow */ +static ssize_t show_constraint_name(struct device *dev, + struct device_attribute *dev_attr, + char *buf) +{ + const char *name; + struct powercap_zone *power_zone = to_powercap_zone(dev); + int id; + ssize_t len = -ENODATA; + struct powercap_zone_constraint *pconst; + + if (!sscanf(dev_attr->attr.name, "constraint_%d_", &id)) + return -EINVAL; + if (id >= power_zone->const_id_cnt) + return -EINVAL; + pconst = &power_zone->constraints[id]; + + if (pconst && pconst->ops && pconst->ops->get_name) { + name = pconst->ops->get_name(power_zone, id); + if (name) { + snprintf(buf, POWERCAP_CONSTRAINT_NAME_LEN, + "%s\n", name); + buf[POWERCAP_CONSTRAINT_NAME_LEN] = '\0'; + len = strlen(buf); + } + } + + return len; +} + +static int create_constraint_attribute(int id, const char *name, + int mode, + struct device_attribute *dev_attr, + ssize_t (*show)(struct device *, + struct device_attribute *, char *), + ssize_t (*store)(struct device *, + struct device_attribute *, + const char *, size_t) + ) +{ + + dev_attr->attr.name = kasprintf(GFP_KERNEL, "constraint_%d_%s", + id, name); + if (!dev_attr->attr.name) + return -ENOMEM; + dev_attr->attr.mode = mode; + dev_attr->show = show; + dev_attr->store = store; + + return 0; +} + +static void free_constraint_attributes(void) +{ + int i; + + for (i = 0; i < MAX_CONSTRAINTS_PER_ZONE; ++i) { + kfree(constraint_attrs[i].power_limit_attr.attr.name); + kfree(constraint_attrs[i].time_window_attr.attr.name); + kfree(constraint_attrs[i].name_attr.attr.name); + kfree(constraint_attrs[i].max_power_attr.attr.name); + kfree(constraint_attrs[i].min_power_attr.attr.name); + kfree(constraint_attrs[i].max_time_window_attr.attr.name); + kfree(constraint_attrs[i].min_time_window_attr.attr.name); + } +} + +static int seed_constraint_attributes(void) +{ + int i; + int ret; + + for (i = 0; i < MAX_CONSTRAINTS_PER_ZONE; ++i) { + ret = create_constraint_attribute(i, "power_limit_uw", + S_IWUSR | S_IRUGO, + &constraint_attrs[i].power_limit_attr, + show_constraint_power_limit_uw, + store_constraint_power_limit_uw); + if (ret) + goto err_alloc; + ret = create_constraint_attribute(i, "time_window_us", + S_IWUSR | S_IRUGO, + &constraint_attrs[i].time_window_attr, + show_constraint_time_window_us, + store_constraint_time_window_us); + if (ret) + goto err_alloc; + ret = create_constraint_attribute(i, "name", S_IRUGO, + &constraint_attrs[i].name_attr, + show_constraint_name, + NULL); + if (ret) + goto err_alloc; + ret = create_constraint_attribute(i, "max_power_uw", S_IRUGO, + &constraint_attrs[i].max_power_attr, + show_constraint_max_power_uw, + NULL); + if (ret) + goto err_alloc; + ret = create_constraint_attribute(i, "min_power_uw", S_IRUGO, + &constraint_attrs[i].min_power_attr, + show_constraint_min_power_uw, + NULL); + if (ret) + goto err_alloc; + ret = create_constraint_attribute(i, "max_time_window_us", + S_IRUGO, + &constraint_attrs[i].max_time_window_attr, + show_constraint_max_time_window_us, + NULL); + if (ret) + goto err_alloc; + ret = create_constraint_attribute(i, "min_time_window_us", + S_IRUGO, + &constraint_attrs[i].min_time_window_attr, + show_constraint_min_time_window_us, + NULL); + if (ret) + goto err_alloc; + + } + + return 0; + +err_alloc: + free_constraint_attributes(); + + return ret; +} + +static int create_constraints(struct powercap_zone *power_zone, + int nr_constraints, + struct powercap_zone_constraint_ops *const_ops) +{ + int i; + int ret = 0; + int count; + struct powercap_zone_constraint *pconst; + + if (!power_zone || !const_ops || !const_ops->get_power_limit_uw || + !const_ops->set_power_limit_uw || + !const_ops->get_time_window_us || + !const_ops->set_time_window_us) + return -EINVAL; + + count = power_zone->zone_attr_count; + for (i = 0; i < nr_constraints; ++i) { + pconst = &power_zone->constraints[i]; + pconst->ops = const_ops; + pconst->id = power_zone->const_id_cnt; + power_zone->const_id_cnt++; + power_zone->zone_dev_attrs[count++] = + &constraint_attrs[i].power_limit_attr.attr; + power_zone->zone_dev_attrs[count++] = + &constraint_attrs[i].time_window_attr.attr; + if (pconst->ops->get_name) + power_zone->zone_dev_attrs[count++] = + &constraint_attrs[i].name_attr.attr; + if (pconst->ops->get_max_power_uw) + power_zone->zone_dev_attrs[count++] = + &constraint_attrs[i].max_power_attr.attr; + if (pconst->ops->get_min_power_uw) + power_zone->zone_dev_attrs[count++] = + &constraint_attrs[i].min_power_attr.attr; + if (pconst->ops->get_max_time_window_us) + power_zone->zone_dev_attrs[count++] = + &constraint_attrs[i].max_time_window_attr.attr; + if (pconst->ops->get_min_time_window_us) + power_zone->zone_dev_attrs[count++] = + &constraint_attrs[i].min_time_window_attr.attr; + } + power_zone->zone_attr_count = count; + + return ret; +} + +static bool control_type_valid(void *control_type) +{ + struct powercap_control_type *pos = NULL; + bool found = false; + + mutex_lock(&powercap_cntrl_list_lock); + + list_for_each_entry(pos, &powercap_cntrl_list, node) { + if (pos == control_type) { + found = true; + break; + } + } + mutex_unlock(&powercap_cntrl_list_lock); + + return found; +} + +static ssize_t name_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + struct powercap_zone *power_zone = to_powercap_zone(dev); + + return sprintf(buf, "%s\n", power_zone->name); +} + +static DEVICE_ATTR_RO(name); + +/* Create zone and attributes in sysfs */ +static void create_power_zone_common_attributes( + struct powercap_zone *power_zone) +{ + int count = 0; + + power_zone->zone_dev_attrs[count++] = &dev_attr_name.attr; + if (power_zone->ops->get_max_energy_range_uj) + power_zone->zone_dev_attrs[count++] = + &dev_attr_max_energy_range_uj.attr; + if (power_zone->ops->get_energy_uj) + power_zone->zone_dev_attrs[count++] = + &dev_attr_energy_uj.attr; + if (power_zone->ops->get_power_uw) + power_zone->zone_dev_attrs[count++] = + &dev_attr_power_uw.attr; + if (power_zone->ops->get_max_power_range_uw) + power_zone->zone_dev_attrs[count++] = + &dev_attr_max_power_range_uw.attr; + power_zone->zone_dev_attrs[count] = NULL; + power_zone->zone_attr_count = count; +} + +static void powercap_release(struct device *dev) +{ + bool allocated; + + if (dev->parent) { + struct powercap_zone *power_zone = to_powercap_zone(dev); + + /* Store flag as the release() may free memory */ + allocated = power_zone->allocated; + /* Remove id from parent idr struct */ + idr_remove(power_zone->parent_idr, power_zone->id); + /* Destroy idrs allocated for this zone */ + idr_destroy(&power_zone->idr); + kfree(power_zone->name); + kfree(power_zone->zone_dev_attrs); + kfree(power_zone->constraints); + if (power_zone->ops->release) + power_zone->ops->release(power_zone); + if (allocated) + kfree(power_zone); + } else { + struct powercap_control_type *control_type = + to_powercap_control_type(dev); + + /* Store flag as the release() may free memory */ + allocated = control_type->allocated; + idr_destroy(&control_type->idr); + mutex_destroy(&control_type->lock); + if (control_type->ops && control_type->ops->release) + control_type->ops->release(control_type); + if (allocated) + kfree(control_type); + } +} + +static ssize_t enabled_show(struct device *dev, + struct device_attribute *attr, + char *buf) +{ + bool mode = true; + + /* Default is enabled */ + if (dev->parent) { + struct powercap_zone *power_zone = to_powercap_zone(dev); + if (power_zone->ops->get_enable) + if (power_zone->ops->get_enable(power_zone, &mode)) + mode = false; + } else { + struct powercap_control_type *control_type = + to_powercap_control_type(dev); + if (control_type->ops && control_type->ops->get_enable) + if (control_type->ops->get_enable(control_type, &mode)) + mode = false; + } + + return sprintf(buf, "%d\n", mode); +} + +static ssize_t enabled_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t len) +{ + bool mode; + + if (strtobool(buf, &mode)) + return -EINVAL; + if (dev->parent) { + struct powercap_zone *power_zone = to_powercap_zone(dev); + if (power_zone->ops->set_enable) + if (!power_zone->ops->set_enable(power_zone, mode)) + return len; + } else { + struct powercap_control_type *control_type = + to_powercap_control_type(dev); + if (control_type->ops && control_type->ops->set_enable) + if (!control_type->ops->set_enable(control_type, mode)) + return len; + } + + return -ENOSYS; +} + +static struct device_attribute powercap_def_attrs[] = { + __ATTR(enabled, S_IWUSR | S_IRUGO, enabled_show, + enabled_store), + __ATTR_NULL +}; + +static struct class powercap_class = { + .name = "powercap", + .dev_release = powercap_release, + .dev_attrs = powercap_def_attrs, +}; + +struct powercap_zone *powercap_register_zone( + struct powercap_zone *power_zone, + struct powercap_control_type *control_type, + const char *name, + struct powercap_zone *parent, + const struct powercap_zone_ops *ops, + int nr_constraints, + struct powercap_zone_constraint_ops *const_ops) +{ + int result; + int nr_attrs; + + if (!name || !control_type || !ops || + nr_constraints > MAX_CONSTRAINTS_PER_ZONE || + (!ops->get_energy_uj && !ops->get_power_uw) || + !control_type_valid(control_type)) + return ERR_PTR(-EINVAL); + + if (power_zone) { + if (!ops->release) + return ERR_PTR(-EINVAL); + memset(power_zone, 0, sizeof(*power_zone)); + } else { + power_zone = kzalloc(sizeof(*power_zone), GFP_KERNEL); + if (!power_zone) + return ERR_PTR(-ENOMEM); + power_zone->allocated = true; + } + power_zone->ops = ops; + power_zone->control_type_inst = control_type; + if (!parent) { + power_zone->dev.parent = &control_type->dev; + power_zone->parent_idr = &control_type->idr; + } else { + power_zone->dev.parent = &parent->dev; + power_zone->parent_idr = &parent->idr; + } + power_zone->dev.class = &powercap_class; + + mutex_lock(&control_type->lock); + /* Using idr to get the unique id */ + result = idr_alloc(power_zone->parent_idr, NULL, 0, 0, GFP_KERNEL); + if (result < 0) + goto err_idr_alloc; + + power_zone->id = result; + idr_init(&power_zone->idr); + power_zone->name = kstrdup(name, GFP_KERNEL); + if (!power_zone->name) + goto err_name_alloc; + dev_set_name(&power_zone->dev, "%s:%x", + dev_name(power_zone->dev.parent), + power_zone->id); + power_zone->constraints = kzalloc(sizeof(*power_zone->constraints) * + nr_constraints, GFP_KERNEL); + if (!power_zone->constraints) + goto err_const_alloc; + + nr_attrs = nr_constraints * POWERCAP_CONSTRAINTS_ATTRS + + POWERCAP_ZONE_MAX_ATTRS + 1; + power_zone->zone_dev_attrs = kzalloc(sizeof(void *) * + nr_attrs, GFP_KERNEL); + if (!power_zone->zone_dev_attrs) + goto err_attr_alloc; + create_power_zone_common_attributes(power_zone); + result = create_constraints(power_zone, nr_constraints, const_ops); + if (result) + goto err_dev_ret; + + power_zone->zone_dev_attrs[power_zone->zone_attr_count] = NULL; + power_zone->dev_zone_attr_group.attrs = power_zone->zone_dev_attrs; + power_zone->dev_attr_groups[0] = &power_zone->dev_zone_attr_group; + power_zone->dev_attr_groups[1] = NULL; + power_zone->dev.groups = power_zone->dev_attr_groups; + result = device_register(&power_zone->dev); + if (result) + goto err_dev_ret; + + control_type->nr_zones++; + mutex_unlock(&control_type->lock); + + return power_zone; + +err_dev_ret: + kfree(power_zone->zone_dev_attrs); +err_attr_alloc: + kfree(power_zone->constraints); +err_const_alloc: + kfree(power_zone->name); +err_name_alloc: + idr_remove(power_zone->parent_idr, power_zone->id); +err_idr_alloc: + if (power_zone->allocated) + kfree(power_zone); + mutex_unlock(&control_type->lock); + + return ERR_PTR(result); +} +EXPORT_SYMBOL_GPL(powercap_register_zone); + +int powercap_unregister_zone(struct powercap_control_type *control_type, + struct powercap_zone *power_zone) +{ + if (!power_zone || !control_type) + return -EINVAL; + + mutex_lock(&control_type->lock); + control_type->nr_zones--; + mutex_unlock(&control_type->lock); + + device_unregister(&power_zone->dev); + + return 0; +} +EXPORT_SYMBOL_GPL(powercap_unregister_zone); + +struct powercap_control_type *powercap_register_control_type( + struct powercap_control_type *control_type, + const char *name, + const struct powercap_control_type_ops *ops) +{ + int result; + + if (!name) + return ERR_PTR(-EINVAL); + if (control_type) { + if (!ops || !ops->release) + return ERR_PTR(-EINVAL); + memset(control_type, 0, sizeof(*control_type)); + } else { + control_type = kzalloc(sizeof(*control_type), GFP_KERNEL); + if (!control_type) + return ERR_PTR(-ENOMEM); + control_type->allocated = true; + } + mutex_init(&control_type->lock); + control_type->ops = ops; + INIT_LIST_HEAD(&control_type->node); + control_type->dev.class = &powercap_class; + dev_set_name(&control_type->dev, name); + result = device_register(&control_type->dev); + if (result) { + if (control_type->allocated) + kfree(control_type); + return ERR_PTR(result); + } + idr_init(&control_type->idr); + + mutex_lock(&powercap_cntrl_list_lock); + list_add_tail(&control_type->node, &powercap_cntrl_list); + mutex_unlock(&powercap_cntrl_list_lock); + + return control_type; +} +EXPORT_SYMBOL_GPL(powercap_register_control_type); + +int powercap_unregister_control_type(struct powercap_control_type *control_type) +{ + struct powercap_control_type *pos = NULL; + + if (control_type->nr_zones) { + dev_err(&control_type->dev, "Zones of this type still not freed\n"); + return -EINVAL; + } + mutex_lock(&powercap_cntrl_list_lock); + list_for_each_entry(pos, &powercap_cntrl_list, node) { + if (pos == control_type) { + list_del(&control_type->node); + mutex_unlock(&powercap_cntrl_list_lock); + device_unregister(&control_type->dev); + return 0; + } + } + mutex_unlock(&powercap_cntrl_list_lock); + + return -ENODEV; +} +EXPORT_SYMBOL_GPL(powercap_unregister_control_type); + +static int __init powercap_init(void) +{ + int result = 0; + + result = seed_constraint_attributes(); + if (result) + return result; + + result = class_register(&powercap_class); + + return result; +} + +device_initcall(powercap_init); + +MODULE_DESCRIPTION("PowerCap sysfs Driver"); +MODULE_AUTHOR("Srinivas Pandruvada "); +MODULE_LICENSE("GPL v2"); diff --git a/include/linux/powercap.h b/include/linux/powercap.h new file mode 100644 index 000000000000..4e250417ee30 --- /dev/null +++ b/include/linux/powercap.h @@ -0,0 +1,325 @@ +/* + * powercap.h: Data types and headers for sysfs power capping interface + * Copyright (c) 2013, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc. + * + */ + +#ifndef __POWERCAP_H__ +#define __POWERCAP_H__ + +#include +#include + +/* + * A power cap class device can contain multiple powercap control_types. + * Each control_type can have multiple power zones, which can be independently + * controlled. Each power zone can have one or more constraints. + */ + +struct powercap_control_type; +struct powercap_zone; +struct powercap_zone_constraint; + +/** + * struct powercap_control_type_ops - Define control type callbacks + * @set_enable: Enable/Disable whole control type. + * Default is enabled. But this callback allows all zones + * to be in disable state and remove any applied power + * limits. If disabled power zone can only be monitored + * not controlled. + * @get_enable: get Enable/Disable status. + * @release: Callback to inform that last reference to this + * control type is closed. So it is safe to free data + * structure associated with this control type. + * This callback is mandatory if the client own memory + * for the control type. + * + * This structure defines control type callbacks to be implemented by client + * drivers + */ +struct powercap_control_type_ops { + int (*set_enable) (struct powercap_control_type *, bool mode); + int (*get_enable) (struct powercap_control_type *, bool *mode); + int (*release) (struct powercap_control_type *); +}; + +/** + * struct powercap_control_type- Defines a powercap control_type + * @name: name of control_type + * @dev: device for this control_type + * @idr: idr to have unique id for its child + * @root_node: Root holding power zones for this control_type + * @ops: Pointer to callback struct + * @node_lock: mutex for control type + * @allocated: This is possible that client owns the memory + * used by this structure. In this case + * this flag is set to false by framework to + * prevent deallocation during release process. + * Otherwise this flag is set to true. + * @ctrl_inst: link to the control_type list + * + * Defines powercap control_type. This acts as a container for power + * zones, which use same method to control power. E.g. RAPL, RAPL-PCI etc. + * All fields are private and should not be used by client drivers. + */ +struct powercap_control_type { + struct device dev; + struct idr idr; + int nr_zones; + const struct powercap_control_type_ops *ops; + struct mutex lock; + bool allocated; + struct list_head node; +}; + +/** + * struct powercap_zone_ops - Define power zone callbacks + * @get_max_energy_range_uj: Get maximum range of energy counter in + * micro-joules. + * @get_energy_uj: Get current energy counter in micro-joules. + * @reset_energy_uj: Reset micro-joules energy counter. + * @get_max_power_range_uw: Get maximum range of power counter in + * micro-watts. + * @get_power_uw: Get current power counter in micro-watts. + * @set_enable: Enable/Disable power zone controls. + * Default is enabled. + * @get_enable: get Enable/Disable status. + * @release: Callback to inform that last reference to this + * control type is closed. So it is safe to free + * data structure associated with this + * control type. Mandatory, if client driver owns + * the power_zone memory. + * + * This structure defines zone callbacks to be implemented by client drivers. + * Client drives can define both energy and power related callbacks. But at + * the least one type (either power or energy) is mandatory. Client drivers + * should handle mutual exclusion, if required in callbacks. + */ +struct powercap_zone_ops { + int (*get_max_energy_range_uj) (struct powercap_zone *, u64 *); + int (*get_energy_uj) (struct powercap_zone *, u64 *); + int (*reset_energy_uj) (struct powercap_zone *); + int (*get_max_power_range_uw) (struct powercap_zone *, u64 *); + int (*get_power_uw) (struct powercap_zone *, u64 *); + int (*set_enable) (struct powercap_zone *, bool mode); + int (*get_enable) (struct powercap_zone *, bool *mode); + int (*release) (struct powercap_zone *); +}; + +#define POWERCAP_ZONE_MAX_ATTRS 6 +#define POWERCAP_CONSTRAINTS_ATTRS 8 +#define MAX_CONSTRAINTS_PER_ZONE 10 +/** + * struct powercap_zone- Defines instance of a power cap zone + * @id: Unique id + * @name: Power zone name. + * @control_type_inst: Control type instance for this zone. + * @ops: Pointer to the zone operation structure. + * @dev: Instance of a device. + * @const_id_cnt: Number of constraint defined. + * @idr: Instance to an idr entry for children zones. + * @parent_idr: To remove reference from the parent idr. + * @private_data: Private data pointer if any for this zone. + * @zone_dev_attrs: Attributes associated with this device. + * @zone_attr_count: Attribute count. + * @dev_zone_attr_group: Attribute group for attributes. + * @dev_attr_groups: Attribute group store to register with device. + * @allocated: This is possible that client owns the memory + * used by this structure. In this case + * this flag is set to false by framework to + * prevent deallocation during release process. + * Otherwise this flag is set to true. + * @constraint_ptr: List of constraints for this zone. + * + * This defines a power zone instance. The fields of this structure are + * private, and should not be used by client drivers. + */ +struct powercap_zone { + int id; + char *name; + void *control_type_inst; + const struct powercap_zone_ops *ops; + struct device dev; + int const_id_cnt; + struct idr idr; + struct idr *parent_idr; + void *private_data; + struct attribute **zone_dev_attrs; + int zone_attr_count; + struct attribute_group dev_zone_attr_group; + const struct attribute_group *dev_attr_groups[2]; /* 1 group + NULL */ + bool allocated; + struct powercap_zone_constraint *constraints; +}; + +/** + * struct powercap_zone_constraint_ops - Define constraint callbacks + * @set_power_limit_uw: Set power limit in micro-watts. + * @get_power_limit_uw: Get power limit in micro-watts. + * @set_time_window_us: Set time window in micro-seconds. + * @get_time_window_us: Get time window in micro-seconds. + * @get_max_power_uw: Get max power allowed in micro-watts. + * @get_min_power_uw: Get min power allowed in micro-watts. + * @get_max_time_window_us: Get max time window allowed in micro-seconds. + * @get_min_time_window_us: Get min time window allowed in micro-seconds. + * @get_name: Get the name of constraint + * + * This structure is used to define the constraint callbacks for the client + * drivers. The following callbacks are mandatory and can't be NULL: + * set_power_limit_uw + * get_power_limit_uw + * set_time_window_us + * get_time_window_us + * get_name + * Client drivers should handle mutual exclusion, if required in callbacks. + */ +struct powercap_zone_constraint_ops { + int (*set_power_limit_uw) (struct powercap_zone *, int, u64); + int (*get_power_limit_uw) (struct powercap_zone *, int, u64 *); + int (*set_time_window_us) (struct powercap_zone *, int, u64); + int (*get_time_window_us) (struct powercap_zone *, int, u64 *); + int (*get_max_power_uw) (struct powercap_zone *, int, u64 *); + int (*get_min_power_uw) (struct powercap_zone *, int, u64 *); + int (*get_max_time_window_us) (struct powercap_zone *, int, u64 *); + int (*get_min_time_window_us) (struct powercap_zone *, int, u64 *); + const char *(*get_name) (struct powercap_zone *, int); +}; + +/** + * struct powercap_zone_constraint- Defines instance of a constraint + * @id: Instance Id of this constraint. + * @power_zone: Pointer to the power zone for this constraint. + * @ops: Pointer to the constraint callbacks. + * + * This defines a constraint instance. + */ +struct powercap_zone_constraint { + int id; + struct powercap_zone *power_zone; + struct powercap_zone_constraint_ops *ops; +}; + + +/* For clients to get their device pointer, may be used for dev_dbgs */ +#define POWERCAP_GET_DEV(power_zone) (&power_zone->dev) + +/** +* powercap_set_zone_data() - Set private data for a zone +* @power_zone: A pointer to the valid zone instance. +* @pdata: A pointer to the user private data. +* +* Allows client drivers to associate some private data to zone instance. +*/ +static inline void powercap_set_zone_data(struct powercap_zone *power_zone, + void *pdata) +{ + if (power_zone) + power_zone->private_data = pdata; +} + +/** +* powercap_get_zone_data() - Get private data for a zone +* @power_zone: A pointer to the valid zone instance. +* +* Allows client drivers to get private data associate with a zone, +* using call to powercap_set_zone_data. +*/ +static inline void *powercap_get_zone_data(struct powercap_zone *power_zone) +{ + if (power_zone) + return power_zone->private_data; + return NULL; +} + +/** +* powercap_register_control_type() - Register a control_type with framework +* @control_type: Pointer to client allocated memory for the control type +* structure storage. If this is NULL, powercap framework +* will allocate memory and own it. +* Advantage of this parameter is that client can embed +* this data in its data structures and allocate in a +* single call, preventing multiple allocations. +* @control_type_name: The Name of this control_type, which will be shown +* in the sysfs Interface. +* @ops: Callbacks for control type. This parameter is optional. +* +* Used to create a control_type with the power capping class. Here control_type +* can represent a type of technology, which can control a range of power zones. +* For example a control_type can be RAPL (Running Average Power Limit) +* Intel® 64 and IA-32 Processor Architectures. The name can be any string +* which must be unique, otherwise this function returns NULL. +* A pointer to the control_type instance is returned on success. +*/ +struct powercap_control_type *powercap_register_control_type( + struct powercap_control_type *control_type, + const char *name, + const struct powercap_control_type_ops *ops); + +/** +* powercap_unregister_control_type() - Unregister a control_type from framework +* @instance: A pointer to the valid control_type instance. +* +* Used to unregister a control_type with the power capping class. +* All power zones registered under this control type have to be unregistered +* before calling this function, or it will fail with an error code. +*/ +int powercap_unregister_control_type(struct powercap_control_type *instance); + +/* Zone register/unregister API */ + +/** +* powercap_register_zone() - Register a power zone +* @power_zone: Pointer to client allocated memory for the power zone structure +* storage. If this is NULL, powercap framework will allocate +* memory and own it. Advantage of this parameter is that client +* can embed this data in its data structures and allocate in a +* single call, preventing multiple allocations. +* @control_type: A control_type instance under which this zone operates. +* @name: A name for this zone. +* @parent: A pointer to the parent power zone instance if any or NULL +* @ops: Pointer to zone operation callback structure. +* @no_constraints: Number of constraints for this zone +* @const_ops: Pointer to constraint callback structure +* +* Register a power zone under a given control type. A power zone must register +* a pointer to a structure representing zone callbacks. +* A power zone can be located under a parent power zone, in which case @parent +* should point to it. Otherwise, if @parent is NULL, the new power zone will +* be located directly under the given control type +* For each power zone there may be a number of constraints that appear in the +* sysfs under that zone as attributes with unique numeric IDs. +* Returns pointer to the power_zone on success. +*/ +struct powercap_zone *powercap_register_zone( + struct powercap_zone *power_zone, + struct powercap_control_type *control_type, + const char *name, + struct powercap_zone *parent, + const struct powercap_zone_ops *ops, + int nr_constraints, + struct powercap_zone_constraint_ops *const_ops); + +/** +* powercap_unregister_zone() - Unregister a zone device +* @control_type: A pointer to the valid instance of a control_type. +* @power_zone: A pointer to the valid zone instance for a control_type +* +* Used to unregister a zone device for a control_type. Caller should +* make sure that children for this zone are unregistered first. +*/ +int powercap_unregister_zone(struct powercap_control_type *control_type, + struct powercap_zone *power_zone); + +#endif From 12cc4b3827f8cc5973f86330ccc9d9656a31bfa8 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 11 Oct 2013 16:54:57 -0700 Subject: [PATCH 3/7] PowerCap: Add to drivers Kconfig and Makefile Added changes to Makefile and Kconfig to include in driver build. Signed-off-by: Srinivas Pandruvada Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- drivers/Kconfig | 2 ++ drivers/Makefile | 1 + 2 files changed, 3 insertions(+) diff --git a/drivers/Kconfig b/drivers/Kconfig index aa43b911ccef..969e9871785c 100644 --- a/drivers/Kconfig +++ b/drivers/Kconfig @@ -166,4 +166,6 @@ source "drivers/reset/Kconfig" source "drivers/fmc/Kconfig" +source "drivers/powercap/Kconfig" + endmenu diff --git a/drivers/Makefile b/drivers/Makefile index ab93de8297f1..34c1d554f69b 100644 --- a/drivers/Makefile +++ b/drivers/Makefile @@ -152,3 +152,4 @@ obj-$(CONFIG_VME_BUS) += vme/ obj-$(CONFIG_IPACK_BUS) += ipack/ obj-$(CONFIG_NTB) += ntb/ obj-$(CONFIG_FMC) += fmc/ +obj-$(CONFIG_POWERCAP) += powercap/ From 1a6b991a9875a4c4811c7baf4058fa17aa1a9d9b Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Fri, 11 Oct 2013 16:54:58 -0700 Subject: [PATCH 4/7] x86 / msr: add 64bit _on_cpu access functions Having 64-bit MSR access methods on given CPU can avoid shifting and simplify MSR content manipulation. We already have other combinations of rdmsrl_xxx and wrmsrl_xxx but missing the _on_cpu version. Signed-off-by: Srinivas Pandruvada Signed-off-by: Jacob Pan Reviewed-by: H. Peter Anvin Signed-off-by: Rafael J. Wysocki --- arch/x86/include/asm/msr.h | 22 ++++++++++++++ arch/x86/lib/msr-smp.c | 62 ++++++++++++++++++++++++++++++++++++++ 2 files changed, 84 insertions(+) diff --git a/arch/x86/include/asm/msr.h b/arch/x86/include/asm/msr.h index cb7502852acb..e139b13f2a33 100644 --- a/arch/x86/include/asm/msr.h +++ b/arch/x86/include/asm/msr.h @@ -218,10 +218,14 @@ void msrs_free(struct msr *msrs); #ifdef CONFIG_SMP int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); +int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); +int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q); void rdmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); void wrmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs); int rdmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h); int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h); +int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q); +int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q); int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); int wrmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]); #else /* CONFIG_SMP */ @@ -235,6 +239,16 @@ static inline int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) wrmsr(msr_no, l, h); return 0; } +static inline int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +{ + rdmsrl(msr_no, *q); + return 0; +} +static inline int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +{ + wrmsrl(msr_no, q); + return 0; +} static inline void rdmsr_on_cpus(const struct cpumask *m, u32 msr_no, struct msr *msrs) { @@ -254,6 +268,14 @@ static inline int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { return wrmsr_safe(msr_no, l, h); } +static inline int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +{ + return rdmsrl_safe(msr_no, q); +} +static inline int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +{ + return wrmsrl_safe(msr_no, q); +} static inline int rdmsr_safe_regs_on_cpu(unsigned int cpu, u32 regs[8]) { return rdmsr_safe_regs(regs); diff --git a/arch/x86/lib/msr-smp.c b/arch/x86/lib/msr-smp.c index a6b1b86d2253..518532e6a3fa 100644 --- a/arch/x86/lib/msr-smp.c +++ b/arch/x86/lib/msr-smp.c @@ -47,6 +47,21 @@ int rdmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 *l, u32 *h) } EXPORT_SYMBOL(rdmsr_on_cpu); +int rdmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_on_cpu, &rv, 1); + *q = rv.reg.q; + + return err; +} +EXPORT_SYMBOL(rdmsrl_on_cpu); + int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) { int err; @@ -63,6 +78,22 @@ int wrmsr_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) } EXPORT_SYMBOL(wrmsr_on_cpu); +int wrmsrl_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.q = q; + + err = smp_call_function_single(cpu, __wrmsr_on_cpu, &rv, 1); + + return err; +} +EXPORT_SYMBOL(wrmsrl_on_cpu); + static void __rwmsr_on_cpus(const struct cpumask *mask, u32 msr_no, struct msr *msrs, void (*msr_func) (void *info)) @@ -159,6 +190,37 @@ int wrmsr_safe_on_cpu(unsigned int cpu, u32 msr_no, u32 l, u32 h) } EXPORT_SYMBOL(wrmsr_safe_on_cpu); +int wrmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 q) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + rv.reg.q = q; + + err = smp_call_function_single(cpu, __wrmsr_safe_on_cpu, &rv, 1); + + return err ? err : rv.err; +} +EXPORT_SYMBOL(wrmsrl_safe_on_cpu); + +int rdmsrl_safe_on_cpu(unsigned int cpu, u32 msr_no, u64 *q) +{ + int err; + struct msr_info rv; + + memset(&rv, 0, sizeof(rv)); + + rv.msr_no = msr_no; + err = smp_call_function_single(cpu, __rdmsr_safe_on_cpu, &rv, 1); + *q = rv.reg.q; + + return err ? err : rv.err; +} +EXPORT_SYMBOL(rdmsrl_safe_on_cpu); + /* * These variants are significantly slower, but allows control over * the entire 32-bit GPR set. From bfd1ff6375c82930bfb3b401eee2c96720fa8e84 Mon Sep 17 00:00:00 2001 From: Srinivas Pandruvada Date: Fri, 11 Oct 2013 16:54:59 -0700 Subject: [PATCH 5/7] bitops: Introduce BIT_ULL Adding BIT(x) equivalent for unsigned long long type, BIT_ULL(x). Also added BIT_ULL_MASK and BIT_ULL_WORD. Suggested-by: Joe Perches Signed-off-by: Srinivas Pandruvada Signed-off-by: Jacob Pan Signed-off-by: Rafael J. Wysocki --- include/linux/bitops.h | 3 +++ 1 file changed, 3 insertions(+) diff --git a/include/linux/bitops.h b/include/linux/bitops.h index a3b6b82108b9..5a1c8b71ccd8 100644 --- a/include/linux/bitops.h +++ b/include/linux/bitops.h @@ -4,8 +4,11 @@ #ifdef __KERNEL__ #define BIT(nr) (1UL << (nr)) +#define BIT_ULL(nr) (1ULL << (nr)) #define BIT_MASK(nr) (1UL << ((nr) % BITS_PER_LONG)) #define BIT_WORD(nr) ((nr) / BITS_PER_LONG) +#define BIT_ULL_MASK(nr) (1ULL << ((nr) % BITS_PER_LONG_LONG)) +#define BIT_ULL_WORD(nr) ((nr) / BITS_PER_LONG_LONG) #define BITS_PER_BYTE 8 #define BITS_TO_LONGS(nr) DIV_ROUND_UP(nr, BITS_PER_BYTE * sizeof(long)) #endif From 2d281d8196e38dd3a4ee9af26621ddde8329f269 Mon Sep 17 00:00:00 2001 From: Jacob Pan Date: Thu, 17 Oct 2013 10:28:35 -0700 Subject: [PATCH 6/7] PowerCap: Introduce Intel RAPL power capping driver The Intel Running Average Power Limit (RAPL) technology provides platform software with the ability to monitor, control, and get notifications on power usage. This feature is present in all Sandy Bridge and later Intel processors. Newer models allow more fine grained controls to be applied. In RAPL, power control is divided into domains, which include package, DRAM controller, CPU core (Power Plane 0), graphics uncore (power plane 1), etc. The purpose of this driver is to expose the RAPL settings to userspace. Overall, RAPL fits in the new powercap class driver in that platform level power capping controls are exposed via this generic interface. This driver is based on an earlier patch from Zhang Rui. However, while the previous work was mainly focused on thermal monitoring the focus here is on the usability from user space perspective. References: https://lkml.org/lkml/2011/5/26/93 Signed-off-by: Srinivas Pandruvada Signed-off-by: Jacob Pan Reviewed-by: Rafael J. Wysocki Signed-off-by: Rafael J. Wysocki --- drivers/powercap/Kconfig | 13 + drivers/powercap/Makefile | 1 + drivers/powercap/intel_rapl.c | 1395 +++++++++++++++++++++++++++++++++ 3 files changed, 1409 insertions(+) create mode 100644 drivers/powercap/intel_rapl.c diff --git a/drivers/powercap/Kconfig b/drivers/powercap/Kconfig index a37055eb5ebc..a7c81b53d88a 100644 --- a/drivers/powercap/Kconfig +++ b/drivers/powercap/Kconfig @@ -15,5 +15,18 @@ menuconfig POWERCAP if POWERCAP # Client driver configurations go here. +config INTEL_RAPL + tristate "Intel RAPL Support" + depends on X86 + default n + ---help--- + This enables support for the Intel Running Average Power Limit (RAPL) + technology which allows power limits to be enforced and monitored on + modern Intel processors (Sandy Bridge and later). + + In RAPL, the platform level settings are divided into domains for + fine grained control. These domains include processor package, DRAM + controller, CPU core (Power Plance 0), graphics uncore (Power Plane + 1), etc. endif diff --git a/drivers/powercap/Makefile b/drivers/powercap/Makefile index 6defbc8dc4bf..0a21ef31372b 100644 --- a/drivers/powercap/Makefile +++ b/drivers/powercap/Makefile @@ -1 +1,2 @@ obj-$(CONFIG_POWERCAP) += powercap_sys.o +obj-$(CONFIG_INTEL_RAPL) += intel_rapl.o diff --git a/drivers/powercap/intel_rapl.c b/drivers/powercap/intel_rapl.c new file mode 100644 index 000000000000..2a786c504460 --- /dev/null +++ b/drivers/powercap/intel_rapl.c @@ -0,0 +1,1395 @@ +/* + * Intel Running Average Power Limit (RAPL) Driver + * Copyright (c) 2013, Intel Corporation. + * + * This program is free software; you can redistribute it and/or modify it + * under the terms and conditions of the GNU General Public License, + * version 2, as published by the Free Software Foundation. + * + * This program is distributed in the hope it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for + * more details. + * + * You should have received a copy of the GNU General Public License along with + * this program; if not, write to the Free Software Foundation, Inc. + * + */ +#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include +#include + +/* bitmasks for RAPL MSRs, used by primitive access functions */ +#define ENERGY_STATUS_MASK 0xffffffff + +#define POWER_LIMIT1_MASK 0x7FFF +#define POWER_LIMIT1_ENABLE BIT(15) +#define POWER_LIMIT1_CLAMP BIT(16) + +#define POWER_LIMIT2_MASK (0x7FFFULL<<32) +#define POWER_LIMIT2_ENABLE BIT_ULL(47) +#define POWER_LIMIT2_CLAMP BIT_ULL(48) +#define POWER_PACKAGE_LOCK BIT_ULL(63) +#define POWER_PP_LOCK BIT(31) + +#define TIME_WINDOW1_MASK (0x7FULL<<17) +#define TIME_WINDOW2_MASK (0x7FULL<<49) + +#define POWER_UNIT_OFFSET 0 +#define POWER_UNIT_MASK 0x0F + +#define ENERGY_UNIT_OFFSET 0x08 +#define ENERGY_UNIT_MASK 0x1F00 + +#define TIME_UNIT_OFFSET 0x10 +#define TIME_UNIT_MASK 0xF0000 + +#define POWER_INFO_MAX_MASK (0x7fffULL<<32) +#define POWER_INFO_MIN_MASK (0x7fffULL<<16) +#define POWER_INFO_MAX_TIME_WIN_MASK (0x3fULL<<48) +#define POWER_INFO_THERMAL_SPEC_MASK 0x7fff + +#define PERF_STATUS_THROTTLE_TIME_MASK 0xffffffff +#define PP_POLICY_MASK 0x1F + +/* Non HW constants */ +#define RAPL_PRIMITIVE_DERIVED BIT(1) /* not from raw data */ +#define RAPL_PRIMITIVE_DUMMY BIT(2) + +/* scale RAPL units to avoid floating point math inside kernel */ +#define POWER_UNIT_SCALE (1000000) +#define ENERGY_UNIT_SCALE (1000000) +#define TIME_UNIT_SCALE (1000000) + +#define TIME_WINDOW_MAX_MSEC 40000 +#define TIME_WINDOW_MIN_MSEC 250 + +enum unit_type { + ARBITRARY_UNIT, /* no translation */ + POWER_UNIT, + ENERGY_UNIT, + TIME_UNIT, +}; + +enum rapl_domain_type { + RAPL_DOMAIN_PACKAGE, /* entire package/socket */ + RAPL_DOMAIN_PP0, /* core power plane */ + RAPL_DOMAIN_PP1, /* graphics uncore */ + RAPL_DOMAIN_DRAM,/* DRAM control_type */ + RAPL_DOMAIN_MAX, +}; + +enum rapl_domain_msr_id { + RAPL_DOMAIN_MSR_LIMIT, + RAPL_DOMAIN_MSR_STATUS, + RAPL_DOMAIN_MSR_PERF, + RAPL_DOMAIN_MSR_POLICY, + RAPL_DOMAIN_MSR_INFO, + RAPL_DOMAIN_MSR_MAX, +}; + +/* per domain data, some are optional */ +enum rapl_primitives { + ENERGY_COUNTER, + POWER_LIMIT1, + POWER_LIMIT2, + FW_LOCK, + + PL1_ENABLE, /* power limit 1, aka long term */ + PL1_CLAMP, /* allow frequency to go below OS request */ + PL2_ENABLE, /* power limit 2, aka short term, instantaneous */ + PL2_CLAMP, + + TIME_WINDOW1, /* long term */ + TIME_WINDOW2, /* short term */ + THERMAL_SPEC_POWER, + MAX_POWER, + + MIN_POWER, + MAX_TIME_WINDOW, + THROTTLED_TIME, + PRIORITY_LEVEL, + + /* below are not raw primitive data */ + AVERAGE_POWER, + NR_RAPL_PRIMITIVES, +}; + +#define NR_RAW_PRIMITIVES (NR_RAPL_PRIMITIVES - 2) + +/* Can be expanded to include events, etc.*/ +struct rapl_domain_data { + u64 primitives[NR_RAPL_PRIMITIVES]; + unsigned long timestamp; +}; + + +#define DOMAIN_STATE_INACTIVE BIT(0) +#define DOMAIN_STATE_POWER_LIMIT_SET BIT(1) +#define DOMAIN_STATE_BIOS_LOCKED BIT(2) + +#define NR_POWER_LIMITS (2) +struct rapl_power_limit { + struct powercap_zone_constraint *constraint; + int prim_id; /* primitive ID used to enable */ + struct rapl_domain *domain; + const char *name; +}; + +static const char pl1_name[] = "long_term"; +static const char pl2_name[] = "short_term"; + +struct rapl_domain { + const char *name; + enum rapl_domain_type id; + int msrs[RAPL_DOMAIN_MSR_MAX]; + struct powercap_zone power_zone; + struct rapl_domain_data rdd; + struct rapl_power_limit rpl[NR_POWER_LIMITS]; + u64 attr_map; /* track capabilities */ + unsigned int state; + int package_id; +}; +#define power_zone_to_rapl_domain(_zone) \ + container_of(_zone, struct rapl_domain, power_zone) + + +/* Each physical package contains multiple domains, these are the common + * data across RAPL domains within a package. + */ +struct rapl_package { + unsigned int id; /* physical package/socket id */ + unsigned int nr_domains; + unsigned long domain_map; /* bit map of active domains */ + unsigned int power_unit_divisor; + unsigned int energy_unit_divisor; + unsigned int time_unit_divisor; + struct rapl_domain *domains; /* array of domains, sized at runtime */ + struct powercap_zone *power_zone; /* keep track of parent zone */ + int nr_cpus; /* active cpus on the package, topology info is lost during + * cpu hotplug. so we have to track ourselves. + */ + unsigned long power_limit_irq; /* keep track of package power limit + * notify interrupt enable status. + */ + struct list_head plist; +}; +#define PACKAGE_PLN_INT_SAVED BIT(0) +#define MAX_PRIM_NAME (32) + +/* per domain data. used to describe individual knobs such that access function + * can be consolidated into one instead of many inline functions. + */ +struct rapl_primitive_info { + const char *name; + u64 mask; + int shift; + enum rapl_domain_msr_id id; + enum unit_type unit; + u32 flag; +}; + +#define PRIMITIVE_INFO_INIT(p, m, s, i, u, f) { \ + .name = #p, \ + .mask = m, \ + .shift = s, \ + .id = i, \ + .unit = u, \ + .flag = f \ + } + +static void rapl_init_domains(struct rapl_package *rp); +static int rapl_read_data_raw(struct rapl_domain *rd, + enum rapl_primitives prim, + bool xlate, u64 *data); +static int rapl_write_data_raw(struct rapl_domain *rd, + enum rapl_primitives prim, + unsigned long long value); +static u64 rapl_unit_xlate(int package, enum unit_type type, u64 value, + int to_raw); +static void package_power_limit_irq_save(int package_id); + +static LIST_HEAD(rapl_packages); /* guarded by CPU hotplug lock */ + +static const char * const rapl_domain_names[] = { + "package", + "core", + "uncore", + "dram", +}; + +static struct powercap_control_type *control_type; /* PowerCap Controller */ + +/* caller to ensure CPU hotplug lock is held */ +static struct rapl_package *find_package_by_id(int id) +{ + struct rapl_package *rp; + + list_for_each_entry(rp, &rapl_packages, plist) { + if (rp->id == id) + return rp; + } + + return NULL; +} + +/* caller to ensure CPU hotplug lock is held */ +static int find_active_cpu_on_package(int package_id) +{ + int i; + + for_each_online_cpu(i) { + if (topology_physical_package_id(i) == package_id) + return i; + } + /* all CPUs on this package are offline */ + + return -ENODEV; +} + +/* caller must hold cpu hotplug lock */ +static void rapl_cleanup_data(void) +{ + struct rapl_package *p, *tmp; + + list_for_each_entry_safe(p, tmp, &rapl_packages, plist) { + kfree(p->domains); + list_del(&p->plist); + kfree(p); + } +} + +static int get_energy_counter(struct powercap_zone *power_zone, u64 *energy_raw) +{ + struct rapl_domain *rd; + u64 energy_now; + + /* prevent CPU hotplug, make sure the RAPL domain does not go + * away while reading the counter. + */ + get_online_cpus(); + rd = power_zone_to_rapl_domain(power_zone); + + if (!rapl_read_data_raw(rd, ENERGY_COUNTER, true, &energy_now)) { + *energy_raw = energy_now; + put_online_cpus(); + + return 0; + } + put_online_cpus(); + + return -EIO; +} + +static int get_max_energy_counter(struct powercap_zone *pcd_dev, u64 *energy) +{ + *energy = rapl_unit_xlate(0, ENERGY_UNIT, ENERGY_STATUS_MASK, 0); + return 0; +} + +static int release_zone(struct powercap_zone *power_zone) +{ + struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); + struct rapl_package *rp; + + /* package zone is the last zone of a package, we can free + * memory here since all children has been unregistered. + */ + if (rd->id == RAPL_DOMAIN_PACKAGE) { + rp = find_package_by_id(rd->package_id); + if (!rp) { + dev_warn(&power_zone->dev, "no package id %s\n", + rd->name); + return -ENODEV; + } + kfree(rd); + rp->domains = NULL; + } + + return 0; + +} + +static int find_nr_power_limit(struct rapl_domain *rd) +{ + int i; + + for (i = 0; i < NR_POWER_LIMITS; i++) { + if (rd->rpl[i].name == NULL) + break; + } + + return i; +} + +static int set_domain_enable(struct powercap_zone *power_zone, bool mode) +{ + struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); + int nr_powerlimit; + + if (rd->state & DOMAIN_STATE_BIOS_LOCKED) + return -EACCES; + get_online_cpus(); + nr_powerlimit = find_nr_power_limit(rd); + /* here we activate/deactivate the hardware for power limiting */ + rapl_write_data_raw(rd, PL1_ENABLE, mode); + /* always enable clamp such that p-state can go below OS requested + * range. power capping priority over guranteed frequency. + */ + rapl_write_data_raw(rd, PL1_CLAMP, mode); + /* some domains have pl2 */ + if (nr_powerlimit > 1) { + rapl_write_data_raw(rd, PL2_ENABLE, mode); + rapl_write_data_raw(rd, PL2_CLAMP, mode); + } + put_online_cpus(); + + return 0; +} + +static int get_domain_enable(struct powercap_zone *power_zone, bool *mode) +{ + struct rapl_domain *rd = power_zone_to_rapl_domain(power_zone); + u64 val; + + if (rd->state & DOMAIN_STATE_BIOS_LOCKED) { + *mode = false; + return 0; + } + get_online_cpus(); + if (rapl_read_data_raw(rd, PL1_ENABLE, true, &val)) { + put_online_cpus(); + return -EIO; + } + *mode = val; + put_online_cpus(); + + return 0; +} + +/* per RAPL domain ops, in the order of rapl_domain_type */ +static struct powercap_zone_ops zone_ops[] = { + /* RAPL_DOMAIN_PACKAGE */ + { + .get_energy_uj = get_energy_counter, + .get_max_energy_range_uj = get_max_energy_counter, + .release = release_zone, + .set_enable = set_domain_enable, + .get_enable = get_domain_enable, + }, + /* RAPL_DOMAIN_PP0 */ + { + .get_energy_uj = get_energy_counter, + .get_max_energy_range_uj = get_max_energy_counter, + .release = release_zone, + .set_enable = set_domain_enable, + .get_enable = get_domain_enable, + }, + /* RAPL_DOMAIN_PP1 */ + { + .get_energy_uj = get_energy_counter, + .get_max_energy_range_uj = get_max_energy_counter, + .release = release_zone, + .set_enable = set_domain_enable, + .get_enable = get_domain_enable, + }, + /* RAPL_DOMAIN_DRAM */ + { + .get_energy_uj = get_energy_counter, + .get_max_energy_range_uj = get_max_energy_counter, + .release = release_zone, + .set_enable = set_domain_enable, + .get_enable = get_domain_enable, + }, +}; + +static int set_power_limit(struct powercap_zone *power_zone, int id, + u64 power_limit) +{ + struct rapl_domain *rd; + struct rapl_package *rp; + int ret = 0; + + get_online_cpus(); + rd = power_zone_to_rapl_domain(power_zone); + rp = find_package_by_id(rd->package_id); + if (!rp) { + ret = -ENODEV; + goto set_exit; + } + + if (rd->state & DOMAIN_STATE_BIOS_LOCKED) { + dev_warn(&power_zone->dev, "%s locked by BIOS, monitoring only\n", + rd->name); + ret = -EACCES; + goto set_exit; + } + + switch (rd->rpl[id].prim_id) { + case PL1_ENABLE: + rapl_write_data_raw(rd, POWER_LIMIT1, power_limit); + break; + case PL2_ENABLE: + rapl_write_data_raw(rd, POWER_LIMIT2, power_limit); + break; + default: + ret = -EINVAL; + } + if (!ret) + package_power_limit_irq_save(rd->package_id); +set_exit: + put_online_cpus(); + return ret; +} + +static int get_current_power_limit(struct powercap_zone *power_zone, int id, + u64 *data) +{ + struct rapl_domain *rd; + u64 val; + int prim; + int ret = 0; + + get_online_cpus(); + rd = power_zone_to_rapl_domain(power_zone); + switch (rd->rpl[id].prim_id) { + case PL1_ENABLE: + prim = POWER_LIMIT1; + break; + case PL2_ENABLE: + prim = POWER_LIMIT2; + break; + default: + put_online_cpus(); + return -EINVAL; + } + if (rapl_read_data_raw(rd, prim, true, &val)) + ret = -EIO; + else + *data = val; + + put_online_cpus(); + + return ret; +} + +static int set_time_window(struct powercap_zone *power_zone, int id, + u64 window) +{ + struct rapl_domain *rd; + int ret = 0; + + get_online_cpus(); + rd = power_zone_to_rapl_domain(power_zone); + switch (rd->rpl[id].prim_id) { + case PL1_ENABLE: + rapl_write_data_raw(rd, TIME_WINDOW1, window); + break; + case PL2_ENABLE: + rapl_write_data_raw(rd, TIME_WINDOW2, window); + break; + default: + ret = -EINVAL; + } + put_online_cpus(); + return ret; +} + +static int get_time_window(struct powercap_zone *power_zone, int id, u64 *data) +{ + struct rapl_domain *rd; + u64 val; + int ret = 0; + + get_online_cpus(); + rd = power_zone_to_rapl_domain(power_zone); + switch (rd->rpl[id].prim_id) { + case PL1_ENABLE: + ret = rapl_read_data_raw(rd, TIME_WINDOW1, true, &val); + break; + case PL2_ENABLE: + ret = rapl_read_data_raw(rd, TIME_WINDOW2, true, &val); + break; + default: + put_online_cpus(); + return -EINVAL; + } + if (!ret) + *data = val; + put_online_cpus(); + + return ret; +} + +static const char *get_constraint_name(struct powercap_zone *power_zone, int id) +{ + struct rapl_power_limit *rpl; + struct rapl_domain *rd; + + rd = power_zone_to_rapl_domain(power_zone); + rpl = (struct rapl_power_limit *) &rd->rpl[id]; + + return rpl->name; +} + + +static int get_max_power(struct powercap_zone *power_zone, int id, + u64 *data) +{ + struct rapl_domain *rd; + u64 val; + int prim; + int ret = 0; + + get_online_cpus(); + rd = power_zone_to_rapl_domain(power_zone); + switch (rd->rpl[id].prim_id) { + case PL1_ENABLE: + prim = THERMAL_SPEC_POWER; + break; + case PL2_ENABLE: + prim = MAX_POWER; + break; + default: + put_online_cpus(); + return -EINVAL; + } + if (rapl_read_data_raw(rd, prim, true, &val)) + ret = -EIO; + else + *data = val; + + put_online_cpus(); + + return ret; +} + +static struct powercap_zone_constraint_ops constraint_ops = { + .set_power_limit_uw = set_power_limit, + .get_power_limit_uw = get_current_power_limit, + .set_time_window_us = set_time_window, + .get_time_window_us = get_time_window, + .get_max_power_uw = get_max_power, + .get_name = get_constraint_name, +}; + +/* called after domain detection and package level data are set */ +static void rapl_init_domains(struct rapl_package *rp) +{ + int i; + struct rapl_domain *rd = rp->domains; + + for (i = 0; i < RAPL_DOMAIN_MAX; i++) { + unsigned int mask = rp->domain_map & (1 << i); + switch (mask) { + case BIT(RAPL_DOMAIN_PACKAGE): + rd->name = rapl_domain_names[RAPL_DOMAIN_PACKAGE]; + rd->id = RAPL_DOMAIN_PACKAGE; + rd->msrs[0] = MSR_PKG_POWER_LIMIT; + rd->msrs[1] = MSR_PKG_ENERGY_STATUS; + rd->msrs[2] = MSR_PKG_PERF_STATUS; + rd->msrs[3] = 0; + rd->msrs[4] = MSR_PKG_POWER_INFO; + rd->rpl[0].prim_id = PL1_ENABLE; + rd->rpl[0].name = pl1_name; + rd->rpl[1].prim_id = PL2_ENABLE; + rd->rpl[1].name = pl2_name; + break; + case BIT(RAPL_DOMAIN_PP0): + rd->name = rapl_domain_names[RAPL_DOMAIN_PP0]; + rd->id = RAPL_DOMAIN_PP0; + rd->msrs[0] = MSR_PP0_POWER_LIMIT; + rd->msrs[1] = MSR_PP0_ENERGY_STATUS; + rd->msrs[2] = 0; + rd->msrs[3] = MSR_PP0_POLICY; + rd->msrs[4] = 0; + rd->rpl[0].prim_id = PL1_ENABLE; + rd->rpl[0].name = pl1_name; + break; + case BIT(RAPL_DOMAIN_PP1): + rd->name = rapl_domain_names[RAPL_DOMAIN_PP1]; + rd->id = RAPL_DOMAIN_PP1; + rd->msrs[0] = MSR_PP1_POWER_LIMIT; + rd->msrs[1] = MSR_PP1_ENERGY_STATUS; + rd->msrs[2] = 0; + rd->msrs[3] = MSR_PP1_POLICY; + rd->msrs[4] = 0; + rd->rpl[0].prim_id = PL1_ENABLE; + rd->rpl[0].name = pl1_name; + break; + case BIT(RAPL_DOMAIN_DRAM): + rd->name = rapl_domain_names[RAPL_DOMAIN_DRAM]; + rd->id = RAPL_DOMAIN_DRAM; + rd->msrs[0] = MSR_DRAM_POWER_LIMIT; + rd->msrs[1] = MSR_DRAM_ENERGY_STATUS; + rd->msrs[2] = MSR_DRAM_PERF_STATUS; + rd->msrs[3] = 0; + rd->msrs[4] = MSR_DRAM_POWER_INFO; + rd->rpl[0].prim_id = PL1_ENABLE; + rd->rpl[0].name = pl1_name; + break; + } + if (mask) { + rd->package_id = rp->id; + rd++; + } + } +} + +static u64 rapl_unit_xlate(int package, enum unit_type type, u64 value, + int to_raw) +{ + u64 divisor = 1; + int scale = 1; /* scale to user friendly data without floating point */ + u64 f, y; /* fraction and exp. used for time unit */ + struct rapl_package *rp; + + rp = find_package_by_id(package); + if (!rp) + return value; + + switch (type) { + case POWER_UNIT: + divisor = rp->power_unit_divisor; + scale = POWER_UNIT_SCALE; + break; + case ENERGY_UNIT: + scale = ENERGY_UNIT_SCALE; + divisor = rp->energy_unit_divisor; + break; + case TIME_UNIT: + divisor = rp->time_unit_divisor; + scale = TIME_UNIT_SCALE; + /* special processing based on 2^Y*(1+F)/4 = val/divisor, refer + * to Intel Software Developer's manual Vol. 3a, CH 14.7.4. + */ + if (!to_raw) { + f = (value & 0x60) >> 5; + y = value & 0x1f; + value = (1 << y) * (4 + f) * scale / 4; + return div64_u64(value, divisor); + } else { + do_div(value, scale); + value *= divisor; + y = ilog2(value); + f = div64_u64(4 * (value - (1 << y)), 1 << y); + value = (y & 0x1f) | ((f & 0x3) << 5); + return value; + } + break; + case ARBITRARY_UNIT: + default: + return value; + }; + + if (to_raw) + return div64_u64(value * divisor, scale); + else + return div64_u64(value * scale, divisor); +} + +/* in the order of enum rapl_primitives */ +static struct rapl_primitive_info rpi[] = { + /* name, mask, shift, msr index, unit divisor */ + PRIMITIVE_INFO_INIT(ENERGY_COUNTER, ENERGY_STATUS_MASK, 0, + RAPL_DOMAIN_MSR_STATUS, ENERGY_UNIT, 0), + PRIMITIVE_INFO_INIT(POWER_LIMIT1, POWER_LIMIT1_MASK, 0, + RAPL_DOMAIN_MSR_LIMIT, POWER_UNIT, 0), + PRIMITIVE_INFO_INIT(POWER_LIMIT2, POWER_LIMIT2_MASK, 32, + RAPL_DOMAIN_MSR_LIMIT, POWER_UNIT, 0), + PRIMITIVE_INFO_INIT(FW_LOCK, POWER_PP_LOCK, 31, + RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0), + PRIMITIVE_INFO_INIT(PL1_ENABLE, POWER_LIMIT1_ENABLE, 15, + RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0), + PRIMITIVE_INFO_INIT(PL1_CLAMP, POWER_LIMIT1_CLAMP, 16, + RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0), + PRIMITIVE_INFO_INIT(PL2_ENABLE, POWER_LIMIT2_ENABLE, 47, + RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0), + PRIMITIVE_INFO_INIT(PL2_CLAMP, POWER_LIMIT2_CLAMP, 48, + RAPL_DOMAIN_MSR_LIMIT, ARBITRARY_UNIT, 0), + PRIMITIVE_INFO_INIT(TIME_WINDOW1, TIME_WINDOW1_MASK, 17, + RAPL_DOMAIN_MSR_LIMIT, TIME_UNIT, 0), + PRIMITIVE_INFO_INIT(TIME_WINDOW2, TIME_WINDOW2_MASK, 49, + RAPL_DOMAIN_MSR_LIMIT, TIME_UNIT, 0), + PRIMITIVE_INFO_INIT(THERMAL_SPEC_POWER, POWER_INFO_THERMAL_SPEC_MASK, + 0, RAPL_DOMAIN_MSR_INFO, POWER_UNIT, 0), + PRIMITIVE_INFO_INIT(MAX_POWER, POWER_INFO_MAX_MASK, 32, + RAPL_DOMAIN_MSR_INFO, POWER_UNIT, 0), + PRIMITIVE_INFO_INIT(MIN_POWER, POWER_INFO_MIN_MASK, 16, + RAPL_DOMAIN_MSR_INFO, POWER_UNIT, 0), + PRIMITIVE_INFO_INIT(MAX_TIME_WINDOW, POWER_INFO_MAX_TIME_WIN_MASK, 48, + RAPL_DOMAIN_MSR_INFO, TIME_UNIT, 0), + PRIMITIVE_INFO_INIT(THROTTLED_TIME, PERF_STATUS_THROTTLE_TIME_MASK, 0, + RAPL_DOMAIN_MSR_PERF, TIME_UNIT, 0), + PRIMITIVE_INFO_INIT(PRIORITY_LEVEL, PP_POLICY_MASK, 0, + RAPL_DOMAIN_MSR_POLICY, ARBITRARY_UNIT, 0), + /* non-hardware */ + PRIMITIVE_INFO_INIT(AVERAGE_POWER, 0, 0, 0, POWER_UNIT, + RAPL_PRIMITIVE_DERIVED), + {NULL, 0, 0, 0}, +}; + +/* Read primitive data based on its related struct rapl_primitive_info. + * if xlate flag is set, return translated data based on data units, i.e. + * time, energy, and power. + * RAPL MSRs are non-architectual and are laid out not consistently across + * domains. Here we use primitive info to allow writing consolidated access + * functions. + * For a given primitive, it is processed by MSR mask and shift. Unit conversion + * is pre-assigned based on RAPL unit MSRs read at init time. + * 63-------------------------- 31--------------------------- 0 + * | xxxxx (mask) | + * | |<- shift ----------------| + * 63-------------------------- 31--------------------------- 0 + */ +static int rapl_read_data_raw(struct rapl_domain *rd, + enum rapl_primitives prim, + bool xlate, u64 *data) +{ + u64 value, final; + u32 msr; + struct rapl_primitive_info *rp = &rpi[prim]; + int cpu; + + if (!rp->name || rp->flag & RAPL_PRIMITIVE_DUMMY) + return -EINVAL; + + msr = rd->msrs[rp->id]; + if (!msr) + return -EINVAL; + /* use physical package id to look up active cpus */ + cpu = find_active_cpu_on_package(rd->package_id); + if (cpu < 0) + return cpu; + + /* special-case package domain, which uses a different bit*/ + if (prim == FW_LOCK && rd->id == RAPL_DOMAIN_PACKAGE) { + rp->mask = POWER_PACKAGE_LOCK; + rp->shift = 63; + } + /* non-hardware data are collected by the polling thread */ + if (rp->flag & RAPL_PRIMITIVE_DERIVED) { + *data = rd->rdd.primitives[prim]; + return 0; + } + + if (rdmsrl_safe_on_cpu(cpu, msr, &value)) { + pr_debug("failed to read msr 0x%x on cpu %d\n", msr, cpu); + return -EIO; + } + + final = value & rp->mask; + final = final >> rp->shift; + if (xlate) + *data = rapl_unit_xlate(rd->package_id, rp->unit, final, 0); + else + *data = final; + + return 0; +} + +/* Similar use of primitive info in the read counterpart */ +static int rapl_write_data_raw(struct rapl_domain *rd, + enum rapl_primitives prim, + unsigned long long value) +{ + u64 msr_val; + u32 msr; + struct rapl_primitive_info *rp = &rpi[prim]; + int cpu; + + cpu = find_active_cpu_on_package(rd->package_id); + if (cpu < 0) + return cpu; + msr = rd->msrs[rp->id]; + if (rdmsrl_safe_on_cpu(cpu, msr, &msr_val)) { + dev_dbg(&rd->power_zone.dev, + "failed to read msr 0x%x on cpu %d\n", msr, cpu); + return -EIO; + } + value = rapl_unit_xlate(rd->package_id, rp->unit, value, 1); + msr_val &= ~rp->mask; + msr_val |= value << rp->shift; + if (wrmsrl_safe_on_cpu(cpu, msr, msr_val)) { + dev_dbg(&rd->power_zone.dev, + "failed to write msr 0x%x on cpu %d\n", msr, cpu); + return -EIO; + } + + return 0; +} + +static int rapl_check_unit(struct rapl_package *rp, int cpu) +{ + u64 msr_val; + u32 value; + + if (rdmsrl_safe_on_cpu(cpu, MSR_RAPL_POWER_UNIT, &msr_val)) { + pr_err("Failed to read power unit MSR 0x%x on CPU %d, exit.\n", + MSR_RAPL_POWER_UNIT, cpu); + return -ENODEV; + } + + /* Raw RAPL data stored in MSRs are in certain scales. We need to + * convert them into standard units based on the divisors reported in + * the RAPL unit MSRs. + * i.e. + * energy unit: 1/enery_unit_divisor Joules + * power unit: 1/power_unit_divisor Watts + * time unit: 1/time_unit_divisor Seconds + */ + value = (msr_val & ENERGY_UNIT_MASK) >> ENERGY_UNIT_OFFSET; + rp->energy_unit_divisor = 1 << value; + + + value = (msr_val & POWER_UNIT_MASK) >> POWER_UNIT_OFFSET; + rp->power_unit_divisor = 1 << value; + + value = (msr_val & TIME_UNIT_MASK) >> TIME_UNIT_OFFSET; + rp->time_unit_divisor = 1 << value; + + pr_debug("Physical package %d units: energy=%d, time=%d, power=%d\n", + rp->id, + rp->energy_unit_divisor, + rp->time_unit_divisor, + rp->power_unit_divisor); + + return 0; +} + +/* REVISIT: + * When package power limit is set artificially low by RAPL, LVT + * thermal interrupt for package power limit should be ignored + * since we are not really exceeding the real limit. The intention + * is to avoid excessive interrupts while we are trying to save power. + * A useful feature might be routing the package_power_limit interrupt + * to userspace via eventfd. once we have a usecase, this is simple + * to do by adding an atomic notifier. + */ + +static void package_power_limit_irq_save(int package_id) +{ + u32 l, h = 0; + int cpu; + struct rapl_package *rp; + + rp = find_package_by_id(package_id); + if (!rp) + return; + + if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) + return; + + cpu = find_active_cpu_on_package(package_id); + if (cpu < 0) + return; + /* save the state of PLN irq mask bit before disabling it */ + rdmsr_safe_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); + if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) { + rp->power_limit_irq = l & PACKAGE_THERM_INT_PLN_ENABLE; + rp->power_limit_irq |= PACKAGE_PLN_INT_SAVED; + } + l &= ~PACKAGE_THERM_INT_PLN_ENABLE; + wrmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); +} + +/* restore per package power limit interrupt enable state */ +static void package_power_limit_irq_restore(int package_id) +{ + u32 l, h; + int cpu; + struct rapl_package *rp; + + rp = find_package_by_id(package_id); + if (!rp) + return; + + if (!boot_cpu_has(X86_FEATURE_PTS) || !boot_cpu_has(X86_FEATURE_PLN)) + return; + + cpu = find_active_cpu_on_package(package_id); + if (cpu < 0) + return; + + /* irq enable state not saved, nothing to restore */ + if (!(rp->power_limit_irq & PACKAGE_PLN_INT_SAVED)) + return; + rdmsr_safe_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, &l, &h); + + if (rp->power_limit_irq & PACKAGE_THERM_INT_PLN_ENABLE) + l |= PACKAGE_THERM_INT_PLN_ENABLE; + else + l &= ~PACKAGE_THERM_INT_PLN_ENABLE; + + wrmsr_on_cpu(cpu, MSR_IA32_PACKAGE_THERM_INTERRUPT, l, h); +} + +static const struct x86_cpu_id rapl_ids[] = { + { X86_VENDOR_INTEL, 6, 0x2a},/* SNB */ + { X86_VENDOR_INTEL, 6, 0x2d},/* SNB EP */ + { X86_VENDOR_INTEL, 6, 0x3a},/* IVB */ + { X86_VENDOR_INTEL, 6, 0x45},/* HSW */ + /* TODO: Add more CPU IDs after testing */ + {} +}; +MODULE_DEVICE_TABLE(x86cpu, rapl_ids); + +/* read once for all raw primitive data for all packages, domains */ +static void rapl_update_domain_data(void) +{ + int dmn, prim; + u64 val; + struct rapl_package *rp; + + list_for_each_entry(rp, &rapl_packages, plist) { + for (dmn = 0; dmn < rp->nr_domains; dmn++) { + pr_debug("update package %d domain %s data\n", rp->id, + rp->domains[dmn].name); + /* exclude non-raw primitives */ + for (prim = 0; prim < NR_RAW_PRIMITIVES; prim++) + if (!rapl_read_data_raw(&rp->domains[dmn], prim, + rpi[prim].unit, + &val)) + rp->domains[dmn].rdd.primitives[prim] = + val; + } + } + +} + +static int rapl_unregister_powercap(void) +{ + struct rapl_package *rp; + struct rapl_domain *rd, *rd_package = NULL; + + /* unregister all active rapl packages from the powercap layer, + * hotplug lock held + */ + list_for_each_entry(rp, &rapl_packages, plist) { + package_power_limit_irq_restore(rp->id); + + for (rd = rp->domains; rd < rp->domains + rp->nr_domains; + rd++) { + pr_debug("remove package, undo power limit on %d: %s\n", + rp->id, rd->name); + rapl_write_data_raw(rd, PL1_ENABLE, 0); + rapl_write_data_raw(rd, PL2_ENABLE, 0); + rapl_write_data_raw(rd, PL1_CLAMP, 0); + rapl_write_data_raw(rd, PL2_CLAMP, 0); + if (rd->id == RAPL_DOMAIN_PACKAGE) { + rd_package = rd; + continue; + } + powercap_unregister_zone(control_type, &rd->power_zone); + } + /* do the package zone last */ + if (rd_package) + powercap_unregister_zone(control_type, + &rd_package->power_zone); + } + powercap_unregister_control_type(control_type); + + return 0; +} + +static int rapl_package_register_powercap(struct rapl_package *rp) +{ + struct rapl_domain *rd; + int ret = 0; + char dev_name[17]; /* max domain name = 7 + 1 + 8 for int + 1 for null*/ + struct powercap_zone *power_zone = NULL; + int nr_pl; + + /* first we register package domain as the parent zone*/ + for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { + if (rd->id == RAPL_DOMAIN_PACKAGE) { + nr_pl = find_nr_power_limit(rd); + pr_debug("register socket %d package domain %s\n", + rp->id, rd->name); + memset(dev_name, 0, sizeof(dev_name)); + snprintf(dev_name, sizeof(dev_name), "%s-%d", + rd->name, rp->id); + power_zone = powercap_register_zone(&rd->power_zone, + control_type, + dev_name, NULL, + &zone_ops[rd->id], + nr_pl, + &constraint_ops); + if (IS_ERR(power_zone)) { + pr_debug("failed to register package, %d\n", + rp->id); + ret = PTR_ERR(power_zone); + goto exit_package; + } + /* track parent zone in per package/socket data */ + rp->power_zone = power_zone; + /* done, only one package domain per socket */ + break; + } + } + if (!power_zone) { + pr_err("no package domain found, unknown topology!\n"); + ret = -ENODEV; + goto exit_package; + } + /* now register domains as children of the socket/package*/ + for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { + if (rd->id == RAPL_DOMAIN_PACKAGE) + continue; + /* number of power limits per domain varies */ + nr_pl = find_nr_power_limit(rd); + power_zone = powercap_register_zone(&rd->power_zone, + control_type, rd->name, + rp->power_zone, + &zone_ops[rd->id], nr_pl, + &constraint_ops); + + if (IS_ERR(power_zone)) { + pr_debug("failed to register power_zone, %d:%s:%s\n", + rp->id, rd->name, dev_name); + ret = PTR_ERR(power_zone); + goto err_cleanup; + } + } + +exit_package: + return ret; +err_cleanup: + /* clean up previously initialized domains within the package if we + * failed after the first domain setup. + */ + while (--rd >= rp->domains) { + pr_debug("unregister package %d domain %s\n", rp->id, rd->name); + powercap_unregister_zone(control_type, &rd->power_zone); + } + + return ret; +} + +static int rapl_register_powercap(void) +{ + struct rapl_domain *rd; + struct rapl_package *rp; + int ret = 0; + + control_type = powercap_register_control_type(NULL, "intel-rapl", NULL); + if (IS_ERR(control_type)) { + pr_debug("failed to register powercap control_type.\n"); + return PTR_ERR(control_type); + } + /* read the initial data */ + rapl_update_domain_data(); + list_for_each_entry(rp, &rapl_packages, plist) + if (rapl_package_register_powercap(rp)) + goto err_cleanup_package; + return ret; + +err_cleanup_package: + /* clean up previously initialized packages */ + list_for_each_entry_continue_reverse(rp, &rapl_packages, plist) { + for (rd = rp->domains; rd < rp->domains + rp->nr_domains; + rd++) { + pr_debug("unregister zone/package %d, %s domain\n", + rp->id, rd->name); + powercap_unregister_zone(control_type, &rd->power_zone); + } + } + + return ret; +} + +static int rapl_check_domain(int cpu, int domain) +{ + unsigned msr; + u64 val1, val2 = 0; + int retry = 0; + + switch (domain) { + case RAPL_DOMAIN_PACKAGE: + msr = MSR_PKG_ENERGY_STATUS; + break; + case RAPL_DOMAIN_PP0: + msr = MSR_PP0_ENERGY_STATUS; + break; + case RAPL_DOMAIN_PP1: + msr = MSR_PP1_ENERGY_STATUS; + break; + case RAPL_DOMAIN_DRAM: + msr = MSR_DRAM_ENERGY_STATUS; + break; + default: + pr_err("invalid domain id %d\n", domain); + return -EINVAL; + } + if (rdmsrl_safe_on_cpu(cpu, msr, &val1)) + return -ENODEV; + + /* energy counters roll slowly on some domains */ + while (++retry < 10) { + usleep_range(10000, 15000); + rdmsrl_safe_on_cpu(cpu, msr, &val2); + if ((val1 & ENERGY_STATUS_MASK) != (val2 & ENERGY_STATUS_MASK)) + return 0; + } + /* if energy counter does not change, report as bad domain */ + pr_info("domain %s energy ctr %llu:%llu not working, skip\n", + rapl_domain_names[domain], val1, val2); + + return -ENODEV; +} + +/* Detect active and valid domains for the given CPU, caller must + * ensure the CPU belongs to the targeted package and CPU hotlug is disabled. + */ +static int rapl_detect_domains(struct rapl_package *rp, int cpu) +{ + int i; + int ret = 0; + struct rapl_domain *rd; + u64 locked; + + for (i = 0; i < RAPL_DOMAIN_MAX; i++) { + /* use physical package id to read counters */ + if (!rapl_check_domain(cpu, i)) + rp->domain_map |= 1 << i; + } + rp->nr_domains = bitmap_weight(&rp->domain_map, RAPL_DOMAIN_MAX); + if (!rp->nr_domains) { + pr_err("no valid rapl domains found in package %d\n", rp->id); + ret = -ENODEV; + goto done; + } + pr_debug("found %d domains on package %d\n", rp->nr_domains, rp->id); + + rp->domains = kcalloc(rp->nr_domains + 1, sizeof(struct rapl_domain), + GFP_KERNEL); + if (!rp->domains) { + ret = -ENOMEM; + goto done; + } + rapl_init_domains(rp); + + for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { + /* check if the domain is locked by BIOS */ + if (rapl_read_data_raw(rd, FW_LOCK, false, &locked)) { + pr_info("RAPL package %d domain %s locked by BIOS\n", + rp->id, rd->name); + rd->state |= DOMAIN_STATE_BIOS_LOCKED; + } + } + + +done: + return ret; +} + +static bool is_package_new(int package) +{ + struct rapl_package *rp; + + /* caller prevents cpu hotplug, there will be no new packages added + * or deleted while traversing the package list, no need for locking. + */ + list_for_each_entry(rp, &rapl_packages, plist) + if (package == rp->id) + return false; + + return true; +} + +/* RAPL interface can be made of a two-level hierarchy: package level and domain + * level. We first detect the number of packages then domains of each package. + * We have to consider the possiblity of CPU online/offline due to hotplug and + * other scenarios. + */ +static int rapl_detect_topology(void) +{ + int i; + int phy_package_id; + struct rapl_package *new_package, *rp; + + for_each_online_cpu(i) { + phy_package_id = topology_physical_package_id(i); + if (is_package_new(phy_package_id)) { + new_package = kzalloc(sizeof(*rp), GFP_KERNEL); + if (!new_package) { + rapl_cleanup_data(); + return -ENOMEM; + } + /* add the new package to the list */ + new_package->id = phy_package_id; + new_package->nr_cpus = 1; + + /* check if the package contains valid domains */ + if (rapl_detect_domains(new_package, i) || + rapl_check_unit(new_package, i)) { + kfree(new_package->domains); + kfree(new_package); + /* free up the packages already initialized */ + rapl_cleanup_data(); + return -ENODEV; + } + INIT_LIST_HEAD(&new_package->plist); + list_add(&new_package->plist, &rapl_packages); + } else { + rp = find_package_by_id(phy_package_id); + if (rp) + ++rp->nr_cpus; + } + } + + return 0; +} + +/* called from CPU hotplug notifier, hotplug lock held */ +static void rapl_remove_package(struct rapl_package *rp) +{ + struct rapl_domain *rd, *rd_package = NULL; + + for (rd = rp->domains; rd < rp->domains + rp->nr_domains; rd++) { + if (rd->id == RAPL_DOMAIN_PACKAGE) { + rd_package = rd; + continue; + } + pr_debug("remove package %d, %s domain\n", rp->id, rd->name); + powercap_unregister_zone(control_type, &rd->power_zone); + } + /* do parent zone last */ + powercap_unregister_zone(control_type, &rd_package->power_zone); + list_del(&rp->plist); + kfree(rp); +} + +/* called from CPU hotplug notifier, hotplug lock held */ +static int rapl_add_package(int cpu) +{ + int ret = 0; + int phy_package_id; + struct rapl_package *rp; + + phy_package_id = topology_physical_package_id(cpu); + rp = kzalloc(sizeof(struct rapl_package), GFP_KERNEL); + if (!rp) + return -ENOMEM; + + /* add the new package to the list */ + rp->id = phy_package_id; + rp->nr_cpus = 1; + /* check if the package contains valid domains */ + if (rapl_detect_domains(rp, cpu) || + rapl_check_unit(rp, cpu)) { + ret = -ENODEV; + goto err_free_package; + } + if (!rapl_package_register_powercap(rp)) { + INIT_LIST_HEAD(&rp->plist); + list_add(&rp->plist, &rapl_packages); + return ret; + } + +err_free_package: + kfree(rp->domains); + kfree(rp); + + return ret; +} + +/* Handles CPU hotplug on multi-socket systems. + * If a CPU goes online as the first CPU of the physical package + * we add the RAPL package to the system. Similarly, when the last + * CPU of the package is removed, we remove the RAPL package and its + * associated domains. Cooling devices are handled accordingly at + * per-domain level. + */ +static int rapl_cpu_callback(struct notifier_block *nfb, + unsigned long action, void *hcpu) +{ + unsigned long cpu = (unsigned long)hcpu; + int phy_package_id; + struct rapl_package *rp; + + phy_package_id = topology_physical_package_id(cpu); + switch (action) { + case CPU_ONLINE: + case CPU_ONLINE_FROZEN: + case CPU_DOWN_FAILED: + case CPU_DOWN_FAILED_FROZEN: + rp = find_package_by_id(phy_package_id); + if (rp) + ++rp->nr_cpus; + else + rapl_add_package(cpu); + break; + case CPU_DOWN_PREPARE: + case CPU_DOWN_PREPARE_FROZEN: + rp = find_package_by_id(phy_package_id); + if (!rp) + break; + if (--rp->nr_cpus == 0) + rapl_remove_package(rp); + } + + return NOTIFY_OK; +} + +static struct notifier_block rapl_cpu_notifier = { + .notifier_call = rapl_cpu_callback, +}; + +static int __init rapl_init(void) +{ + int ret = 0; + + if (!x86_match_cpu(rapl_ids)) { + pr_err("driver does not support CPU family %d model %d\n", + boot_cpu_data.x86, boot_cpu_data.x86_model); + + return -ENODEV; + } + /* prevent CPU hotplug during detection */ + get_online_cpus(); + ret = rapl_detect_topology(); + if (ret) + goto done; + + if (rapl_register_powercap()) { + rapl_cleanup_data(); + ret = -ENODEV; + goto done; + } + register_hotcpu_notifier(&rapl_cpu_notifier); +done: + put_online_cpus(); + + return ret; +} + +static void __exit rapl_exit(void) +{ + get_online_cpus(); + unregister_hotcpu_notifier(&rapl_cpu_notifier); + rapl_unregister_powercap(); + rapl_cleanup_data(); + put_online_cpus(); +} + +module_init(rapl_init); +module_exit(rapl_exit); + +MODULE_DESCRIPTION("Driver for Intel RAPL (Running Average Power Limit)"); +MODULE_AUTHOR("Jacob Pan "); +MODULE_LICENSE("GPL v2"); From 9e3410b764b79670a59d6c1ccdcad483b92c058c Mon Sep 17 00:00:00 2001 From: Thierry Reding Date: Wed, 23 Oct 2013 13:37:35 +0200 Subject: [PATCH 7/7] PowerCap: Convert class code to use dev_groups The newly added power capping framework uses the obsolete .dev_attrs field of struct class. However this field will be removed in 3.13, so convert the code to use the .dev_groups field instead. Signed-off-by: Thierry Reding Acked-by: Greg Kroah-Hartman Signed-off-by: Rafael J. Wysocki --- drivers/powercap/powercap_sys.c | 12 +++++++----- 1 file changed, 7 insertions(+), 5 deletions(-) diff --git a/drivers/powercap/powercap_sys.c b/drivers/powercap/powercap_sys.c index c22fa4c78eaa..21814f90a44b 100644 --- a/drivers/powercap/powercap_sys.c +++ b/drivers/powercap/powercap_sys.c @@ -472,16 +472,18 @@ static ssize_t enabled_store(struct device *dev, return -ENOSYS; } -static struct device_attribute powercap_def_attrs[] = { - __ATTR(enabled, S_IWUSR | S_IRUGO, enabled_show, - enabled_store), - __ATTR_NULL +static DEVICE_ATTR_RW(enabled); + +static struct attribute *powercap_attrs[] = { + &dev_attr_enabled.attr, + NULL, }; +ATTRIBUTE_GROUPS(powercap); static struct class powercap_class = { .name = "powercap", .dev_release = powercap_release, - .dev_attrs = powercap_def_attrs, + .dev_groups = powercap_groups, }; struct powercap_zone *powercap_register_zone(