From afb8a9b70b86866a60e08b2956ae4e1406390336 Mon Sep 17 00:00:00 2001
From: Gautham R Shenoy <ego@in.ibm.com>
Date: Thu, 18 Dec 2008 23:26:09 +0530
Subject: [PATCH] sched: framework for sched_mc/smt_power_savings=N

Impact: extend range of /sys/devices/system/cpu/sched_mc_power_savings

Currently the sched_mc/smt_power_savings variable is a boolean,
which either enables or disables topology based power savings.
This patch extends the behaviour of the variable from boolean to
multivalued, such that based on the value, we decide how
aggressively do we want to perform powersavings balance at
appropriate sched domain based on topology.

Variable levels of power saving tunable would benefit end user to
match the required level of power savings vs performance
trade-off depending on the system configuration and workloads.

This version makes the sched_mc_power_savings global variable to
take more values (0,1,2).  Later versions can have a single
tunable called sched_power_savings instead of
sched_{mc,smt}_power_savings.

Signed-off-by: Gautham R Shenoy <ego@in.ibm.com>
Signed-off-by: Vaidyanathan Srinivasan <svaidy@linux.vnet.ibm.com>
Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
---
 include/linux/sched.h | 11 +++++++++++
 kernel/sched.c        | 17 ++++++++++++++---
 2 files changed, 25 insertions(+), 3 deletions(-)

diff --git a/include/linux/sched.h b/include/linux/sched.h
index 1210fb0e45ff..a96726658eca 100644
--- a/include/linux/sched.h
+++ b/include/linux/sched.h
@@ -763,6 +763,17 @@ enum cpu_idle_type {
 #define SD_SERIALIZE		1024	/* Only a single load balancing instance */
 #define SD_WAKE_IDLE_FAR	2048	/* Gain latency sacrificing cache hit */
 
+enum powersavings_balance_level {
+	POWERSAVINGS_BALANCE_NONE = 0,  /* No power saving load balance */
+	POWERSAVINGS_BALANCE_BASIC,	/* Fill one thread/core/package
+					 * first for long running threads
+					 */
+	POWERSAVINGS_BALANCE_WAKEUP,	/* Also bias task wakeups to semi-idle
+					 * cpu package for power savings
+					 */
+	MAX_POWERSAVINGS_BALANCE_LEVELS
+};
+
 extern int sched_mc_power_savings, sched_smt_power_savings;
 
 static inline int sd_balance_for_mc_power(void)
diff --git a/kernel/sched.c b/kernel/sched.c
index b309027bf9e8..56b285cd5350 100644
--- a/kernel/sched.c
+++ b/kernel/sched.c
@@ -7906,14 +7906,25 @@ int arch_reinit_sched_domains(void)
 static ssize_t sched_power_savings_store(const char *buf, size_t count, int smt)
 {
 	int ret;
+	unsigned int level = 0;
 
-	if (buf[0] != '0' && buf[0] != '1')
+	if (sscanf(buf, "%u", &level) != 1)
+		return -EINVAL;
+
+	/*
+	 * level is always be positive so don't check for
+	 * level < POWERSAVINGS_BALANCE_NONE which is 0
+	 * What happens on 0 or 1 byte write,
+	 * need to check for count as well?
+	 */
+
+	if (level >= MAX_POWERSAVINGS_BALANCE_LEVELS)
 		return -EINVAL;
 
 	if (smt)
-		sched_smt_power_savings = (buf[0] == '1');
+		sched_smt_power_savings = level;
 	else
-		sched_mc_power_savings = (buf[0] == '1');
+		sched_mc_power_savings = level;
 
 	ret = arch_reinit_sched_domains();