Merge branch 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull scheduler updates from Ingo Molnar: "The main changes are: - Migrate CPU-intense 'misfit' tasks on asymmetric capacity systems, to better utilize (much) faster 'big core' CPUs. (Morten Rasmussen, Valentin Schneider) - Topology handling improvements, in particular when CPU capacity changes and related load-balancing fixes/improvements (Morten Rasmussen) - ... plus misc other improvements, fixes and updates" * 'sched-core-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: (28 commits) sched/completions/Documentation: Add recommendation for dynamic and ONSTACK completions sched/completions/Documentation: Clean up the document some more sched/completions/Documentation: Fix a couple of punctuation nits cpu/SMT: State SMT is disabled even with nosmt and without "=force" sched/core: Fix comment regarding nr_iowait_cpu() and get_iowait_load() sched/fair: Remove setting task's se->runnable_weight during PELT update sched/fair: Disable LB_BIAS by default sched/pelt: Fix warning and clean up IRQ PELT config sched/topology: Make local variables static sched/debug: Use symbolic names for task state constants sched/numa: Remove unused numa_stats::nr_running field sched/numa: Remove unused code from update_numa_stats() sched/debug: Explicitly cast sched_feat() to bool sched/core: Disable SD_PREFER_SIBLING on asymmetric CPU capacity domains sched/fair: Don't move tasks to lower capacity CPUs unless necessary sched/fair: Set rq->rd->overload when misfit sched/fair: Wrap rq->rd->overload accesses with READ/WRITE_ONCE() sched/core: Change root_domain->overload type to int sched/fair: Change 'prefer_sibling' type to bool sched/fair: Kick nohz balance if rq->misfit_task_load ...
This commit is contained in:
commit
42f52e1c59
|
@ -1,146 +1,187 @@
|
|||
completions - wait for completion handling
|
||||
==========================================
|
||||
|
||||
This document was originally written based on 3.18.0 (linux-next)
|
||||
Completions - "wait for completion" barrier APIs
|
||||
================================================
|
||||
|
||||
Introduction:
|
||||
-------------
|
||||
|
||||
If you have one or more threads of execution that must wait for some process
|
||||
If you have one or more threads that must wait for some kernel activity
|
||||
to have reached a point or a specific state, completions can provide a
|
||||
race-free solution to this problem. Semantically they are somewhat like a
|
||||
pthread_barrier and have similar use-cases.
|
||||
pthread_barrier() and have similar use-cases.
|
||||
|
||||
Completions are a code synchronization mechanism which is preferable to any
|
||||
misuse of locks. Any time you think of using yield() or some quirky
|
||||
msleep(1) loop to allow something else to proceed, you probably want to
|
||||
look into using one of the wait_for_completion*() calls instead. The
|
||||
advantage of using completions is clear intent of the code, but also more
|
||||
efficient code as both threads can continue until the result is actually
|
||||
needed.
|
||||
misuse of locks/semaphores and busy-loops. Any time you think of using
|
||||
yield() or some quirky msleep(1) loop to allow something else to proceed,
|
||||
you probably want to look into using one of the wait_for_completion*()
|
||||
calls and complete() instead.
|
||||
|
||||
Completions are built on top of the generic event infrastructure in Linux,
|
||||
with the event reduced to a simple flag (appropriately called "done") in
|
||||
struct completion that tells the waiting threads of execution if they
|
||||
can continue safely.
|
||||
The advantage of using completions is that they have a well defined, focused
|
||||
purpose which makes it very easy to see the intent of the code, but they
|
||||
also result in more efficient code as all threads can continue execution
|
||||
until the result is actually needed, and both the waiting and the signalling
|
||||
is highly efficient using low level scheduler sleep/wakeup facilities.
|
||||
|
||||
As completions are scheduling related, the code is found in
|
||||
Completions are built on top of the waitqueue and wakeup infrastructure of
|
||||
the Linux scheduler. The event the threads on the waitqueue are waiting for
|
||||
is reduced to a simple flag in 'struct completion', appropriately called "done".
|
||||
|
||||
As completions are scheduling related, the code can be found in
|
||||
kernel/sched/completion.c.
|
||||
|
||||
|
||||
Usage:
|
||||
------
|
||||
|
||||
There are three parts to using completions, the initialization of the
|
||||
struct completion, the waiting part through a call to one of the variants of
|
||||
wait_for_completion() and the signaling side through a call to complete()
|
||||
or complete_all(). Further there are some helper functions for checking the
|
||||
state of completions.
|
||||
There are three main parts to using completions:
|
||||
|
||||
To use completions one needs to include <linux/completion.h> and
|
||||
create a variable of type struct completion. The structure used for
|
||||
handling of completions is:
|
||||
- the initialization of the 'struct completion' synchronization object
|
||||
- the waiting part through a call to one of the variants of wait_for_completion(),
|
||||
- the signaling side through a call to complete() or complete_all().
|
||||
|
||||
There are also some helper functions for checking the state of completions.
|
||||
Note that while initialization must happen first, the waiting and signaling
|
||||
part can happen in any order. I.e. it's entirely normal for a thread
|
||||
to have marked a completion as 'done' before another thread checks whether
|
||||
it has to wait for it.
|
||||
|
||||
To use completions you need to #include <linux/completion.h> and
|
||||
create a static or dynamic variable of type 'struct completion',
|
||||
which has only two fields:
|
||||
|
||||
struct completion {
|
||||
unsigned int done;
|
||||
wait_queue_head_t wait;
|
||||
};
|
||||
|
||||
providing the wait queue to place tasks on for waiting and the flag for
|
||||
indicating the state of affairs.
|
||||
This provides the ->wait waitqueue to place tasks on for waiting (if any), and
|
||||
the ->done completion flag for indicating whether it's completed or not.
|
||||
|
||||
Completions should be named to convey the intent of the waiter. A good
|
||||
example is:
|
||||
Completions should be named to refer to the event that is being synchronized on.
|
||||
A good example is:
|
||||
|
||||
wait_for_completion(&early_console_added);
|
||||
|
||||
complete(&early_console_added);
|
||||
|
||||
Good naming (as always) helps code readability.
|
||||
Good, intuitive naming (as always) helps code readability. Naming a completion
|
||||
'complete' is not helpful unless the purpose is super obvious...
|
||||
|
||||
|
||||
Initializing completions:
|
||||
-------------------------
|
||||
|
||||
Initialization of dynamically allocated completions, often embedded in
|
||||
other structures, is done with:
|
||||
Dynamically allocated completion objects should preferably be embedded in data
|
||||
structures that are assured to be alive for the life-time of the function/driver,
|
||||
to prevent races with asynchronous complete() calls from occurring.
|
||||
|
||||
void init_completion(&done);
|
||||
Particular care should be taken when using the _timeout() or _killable()/_interruptible()
|
||||
variants of wait_for_completion(), as it must be assured that memory de-allocation
|
||||
does not happen until all related activities (complete() or reinit_completion())
|
||||
have taken place, even if these wait functions return prematurely due to a timeout
|
||||
or a signal triggering.
|
||||
|
||||
Initialization is accomplished by initializing the wait queue and setting
|
||||
the default state to "not available", that is, "done" is set to 0.
|
||||
Initializing of dynamically allocated completion objects is done via a call to
|
||||
init_completion():
|
||||
|
||||
init_completion(&dynamic_object->done);
|
||||
|
||||
In this call we initialize the waitqueue and set ->done to 0, i.e. "not completed"
|
||||
or "not done".
|
||||
|
||||
The re-initialization function, reinit_completion(), simply resets the
|
||||
done element to "not available", thus again to 0, without touching the
|
||||
wait queue. Calling init_completion() twice on the same completion object is
|
||||
->done field to 0 ("not done"), without touching the waitqueue.
|
||||
Callers of this function must make sure that there are no racy
|
||||
wait_for_completion() calls going on in parallel.
|
||||
|
||||
Calling init_completion() on the same completion object twice is
|
||||
most likely a bug as it re-initializes the queue to an empty queue and
|
||||
enqueued tasks could get "lost" - use reinit_completion() in that case.
|
||||
enqueued tasks could get "lost" - use reinit_completion() in that case,
|
||||
but be aware of other races.
|
||||
|
||||
For static declaration and initialization, macros are available. These are:
|
||||
For static declaration and initialization, macros are available.
|
||||
|
||||
static DECLARE_COMPLETION(setup_done)
|
||||
For static (or global) declarations in file scope you can use DECLARE_COMPLETION():
|
||||
|
||||
used for static declarations in file scope. Within functions the static
|
||||
initialization should always use:
|
||||
static DECLARE_COMPLETION(setup_done);
|
||||
DECLARE_COMPLETION(setup_done);
|
||||
|
||||
Note that in this case the completion is boot time (or module load time)
|
||||
initialized to 'not done' and doesn't require an init_completion() call.
|
||||
|
||||
When a completion is declared as a local variable within a function,
|
||||
then the initialization should always use DECLARE_COMPLETION_ONSTACK()
|
||||
explicitly, not just to make lockdep happy, but also to make it clear
|
||||
that limited scope had been considered and is intentional:
|
||||
|
||||
DECLARE_COMPLETION_ONSTACK(setup_done)
|
||||
|
||||
suitable for automatic/local variables on the stack and will make lockdep
|
||||
happy. Note also that one needs to make *sure* the completion passed to
|
||||
work threads remains in-scope, and no references remain to on-stack data
|
||||
when the initiating function returns.
|
||||
Note that when using completion objects as local variables you must be
|
||||
acutely aware of the short life time of the function stack: the function
|
||||
must not return to a calling context until all activities (such as waiting
|
||||
threads) have ceased and the completion object is completely unused.
|
||||
|
||||
Using on-stack completions for code that calls any of the _timeout or
|
||||
_interruptible/_killable variants is not advisable as they will require
|
||||
additional synchronization to prevent the on-stack completion object in
|
||||
the timeout/signal cases from going out of scope. Consider using dynamically
|
||||
allocated completions when intending to use the _interruptible/_killable
|
||||
or _timeout variants of wait_for_completion().
|
||||
To emphasise this again: in particular when using some of the waiting API variants
|
||||
with more complex outcomes, such as the timeout or signalling (_timeout(),
|
||||
_killable() and _interruptible()) variants, the wait might complete
|
||||
prematurely while the object might still be in use by another thread - and a return
|
||||
from the wait_on_completion*() caller function will deallocate the function
|
||||
stack and cause subtle data corruption if a complete() is done in some
|
||||
other thread. Simple testing might not trigger these kinds of races.
|
||||
|
||||
If unsure, use dynamically allocated completion objects, preferably embedded
|
||||
in some other long lived object that has a boringly long life time which
|
||||
exceeds the life time of any helper threads using the completion object,
|
||||
or has a lock or other synchronization mechanism to make sure complete()
|
||||
is not called on a freed object.
|
||||
|
||||
A naive DECLARE_COMPLETION() on the stack triggers a lockdep warning.
|
||||
|
||||
Waiting for completions:
|
||||
------------------------
|
||||
|
||||
For a thread of execution to wait for some concurrent work to finish, it
|
||||
calls wait_for_completion() on the initialized completion structure.
|
||||
For a thread to wait for some concurrent activity to finish, it
|
||||
calls wait_for_completion() on the initialized completion structure:
|
||||
|
||||
void wait_for_completion(struct completion *done)
|
||||
|
||||
A typical usage scenario is:
|
||||
|
||||
CPU#1 CPU#2
|
||||
|
||||
struct completion setup_done;
|
||||
|
||||
init_completion(&setup_done);
|
||||
initialize_work(...,&setup_done,...)
|
||||
initialize_work(...,&setup_done,...);
|
||||
|
||||
/* run non-dependent code */ /* do setup */
|
||||
/* run non-dependent code */ /* do setup */
|
||||
|
||||
wait_for_completion(&setup_done); complete(setup_done)
|
||||
wait_for_completion(&setup_done); complete(setup_done);
|
||||
|
||||
This is not implying any temporal order on wait_for_completion() and the
|
||||
call to complete() - if the call to complete() happened before the call
|
||||
This is not implying any particular order between wait_for_completion() and
|
||||
the call to complete() - if the call to complete() happened before the call
|
||||
to wait_for_completion() then the waiting side simply will continue
|
||||
immediately as all dependencies are satisfied if not it will block until
|
||||
immediately as all dependencies are satisfied; if not, it will block until
|
||||
completion is signaled by complete().
|
||||
|
||||
Note that wait_for_completion() is calling spin_lock_irq()/spin_unlock_irq(),
|
||||
so it can only be called safely when you know that interrupts are enabled.
|
||||
Calling it from hard-irq or irqs-off atomic contexts will result in
|
||||
hard-to-detect spurious enabling of interrupts.
|
||||
|
||||
wait_for_completion():
|
||||
|
||||
void wait_for_completion(struct completion *done):
|
||||
Calling it from IRQs-off atomic contexts will result in hard-to-detect
|
||||
spurious enabling of interrupts.
|
||||
|
||||
The default behavior is to wait without a timeout and to mark the task as
|
||||
uninterruptible. wait_for_completion() and its variants are only safe
|
||||
in process context (as they can sleep) but not in atomic context,
|
||||
interrupt context, with disabled irqs. or preemption is disabled - see also
|
||||
interrupt context, with disabled IRQs, or preemption is disabled - see also
|
||||
try_wait_for_completion() below for handling completion in atomic/interrupt
|
||||
context.
|
||||
|
||||
As all variants of wait_for_completion() can (obviously) block for a long
|
||||
time, you probably don't want to call this with held mutexes.
|
||||
time depending on the nature of the activity they are waiting for, so in
|
||||
most cases you probably don't want to call this with held mutexes.
|
||||
|
||||
|
||||
Variants available:
|
||||
-------------------
|
||||
wait_for_completion*() variants available:
|
||||
------------------------------------------
|
||||
|
||||
The below variants all return status and this status should be checked in
|
||||
most(/all) cases - in cases where the status is deliberately not checked you
|
||||
|
@ -148,51 +189,53 @@ probably want to make a note explaining this (e.g. see
|
|||
arch/arm/kernel/smp.c:__cpu_up()).
|
||||
|
||||
A common problem that occurs is to have unclean assignment of return types,
|
||||
so care should be taken with assigning return-values to variables of proper
|
||||
type. Checking for the specific meaning of return values also has been found
|
||||
to be quite inaccurate e.g. constructs like
|
||||
if (!wait_for_completion_interruptible_timeout(...)) would execute the same
|
||||
code path for successful completion and for the interrupted case - which is
|
||||
probably not what you want.
|
||||
so take care to assign return-values to variables of the proper type.
|
||||
|
||||
Checking for the specific meaning of return values also has been found
|
||||
to be quite inaccurate, e.g. constructs like:
|
||||
|
||||
if (!wait_for_completion_interruptible_timeout(...))
|
||||
|
||||
... would execute the same code path for successful completion and for the
|
||||
interrupted case - which is probably not what you want.
|
||||
|
||||
int wait_for_completion_interruptible(struct completion *done)
|
||||
|
||||
This function marks the task TASK_INTERRUPTIBLE. If a signal was received
|
||||
while waiting it will return -ERESTARTSYS; 0 otherwise.
|
||||
This function marks the task TASK_INTERRUPTIBLE while it is waiting.
|
||||
If a signal was received while waiting it will return -ERESTARTSYS; 0 otherwise.
|
||||
|
||||
unsigned long wait_for_completion_timeout(struct completion *done,
|
||||
unsigned long timeout)
|
||||
unsigned long wait_for_completion_timeout(struct completion *done, unsigned long timeout)
|
||||
|
||||
The task is marked as TASK_UNINTERRUPTIBLE and will wait at most 'timeout'
|
||||
(in jiffies). If timeout occurs it returns 0 else the remaining time in
|
||||
jiffies (but at least 1). Timeouts are preferably calculated with
|
||||
msecs_to_jiffies() or usecs_to_jiffies(). If the returned timeout value is
|
||||
deliberately ignored a comment should probably explain why (e.g. see
|
||||
drivers/mfd/wm8350-core.c wm8350_read_auxadc())
|
||||
jiffies. If a timeout occurs it returns 0, else the remaining time in
|
||||
jiffies (but at least 1).
|
||||
|
||||
long wait_for_completion_interruptible_timeout(
|
||||
struct completion *done, unsigned long timeout)
|
||||
Timeouts are preferably calculated with msecs_to_jiffies() or usecs_to_jiffies(),
|
||||
to make the code largely HZ-invariant.
|
||||
|
||||
If the returned timeout value is deliberately ignored a comment should probably explain
|
||||
why (e.g. see drivers/mfd/wm8350-core.c wm8350_read_auxadc()).
|
||||
|
||||
long wait_for_completion_interruptible_timeout(struct completion *done, unsigned long timeout)
|
||||
|
||||
This function passes a timeout in jiffies and marks the task as
|
||||
TASK_INTERRUPTIBLE. If a signal was received it will return -ERESTARTSYS;
|
||||
otherwise it returns 0 if the completion timed out or the remaining time in
|
||||
otherwise it returns 0 if the completion timed out, or the remaining time in
|
||||
jiffies if completion occurred.
|
||||
|
||||
Further variants include _killable which uses TASK_KILLABLE as the
|
||||
designated tasks state and will return -ERESTARTSYS if it is interrupted or
|
||||
else 0 if completion was achieved. There is a _timeout variant as well:
|
||||
designated tasks state and will return -ERESTARTSYS if it is interrupted,
|
||||
or 0 if completion was achieved. There is a _timeout variant as well:
|
||||
|
||||
long wait_for_completion_killable(struct completion *done)
|
||||
long wait_for_completion_killable_timeout(struct completion *done,
|
||||
unsigned long timeout)
|
||||
long wait_for_completion_killable_timeout(struct completion *done, unsigned long timeout)
|
||||
|
||||
The _io variants wait_for_completion_io() behave the same as the non-_io
|
||||
variants, except for accounting waiting time as waiting on IO, which has
|
||||
an impact on how the task is accounted in scheduling stats.
|
||||
variants, except for accounting waiting time as 'waiting on IO', which has
|
||||
an impact on how the task is accounted in scheduling/IO stats:
|
||||
|
||||
void wait_for_completion_io(struct completion *done)
|
||||
unsigned long wait_for_completion_io_timeout(struct completion *done
|
||||
unsigned long timeout)
|
||||
unsigned long wait_for_completion_io_timeout(struct completion *done, unsigned long timeout)
|
||||
|
||||
|
||||
Signaling completions:
|
||||
|
@ -200,31 +243,31 @@ Signaling completions:
|
|||
|
||||
A thread that wants to signal that the conditions for continuation have been
|
||||
achieved calls complete() to signal exactly one of the waiters that it can
|
||||
continue.
|
||||
continue:
|
||||
|
||||
void complete(struct completion *done)
|
||||
|
||||
or calls complete_all() to signal all current and future waiters.
|
||||
... or calls complete_all() to signal all current and future waiters:
|
||||
|
||||
void complete_all(struct completion *done)
|
||||
|
||||
The signaling will work as expected even if completions are signaled before
|
||||
a thread starts waiting. This is achieved by the waiter "consuming"
|
||||
(decrementing) the done element of struct completion. Waiting threads
|
||||
(decrementing) the done field of 'struct completion'. Waiting threads
|
||||
wakeup order is the same in which they were enqueued (FIFO order).
|
||||
|
||||
If complete() is called multiple times then this will allow for that number
|
||||
of waiters to continue - each call to complete() will simply increment the
|
||||
done element. Calling complete_all() multiple times is a bug though. Both
|
||||
complete() and complete_all() can be called in hard-irq/atomic context safely.
|
||||
done field. Calling complete_all() multiple times is a bug though. Both
|
||||
complete() and complete_all() can be called in IRQ/atomic context safely.
|
||||
|
||||
There only can be one thread calling complete() or complete_all() on a
|
||||
particular struct completion at any time - serialized through the wait
|
||||
There can only be one thread calling complete() or complete_all() on a
|
||||
particular 'struct completion' at any time - serialized through the wait
|
||||
queue spinlock. Any such concurrent calls to complete() or complete_all()
|
||||
probably are a design bug.
|
||||
|
||||
Signaling completion from hard-irq context is fine as it will appropriately
|
||||
lock with spin_lock_irqsave/spin_unlock_irqrestore and it will never sleep.
|
||||
Signaling completion from IRQ context is fine as it will appropriately
|
||||
lock with spin_lock_irqsave()/spin_unlock_irqrestore() and it will never sleep.
|
||||
|
||||
|
||||
try_wait_for_completion()/completion_done():
|
||||
|
@ -236,7 +279,7 @@ else it consumes one posted completion and returns true.
|
|||
|
||||
bool try_wait_for_completion(struct completion *done)
|
||||
|
||||
Finally, to check the state of a completion without changing it in any way,
|
||||
Finally, to check the state of a completion without changing it in any way,
|
||||
call completion_done(), which returns false if there are no posted
|
||||
completions that were not yet consumed by waiters (implying that there are
|
||||
waiters) and true otherwise;
|
||||
|
@ -244,4 +287,4 @@ waiters) and true otherwise;
|
|||
bool completion_done(struct completion *done)
|
||||
|
||||
Both try_wait_for_completion() and completion_done() are safe to be called in
|
||||
hard-irq or atomic context.
|
||||
IRQ or atomic context.
|
||||
|
|
|
@ -33,6 +33,9 @@ const struct cpumask *cpu_coregroup_mask(int cpu);
|
|||
/* Replace task scheduler's default cpu-invariant accounting */
|
||||
#define arch_scale_cpu_capacity topology_get_cpu_scale
|
||||
|
||||
/* Enable topology flag updates */
|
||||
#define arch_update_cpu_topology topology_update_cpu_topology
|
||||
|
||||
#else
|
||||
|
||||
static inline void init_cpu_topology(void) { }
|
||||
|
|
|
@ -45,6 +45,9 @@ int pcibus_to_node(struct pci_bus *bus);
|
|||
/* Replace task scheduler's default cpu-invariant accounting */
|
||||
#define arch_scale_cpu_capacity topology_get_cpu_scale
|
||||
|
||||
/* Enable topology flag updates */
|
||||
#define arch_update_cpu_topology topology_update_cpu_topology
|
||||
|
||||
#include <asm-generic/topology.h>
|
||||
|
||||
#endif /* _ASM_ARM_TOPOLOGY_H */
|
||||
|
|
|
@ -15,6 +15,7 @@
|
|||
#include <linux/slab.h>
|
||||
#include <linux/string.h>
|
||||
#include <linux/sched/topology.h>
|
||||
#include <linux/cpuset.h>
|
||||
|
||||
DEFINE_PER_CPU(unsigned long, freq_scale) = SCHED_CAPACITY_SCALE;
|
||||
|
||||
|
@ -47,6 +48,9 @@ static ssize_t cpu_capacity_show(struct device *dev,
|
|||
return sprintf(buf, "%lu\n", topology_get_cpu_scale(NULL, cpu->dev.id));
|
||||
}
|
||||
|
||||
static void update_topology_flags_workfn(struct work_struct *work);
|
||||
static DECLARE_WORK(update_topology_flags_work, update_topology_flags_workfn);
|
||||
|
||||
static ssize_t cpu_capacity_store(struct device *dev,
|
||||
struct device_attribute *attr,
|
||||
const char *buf,
|
||||
|
@ -72,6 +76,8 @@ static ssize_t cpu_capacity_store(struct device *dev,
|
|||
topology_set_cpu_scale(i, new_capacity);
|
||||
mutex_unlock(&cpu_scale_mutex);
|
||||
|
||||
schedule_work(&update_topology_flags_work);
|
||||
|
||||
return count;
|
||||
}
|
||||
|
||||
|
@ -96,6 +102,25 @@ static int register_cpu_capacity_sysctl(void)
|
|||
}
|
||||
subsys_initcall(register_cpu_capacity_sysctl);
|
||||
|
||||
static int update_topology;
|
||||
|
||||
int topology_update_cpu_topology(void)
|
||||
{
|
||||
return update_topology;
|
||||
}
|
||||
|
||||
/*
|
||||
* Updating the sched_domains can't be done directly from cpufreq callbacks
|
||||
* due to locking, so queue the work for later.
|
||||
*/
|
||||
static void update_topology_flags_workfn(struct work_struct *work)
|
||||
{
|
||||
update_topology = 1;
|
||||
rebuild_sched_domains();
|
||||
pr_debug("sched_domain hierarchy rebuilt, flags updated\n");
|
||||
update_topology = 0;
|
||||
}
|
||||
|
||||
static u32 capacity_scale;
|
||||
static u32 *raw_capacity;
|
||||
|
||||
|
@ -201,6 +226,7 @@ init_cpu_capacity_callback(struct notifier_block *nb,
|
|||
|
||||
if (cpumask_empty(cpus_to_visit)) {
|
||||
topology_normalize_cpu_scale();
|
||||
schedule_work(&update_topology_flags_work);
|
||||
free_raw_capacity();
|
||||
pr_debug("cpu_capacity: parsing done\n");
|
||||
schedule_work(&parsing_done_work);
|
||||
|
|
|
@ -9,6 +9,7 @@
|
|||
#include <linux/percpu.h>
|
||||
|
||||
void topology_normalize_cpu_scale(void);
|
||||
int topology_update_cpu_topology(void);
|
||||
|
||||
struct device_node;
|
||||
bool topology_parse_cpu_capacity(struct device_node *cpu_node, int cpu);
|
||||
|
|
|
@ -23,10 +23,10 @@
|
|||
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
|
||||
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
|
||||
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
|
||||
#define SD_ASYM_CPUCAPACITY 0x0040 /* Groups have different max cpu capacities */
|
||||
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu capacity */
|
||||
#define SD_ASYM_CPUCAPACITY 0x0040 /* Domain members have different CPU capacities */
|
||||
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share CPU capacity */
|
||||
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
|
||||
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
|
||||
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share CPU pkg resources */
|
||||
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
|
||||
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
|
||||
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
|
||||
|
|
|
@ -159,9 +159,14 @@ TRACE_EVENT(sched_switch,
|
|||
|
||||
(__entry->prev_state & (TASK_REPORT_MAX - 1)) ?
|
||||
__print_flags(__entry->prev_state & (TASK_REPORT_MAX - 1), "|",
|
||||
{ 0x01, "S" }, { 0x02, "D" }, { 0x04, "T" },
|
||||
{ 0x08, "t" }, { 0x10, "X" }, { 0x20, "Z" },
|
||||
{ 0x40, "P" }, { 0x80, "I" }) :
|
||||
{ TASK_INTERRUPTIBLE, "S" },
|
||||
{ TASK_UNINTERRUPTIBLE, "D" },
|
||||
{ __TASK_STOPPED, "T" },
|
||||
{ __TASK_TRACED, "t" },
|
||||
{ EXIT_DEAD, "X" },
|
||||
{ EXIT_ZOMBIE, "Z" },
|
||||
{ TASK_PARKED, "P" },
|
||||
{ TASK_DEAD, "I" }) :
|
||||
"R",
|
||||
|
||||
__entry->prev_state & TASK_REPORT_MAX ? "+" : "",
|
||||
|
|
|
@ -415,6 +415,11 @@ config IRQ_TIME_ACCOUNTING
|
|||
|
||||
If in doubt, say N here.
|
||||
|
||||
config HAVE_SCHED_AVG_IRQ
|
||||
def_bool y
|
||||
depends on IRQ_TIME_ACCOUNTING || PARAVIRT_TIME_ACCOUNTING
|
||||
depends on SMP
|
||||
|
||||
config BSD_PROCESS_ACCT
|
||||
bool "BSD Process Accounting"
|
||||
depends on MULTIUSER
|
||||
|
|
|
@ -383,6 +383,7 @@ void __init cpu_smt_disable(bool force)
|
|||
pr_info("SMT: Force disabled\n");
|
||||
cpu_smt_control = CPU_SMT_FORCE_DISABLED;
|
||||
} else {
|
||||
pr_info("SMT: disabled\n");
|
||||
cpu_smt_control = CPU_SMT_DISABLED;
|
||||
}
|
||||
}
|
||||
|
|
|
@ -135,9 +135,8 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
|
|||
* In theory, the compile should just see 0 here, and optimize out the call
|
||||
* to sched_rt_avg_update. But I don't trust it...
|
||||
*/
|
||||
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
||||
s64 steal = 0, irq_delta = 0;
|
||||
#endif
|
||||
s64 __maybe_unused steal = 0, irq_delta = 0;
|
||||
|
||||
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
||||
irq_delta = irq_time_read(cpu_of(rq)) - rq->prev_irq_time;
|
||||
|
||||
|
@ -177,7 +176,7 @@ static void update_rq_clock_task(struct rq *rq, s64 delta)
|
|||
|
||||
rq->clock_task += delta;
|
||||
|
||||
#ifdef HAVE_SCHED_AVG_IRQ
|
||||
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
||||
if ((irq_delta + steal) && sched_feat(NONTASK_CAPACITY))
|
||||
update_irq_load_avg(rq, irq_delta + steal);
|
||||
#endif
|
||||
|
@ -701,6 +700,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
|
|||
if (idle_policy(p->policy)) {
|
||||
load->weight = scale_load(WEIGHT_IDLEPRIO);
|
||||
load->inv_weight = WMULT_IDLEPRIO;
|
||||
p->se.runnable_weight = load->weight;
|
||||
return;
|
||||
}
|
||||
|
||||
|
@ -713,6 +713,7 @@ static void set_load_weight(struct task_struct *p, bool update_load)
|
|||
} else {
|
||||
load->weight = scale_load(sched_prio_to_weight[prio]);
|
||||
load->inv_weight = sched_prio_to_wmult[prio];
|
||||
p->se.runnable_weight = load->weight;
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -2915,10 +2916,10 @@ unsigned long nr_iowait(void)
|
|||
}
|
||||
|
||||
/*
|
||||
* Consumers of these two interfaces, like for example the cpufreq menu
|
||||
* governor are using nonsensical data. Boosting frequency for a CPU that has
|
||||
* IO-wait which might not even end up running the task when it does become
|
||||
* runnable.
|
||||
* Consumers of these two interfaces, like for example the cpuidle menu
|
||||
* governor, are using nonsensical data. Preferring shallow idle state selection
|
||||
* for a CPU that has IO-wait which might not even end up running the task when
|
||||
* it does become runnable.
|
||||
*/
|
||||
|
||||
unsigned long nr_iowait_cpu(int cpu)
|
||||
|
|
|
@ -693,6 +693,7 @@ static u64 sched_vslice(struct cfs_rq *cfs_rq, struct sched_entity *se)
|
|||
|
||||
static int select_idle_sibling(struct task_struct *p, int prev_cpu, int cpu);
|
||||
static unsigned long task_h_load(struct task_struct *p);
|
||||
static unsigned long capacity_of(int cpu);
|
||||
|
||||
/* Give new sched_entity start runnable values to heavy its load in infant time */
|
||||
void init_entity_runnable_average(struct sched_entity *se)
|
||||
|
@ -1456,7 +1457,6 @@ bool should_numa_migrate_memory(struct task_struct *p, struct page * page,
|
|||
static unsigned long weighted_cpuload(struct rq *rq);
|
||||
static unsigned long source_load(int cpu, int type);
|
||||
static unsigned long target_load(int cpu, int type);
|
||||
static unsigned long capacity_of(int cpu);
|
||||
|
||||
/* Cached statistics for all CPUs within a node */
|
||||
struct numa_stats {
|
||||
|
@ -1464,8 +1464,6 @@ struct numa_stats {
|
|||
|
||||
/* Total compute capacity of CPUs on a node */
|
||||
unsigned long compute_capacity;
|
||||
|
||||
unsigned int nr_running;
|
||||
};
|
||||
|
||||
/*
|
||||
|
@ -1473,36 +1471,16 @@ struct numa_stats {
|
|||
*/
|
||||
static void update_numa_stats(struct numa_stats *ns, int nid)
|
||||
{
|
||||
int smt, cpu, cpus = 0;
|
||||
unsigned long capacity;
|
||||
int cpu;
|
||||
|
||||
memset(ns, 0, sizeof(*ns));
|
||||
for_each_cpu(cpu, cpumask_of_node(nid)) {
|
||||
struct rq *rq = cpu_rq(cpu);
|
||||
|
||||
ns->nr_running += rq->nr_running;
|
||||
ns->load += weighted_cpuload(rq);
|
||||
ns->compute_capacity += capacity_of(cpu);
|
||||
|
||||
cpus++;
|
||||
}
|
||||
|
||||
/*
|
||||
* If we raced with hotplug and there are no CPUs left in our mask
|
||||
* the @ns structure is NULL'ed and task_numa_compare() will
|
||||
* not find this node attractive.
|
||||
*
|
||||
* We'll detect a huge imbalance and bail there.
|
||||
*/
|
||||
if (!cpus)
|
||||
return;
|
||||
|
||||
/* smt := ceil(cpus / capacity), assumes: 1 < smt_power < 2 */
|
||||
smt = DIV_ROUND_UP(SCHED_CAPACITY_SCALE * cpus, ns->compute_capacity);
|
||||
capacity = cpus / smt; /* cores */
|
||||
|
||||
capacity = min_t(unsigned, capacity,
|
||||
DIV_ROUND_CLOSEST(ns->compute_capacity, SCHED_CAPACITY_SCALE));
|
||||
}
|
||||
|
||||
struct task_numa_env {
|
||||
|
@ -3723,6 +3701,29 @@ util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p, bool task_sleep)
|
|||
WRITE_ONCE(p->se.avg.util_est, ue);
|
||||
}
|
||||
|
||||
static inline int task_fits_capacity(struct task_struct *p, long capacity)
|
||||
{
|
||||
return capacity * 1024 > task_util_est(p) * capacity_margin;
|
||||
}
|
||||
|
||||
static inline void update_misfit_status(struct task_struct *p, struct rq *rq)
|
||||
{
|
||||
if (!static_branch_unlikely(&sched_asym_cpucapacity))
|
||||
return;
|
||||
|
||||
if (!p) {
|
||||
rq->misfit_task_load = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
if (task_fits_capacity(p, capacity_of(cpu_of(rq)))) {
|
||||
rq->misfit_task_load = 0;
|
||||
return;
|
||||
}
|
||||
|
||||
rq->misfit_task_load = task_h_load(p);
|
||||
}
|
||||
|
||||
#else /* CONFIG_SMP */
|
||||
|
||||
#define UPDATE_TG 0x0
|
||||
|
@ -3752,6 +3753,7 @@ util_est_enqueue(struct cfs_rq *cfs_rq, struct task_struct *p) {}
|
|||
static inline void
|
||||
util_est_dequeue(struct cfs_rq *cfs_rq, struct task_struct *p,
|
||||
bool task_sleep) {}
|
||||
static inline void update_misfit_status(struct task_struct *p, struct rq *rq) {}
|
||||
|
||||
#endif /* CONFIG_SMP */
|
||||
|
||||
|
@ -6280,6 +6282,9 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
|
|||
{
|
||||
long min_cap, max_cap;
|
||||
|
||||
if (!static_branch_unlikely(&sched_asym_cpucapacity))
|
||||
return 0;
|
||||
|
||||
min_cap = min(capacity_orig_of(prev_cpu), capacity_orig_of(cpu));
|
||||
max_cap = cpu_rq(cpu)->rd->max_cpu_capacity;
|
||||
|
||||
|
@ -6290,7 +6295,7 @@ static int wake_cap(struct task_struct *p, int cpu, int prev_cpu)
|
|||
/* Bring task utilization in sync with prev_cpu */
|
||||
sync_entity_load_avg(&p->se);
|
||||
|
||||
return min_cap * 1024 < task_util(p) * capacity_margin;
|
||||
return !task_fits_capacity(p, min_cap);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -6709,9 +6714,12 @@ done: __maybe_unused;
|
|||
if (hrtick_enabled(rq))
|
||||
hrtick_start_fair(rq, p);
|
||||
|
||||
update_misfit_status(p, rq);
|
||||
|
||||
return p;
|
||||
|
||||
idle:
|
||||
update_misfit_status(NULL, rq);
|
||||
new_tasks = idle_balance(rq, rf);
|
||||
|
||||
/*
|
||||
|
@ -6917,6 +6925,13 @@ static unsigned long __read_mostly max_load_balance_interval = HZ/10;
|
|||
|
||||
enum fbq_type { regular, remote, all };
|
||||
|
||||
enum group_type {
|
||||
group_other = 0,
|
||||
group_misfit_task,
|
||||
group_imbalanced,
|
||||
group_overloaded,
|
||||
};
|
||||
|
||||
#define LBF_ALL_PINNED 0x01
|
||||
#define LBF_NEED_BREAK 0x02
|
||||
#define LBF_DST_PINNED 0x04
|
||||
|
@ -6947,6 +6962,7 @@ struct lb_env {
|
|||
unsigned int loop_max;
|
||||
|
||||
enum fbq_type fbq_type;
|
||||
enum group_type src_grp_type;
|
||||
struct list_head tasks;
|
||||
};
|
||||
|
||||
|
@ -7327,7 +7343,7 @@ static inline bool others_have_blocked(struct rq *rq)
|
|||
if (READ_ONCE(rq->avg_dl.util_avg))
|
||||
return true;
|
||||
|
||||
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
||||
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
||||
if (READ_ONCE(rq->avg_irq.util_avg))
|
||||
return true;
|
||||
#endif
|
||||
|
@ -7490,12 +7506,6 @@ static unsigned long task_h_load(struct task_struct *p)
|
|||
|
||||
/********** Helpers for find_busiest_group ************************/
|
||||
|
||||
enum group_type {
|
||||
group_other = 0,
|
||||
group_imbalanced,
|
||||
group_overloaded,
|
||||
};
|
||||
|
||||
/*
|
||||
* sg_lb_stats - stats of a sched_group required for load_balancing
|
||||
*/
|
||||
|
@ -7511,6 +7521,7 @@ struct sg_lb_stats {
|
|||
unsigned int group_weight;
|
||||
enum group_type group_type;
|
||||
int group_no_capacity;
|
||||
unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
unsigned int nr_numa_running;
|
||||
unsigned int nr_preferred_running;
|
||||
|
@ -7619,13 +7630,14 @@ static void update_cpu_capacity(struct sched_domain *sd, int cpu)
|
|||
cpu_rq(cpu)->cpu_capacity = capacity;
|
||||
sdg->sgc->capacity = capacity;
|
||||
sdg->sgc->min_capacity = capacity;
|
||||
sdg->sgc->max_capacity = capacity;
|
||||
}
|
||||
|
||||
void update_group_capacity(struct sched_domain *sd, int cpu)
|
||||
{
|
||||
struct sched_domain *child = sd->child;
|
||||
struct sched_group *group, *sdg = sd->groups;
|
||||
unsigned long capacity, min_capacity;
|
||||
unsigned long capacity, min_capacity, max_capacity;
|
||||
unsigned long interval;
|
||||
|
||||
interval = msecs_to_jiffies(sd->balance_interval);
|
||||
|
@ -7639,6 +7651,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|||
|
||||
capacity = 0;
|
||||
min_capacity = ULONG_MAX;
|
||||
max_capacity = 0;
|
||||
|
||||
if (child->flags & SD_OVERLAP) {
|
||||
/*
|
||||
|
@ -7669,6 +7682,7 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|||
}
|
||||
|
||||
min_capacity = min(capacity, min_capacity);
|
||||
max_capacity = max(capacity, max_capacity);
|
||||
}
|
||||
} else {
|
||||
/*
|
||||
|
@ -7682,12 +7696,14 @@ void update_group_capacity(struct sched_domain *sd, int cpu)
|
|||
|
||||
capacity += sgc->capacity;
|
||||
min_capacity = min(sgc->min_capacity, min_capacity);
|
||||
max_capacity = max(sgc->max_capacity, max_capacity);
|
||||
group = group->next;
|
||||
} while (group != child->groups);
|
||||
}
|
||||
|
||||
sdg->sgc->capacity = capacity;
|
||||
sdg->sgc->min_capacity = min_capacity;
|
||||
sdg->sgc->max_capacity = max_capacity;
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -7783,16 +7799,27 @@ group_is_overloaded(struct lb_env *env, struct sg_lb_stats *sgs)
|
|||
}
|
||||
|
||||
/*
|
||||
* group_smaller_cpu_capacity: Returns true if sched_group sg has smaller
|
||||
* group_smaller_min_cpu_capacity: Returns true if sched_group sg has smaller
|
||||
* per-CPU capacity than sched_group ref.
|
||||
*/
|
||||
static inline bool
|
||||
group_smaller_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
|
||||
group_smaller_min_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
|
||||
{
|
||||
return sg->sgc->min_capacity * capacity_margin <
|
||||
ref->sgc->min_capacity * 1024;
|
||||
}
|
||||
|
||||
/*
|
||||
* group_smaller_max_cpu_capacity: Returns true if sched_group sg has smaller
|
||||
* per-CPU capacity_orig than sched_group ref.
|
||||
*/
|
||||
static inline bool
|
||||
group_smaller_max_cpu_capacity(struct sched_group *sg, struct sched_group *ref)
|
||||
{
|
||||
return sg->sgc->max_capacity * capacity_margin <
|
||||
ref->sgc->max_capacity * 1024;
|
||||
}
|
||||
|
||||
static inline enum
|
||||
group_type group_classify(struct sched_group *group,
|
||||
struct sg_lb_stats *sgs)
|
||||
|
@ -7803,6 +7830,9 @@ group_type group_classify(struct sched_group *group,
|
|||
if (sg_imbalanced(group))
|
||||
return group_imbalanced;
|
||||
|
||||
if (sgs->group_misfit_task_load)
|
||||
return group_misfit_task;
|
||||
|
||||
return group_other;
|
||||
}
|
||||
|
||||
|
@ -7835,7 +7865,7 @@ static bool update_nohz_stats(struct rq *rq, bool force)
|
|||
* @load_idx: Load index of sched_domain of this_cpu for load calc.
|
||||
* @local_group: Does group contain this_cpu.
|
||||
* @sgs: variable to hold the statistics for this group.
|
||||
* @overload: Indicate more than one runnable task for any CPU.
|
||||
* @overload: Indicate pullable load (e.g. >1 runnable task).
|
||||
*/
|
||||
static inline void update_sg_lb_stats(struct lb_env *env,
|
||||
struct sched_group *group, int load_idx,
|
||||
|
@ -7877,6 +7907,12 @@ static inline void update_sg_lb_stats(struct lb_env *env,
|
|||
*/
|
||||
if (!nr_running && idle_cpu(i))
|
||||
sgs->idle_cpus++;
|
||||
|
||||
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
|
||||
sgs->group_misfit_task_load < rq->misfit_task_load) {
|
||||
sgs->group_misfit_task_load = rq->misfit_task_load;
|
||||
*overload = 1;
|
||||
}
|
||||
}
|
||||
|
||||
/* Adjust by relative CPU capacity of the group */
|
||||
|
@ -7912,6 +7948,17 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|||
{
|
||||
struct sg_lb_stats *busiest = &sds->busiest_stat;
|
||||
|
||||
/*
|
||||
* Don't try to pull misfit tasks we can't help.
|
||||
* We can use max_capacity here as reduction in capacity on some
|
||||
* CPUs in the group should either be possible to resolve
|
||||
* internally or be covered by avg_load imbalance (eventually).
|
||||
*/
|
||||
if (sgs->group_type == group_misfit_task &&
|
||||
(!group_smaller_max_cpu_capacity(sg, sds->local) ||
|
||||
!group_has_capacity(env, &sds->local_stat)))
|
||||
return false;
|
||||
|
||||
if (sgs->group_type > busiest->group_type)
|
||||
return true;
|
||||
|
||||
|
@ -7931,7 +7978,14 @@ static bool update_sd_pick_busiest(struct lb_env *env,
|
|||
* power/energy consequences are not considered.
|
||||
*/
|
||||
if (sgs->sum_nr_running <= sgs->group_weight &&
|
||||
group_smaller_cpu_capacity(sds->local, sg))
|
||||
group_smaller_min_cpu_capacity(sds->local, sg))
|
||||
return false;
|
||||
|
||||
/*
|
||||
* If we have more than one misfit sg go with the biggest misfit.
|
||||
*/
|
||||
if (sgs->group_type == group_misfit_task &&
|
||||
sgs->group_misfit_task_load < busiest->group_misfit_task_load)
|
||||
return false;
|
||||
|
||||
asym_packing:
|
||||
|
@ -8002,11 +8056,9 @@ static inline void update_sd_lb_stats(struct lb_env *env, struct sd_lb_stats *sd
|
|||
struct sched_group *sg = env->sd->groups;
|
||||
struct sg_lb_stats *local = &sds->local_stat;
|
||||
struct sg_lb_stats tmp_sgs;
|
||||
int load_idx, prefer_sibling = 0;
|
||||
int load_idx;
|
||||
bool overload = false;
|
||||
|
||||
if (child && child->flags & SD_PREFER_SIBLING)
|
||||
prefer_sibling = 1;
|
||||
bool prefer_sibling = child && child->flags & SD_PREFER_SIBLING;
|
||||
|
||||
#ifdef CONFIG_NO_HZ_COMMON
|
||||
if (env->idle == CPU_NEWLY_IDLE && READ_ONCE(nohz.has_blocked))
|
||||
|
@ -8080,8 +8132,8 @@ next_group:
|
|||
|
||||
if (!env->sd->parent) {
|
||||
/* update overload indicator if we are at root domain */
|
||||
if (env->dst_rq->rd->overload != overload)
|
||||
env->dst_rq->rd->overload = overload;
|
||||
if (READ_ONCE(env->dst_rq->rd->overload) != overload)
|
||||
WRITE_ONCE(env->dst_rq->rd->overload, overload);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -8231,8 +8283,9 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
|
|||
* factors in sg capacity and sgs with smaller group_type are
|
||||
* skipped when updating the busiest sg:
|
||||
*/
|
||||
if (busiest->avg_load <= sds->avg_load ||
|
||||
local->avg_load >= sds->avg_load) {
|
||||
if (busiest->group_type != group_misfit_task &&
|
||||
(busiest->avg_load <= sds->avg_load ||
|
||||
local->avg_load >= sds->avg_load)) {
|
||||
env->imbalance = 0;
|
||||
return fix_small_imbalance(env, sds);
|
||||
}
|
||||
|
@ -8266,6 +8319,12 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s
|
|||
(sds->avg_load - local->avg_load) * local->group_capacity
|
||||
) / SCHED_CAPACITY_SCALE;
|
||||
|
||||
/* Boost imbalance to allow misfit task to be balanced. */
|
||||
if (busiest->group_type == group_misfit_task) {
|
||||
env->imbalance = max_t(long, env->imbalance,
|
||||
busiest->group_misfit_task_load);
|
||||
}
|
||||
|
||||
/*
|
||||
* if *imbalance is less than the average load per runnable task
|
||||
* there is no guarantee that any tasks will be moved so we'll have
|
||||
|
@ -8332,6 +8391,10 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
|||
busiest->group_no_capacity)
|
||||
goto force_balance;
|
||||
|
||||
/* Misfit tasks should be dealt with regardless of the avg load */
|
||||
if (busiest->group_type == group_misfit_task)
|
||||
goto force_balance;
|
||||
|
||||
/*
|
||||
* If the local group is busier than the selected busiest group
|
||||
* don't try and pull any tasks.
|
||||
|
@ -8369,6 +8432,7 @@ static struct sched_group *find_busiest_group(struct lb_env *env)
|
|||
|
||||
force_balance:
|
||||
/* Looks like there is an imbalance. Compute it */
|
||||
env->src_grp_type = busiest->group_type;
|
||||
calculate_imbalance(env, &sds);
|
||||
return env->imbalance ? sds.busiest : NULL;
|
||||
|
||||
|
@ -8416,8 +8480,32 @@ static struct rq *find_busiest_queue(struct lb_env *env,
|
|||
if (rt > env->fbq_type)
|
||||
continue;
|
||||
|
||||
/*
|
||||
* For ASYM_CPUCAPACITY domains with misfit tasks we simply
|
||||
* seek the "biggest" misfit task.
|
||||
*/
|
||||
if (env->src_grp_type == group_misfit_task) {
|
||||
if (rq->misfit_task_load > busiest_load) {
|
||||
busiest_load = rq->misfit_task_load;
|
||||
busiest = rq;
|
||||
}
|
||||
|
||||
continue;
|
||||
}
|
||||
|
||||
capacity = capacity_of(i);
|
||||
|
||||
/*
|
||||
* For ASYM_CPUCAPACITY domains, don't pick a CPU that could
|
||||
* eventually lead to active_balancing high->low capacity.
|
||||
* Higher per-CPU capacity is considered better than balancing
|
||||
* average load.
|
||||
*/
|
||||
if (env->sd->flags & SD_ASYM_CPUCAPACITY &&
|
||||
capacity_of(env->dst_cpu) < capacity &&
|
||||
rq->nr_running == 1)
|
||||
continue;
|
||||
|
||||
wl = weighted_cpuload(rq);
|
||||
|
||||
/*
|
||||
|
@ -8485,6 +8573,9 @@ static int need_active_balance(struct lb_env *env)
|
|||
return 1;
|
||||
}
|
||||
|
||||
if (env->src_grp_type == group_misfit_task)
|
||||
return 1;
|
||||
|
||||
return unlikely(sd->nr_balance_failed > sd->cache_nice_tries+2);
|
||||
}
|
||||
|
||||
|
@ -9127,7 +9218,7 @@ static void nohz_balancer_kick(struct rq *rq)
|
|||
if (time_before(now, nohz.next_balance))
|
||||
goto out;
|
||||
|
||||
if (rq->nr_running >= 2) {
|
||||
if (rq->nr_running >= 2 || rq->misfit_task_load) {
|
||||
flags = NOHZ_KICK_MASK;
|
||||
goto out;
|
||||
}
|
||||
|
@ -9496,7 +9587,7 @@ static int idle_balance(struct rq *this_rq, struct rq_flags *rf)
|
|||
rq_unpin_lock(this_rq, rf);
|
||||
|
||||
if (this_rq->avg_idle < sysctl_sched_migration_cost ||
|
||||
!this_rq->rd->overload) {
|
||||
!READ_ONCE(this_rq->rd->overload)) {
|
||||
|
||||
rcu_read_lock();
|
||||
sd = rcu_dereference_check_sched_domain(this_rq->sd);
|
||||
|
@ -9658,6 +9749,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued)
|
|||
|
||||
if (static_branch_unlikely(&sched_numa_balancing))
|
||||
task_tick_numa(rq, curr);
|
||||
|
||||
update_misfit_status(curr, rq);
|
||||
}
|
||||
|
||||
/*
|
||||
|
|
|
@ -39,7 +39,7 @@ SCHED_FEAT(WAKEUP_PREEMPTION, true)
|
|||
|
||||
SCHED_FEAT(HRTICK, false)
|
||||
SCHED_FEAT(DOUBLE_TICK, false)
|
||||
SCHED_FEAT(LB_BIAS, true)
|
||||
SCHED_FEAT(LB_BIAS, false)
|
||||
|
||||
/*
|
||||
* Decrement CPU capacity based on time not spent running tasks
|
||||
|
|
|
@ -269,9 +269,6 @@ ___update_load_avg(struct sched_avg *sa, unsigned long load, unsigned long runna
|
|||
|
||||
int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
|
||||
{
|
||||
if (entity_is_task(se))
|
||||
se->runnable_weight = se->load.weight;
|
||||
|
||||
if (___update_load_sum(now, cpu, &se->avg, 0, 0, 0)) {
|
||||
___update_load_avg(&se->avg, se_weight(se), se_runnable(se));
|
||||
return 1;
|
||||
|
@ -282,9 +279,6 @@ int __update_load_avg_blocked_se(u64 now, int cpu, struct sched_entity *se)
|
|||
|
||||
int __update_load_avg_se(u64 now, int cpu, struct cfs_rq *cfs_rq, struct sched_entity *se)
|
||||
{
|
||||
if (entity_is_task(se))
|
||||
se->runnable_weight = se->load.weight;
|
||||
|
||||
if (___update_load_sum(now, cpu, &se->avg, !!se->on_rq, !!se->on_rq,
|
||||
cfs_rq->curr == se)) {
|
||||
|
||||
|
@ -358,7 +352,7 @@ int update_dl_rq_load_avg(u64 now, struct rq *rq, int running)
|
|||
return 0;
|
||||
}
|
||||
|
||||
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
||||
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
||||
/*
|
||||
* irq:
|
||||
*
|
||||
|
|
|
@ -6,7 +6,7 @@ int __update_load_avg_cfs_rq(u64 now, int cpu, struct cfs_rq *cfs_rq);
|
|||
int update_rt_rq_load_avg(u64 now, struct rq *rq, int running);
|
||||
int update_dl_rq_load_avg(u64 now, struct rq *rq, int running);
|
||||
|
||||
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
||||
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
||||
int update_irq_load_avg(struct rq *rq, u64 running);
|
||||
#else
|
||||
static inline int
|
||||
|
|
|
@ -717,8 +717,12 @@ struct root_domain {
|
|||
cpumask_var_t span;
|
||||
cpumask_var_t online;
|
||||
|
||||
/* Indicate more than one runnable task for any CPU */
|
||||
bool overload;
|
||||
/*
|
||||
* Indicate pullable load on at least one CPU, e.g:
|
||||
* - More than one runnable task
|
||||
* - Running task is misfit
|
||||
*/
|
||||
int overload;
|
||||
|
||||
/*
|
||||
* The bit corresponding to a CPU gets set here if such CPU has more
|
||||
|
@ -845,6 +849,8 @@ struct rq {
|
|||
|
||||
unsigned char idle_balance;
|
||||
|
||||
unsigned long misfit_task_load;
|
||||
|
||||
/* For active balancing */
|
||||
int active_balance;
|
||||
int push_cpu;
|
||||
|
@ -858,8 +864,7 @@ struct rq {
|
|||
|
||||
struct sched_avg avg_rt;
|
||||
struct sched_avg avg_dl;
|
||||
#if defined(CONFIG_IRQ_TIME_ACCOUNTING) || defined(CONFIG_PARAVIRT_TIME_ACCOUNTING)
|
||||
#define HAVE_SCHED_AVG_IRQ
|
||||
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
||||
struct sched_avg avg_irq;
|
||||
#endif
|
||||
u64 idle_stamp;
|
||||
|
@ -1188,6 +1193,7 @@ DECLARE_PER_CPU(int, sd_llc_id);
|
|||
DECLARE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
|
||||
DECLARE_PER_CPU(struct sched_domain *, sd_numa);
|
||||
DECLARE_PER_CPU(struct sched_domain *, sd_asym);
|
||||
extern struct static_key_false sched_asym_cpucapacity;
|
||||
|
||||
struct sched_group_capacity {
|
||||
atomic_t ref;
|
||||
|
@ -1197,6 +1203,7 @@ struct sched_group_capacity {
|
|||
*/
|
||||
unsigned long capacity;
|
||||
unsigned long min_capacity; /* Min per-CPU capacity in group */
|
||||
unsigned long max_capacity; /* Max per-CPU capacity in group */
|
||||
unsigned long next_update;
|
||||
int imbalance; /* XXX unrelated to capacity but shared group state */
|
||||
|
||||
|
@ -1396,7 +1403,7 @@ static const_debug __maybe_unused unsigned int sysctl_sched_features =
|
|||
0;
|
||||
#undef SCHED_FEAT
|
||||
|
||||
#define sched_feat(x) (sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
|
||||
#define sched_feat(x) !!(sysctl_sched_features & (1UL << __SCHED_FEAT_##x))
|
||||
|
||||
#endif /* SCHED_DEBUG && HAVE_JUMP_LABEL */
|
||||
|
||||
|
@ -1696,8 +1703,8 @@ static inline void add_nr_running(struct rq *rq, unsigned count)
|
|||
|
||||
if (prev_nr < 2 && rq->nr_running >= 2) {
|
||||
#ifdef CONFIG_SMP
|
||||
if (!rq->rd->overload)
|
||||
rq->rd->overload = true;
|
||||
if (!READ_ONCE(rq->rd->overload))
|
||||
WRITE_ONCE(rq->rd->overload, 1);
|
||||
#endif
|
||||
}
|
||||
|
||||
|
@ -2217,7 +2224,7 @@ static inline unsigned long cpu_util_rt(struct rq *rq)
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef HAVE_SCHED_AVG_IRQ
|
||||
#ifdef CONFIG_HAVE_SCHED_AVG_IRQ
|
||||
static inline unsigned long cpu_util_irq(struct rq *rq)
|
||||
{
|
||||
return rq->avg_irq.util_avg;
|
||||
|
|
|
@ -7,8 +7,8 @@
|
|||
DEFINE_MUTEX(sched_domains_mutex);
|
||||
|
||||
/* Protected by sched_domains_mutex: */
|
||||
cpumask_var_t sched_domains_tmpmask;
|
||||
cpumask_var_t sched_domains_tmpmask2;
|
||||
static cpumask_var_t sched_domains_tmpmask;
|
||||
static cpumask_var_t sched_domains_tmpmask2;
|
||||
|
||||
#ifdef CONFIG_SCHED_DEBUG
|
||||
|
||||
|
@ -398,6 +398,7 @@ DEFINE_PER_CPU(int, sd_llc_id);
|
|||
DEFINE_PER_CPU(struct sched_domain_shared *, sd_llc_shared);
|
||||
DEFINE_PER_CPU(struct sched_domain *, sd_numa);
|
||||
DEFINE_PER_CPU(struct sched_domain *, sd_asym);
|
||||
DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity);
|
||||
|
||||
static void update_top_cache_domain(int cpu)
|
||||
{
|
||||
|
@ -692,6 +693,7 @@ static void init_overlap_sched_group(struct sched_domain *sd,
|
|||
sg_span = sched_group_span(sg);
|
||||
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sg_span);
|
||||
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
|
||||
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -851,6 +853,7 @@ static struct sched_group *get_group(int cpu, struct sd_data *sdd)
|
|||
|
||||
sg->sgc->capacity = SCHED_CAPACITY_SCALE * cpumask_weight(sched_group_span(sg));
|
||||
sg->sgc->min_capacity = SCHED_CAPACITY_SCALE;
|
||||
sg->sgc->max_capacity = SCHED_CAPACITY_SCALE;
|
||||
|
||||
return sg;
|
||||
}
|
||||
|
@ -1061,7 +1064,6 @@ static struct cpumask ***sched_domains_numa_masks;
|
|||
* SD_SHARE_PKG_RESOURCES - describes shared caches
|
||||
* SD_NUMA - describes NUMA topologies
|
||||
* SD_SHARE_POWERDOMAIN - describes shared power domain
|
||||
* SD_ASYM_CPUCAPACITY - describes mixed capacity topologies
|
||||
*
|
||||
* Odd one out, which beside describing the topology has a quirk also
|
||||
* prescribes the desired behaviour that goes along with it:
|
||||
|
@ -1073,13 +1075,12 @@ static struct cpumask ***sched_domains_numa_masks;
|
|||
SD_SHARE_PKG_RESOURCES | \
|
||||
SD_NUMA | \
|
||||
SD_ASYM_PACKING | \
|
||||
SD_ASYM_CPUCAPACITY | \
|
||||
SD_SHARE_POWERDOMAIN)
|
||||
|
||||
static struct sched_domain *
|
||||
sd_init(struct sched_domain_topology_level *tl,
|
||||
const struct cpumask *cpu_map,
|
||||
struct sched_domain *child, int cpu)
|
||||
struct sched_domain *child, int dflags, int cpu)
|
||||
{
|
||||
struct sd_data *sdd = &tl->data;
|
||||
struct sched_domain *sd = *per_cpu_ptr(sdd->sd, cpu);
|
||||
|
@ -1100,6 +1101,9 @@ sd_init(struct sched_domain_topology_level *tl,
|
|||
"wrong sd_flags in topology description\n"))
|
||||
sd_flags &= ~TOPOLOGY_SD_FLAGS;
|
||||
|
||||
/* Apply detected topology flags */
|
||||
sd_flags |= dflags;
|
||||
|
||||
*sd = (struct sched_domain){
|
||||
.min_interval = sd_weight,
|
||||
.max_interval = 2*sd_weight,
|
||||
|
@ -1122,7 +1126,7 @@ sd_init(struct sched_domain_topology_level *tl,
|
|||
| 0*SD_SHARE_CPUCAPACITY
|
||||
| 0*SD_SHARE_PKG_RESOURCES
|
||||
| 0*SD_SERIALIZE
|
||||
| 0*SD_PREFER_SIBLING
|
||||
| 1*SD_PREFER_SIBLING
|
||||
| 0*SD_NUMA
|
||||
| sd_flags
|
||||
,
|
||||
|
@ -1148,17 +1152,21 @@ sd_init(struct sched_domain_topology_level *tl,
|
|||
if (sd->flags & SD_ASYM_CPUCAPACITY) {
|
||||
struct sched_domain *t = sd;
|
||||
|
||||
/*
|
||||
* Don't attempt to spread across CPUs of different capacities.
|
||||
*/
|
||||
if (sd->child)
|
||||
sd->child->flags &= ~SD_PREFER_SIBLING;
|
||||
|
||||
for_each_lower_domain(t)
|
||||
t->flags |= SD_BALANCE_WAKE;
|
||||
}
|
||||
|
||||
if (sd->flags & SD_SHARE_CPUCAPACITY) {
|
||||
sd->flags |= SD_PREFER_SIBLING;
|
||||
sd->imbalance_pct = 110;
|
||||
sd->smt_gain = 1178; /* ~15% */
|
||||
|
||||
} else if (sd->flags & SD_SHARE_PKG_RESOURCES) {
|
||||
sd->flags |= SD_PREFER_SIBLING;
|
||||
sd->imbalance_pct = 117;
|
||||
sd->cache_nice_tries = 1;
|
||||
sd->busy_idx = 2;
|
||||
|
@ -1169,6 +1177,7 @@ sd_init(struct sched_domain_topology_level *tl,
|
|||
sd->busy_idx = 3;
|
||||
sd->idle_idx = 2;
|
||||
|
||||
sd->flags &= ~SD_PREFER_SIBLING;
|
||||
sd->flags |= SD_SERIALIZE;
|
||||
if (sched_domains_numa_distance[tl->numa_level] > RECLAIM_DISTANCE) {
|
||||
sd->flags &= ~(SD_BALANCE_EXEC |
|
||||
|
@ -1178,7 +1187,6 @@ sd_init(struct sched_domain_topology_level *tl,
|
|||
|
||||
#endif
|
||||
} else {
|
||||
sd->flags |= SD_PREFER_SIBLING;
|
||||
sd->cache_nice_tries = 1;
|
||||
sd->busy_idx = 2;
|
||||
sd->idle_idx = 1;
|
||||
|
@ -1604,9 +1612,9 @@ static void __sdt_free(const struct cpumask *cpu_map)
|
|||
|
||||
static struct sched_domain *build_sched_domain(struct sched_domain_topology_level *tl,
|
||||
const struct cpumask *cpu_map, struct sched_domain_attr *attr,
|
||||
struct sched_domain *child, int cpu)
|
||||
struct sched_domain *child, int dflags, int cpu)
|
||||
{
|
||||
struct sched_domain *sd = sd_init(tl, cpu_map, child, cpu);
|
||||
struct sched_domain *sd = sd_init(tl, cpu_map, child, dflags, cpu);
|
||||
|
||||
if (child) {
|
||||
sd->level = child->level + 1;
|
||||
|
@ -1632,6 +1640,65 @@ static struct sched_domain *build_sched_domain(struct sched_domain_topology_leve
|
|||
return sd;
|
||||
}
|
||||
|
||||
/*
|
||||
* Find the sched_domain_topology_level where all CPU capacities are visible
|
||||
* for all CPUs.
|
||||
*/
|
||||
static struct sched_domain_topology_level
|
||||
*asym_cpu_capacity_level(const struct cpumask *cpu_map)
|
||||
{
|
||||
int i, j, asym_level = 0;
|
||||
bool asym = false;
|
||||
struct sched_domain_topology_level *tl, *asym_tl = NULL;
|
||||
unsigned long cap;
|
||||
|
||||
/* Is there any asymmetry? */
|
||||
cap = arch_scale_cpu_capacity(NULL, cpumask_first(cpu_map));
|
||||
|
||||
for_each_cpu(i, cpu_map) {
|
||||
if (arch_scale_cpu_capacity(NULL, i) != cap) {
|
||||
asym = true;
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
if (!asym)
|
||||
return NULL;
|
||||
|
||||
/*
|
||||
* Examine topology from all CPU's point of views to detect the lowest
|
||||
* sched_domain_topology_level where a highest capacity CPU is visible
|
||||
* to everyone.
|
||||
*/
|
||||
for_each_cpu(i, cpu_map) {
|
||||
unsigned long max_capacity = arch_scale_cpu_capacity(NULL, i);
|
||||
int tl_id = 0;
|
||||
|
||||
for_each_sd_topology(tl) {
|
||||
if (tl_id < asym_level)
|
||||
goto next_level;
|
||||
|
||||
for_each_cpu_and(j, tl->mask(i), cpu_map) {
|
||||
unsigned long capacity;
|
||||
|
||||
capacity = arch_scale_cpu_capacity(NULL, j);
|
||||
|
||||
if (capacity <= max_capacity)
|
||||
continue;
|
||||
|
||||
max_capacity = capacity;
|
||||
asym_level = tl_id;
|
||||
asym_tl = tl;
|
||||
}
|
||||
next_level:
|
||||
tl_id++;
|
||||
}
|
||||
}
|
||||
|
||||
return asym_tl;
|
||||
}
|
||||
|
||||
|
||||
/*
|
||||
* Build sched domains for a given set of CPUs and attach the sched domains
|
||||
* to the individual CPUs
|
||||
|
@ -1644,18 +1711,30 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
|
|||
struct s_data d;
|
||||
struct rq *rq = NULL;
|
||||
int i, ret = -ENOMEM;
|
||||
struct sched_domain_topology_level *tl_asym;
|
||||
bool has_asym = false;
|
||||
|
||||
alloc_state = __visit_domain_allocation_hell(&d, cpu_map);
|
||||
if (alloc_state != sa_rootdomain)
|
||||
goto error;
|
||||
|
||||
tl_asym = asym_cpu_capacity_level(cpu_map);
|
||||
|
||||
/* Set up domains for CPUs specified by the cpu_map: */
|
||||
for_each_cpu(i, cpu_map) {
|
||||
struct sched_domain_topology_level *tl;
|
||||
|
||||
sd = NULL;
|
||||
for_each_sd_topology(tl) {
|
||||
sd = build_sched_domain(tl, cpu_map, attr, sd, i);
|
||||
int dflags = 0;
|
||||
|
||||
if (tl == tl_asym) {
|
||||
dflags |= SD_ASYM_CPUCAPACITY;
|
||||
has_asym = true;
|
||||
}
|
||||
|
||||
sd = build_sched_domain(tl, cpu_map, attr, sd, dflags, i);
|
||||
|
||||
if (tl == sched_domain_topology)
|
||||
*per_cpu_ptr(d.sd, i) = sd;
|
||||
if (tl->flags & SDTL_OVERLAP)
|
||||
|
@ -1704,6 +1783,9 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att
|
|||
}
|
||||
rcu_read_unlock();
|
||||
|
||||
if (has_asym)
|
||||
static_branch_enable_cpuslocked(&sched_asym_cpucapacity);
|
||||
|
||||
if (rq && sched_debug_enabled) {
|
||||
pr_info("root domain span: %*pbl (max cpu_capacity = %lu)\n",
|
||||
cpumask_pr_args(cpu_map), rq->rd->max_cpu_capacity);
|
||||
|
|
Loading…
Reference in New Issue