2005-04-17 06:20:36 +08:00
|
|
|
#ifndef _LINUX_SCHED_H
|
|
|
|
#define _LINUX_SCHED_H
|
|
|
|
|
2012-10-13 17:46:48 +08:00
|
|
|
#include <uapi/linux/sched.h>
|
2006-04-27 07:12:56 +08:00
|
|
|
|
2014-01-28 06:15:37 +08:00
|
|
|
#include <linux/sched/prio.h>
|
|
|
|
|
2006-04-27 07:12:56 +08:00
|
|
|
|
|
|
|
struct sched_param {
|
|
|
|
int sched_priority;
|
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <asm/param.h> /* for HZ */
|
|
|
|
|
|
|
|
#include <linux/capability.h>
|
|
|
|
#include <linux/threads.h>
|
|
|
|
#include <linux/kernel.h>
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/timex.h>
|
|
|
|
#include <linux/jiffies.h>
|
rtmutex: Turn the plist into an rb-tree
Turn the pi-chains from plist to rb-tree, in the rt_mutex code,
and provide a proper comparison function for -deadline and
-priority tasks.
This is done mainly because:
- classical prio field of the plist is just an int, which might
not be enough for representing a deadline;
- manipulating such a list would become O(nr_deadline_tasks),
which might be to much, as the number of -deadline task increases.
Therefore, an rb-tree is used, and tasks are queued in it according
to the following logic:
- among two -priority (i.e., SCHED_BATCH/OTHER/RR/FIFO) tasks, the
one with the higher (lower, actually!) prio wins;
- among a -priority and a -deadline task, the latter always wins;
- among two -deadline tasks, the one with the earliest deadline
wins.
Queueing and dequeueing functions are changed accordingly, for both
the list of a task's pi-waiters and the list of tasks blocked on
a pi-lock.
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-again-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-10-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 21:43:43 +08:00
|
|
|
#include <linux/plist.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/rbtree.h>
|
|
|
|
#include <linux/thread_info.h>
|
|
|
|
#include <linux/cpumask.h>
|
|
|
|
#include <linux/errno.h>
|
|
|
|
#include <linux/nodemask.h>
|
2007-10-16 16:24:43 +08:00
|
|
|
#include <linux/mm_types.h>
|
sched/preempt: Merge preempt_mask.h into preempt.h
preempt_mask.h defines all the preempt_count semantics and related
symbols: preempt, softirq, hardirq, nmi, preempt active, need resched,
etc...
preempt.h defines the accessors and mutators of preempt_count.
But there is a messy dependency game around those two header files:
* preempt_mask.h includes preempt.h in order to access preempt_count()
* preempt_mask.h defines all preempt_count semantic and symbols
except PREEMPT_NEED_RESCHED that is needed by asm/preempt.h
Thus we need to define it from preempt.h, right before including
asm/preempt.h, instead of defining it to preempt_mask.h with the
other preempt_count symbols. Therefore the preempt_count semantics
happen to be spread out.
* We plan to introduce preempt_active_[enter,exit]() to consolidate
preempt_schedule*() code. But we'll need to access both preempt_count
mutators (preempt_count_add()) and preempt_count symbols
(PREEMPT_ACTIVE, PREEMPT_OFFSET). The usual place to define preempt
operations is in preempt.h but then we'll need symbols in
preempt_mask.h which already includes preempt.h. So we end up with
a ressource circle dependency.
Lets merge preempt_mask.h into preempt.h to solve these dependency issues.
This way we gather semantic symbols and operation definition of
preempt_count in a single file.
This is a dumb copy-paste merge. Further merge re-arrangments are
performed in a subsequent patch to ease review.
Signed-off-by: Frederic Weisbecker <fweisbec@gmail.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1431441711-29753-2-git-send-email-fweisbec@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-05-12 22:41:46 +08:00
|
|
|
#include <linux/preempt.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <asm/page.h>
|
|
|
|
#include <asm/ptrace.h>
|
2014-03-05 23:33:42 +08:00
|
|
|
#include <linux/cputime.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <linux/smp.h>
|
|
|
|
#include <linux/sem.h>
|
shm: make exit_shm work proportional to task activity
This is small set of patches our team has had kicking around for a few
versions internally that fixes tasks getting hung on shm_exit when there
are many threads hammering it at once.
Anton wrote a simple test to cause the issue:
http://ozlabs.org/~anton/junkcode/bust_shm_exit.c
Before applying this patchset, this test code will cause either hanging
tracebacks or pthread out of memory errors.
After this patchset, it will still produce output like:
root@somehost:~# ./bust_shm_exit 1024 160
...
INFO: rcu_sched detected stalls on CPUs/tasks: {} (detected by 116, t=2111 jiffies, g=241, c=240, q=7113)
INFO: Stall ended before state dump start
...
But the task will continue to run along happily, so we consider this an
improvement over hanging, even if it's a bit noisy.
This patch (of 3):
exit_shm obtains the ipc_ns shm rwsem for write and holds it while it
walks every shared memory segment in the namespace. Thus the amount of
work is related to the number of shm segments in the namespace not the
number of segments that might need to be cleaned.
In addition, this occurs after the task has been notified the thread has
exited, so the number of tasks waiting for the ns shm rwsem can grow
without bound until memory is exausted.
Add a list to the task struct of all shmids allocated by this task. Init
the list head in copy_process. Use the ns->rwsem for locking. Add
segments after id is added, remove before removing from id.
On unshare of NEW_IPCNS orphan any ids as if the task had exited, similar
to handling of semaphore undo.
I chose a define for the init sequence since its a simple list init,
otherwise it would require a function call to avoid include loops between
the semaphore code and the task struct. Converting the list_del to
list_del_init for the unshare cases would remove the exit followed by
init, but I left it blow up if not inited.
Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Jack Miller <millerjo@us.ibm.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 05:23:19 +08:00
|
|
|
#include <linux/shm.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/signal.h>
|
|
|
|
#include <linux/compiler.h>
|
|
|
|
#include <linux/completion.h>
|
|
|
|
#include <linux/pid.h>
|
|
|
|
#include <linux/percpu.h>
|
|
|
|
#include <linux/topology.h>
|
2007-10-17 14:25:50 +08:00
|
|
|
#include <linux/proportions.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/seccomp.h>
|
2006-01-08 17:01:37 +08:00
|
|
|
#include <linux/rcupdate.h>
|
2009-04-15 02:17:16 +08:00
|
|
|
#include <linux/rculist.h>
|
2006-06-27 17:54:53 +08:00
|
|
|
#include <linux/rtmutex.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-04-25 21:54:40 +08:00
|
|
|
#include <linux/time.h>
|
|
|
|
#include <linux/param.h>
|
|
|
|
#include <linux/resource.h>
|
|
|
|
#include <linux/timer.h>
|
|
|
|
#include <linux/hrtimer.h>
|
kernel: add kcov code coverage
kcov provides code coverage collection for coverage-guided fuzzing
(randomized testing). Coverage-guided fuzzing is a testing technique
that uses coverage feedback to determine new interesting inputs to a
system. A notable user-space example is AFL
(http://lcamtuf.coredump.cx/afl/). However, this technique is not
widely used for kernel testing due to missing compiler and kernel
support.
kcov does not aim to collect as much coverage as possible. It aims to
collect more or less stable coverage that is function of syscall inputs.
To achieve this goal it does not collect coverage in soft/hard
interrupts and instrumentation of some inherently non-deterministic or
non-interesting parts of kernel is disbled (e.g. scheduler, locking).
Currently there is a single coverage collection mode (tracing), but the
API anticipates additional collection modes. Initially I also
implemented a second mode which exposes coverage in a fixed-size hash
table of counters (what Quentin used in his original patch). I've
dropped the second mode for simplicity.
This patch adds the necessary support on kernel side. The complimentary
compiler support was added in gcc revision 231296.
We've used this support to build syzkaller system call fuzzer, which has
found 90 kernel bugs in just 2 months:
https://github.com/google/syzkaller/wiki/Found-Bugs
We've also found 30+ bugs in our internal systems with syzkaller.
Another (yet unexplored) direction where kcov coverage would greatly
help is more traditional "blob mutation". For example, mounting a
random blob as a filesystem, or receiving a random blob over wire.
Why not gcov. Typical fuzzing loop looks as follows: (1) reset
coverage, (2) execute a bit of code, (3) collect coverage, repeat. A
typical coverage can be just a dozen of basic blocks (e.g. an invalid
input). In such context gcov becomes prohibitively expensive as
reset/collect coverage steps depend on total number of basic
blocks/edges in program (in case of kernel it is about 2M). Cost of
kcov depends only on number of executed basic blocks/edges. On top of
that, kernel requires per-thread coverage because there are always
background threads and unrelated processes that also produce coverage.
With inlined gcov instrumentation per-thread coverage is not possible.
kcov exposes kernel PCs and control flow to user-space which is
insecure. But debugfs should not be mapped as user accessible.
Based on a patch by Quentin Casasnovas.
[akpm@linux-foundation.org: make task_struct.kcov_mode have type `enum kcov_mode']
[akpm@linux-foundation.org: unbreak allmodconfig]
[akpm@linux-foundation.org: follow x86 Makefile layout standards]
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: syzkaller <syzkaller@googlegroups.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Tavis Ormandy <taviso@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Kees Cook <keescook@google.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: David Drysdale <drysdale@google.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-23 05:27:30 +08:00
|
|
|
#include <linux/kcov.h>
|
2006-12-10 18:19:19 +08:00
|
|
|
#include <linux/task_io_accounting.h>
|
2008-01-26 04:08:34 +08:00
|
|
|
#include <linux/latencytop.h>
|
2008-08-13 23:20:04 +08:00
|
|
|
#include <linux/cred.h>
|
2011-09-12 19:06:17 +08:00
|
|
|
#include <linux/llist.h>
|
2011-11-17 15:20:58 +08:00
|
|
|
#include <linux/uidgid.h>
|
mm: teach mm by current context info to not do I/O during memory allocation
This patch introduces PF_MEMALLOC_NOIO on process flag('flags' field of
'struct task_struct'), so that the flag can be set by one task to avoid
doing I/O inside memory allocation in the task's context.
The patch trys to solve one deadlock problem caused by block device, and
the problem may happen at least in the below situations:
- during block device runtime resume, if memory allocation with
GFP_KERNEL is called inside runtime resume callback of any one of its
ancestors(or the block device itself), the deadlock may be triggered
inside the memory allocation since it might not complete until the block
device becomes active and the involed page I/O finishes. The situation
is pointed out first by Alan Stern. It is not a good approach to
convert all GFP_KERNEL[1] in the path into GFP_NOIO because several
subsystems may be involved(for example, PCI, USB and SCSI may be
involved for usb mass stoarage device, network devices involved too in
the iSCSI case)
- during block device runtime suspend, because runtime resume need to
wait for completion of concurrent runtime suspend.
- during error handling of usb mass storage deivce, USB bus reset will
be put on the device, so there shouldn't have any memory allocation with
GFP_KERNEL during USB bus reset, otherwise the deadlock similar with
above may be triggered. Unfortunately, any usb device may include one
mass storage interface in theory, so it requires all usb interface
drivers to handle the situation. In fact, most usb drivers don't know
how to handle bus reset on the device and don't provide .pre_set() and
.post_reset() callback at all, so USB core has to unbind and bind driver
for these devices. So it is still not practical to resort to GFP_NOIO
for solving the problem.
Also the introduced solution can be used by block subsystem or block
drivers too, for example, set the PF_MEMALLOC_NOIO flag before doing
actual I/O transfer.
It is not a good idea to convert all these GFP_KERNEL in the affected
path into GFP_NOIO because these functions doing that may be implemented
as library and will be called in many other contexts.
In fact, memalloc_noio_flags() can convert some of current static
GFP_NOIO allocation into GFP_KERNEL back in other non-affected contexts,
at least almost all GFP_NOIO in USB subsystem can be converted into
GFP_KERNEL after applying the approach and make allocation with GFP_NOIO
only happen in runtime resume/bus reset/block I/O transfer contexts
generally.
[1], several GFP_KERNEL allocation examples in runtime resume path
- pci subsystem
acpi_os_allocate
<-acpi_ut_allocate
<-ACPI_ALLOCATE_ZEROED
<-acpi_evaluate_object
<-__acpi_bus_set_power
<-acpi_bus_set_power
<-acpi_pci_set_power_state
<-platform_pci_set_power_state
<-pci_platform_power_transition
<-__pci_complete_power_transition
<-pci_set_power_state
<-pci_restore_standard_config
<-pci_pm_runtime_resume
- usb subsystem
usb_get_status
<-finish_port_resume
<-usb_port_resume
<-generic_resume
<-usb_resume_device
<-usb_resume_both
<-usb_runtime_resume
- some individual usb drivers
usblp, uvc, gspca, most of dvb-usb-v2 media drivers, cpia2, az6007, ....
That is just what I have found. Unfortunately, this allocation can only
be found by human being now, and there should be many not found since
any function in the resume path(call tree) may allocate memory with
GFP_KERNEL.
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Oliver Neukum <oneukum@suse.de>
Cc: Jiri Kosina <jiri.kosina@suse.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Greg KH <greg@kroah.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Decotigny <david.decotigny@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-23 08:34:08 +08:00
|
|
|
#include <linux/gfp.h>
|
2014-09-12 21:16:17 +08:00
|
|
|
#include <linux/magic.h>
|
2015-05-14 04:35:16 +08:00
|
|
|
#include <linux/cgroup-defs.h>
|
2006-04-25 21:54:40 +08:00
|
|
|
|
|
|
|
#include <asm/processor.h>
|
2005-09-07 06:16:49 +08:00
|
|
|
|
sched: Add new scheduler syscalls to support an extended scheduling parameters ABI
Add the syscalls needed for supporting scheduling algorithms
with extended scheduling parameters (e.g., SCHED_DEADLINE).
In general, it makes possible to specify a periodic/sporadic task,
that executes for a given amount of runtime at each instance, and is
scheduled according to the urgency of their own timing constraints,
i.e.:
- a (maximum/typical) instance execution time,
- a minimum interval between consecutive instances,
- a time constraint by which each instance must be completed.
Thus, both the data structure that holds the scheduling parameters of
the tasks and the system calls dealing with it must be extended.
Unfortunately, modifying the existing struct sched_param would break
the ABI and result in potentially serious compatibility issues with
legacy binaries.
For these reasons, this patch:
- defines the new struct sched_attr, containing all the fields
that are necessary for specifying a task in the computational
model described above;
- defines and implements the new scheduling related syscalls that
manipulate it, i.e., sched_setattr() and sched_getattr().
Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a
proof of concept and for developing and testing purposes. Making them
available on other architectures is straightforward.
Since no "user" for these new parameters is introduced in this patch,
the implementation of the new system calls is just identical to their
already existing counterpart. Future patches that implement scheduling
policies able to exploit the new data structure must also take care of
modifying the sched_*attr() calls accordingly with their own purposes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
[ Rewrote to use sched_attr. ]
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
[ Removed sched_setscheduler2() for now. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 21:43:36 +08:00
|
|
|
#define SCHED_ATTR_SIZE_VER0 48 /* sizeof first published struct */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Extended scheduling parameters data structure.
|
|
|
|
*
|
|
|
|
* This is needed because the original struct sched_param can not be
|
|
|
|
* altered without introducing ABI issues with legacy applications
|
|
|
|
* (e.g., in sched_getparam()).
|
|
|
|
*
|
|
|
|
* However, the possibility of specifying more than just a priority for
|
|
|
|
* the tasks may be useful for a wide variety of application fields, e.g.,
|
|
|
|
* multimedia, streaming, automation and control, and many others.
|
|
|
|
*
|
|
|
|
* This variant (sched_attr) is meant at describing a so-called
|
|
|
|
* sporadic time-constrained task. In such model a task is specified by:
|
|
|
|
* - the activation period or minimum instance inter-arrival time;
|
|
|
|
* - the maximum (or average, depending on the actual scheduling
|
|
|
|
* discipline) computation time of all instances, a.k.a. runtime;
|
|
|
|
* - the deadline (relative to the actual activation time) of each
|
|
|
|
* instance.
|
|
|
|
* Very briefly, a periodic (sporadic) task asks for the execution of
|
|
|
|
* some specific computation --which is typically called an instance--
|
|
|
|
* (at most) every period. Moreover, each instance typically lasts no more
|
|
|
|
* than the runtime and must be completed by time instant t equal to
|
|
|
|
* the instance activation time + the deadline.
|
|
|
|
*
|
|
|
|
* This is reflected by the actual fields of the sched_attr structure:
|
|
|
|
*
|
|
|
|
* @size size of the structure, for fwd/bwd compat.
|
|
|
|
*
|
|
|
|
* @sched_policy task's scheduling policy
|
|
|
|
* @sched_flags for customizing the scheduler behaviour
|
|
|
|
* @sched_nice task's nice value (SCHED_NORMAL/BATCH)
|
|
|
|
* @sched_priority task's static priority (SCHED_FIFO/RR)
|
|
|
|
* @sched_deadline representative of the task's deadline
|
|
|
|
* @sched_runtime representative of the task's runtime
|
|
|
|
* @sched_period representative of the task's period
|
|
|
|
*
|
|
|
|
* Given this task model, there are a multiplicity of scheduling algorithms
|
|
|
|
* and policies, that can be used to ensure all the tasks will make their
|
|
|
|
* timing constraints.
|
sched/deadline: Add SCHED_DEADLINE structures & implementation
Introduces the data structures, constants and symbols needed for
SCHED_DEADLINE implementation.
Core data structure of SCHED_DEADLINE are defined, along with their
initializers. Hooks for checking if a task belong to the new policy
are also added where they are needed.
Adds a scheduling class, in sched/dl.c and a new policy called
SCHED_DEADLINE. It is an implementation of the Earliest Deadline
First (EDF) scheduling algorithm, augmented with a mechanism (called
Constant Bandwidth Server, CBS) that makes it possible to isolate
the behaviour of tasks between each other.
The typical -deadline task will be made up of a computation phase
(instance) which is activated on a periodic or sporadic fashion. The
expected (maximum) duration of such computation is called the task's
runtime; the time interval by which each instance need to be completed
is called the task's relative deadline. The task's absolute deadline
is dynamically calculated as the time instant a task (better, an
instance) activates plus the relative deadline.
The EDF algorithms selects the task with the smallest absolute
deadline as the one to be executed first, while the CBS ensures each
task to run for at most its runtime every (relative) deadline
length time interval, avoiding any interference between different
tasks (bandwidth isolation).
Thanks to this feature, also tasks that do not strictly comply with
the computational model sketched above can effectively use the new
policy.
To summarize, this patch:
- introduces the data structures, constants and symbols needed;
- implements the core logic of the scheduling algorithm in the new
scheduling class file;
- provides all the glue code between the new scheduling class and
the core scheduler and refines the interactions between sched/dl
and the other existing scheduling classes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
Signed-off-by: Fabio Checconi <fchecconi@gmail.com>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-4-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-28 18:14:43 +08:00
|
|
|
*
|
|
|
|
* As of now, the SCHED_DEADLINE policy (sched_dl scheduling class) is the
|
|
|
|
* only user of this new interface. More information about the algorithm
|
|
|
|
* available in the scheduling class file or in Documentation/.
|
sched: Add new scheduler syscalls to support an extended scheduling parameters ABI
Add the syscalls needed for supporting scheduling algorithms
with extended scheduling parameters (e.g., SCHED_DEADLINE).
In general, it makes possible to specify a periodic/sporadic task,
that executes for a given amount of runtime at each instance, and is
scheduled according to the urgency of their own timing constraints,
i.e.:
- a (maximum/typical) instance execution time,
- a minimum interval between consecutive instances,
- a time constraint by which each instance must be completed.
Thus, both the data structure that holds the scheduling parameters of
the tasks and the system calls dealing with it must be extended.
Unfortunately, modifying the existing struct sched_param would break
the ABI and result in potentially serious compatibility issues with
legacy binaries.
For these reasons, this patch:
- defines the new struct sched_attr, containing all the fields
that are necessary for specifying a task in the computational
model described above;
- defines and implements the new scheduling related syscalls that
manipulate it, i.e., sched_setattr() and sched_getattr().
Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a
proof of concept and for developing and testing purposes. Making them
available on other architectures is straightforward.
Since no "user" for these new parameters is introduced in this patch,
the implementation of the new system calls is just identical to their
already existing counterpart. Future patches that implement scheduling
policies able to exploit the new data structure must also take care of
modifying the sched_*attr() calls accordingly with their own purposes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
[ Rewrote to use sched_attr. ]
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
[ Removed sched_setscheduler2() for now. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 21:43:36 +08:00
|
|
|
*/
|
|
|
|
struct sched_attr {
|
|
|
|
u32 size;
|
|
|
|
|
|
|
|
u32 sched_policy;
|
|
|
|
u64 sched_flags;
|
|
|
|
|
|
|
|
/* SCHED_NORMAL, SCHED_BATCH */
|
|
|
|
s32 sched_nice;
|
|
|
|
|
|
|
|
/* SCHED_FIFO, SCHED_RR */
|
|
|
|
u32 sched_priority;
|
|
|
|
|
|
|
|
/* SCHED_DEADLINE */
|
|
|
|
u64 sched_runtime;
|
|
|
|
u64 sched_deadline;
|
|
|
|
u64 sched_period;
|
|
|
|
};
|
|
|
|
|
2006-06-27 17:54:58 +08:00
|
|
|
struct futex_pi_state;
|
2008-01-26 04:08:34 +08:00
|
|
|
struct robust_list_head;
|
2010-02-23 15:55:42 +08:00
|
|
|
struct bio_list;
|
2009-03-30 07:50:06 +08:00
|
|
|
struct fs_struct;
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
struct perf_event_context;
|
2011-03-08 20:19:51 +08:00
|
|
|
struct blk_plug;
|
2014-02-06 04:54:53 +08:00
|
|
|
struct filename;
|
2015-05-12 20:29:38 +08:00
|
|
|
struct nameidata;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
mm: per-thread vma caching
This patch is a continuation of efforts trying to optimize find_vma(),
avoiding potentially expensive rbtree walks to locate a vma upon faults.
The original approach (https://lkml.org/lkml/2013/11/1/410), where the
largest vma was also cached, ended up being too specific and random,
thus further comparison with other approaches were needed. There are
two things to consider when dealing with this, the cache hit rate and
the latency of find_vma(). Improving the hit-rate does not necessarily
translate in finding the vma any faster, as the overhead of any fancy
caching schemes can be too high to consider.
We currently cache the last used vma for the whole address space, which
provides a nice optimization, reducing the total cycles in find_vma() by
up to 250%, for workloads with good locality. On the other hand, this
simple scheme is pretty much useless for workloads with poor locality.
Analyzing ebizzy runs shows that, no matter how many threads are
running, the mmap_cache hit rate is less than 2%, and in many situations
below 1%.
The proposed approach is to replace this scheme with a small per-thread
cache, maximizing hit rates at a very low maintenance cost.
Invalidations are performed by simply bumping up a 32-bit sequence
number. The only expensive operation is in the rare case of a seq
number overflow, where all caches that share the same address space are
flushed. Upon a miss, the proposed replacement policy is based on the
page number that contains the virtual address in question. Concretely,
the following results are seen on an 80 core, 8 socket x86-64 box:
1) System bootup: Most programs are single threaded, so the per-thread
scheme does improve ~50% hit rate by just adding a few more slots to
the cache.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 50.61% | 19.90 |
| patched | 73.45% | 13.58 |
+----------------+----------+------------------+
2) Kernel build: This one is already pretty good with the current
approach as we're dealing with good locality.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 75.28% | 11.03 |
| patched | 88.09% | 9.31 |
+----------------+----------+------------------+
3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 70.66% | 17.14 |
| patched | 91.15% | 12.57 |
+----------------+----------+------------------+
4) Ebizzy: There's a fair amount of variation from run to run, but this
approach always shows nearly perfect hit rates, while baseline is just
about non-existent. The amounts of cycles can fluctuate between
anywhere from ~60 to ~116 for the baseline scheme, but this approach
reduces it considerably. For instance, with 80 threads:
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 1.06% | 91.54 |
| patched | 99.97% | 14.18 |
+----------------+----------+------------------+
[akpm@linux-foundation.org: fix nommu build, per Davidlohr]
[akpm@linux-foundation.org: document vmacache_valid() logic]
[akpm@linux-foundation.org: attempt to untangle header files]
[akpm@linux-foundation.org: add vmacache_find() BUG_ON]
[hughd@google.com: add vmacache_valid_mm() (from Oleg)]
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: adjust and enhance comments]
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Michel Lespinasse <walken@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Tested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-08 06:37:25 +08:00
|
|
|
#define VMACACHE_BITS 2
|
|
|
|
#define VMACACHE_SIZE (1U << VMACACHE_BITS)
|
|
|
|
#define VMACACHE_MASK (VMACACHE_SIZE - 1)
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* These are the constant used to fake the fixed-point load-average
|
|
|
|
* counting. Some notes:
|
|
|
|
* - 11 bit fractions expand to 22 bits by the multiplies: this gives
|
|
|
|
* a load-average precision of 10 bits integer + 11 bits fractional
|
|
|
|
* - if you want to count load-averages more often, you need more
|
|
|
|
* precision, or rounding will get you. With 2-second counting freq,
|
|
|
|
* the EXP_n values would be 1981, 2034 and 2043 if still using only
|
|
|
|
* 11 bit fractions.
|
|
|
|
*/
|
|
|
|
extern unsigned long avenrun[]; /* Load averages */
|
2009-05-03 02:08:52 +08:00
|
|
|
extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#define FSHIFT 11 /* nr of bits of precision */
|
|
|
|
#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
|
2007-10-08 07:17:38 +08:00
|
|
|
#define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
|
|
|
|
#define EXP_5 2014 /* 1/exp(5sec/5min) */
|
|
|
|
#define EXP_15 2037 /* 1/exp(5sec/15min) */
|
|
|
|
|
|
|
|
#define CALC_LOAD(load,exp,n) \
|
|
|
|
load *= exp; \
|
|
|
|
load += n*(FIXED_1-exp); \
|
|
|
|
load >>= FSHIFT;
|
|
|
|
|
|
|
|
extern unsigned long total_forks;
|
|
|
|
extern int nr_threads;
|
|
|
|
DECLARE_PER_CPU(unsigned long, process_counts);
|
|
|
|
extern int nr_processes(void);
|
|
|
|
extern unsigned long nr_running(void);
|
2014-08-01 01:29:48 +08:00
|
|
|
extern bool single_task_running(void);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern unsigned long nr_iowait(void);
|
2010-07-01 15:07:17 +08:00
|
|
|
extern unsigned long nr_iowait_cpu(int cpu);
|
2014-08-06 21:19:21 +08:00
|
|
|
extern void get_iowait_load(unsigned long *nr_waiters, unsigned long *load);
|
2009-09-22 08:04:08 +08:00
|
|
|
|
2010-12-01 02:48:45 +08:00
|
|
|
extern void calc_global_load(unsigned long ticks);
|
2015-04-14 19:19:42 +08:00
|
|
|
|
|
|
|
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
|
2016-04-13 21:56:51 +08:00
|
|
|
extern void cpu_load_update_nohz_start(void);
|
|
|
|
extern void cpu_load_update_nohz_stop(void);
|
2015-04-14 19:19:42 +08:00
|
|
|
#else
|
2016-04-13 21:56:51 +08:00
|
|
|
static inline void cpu_load_update_nohz_start(void) { }
|
|
|
|
static inline void cpu_load_update_nohz_stop(void) { }
|
2015-04-14 19:19:42 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-09-20 07:58:38 +08:00
|
|
|
extern void dump_cpu_task(int cpu);
|
|
|
|
|
2007-07-10 00:52:00 +08:00
|
|
|
struct seq_file;
|
|
|
|
struct cfs_rq;
|
2007-10-15 23:00:14 +08:00
|
|
|
struct task_group;
|
2007-07-10 00:52:00 +08:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
|
|
|
|
extern void proc_sched_set_task(struct task_struct *p);
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-09-30 06:18:21 +08:00
|
|
|
/*
|
|
|
|
* Task state bitmask. NOTE! These bits are also
|
|
|
|
* encoded in fs/proc/array.c: get_task_state().
|
|
|
|
*
|
|
|
|
* We have two separate sets of flags: task->state
|
|
|
|
* is about runnability, while task->exit_state are
|
|
|
|
* about the task exiting. Confusing, but this way
|
|
|
|
* modifying one set can't modify the other one by
|
|
|
|
* mistake.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
#define TASK_RUNNING 0
|
|
|
|
#define TASK_INTERRUPTIBLE 1
|
|
|
|
#define TASK_UNINTERRUPTIBLE 2
|
2007-12-07 00:13:16 +08:00
|
|
|
#define __TASK_STOPPED 4
|
|
|
|
#define __TASK_TRACED 8
|
2005-09-30 06:18:21 +08:00
|
|
|
/* in tsk->exit_state */
|
2014-04-08 06:38:46 +08:00
|
|
|
#define EXIT_DEAD 16
|
|
|
|
#define EXIT_ZOMBIE 32
|
wait: introduce EXIT_TRACE to avoid the racy EXIT_DEAD->EXIT_ZOMBIE transition
wait_task_zombie() first does EXIT_ZOMBIE->EXIT_DEAD transition and
drops tasklist_lock. If this task is not the natural child and it is
traced, we change its state back to EXIT_ZOMBIE for ->real_parent.
The last transition is racy, this is even documented in 50b8d257486a
"ptrace: partially fix the do_wait(WEXITED) vs EXIT_DEAD->EXIT_ZOMBIE
race". wait_consider_task() tries to detect this transition and clear
->notask_error but we can't rely on ptrace_reparented(), debugger can
exit and do ptrace_unlink() before its sub-thread sets EXIT_ZOMBIE.
And there is another problem which were missed before: this transition
can also race with reparent_leader() which doesn't reset >exit_signal if
EXIT_DEAD, assuming that this task must be reaped by someone else. So
the tracee can be re-parented with ->exit_signal != SIGCHLD, and if
/sbin/init doesn't use __WALL it becomes unreapable. This was fixed by
the previous commit, but it was the temporary hack.
1. Add the new exit_state, EXIT_TRACE. It means that the task is the
traced zombie, debugger is going to detach and notify its natural
parent.
This new state is actually EXIT_ZOMBIE | EXIT_DEAD. This way we
can avoid the changes in proc/kgdb code, get_task_state() still
reports "X (dead)" in this case.
Note: with or without this change userspace can see Z -> X -> Z
transition. Not really bad, but probably makes sense to fix.
2. Change wait_task_zombie() to use EXIT_TRACE instead of EXIT_DEAD
if we need to notify the ->real_parent.
3. Revert the previous hack in reparent_leader(), now that EXIT_DEAD
is always the final state we can safely ignore such a task.
4. Change wait_consider_task() to check EXIT_TRACE separately and kill
the racy and no longer needed ptrace_reparented() case.
If ptrace == T an EXIT_TRACE thread should be simply ignored, the
owner of this state is going to ptrace_unlink() this task. We can
pretend that it was already removed from ->ptraced list.
Otherwise we should skip this thread too but clear ->notask_error,
we must be the natural parent and debugger is going to untrace and
notify us. IOW, this doesn't differ from "EXIT_ZOMBIE && p->ptrace"
even if the task was already untraced.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reported-by: Jan Kratochvil <jan.kratochvil@redhat.com>
Reported-by: Michal Schmidt <mschmidt@redhat.com>
Tested-by: Michal Schmidt <mschmidt@redhat.com>
Cc: Al Viro <viro@ZenIV.linux.org.uk>
Cc: Lennart Poettering <lpoetter@redhat.com>
Cc: Roland McGrath <roland@hack.frob.com>
Cc: Tejun Heo <tj@kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-08 06:38:42 +08:00
|
|
|
#define EXIT_TRACE (EXIT_ZOMBIE | EXIT_DEAD)
|
2005-09-30 06:18:21 +08:00
|
|
|
/* in tsk->state again */
|
2007-10-15 23:00:13 +08:00
|
|
|
#define TASK_DEAD 64
|
2007-12-07 00:13:16 +08:00
|
|
|
#define TASK_WAKEKILL 128
|
2009-09-15 20:43:03 +08:00
|
|
|
#define TASK_WAKING 256
|
2013-04-09 15:33:34 +08:00
|
|
|
#define TASK_PARKED 512
|
2015-05-08 20:23:45 +08:00
|
|
|
#define TASK_NOLOAD 1024
|
|
|
|
#define TASK_STATE_MAX 2048
|
2007-12-07 00:13:16 +08:00
|
|
|
|
2015-05-08 20:23:45 +08:00
|
|
|
#define TASK_STATE_TO_CHAR_STR "RSDTtXZxKWPN"
|
2009-12-17 20:16:27 +08:00
|
|
|
|
2009-12-17 20:16:30 +08:00
|
|
|
extern char ___assert_task_state[1 - 2*!!(
|
|
|
|
sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
|
2007-12-07 00:13:16 +08:00
|
|
|
|
|
|
|
/* Convenience macros for the sake of set_task_state */
|
|
|
|
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
|
|
|
|
#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED)
|
|
|
|
#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-05-08 20:23:45 +08:00
|
|
|
#define TASK_IDLE (TASK_UNINTERRUPTIBLE | TASK_NOLOAD)
|
|
|
|
|
2007-12-06 23:55:25 +08:00
|
|
|
/* Convenience macros for the sake of wake_up */
|
|
|
|
#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
|
2007-12-07 00:13:16 +08:00
|
|
|
#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
|
2007-12-06 23:55:25 +08:00
|
|
|
|
|
|
|
/* get_task_state() */
|
|
|
|
#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
|
2007-12-07 00:13:16 +08:00
|
|
|
TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
|
2014-01-24 07:55:35 +08:00
|
|
|
__TASK_TRACED | EXIT_ZOMBIE | EXIT_DEAD)
|
2007-12-06 23:55:25 +08:00
|
|
|
|
2007-12-07 00:13:16 +08:00
|
|
|
#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
|
|
|
|
#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
|
2007-12-06 23:55:25 +08:00
|
|
|
#define task_is_stopped_or_traced(task) \
|
2007-12-07 00:13:16 +08:00
|
|
|
((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
|
2007-12-06 23:55:25 +08:00
|
|
|
#define task_contributes_to_load(task) \
|
2009-04-09 08:45:12 +08:00
|
|
|
((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
|
2015-05-08 20:23:45 +08:00
|
|
|
(task->flags & PF_FROZEN) == 0 && \
|
|
|
|
(task->state & TASK_NOLOAD) == 0)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-09-24 16:18:55 +08:00
|
|
|
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
|
|
|
|
|
|
|
|
#define __set_task_state(tsk, state_value) \
|
|
|
|
do { \
|
|
|
|
(tsk)->task_state_change = _THIS_IP_; \
|
|
|
|
(tsk)->state = (state_value); \
|
|
|
|
} while (0)
|
|
|
|
#define set_task_state(tsk, state_value) \
|
|
|
|
do { \
|
|
|
|
(tsk)->task_state_change = _THIS_IP_; \
|
2015-05-12 16:51:55 +08:00
|
|
|
smp_store_mb((tsk)->state, (state_value)); \
|
2014-09-24 16:18:55 +08:00
|
|
|
} while (0)
|
|
|
|
|
|
|
|
/*
|
|
|
|
* set_current_state() includes a barrier so that the write of current->state
|
|
|
|
* is correctly serialised wrt the caller's subsequent test of whether to
|
|
|
|
* actually sleep:
|
|
|
|
*
|
|
|
|
* set_current_state(TASK_UNINTERRUPTIBLE);
|
|
|
|
* if (do_i_need_to_sleep())
|
|
|
|
* schedule();
|
|
|
|
*
|
|
|
|
* If the caller does not need such serialisation then use __set_current_state()
|
|
|
|
*/
|
|
|
|
#define __set_current_state(state_value) \
|
|
|
|
do { \
|
|
|
|
current->task_state_change = _THIS_IP_; \
|
|
|
|
current->state = (state_value); \
|
|
|
|
} while (0)
|
|
|
|
#define set_current_state(state_value) \
|
|
|
|
do { \
|
|
|
|
current->task_state_change = _THIS_IP_; \
|
2015-05-12 16:51:55 +08:00
|
|
|
smp_store_mb(current->state, (state_value)); \
|
2014-09-24 16:18:55 +08:00
|
|
|
} while (0)
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#define __set_task_state(tsk, state_value) \
|
|
|
|
do { (tsk)->state = (state_value); } while (0)
|
|
|
|
#define set_task_state(tsk, state_value) \
|
2015-05-12 16:51:55 +08:00
|
|
|
smp_store_mb((tsk)->state, (state_value))
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-09-13 16:25:14 +08:00
|
|
|
/*
|
|
|
|
* set_current_state() includes a barrier so that the write of current->state
|
|
|
|
* is correctly serialised wrt the caller's subsequent test of whether to
|
|
|
|
* actually sleep:
|
|
|
|
*
|
|
|
|
* set_current_state(TASK_UNINTERRUPTIBLE);
|
|
|
|
* if (do_i_need_to_sleep())
|
|
|
|
* schedule();
|
|
|
|
*
|
|
|
|
* If the caller does not need such serialisation then use __set_current_state()
|
|
|
|
*/
|
2014-09-24 16:18:55 +08:00
|
|
|
#define __set_current_state(state_value) \
|
2005-04-17 06:20:36 +08:00
|
|
|
do { current->state = (state_value); } while (0)
|
2014-09-24 16:18:55 +08:00
|
|
|
#define set_current_state(state_value) \
|
2015-05-12 16:51:55 +08:00
|
|
|
smp_store_mb(current->state, (state_value))
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-09-24 16:18:55 +08:00
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Task command name length */
|
|
|
|
#define TASK_COMM_LEN 16
|
|
|
|
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This serializes "schedule()" and also protects
|
|
|
|
* the run-queue from deletions/modifications (but
|
|
|
|
* _adding_ to the beginning of the run-queue has
|
|
|
|
* a separate lock).
|
|
|
|
*/
|
|
|
|
extern rwlock_t tasklist_lock;
|
|
|
|
extern spinlock_t mmlist_lock;
|
|
|
|
|
2006-07-03 15:25:41 +08:00
|
|
|
struct task_struct;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-03-03 23:46:56 +08:00
|
|
|
#ifdef CONFIG_PROVE_RCU
|
|
|
|
extern int lockdep_tasklist_lock_is_held(void);
|
|
|
|
#endif /* #ifdef CONFIG_PROVE_RCU */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void sched_init(void);
|
|
|
|
extern void sched_init_smp(void);
|
2008-02-16 01:56:34 +08:00
|
|
|
extern asmlinkage void schedule_tail(struct task_struct *prev);
|
2006-07-03 15:25:41 +08:00
|
|
|
extern void init_idle(struct task_struct *idle, int cpu);
|
2007-07-10 00:51:58 +08:00
|
|
|
extern void init_idle_bootup_task(struct task_struct *idle);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-03-10 00:12:07 +08:00
|
|
|
extern cpumask_var_t cpu_isolated_map;
|
|
|
|
|
2009-09-20 02:55:44 +08:00
|
|
|
extern int runqueue_is_locked(int cpu);
|
2008-05-13 03:20:52 +08:00
|
|
|
|
2011-08-11 05:21:01 +08:00
|
|
|
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ_COMMON)
|
2012-09-10 15:10:58 +08:00
|
|
|
extern void nohz_balance_enter_idle(int cpu);
|
2011-12-02 09:07:33 +08:00
|
|
|
extern void set_cpu_sd_state_idle(void);
|
2015-05-27 06:50:33 +08:00
|
|
|
extern int get_nohz_timer_target(void);
|
2007-05-08 15:32:51 +08:00
|
|
|
#else
|
2012-09-10 15:10:58 +08:00
|
|
|
static inline void nohz_balance_enter_idle(int cpu) { }
|
2011-12-06 19:47:55 +08:00
|
|
|
static inline void set_cpu_sd_state_idle(void) { }
|
2007-05-08 15:32:51 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-12-07 12:35:59 +08:00
|
|
|
/*
|
2007-04-26 11:50:03 +08:00
|
|
|
* Only dump TASK_* tasks. (0 for all tasks)
|
2006-12-07 12:35:59 +08:00
|
|
|
*/
|
|
|
|
extern void show_state_filter(unsigned long state_filter);
|
|
|
|
|
|
|
|
static inline void show_state(void)
|
|
|
|
{
|
2007-04-26 11:50:03 +08:00
|
|
|
show_state_filter(0);
|
2006-12-07 12:35:59 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void show_regs(struct pt_regs *);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* TASK is a pointer to the task whose backtrace we want to see (or NULL for current
|
|
|
|
* task), SP is the stack pointer of the first frame that should be shown in the back
|
|
|
|
* trace (or NULL if the entire call-chain of the task should be shown).
|
|
|
|
*/
|
|
|
|
extern void show_stack(struct task_struct *task, unsigned long *sp);
|
|
|
|
|
|
|
|
extern void cpu_init (void);
|
|
|
|
extern void trap_init(void);
|
|
|
|
extern void update_process_times(int user);
|
|
|
|
extern void scheduler_tick(void);
|
2016-03-10 19:54:09 +08:00
|
|
|
extern int sched_cpu_starting(unsigned int cpu);
|
2016-03-10 19:54:13 +08:00
|
|
|
extern int sched_cpu_activate(unsigned int cpu);
|
|
|
|
extern int sched_cpu_deactivate(unsigned int cpu);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2016-03-10 19:54:18 +08:00
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
|
|
extern int sched_cpu_dying(unsigned int cpu);
|
|
|
|
#else
|
|
|
|
# define sched_cpu_dying NULL
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-01-26 04:08:02 +08:00
|
|
|
extern void sched_show_task(struct task_struct *p);
|
|
|
|
|
2010-05-13 08:30:49 +08:00
|
|
|
#ifdef CONFIG_LOCKUP_DETECTOR
|
2015-12-09 00:28:04 +08:00
|
|
|
extern void touch_softlockup_watchdog_sched(void);
|
2005-09-07 06:16:27 +08:00
|
|
|
extern void touch_softlockup_watchdog(void);
|
2010-01-28 06:25:22 +08:00
|
|
|
extern void touch_softlockup_watchdog_sync(void);
|
2007-05-08 15:28:05 +08:00
|
|
|
extern void touch_all_softlockup_watchdogs(void);
|
2010-05-08 05:11:45 +08:00
|
|
|
extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
|
|
|
|
void __user *buffer,
|
|
|
|
size_t *lenp, loff_t *ppos);
|
2008-05-13 03:21:04 +08:00
|
|
|
extern unsigned int softlockup_panic;
|
2015-11-06 10:44:44 +08:00
|
|
|
extern unsigned int hardlockup_panic;
|
2010-11-26 01:38:29 +08:00
|
|
|
void lockup_detector_init(void);
|
2005-09-07 06:16:27 +08:00
|
|
|
#else
|
2015-12-09 00:28:04 +08:00
|
|
|
static inline void touch_softlockup_watchdog_sched(void)
|
|
|
|
{
|
|
|
|
}
|
2005-09-07 06:16:27 +08:00
|
|
|
static inline void touch_softlockup_watchdog(void)
|
|
|
|
{
|
|
|
|
}
|
2010-01-28 06:25:22 +08:00
|
|
|
static inline void touch_softlockup_watchdog_sync(void)
|
|
|
|
{
|
|
|
|
}
|
2007-05-08 15:28:05 +08:00
|
|
|
static inline void touch_all_softlockup_watchdogs(void)
|
|
|
|
{
|
|
|
|
}
|
2010-11-26 01:38:29 +08:00
|
|
|
static inline void lockup_detector_init(void)
|
|
|
|
{
|
|
|
|
}
|
2005-09-07 06:16:27 +08:00
|
|
|
#endif
|
|
|
|
|
2013-10-12 08:39:26 +08:00
|
|
|
#ifdef CONFIG_DETECT_HUNG_TASK
|
|
|
|
void reset_hung_task_detector(void);
|
|
|
|
#else
|
|
|
|
static inline void reset_hung_task_detector(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Attach to any functions which should be ignored in wchan output. */
|
|
|
|
#define __sched __attribute__((__section__(".sched.text")))
|
2007-11-28 22:52:56 +08:00
|
|
|
|
|
|
|
/* Linker adds these: start and end of __sched functions */
|
|
|
|
extern char __sched_text_start[], __sched_text_end[];
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Is this address in the __sched functions? */
|
|
|
|
extern int in_sched_functions(unsigned long addr);
|
|
|
|
|
|
|
|
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
|
2008-02-14 07:03:15 +08:00
|
|
|
extern signed long schedule_timeout(signed long timeout);
|
2005-09-10 15:27:21 +08:00
|
|
|
extern signed long schedule_timeout_interruptible(signed long timeout);
|
2007-12-07 00:59:46 +08:00
|
|
|
extern signed long schedule_timeout_killable(signed long timeout);
|
2005-09-10 15:27:21 +08:00
|
|
|
extern signed long schedule_timeout_uninterruptible(signed long timeout);
|
2016-03-26 05:20:21 +08:00
|
|
|
extern signed long schedule_timeout_idle(signed long timeout);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage void schedule(void);
|
2011-03-21 19:09:35 +08:00
|
|
|
extern void schedule_preempt_disabled(void);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
sched: Prevent recursion in io_schedule()
io_schedule() calls blk_flush_plug() which, depending on the
contents of current->plug, can initiate arbitrary blk-io requests.
Note that this contrasts with blk_schedule_flush_plug() which requires
all non-trivial work to be handed off to a separate thread.
This makes it possible for io_schedule() to recurse, and initiating
block requests could possibly call mempool_alloc() which, in times of
memory pressure, uses io_schedule().
Apart from any stack usage issues, io_schedule() will not behave
correctly when called recursively as delayacct_blkio_start() does
not allow for repeated calls.
So:
- use ->in_iowait to detect recursion. Set it earlier, and restore
it to the old value.
- move the call to "raw_rq" after the call to blk_flush_plug().
As this is some sort of per-cpu thing, we want some chance that
we are on the right CPU
- When io_schedule() is called recurively, use blk_schedule_flush_plug()
which cannot further recurse.
- as this makes io_schedule() a lot more complex and as io_schedule()
must match io_schedule_timeout(), but all the changes in io_schedule_timeout()
and make io_schedule a simple wrapper for that.
Signed-off-by: NeilBrown <neilb@suse.de>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
[ Moved the now rudimentary io_schedule() into sched.h. ]
Cc: Jens Axboe <axboe@kernel.dk>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Tony Battersby <tonyb@cybernetics.com>
Link: http://lkml.kernel.org/r/20150213162600.059fffb2@notabene.brown
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-02-13 12:49:17 +08:00
|
|
|
extern long io_schedule_timeout(long timeout);
|
|
|
|
|
|
|
|
static inline void io_schedule(void)
|
|
|
|
{
|
|
|
|
io_schedule_timeout(MAX_SCHEDULE_TIMEOUT);
|
|
|
|
}
|
|
|
|
|
2006-10-02 17:18:06 +08:00
|
|
|
struct nsproxy;
|
2007-07-16 14:40:59 +08:00
|
|
|
struct user_namespace;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-01-16 09:01:35 +08:00
|
|
|
#ifdef CONFIG_MMU
|
|
|
|
extern void arch_pick_mmap_layout(struct mm_struct *mm);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern unsigned long
|
|
|
|
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
|
|
|
|
unsigned long, unsigned long);
|
|
|
|
extern unsigned long
|
|
|
|
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
|
|
|
|
unsigned long len, unsigned long pgoff,
|
|
|
|
unsigned long flags);
|
2010-01-16 09:01:35 +08:00
|
|
|
#else
|
|
|
|
static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-11-13 07:11:17 +08:00
|
|
|
#define SUID_DUMP_DISABLE 0 /* No setuid dumping */
|
|
|
|
#define SUID_DUMP_USER 1 /* Dump as user of process */
|
|
|
|
#define SUID_DUMP_ROOT 2 /* Dump as root */
|
|
|
|
|
2007-07-19 16:48:27 +08:00
|
|
|
/* mm flags */
|
2009-09-22 08:01:57 +08:00
|
|
|
|
2014-01-24 07:55:32 +08:00
|
|
|
/* for SUID_DUMP_* above */
|
2007-07-19 16:48:28 +08:00
|
|
|
#define MMF_DUMPABLE_BITS 2
|
2009-09-22 08:01:57 +08:00
|
|
|
#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
|
2007-07-19 16:48:28 +08:00
|
|
|
|
2014-01-24 07:55:34 +08:00
|
|
|
extern void set_dumpable(struct mm_struct *mm, int value);
|
|
|
|
/*
|
|
|
|
* This returns the actual value of the suid_dumpable flag. For things
|
|
|
|
* that are using this for checking for privilege transitions, it must
|
|
|
|
* test against SUID_DUMP_USER rather than treating it as a boolean
|
|
|
|
* value.
|
|
|
|
*/
|
|
|
|
static inline int __get_dumpable(unsigned long mm_flags)
|
|
|
|
{
|
|
|
|
return mm_flags & MMF_DUMPABLE_MASK;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline int get_dumpable(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return __get_dumpable(mm->flags);
|
|
|
|
}
|
|
|
|
|
2007-07-19 16:48:28 +08:00
|
|
|
/* coredump filter bits */
|
|
|
|
#define MMF_DUMP_ANON_PRIVATE 2
|
|
|
|
#define MMF_DUMP_ANON_SHARED 3
|
|
|
|
#define MMF_DUMP_MAPPED_PRIVATE 4
|
|
|
|
#define MMF_DUMP_MAPPED_SHARED 5
|
2007-10-17 14:27:02 +08:00
|
|
|
#define MMF_DUMP_ELF_HEADERS 6
|
coredump_filter: add hugepage dumping
Presently hugepage's vma has a VM_RESERVED flag in order not to be
swapped. But a VM_RESERVED vma isn't core dumped because this flag is
often used for some kernel vmas (e.g. vmalloc, sound related).
Thus hugepages are never dumped and it can't be debugged easily. Many
developers want hugepages to be included into core-dump.
However, We can't read generic VM_RESERVED area because this area is often
IO mapping area. then these area reading may change device state. it is
definitly undesiable side-effect.
So adding a hugepage specific bit to the coredump filter is better. It
will be able to hugepage core dumping and doesn't cause any side-effect to
any i/o devices.
In additional, libhugetlb use hugetlb private mapping pages as anonymous
page. Then, hugepage private mapping pages should be core dumped by
default.
Then, /proc/[pid]/core_dump_filter has two new bits.
- bit 5 mean hugetlb private mapping pages are dumped or not. (default: yes)
- bit 6 mean hugetlb shared mapping pages are dumped or not. (default: no)
I tested by following method.
% ulimit -c unlimited
% ./crash_hugepage 50
% ./crash_hugepage 50 -p
% ls -lh
% gdb ./crash_hugepage core
%
% echo 0x43 > /proc/self/coredump_filter
% ./crash_hugepage 50
% ./crash_hugepage 50 -p
% ls -lh
% gdb ./crash_hugepage core
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>
#include "hugetlbfs.h"
int main(int argc, char** argv){
char* p;
int ch;
int mmap_flags = MAP_SHARED;
int fd;
int nr_pages;
while((ch = getopt(argc, argv, "p")) != -1) {
switch (ch) {
case 'p':
mmap_flags &= ~MAP_SHARED;
mmap_flags |= MAP_PRIVATE;
break;
default:
/* nothing*/
break;
}
}
argc -= optind;
argv += optind;
if (argc == 0){
printf("need # of pages\n");
exit(1);
}
nr_pages = atoi(argv[0]);
if (nr_pages < 2) {
printf("nr_pages must >2\n");
exit(1);
}
fd = hugetlbfs_unlinked_fd();
p = mmap(NULL, nr_pages * gethugepagesize(),
PROT_READ|PROT_WRITE, mmap_flags, fd, 0);
sleep(2);
*(p + gethugepagesize()) = 1; /* COW */
sleep(2);
/* crash! */
*(int*)0 = 1;
return 0;
}
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: William Irwin <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:27:08 +08:00
|
|
|
#define MMF_DUMP_HUGETLB_PRIVATE 7
|
|
|
|
#define MMF_DUMP_HUGETLB_SHARED 8
|
2015-10-06 06:33:36 +08:00
|
|
|
#define MMF_DUMP_DAX_PRIVATE 9
|
|
|
|
#define MMF_DUMP_DAX_SHARED 10
|
2009-09-22 08:01:57 +08:00
|
|
|
|
2007-07-19 16:48:28 +08:00
|
|
|
#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
|
2015-10-06 06:33:36 +08:00
|
|
|
#define MMF_DUMP_FILTER_BITS 9
|
2007-07-19 16:48:28 +08:00
|
|
|
#define MMF_DUMP_FILTER_MASK \
|
|
|
|
(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
|
|
|
|
#define MMF_DUMP_FILTER_DEFAULT \
|
coredump_filter: add hugepage dumping
Presently hugepage's vma has a VM_RESERVED flag in order not to be
swapped. But a VM_RESERVED vma isn't core dumped because this flag is
often used for some kernel vmas (e.g. vmalloc, sound related).
Thus hugepages are never dumped and it can't be debugged easily. Many
developers want hugepages to be included into core-dump.
However, We can't read generic VM_RESERVED area because this area is often
IO mapping area. then these area reading may change device state. it is
definitly undesiable side-effect.
So adding a hugepage specific bit to the coredump filter is better. It
will be able to hugepage core dumping and doesn't cause any side-effect to
any i/o devices.
In additional, libhugetlb use hugetlb private mapping pages as anonymous
page. Then, hugepage private mapping pages should be core dumped by
default.
Then, /proc/[pid]/core_dump_filter has two new bits.
- bit 5 mean hugetlb private mapping pages are dumped or not. (default: yes)
- bit 6 mean hugetlb shared mapping pages are dumped or not. (default: no)
I tested by following method.
% ulimit -c unlimited
% ./crash_hugepage 50
% ./crash_hugepage 50 -p
% ls -lh
% gdb ./crash_hugepage core
%
% echo 0x43 > /proc/self/coredump_filter
% ./crash_hugepage 50
% ./crash_hugepage 50 -p
% ls -lh
% gdb ./crash_hugepage core
#include <stdlib.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/mman.h>
#include <string.h>
#include "hugetlbfs.h"
int main(int argc, char** argv){
char* p;
int ch;
int mmap_flags = MAP_SHARED;
int fd;
int nr_pages;
while((ch = getopt(argc, argv, "p")) != -1) {
switch (ch) {
case 'p':
mmap_flags &= ~MAP_SHARED;
mmap_flags |= MAP_PRIVATE;
break;
default:
/* nothing*/
break;
}
}
argc -= optind;
argv += optind;
if (argc == 0){
printf("need # of pages\n");
exit(1);
}
nr_pages = atoi(argv[0]);
if (nr_pages < 2) {
printf("nr_pages must >2\n");
exit(1);
}
fd = hugetlbfs_unlinked_fd();
p = mmap(NULL, nr_pages * gethugepagesize(),
PROT_READ|PROT_WRITE, mmap_flags, fd, 0);
sleep(2);
*(p + gethugepagesize()) = 1; /* COW */
sleep(2);
/* crash! */
*(int*)0 = 1;
return 0;
}
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Reviewed-by: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: William Irwin <wli@holomorphy.com>
Cc: Adam Litke <agl@us.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:27:08 +08:00
|
|
|
((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
|
2008-10-19 11:28:23 +08:00
|
|
|
(1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
|
|
|
|
|
|
|
|
#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
|
|
|
|
# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
|
|
|
|
#else
|
|
|
|
# define MMF_DUMP_MASK_DEFAULT_ELF 0
|
|
|
|
#endif
|
2009-09-22 08:01:57 +08:00
|
|
|
/* leave room for more dump flags */
|
|
|
|
#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
|
2011-01-14 07:46:58 +08:00
|
|
|
#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */
|
2012-06-08 05:21:11 +08:00
|
|
|
#define MMF_EXE_FILE_CHANGED 18 /* see prctl_set_mm_exe_file() */
|
2009-09-22 08:01:57 +08:00
|
|
|
|
2012-08-19 22:15:09 +08:00
|
|
|
#define MMF_HAS_UPROBES 19 /* has uprobes */
|
|
|
|
#define MMF_RECALC_UPROBES 20 /* MMF_HAS_UPROBES can be wrong */
|
2012-08-08 23:11:42 +08:00
|
|
|
|
2009-09-22 08:01:57 +08:00
|
|
|
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
|
2007-07-19 16:48:27 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sighand_struct {
|
|
|
|
atomic_t count;
|
|
|
|
struct k_sigaction action[_NSIG];
|
|
|
|
spinlock_t siglock;
|
2007-09-21 03:40:16 +08:00
|
|
|
wait_queue_head_t signalfd_wqh;
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2006-06-25 20:49:24 +08:00
|
|
|
struct pacct_struct {
|
2006-06-25 20:49:25 +08:00
|
|
|
int ac_flag;
|
|
|
|
long ac_exitcode;
|
2006-06-25 20:49:24 +08:00
|
|
|
unsigned long ac_mem;
|
2006-06-25 20:49:26 +08:00
|
|
|
cputime_t ac_utime, ac_stime;
|
|
|
|
unsigned long ac_minflt, ac_majflt;
|
2006-06-25 20:49:24 +08:00
|
|
|
};
|
|
|
|
|
2009-07-29 18:15:26 +08:00
|
|
|
struct cpu_itimer {
|
|
|
|
cputime_t expires;
|
|
|
|
cputime_t incr;
|
2009-07-29 18:15:27 +08:00
|
|
|
u32 error;
|
|
|
|
u32 incr_error;
|
2009-07-29 18:15:26 +08:00
|
|
|
};
|
|
|
|
|
2012-11-22 07:58:35 +08:00
|
|
|
/**
|
2015-06-30 17:30:54 +08:00
|
|
|
* struct prev_cputime - snaphsot of system and user cputime
|
2012-11-22 07:58:35 +08:00
|
|
|
* @utime: time spent in user mode
|
|
|
|
* @stime: time spent in system mode
|
2015-06-30 17:30:54 +08:00
|
|
|
* @lock: protects the above two fields
|
2012-11-22 07:58:35 +08:00
|
|
|
*
|
2015-06-30 17:30:54 +08:00
|
|
|
* Stores previous user/system time values such that we can guarantee
|
|
|
|
* monotonicity.
|
2012-11-22 07:58:35 +08:00
|
|
|
*/
|
2015-06-30 17:30:54 +08:00
|
|
|
struct prev_cputime {
|
|
|
|
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
|
2012-11-22 07:58:35 +08:00
|
|
|
cputime_t utime;
|
|
|
|
cputime_t stime;
|
2015-06-30 17:30:54 +08:00
|
|
|
raw_spinlock_t lock;
|
|
|
|
#endif
|
2012-11-22 07:58:35 +08:00
|
|
|
};
|
|
|
|
|
2015-06-30 17:30:54 +08:00
|
|
|
static inline void prev_cputime_init(struct prev_cputime *prev)
|
|
|
|
{
|
|
|
|
#ifndef CONFIG_VIRT_CPU_ACCOUNTING_NATIVE
|
|
|
|
prev->utime = prev->stime = 0;
|
|
|
|
raw_spin_lock_init(&prev->lock);
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
/**
|
|
|
|
* struct task_cputime - collected CPU time counts
|
|
|
|
* @utime: time spent in user mode, in &cputime_t units
|
|
|
|
* @stime: time spent in kernel mode, in &cputime_t units
|
|
|
|
* @sum_exec_runtime: total time spent on the CPU, in nanoseconds
|
2008-09-14 23:11:46 +08:00
|
|
|
*
|
2015-06-30 17:30:54 +08:00
|
|
|
* This structure groups together three kinds of CPU time that are tracked for
|
|
|
|
* threads and thread groups. Most things considering CPU time want to group
|
|
|
|
* these counts together and treat all three of them in parallel.
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
*/
|
|
|
|
struct task_cputime {
|
|
|
|
cputime_t utime;
|
|
|
|
cputime_t stime;
|
|
|
|
unsigned long long sum_exec_runtime;
|
|
|
|
};
|
2015-06-30 17:30:54 +08:00
|
|
|
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
/* Alternate field names when used to cache expirations. */
|
|
|
|
#define virt_exp utime
|
2015-06-30 17:30:54 +08:00
|
|
|
#define prof_exp stime
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
#define sched_exp sum_exec_runtime
|
|
|
|
|
2009-02-05 19:24:16 +08:00
|
|
|
#define INIT_CPUTIME \
|
|
|
|
(struct task_cputime) { \
|
2011-12-15 21:56:09 +08:00
|
|
|
.utime = 0, \
|
|
|
|
.stime = 0, \
|
2009-02-05 19:24:16 +08:00
|
|
|
.sum_exec_runtime = 0, \
|
|
|
|
}
|
|
|
|
|
2015-04-29 04:00:23 +08:00
|
|
|
/*
|
|
|
|
* This is the atomic variant of task_cputime, which can be used for
|
|
|
|
* storing and updating task_cputime statistics without locking.
|
|
|
|
*/
|
|
|
|
struct task_cputime_atomic {
|
|
|
|
atomic64_t utime;
|
|
|
|
atomic64_t stime;
|
|
|
|
atomic64_t sum_exec_runtime;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define INIT_CPUTIME_ATOMIC \
|
|
|
|
(struct task_cputime_atomic) { \
|
|
|
|
.utime = ATOMIC64_INIT(0), \
|
|
|
|
.stime = ATOMIC64_INIT(0), \
|
|
|
|
.sum_exec_runtime = ATOMIC64_INIT(0), \
|
|
|
|
}
|
|
|
|
|
2015-09-28 23:52:18 +08:00
|
|
|
#define PREEMPT_DISABLED (PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
|
2013-09-24 01:04:26 +08:00
|
|
|
|
2009-07-10 20:57:56 +08:00
|
|
|
/*
|
2015-09-28 23:45:40 +08:00
|
|
|
* Disable preemption until the scheduler is running -- use an unconditional
|
|
|
|
* value so that it also works on !PREEMPT_COUNT kernels.
|
2009-07-10 20:57:57 +08:00
|
|
|
*
|
2015-09-28 23:45:40 +08:00
|
|
|
* Reset by start_kernel()->sched_init()->init_idle()->init_idle_preempt_count().
|
2009-07-10 20:57:56 +08:00
|
|
|
*/
|
2015-09-28 23:45:40 +08:00
|
|
|
#define INIT_PREEMPT_COUNT PREEMPT_OFFSET
|
2013-09-24 01:04:26 +08:00
|
|
|
|
2009-07-10 20:57:56 +08:00
|
|
|
/*
|
2015-09-28 23:52:18 +08:00
|
|
|
* Initial preempt_count value; reflects the preempt_count schedule invariant
|
|
|
|
* which states that during context switches:
|
2009-07-10 20:57:57 +08:00
|
|
|
*
|
2015-09-28 23:52:18 +08:00
|
|
|
* preempt_count() == 2*PREEMPT_DISABLE_OFFSET
|
|
|
|
*
|
|
|
|
* Note: PREEMPT_DISABLE_OFFSET is 0 for !PREEMPT_COUNT kernels.
|
|
|
|
* Note: See finish_task_switch().
|
2009-07-10 20:57:56 +08:00
|
|
|
*/
|
2015-09-28 23:52:18 +08:00
|
|
|
#define FORK_PREEMPT_COUNT (2*PREEMPT_DISABLE_OFFSET + PREEMPT_ENABLED)
|
2009-07-10 20:57:56 +08:00
|
|
|
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
/**
|
2009-02-05 19:24:16 +08:00
|
|
|
* struct thread_group_cputimer - thread group interval timer counts
|
2015-05-09 05:31:50 +08:00
|
|
|
* @cputime_atomic: atomic thread group interval timers.
|
2015-10-15 03:07:55 +08:00
|
|
|
* @running: true when there are timers running and
|
|
|
|
* @cputime_atomic receives updates.
|
2015-10-15 03:07:56 +08:00
|
|
|
* @checking_timer: true when a thread in the group is in the
|
|
|
|
* process of checking for thread group timers.
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
*
|
|
|
|
* This structure contains the version of task_cputime, above, that is
|
2009-02-05 19:24:16 +08:00
|
|
|
* used for thread group CPU timer calculations.
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
*/
|
2009-02-05 19:24:16 +08:00
|
|
|
struct thread_group_cputimer {
|
2015-04-29 04:00:24 +08:00
|
|
|
struct task_cputime_atomic cputime_atomic;
|
2015-10-15 03:07:55 +08:00
|
|
|
bool running;
|
2015-10-15 03:07:56 +08:00
|
|
|
bool checking_timer;
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
};
|
|
|
|
|
2011-05-27 07:25:18 +08:00
|
|
|
#include <linux/rwsem.h>
|
sched: Add 'autogroup' scheduling feature: automated per session task groups
A recurring complaint from CFS users is that parallel kbuild has
a negative impact on desktop interactivity. This patch
implements an idea from Linus, to automatically create task
groups. Currently, only per session autogroups are implemented,
but the patch leaves the way open for enhancement.
Implementation: each task's signal struct contains an inherited
pointer to a refcounted autogroup struct containing a task group
pointer, the default for all tasks pointing to the
init_task_group. When a task calls setsid(), a new task group
is created, the process is moved into the new task group, and a
reference to the preveious task group is dropped. Child
processes inherit this task group thereafter, and increase it's
refcount. When the last thread of a process exits, the
process's reference is dropped, such that when the last process
referencing an autogroup exits, the autogroup is destroyed.
At runqueue selection time, IFF a task has no cgroup assignment,
its current autogroup is used.
Autogroup bandwidth is controllable via setting it's nice level
through the proc filesystem:
cat /proc/<pid>/autogroup
Displays the task's group and the group's nice level.
echo <nice level> > /proc/<pid>/autogroup
Sets the task group's shares to the weight of nice <level> task.
Setting nice level is rate limited for !admin users due to the
abuse risk of task group locking.
The feature is enabled from boot by default if
CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via
the boot option noautogroup, and can also be turned on/off on
the fly via:
echo [01] > /proc/sys/kernel/sched_autogroup_enabled
... which will automatically move tasks to/from the root task group.
Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Paul Turner <pjt@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
[ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-11-30 21:18:03 +08:00
|
|
|
struct autogroup;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2011-03-22 03:24:47 +08:00
|
|
|
* NOTE! "signal_struct" does not have its own
|
2005-04-17 06:20:36 +08:00
|
|
|
* locking, because a shared signal_struct always
|
|
|
|
* implies a shared sighand_struct, so locking
|
|
|
|
* sighand_struct is always a proper superset of
|
|
|
|
* the locking of signal_struct.
|
|
|
|
*/
|
|
|
|
struct signal_struct {
|
signals: make task_struct->signal immutable/refcountable
We have a lot of problems with accessing task_struct->signal, it can
"disappear" at any moment. Even current can't use its ->signal safely
after exit_notify(). ->siglock helps, but it is not convenient, not
always possible, and sometimes it makes sense to use task->signal even
after this task has already dead.
This patch adds the reference counter, sigcnt, into signal_struct. This
reference is owned by task_struct and it is dropped in
__put_task_struct(). Perhaps it makes sense to export
get/put_signal_struct() later, but currently I don't see the immediate
reason.
Rename __cleanup_signal() to free_signal_struct() and unexport it. With
the previous changes it does nothing except kmem_cache_free().
Change __exit_signal() to not clear/free ->signal, it will be freed when
the last reference to any thread in the thread group goes away.
Note:
- when the last thead exits signal->tty can point to nowhere, see
the next patch.
- with or without this patch signal_struct->count should go away,
or at least it should be "int nr_threads" for fs/proc. This will
be addressed later.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Alan Cox <alan@linux.intel.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Acked-by: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-27 05:43:16 +08:00
|
|
|
atomic_t sigcnt;
|
2005-04-17 06:20:36 +08:00
|
|
|
atomic_t live;
|
2010-05-27 05:43:24 +08:00
|
|
|
int nr_threads;
|
introduce for_each_thread() to replace the buggy while_each_thread()
while_each_thread() and next_thread() should die, almost every lockless
usage is wrong.
1. Unless g == current, the lockless while_each_thread() is not safe.
while_each_thread(g, t) can loop forever if g exits, next_thread()
can't reach the unhashed thread in this case. Note that this can
happen even if g is the group leader, it can exec.
2. Even if while_each_thread() itself was correct, people often use
it wrongly.
It was never safe to just take rcu_read_lock() and loop unless
you verify that pid_alive(g) == T, even the first next_thread()
can point to the already freed/reused memory.
This patch adds signal_struct->thread_head and task->thread_node to
create the normal rcu-safe list with the stable head. The new
for_each_thread(g, t) helper is always safe under rcu_read_lock() as
long as this task_struct can't go away.
Note: of course it is ugly to have both task_struct->thread_node and the
old task_struct->thread_group, we will kill it later, after we change
the users of while_each_thread() to use for_each_thread().
Perhaps we can kill it even before we convert all users, we can
reimplement next_thread(t) using the new thread_head/thread_node. But
we can't do this right now because this will lead to subtle behavioural
changes. For example, do/while_each_thread() always sees at least one
task, while for_each_thread() can do nothing if the whole thread group
has died. Or thread_group_empty(), currently its semantics is not clear
unless thread_group_leader(p) and we need to audit the callers before we
can change it.
So this patch adds the new interface which has to coexist with the old
one for some time, hopefully the next changes will be more or less
straightforward and the old one will go away soon.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Sergey Dyasly <dserrg@gmail.com>
Tested-by: Sergey Dyasly <dserrg@gmail.com>
Reviewed-by: Sameer Nanda <snanda@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mandeep Singh Baines <msb@chromium.org>
Cc: "Ma, Xindong" <xindong.ma@intel.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: "Tu, Xiaobing" <xiaobing.tu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-01-22 07:49:56 +08:00
|
|
|
struct list_head thread_head;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
wait_queue_head_t wait_chldexit; /* for wait4() */
|
|
|
|
|
|
|
|
/* current thread group signal load-balancing target: */
|
2006-07-03 15:25:41 +08:00
|
|
|
struct task_struct *curr_target;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* shared signal handling: */
|
|
|
|
struct sigpending shared_pending;
|
|
|
|
|
|
|
|
/* thread group exit support */
|
|
|
|
int group_exit_code;
|
|
|
|
/* overloaded:
|
|
|
|
* - notify group_exit_task when ->count is equal to notify_count
|
|
|
|
* - everyone except group_exit_task is stopped during signal delivery
|
|
|
|
* of fatal signals, group_exit_task processes the signal.
|
|
|
|
*/
|
|
|
|
int notify_count;
|
2008-08-01 20:18:04 +08:00
|
|
|
struct task_struct *group_exit_task;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* thread group stop support, overloads group_exit_code too */
|
|
|
|
int group_stop_count;
|
|
|
|
unsigned int flags; /* see SIGNAL_* flags below */
|
|
|
|
|
prctl: add PR_{SET,GET}_CHILD_SUBREAPER to allow simple process supervision
Userspace service managers/supervisors need to track their started
services. Many services daemonize by double-forking and get implicitly
re-parented to PID 1. The service manager will no longer be able to
receive the SIGCHLD signals for them, and is no longer in charge of
reaping the children with wait(). All information about the children is
lost at the moment PID 1 cleans up the re-parented processes.
With this prctl, a service manager process can mark itself as a sort of
'sub-init', able to stay as the parent for all orphaned processes
created by the started services. All SIGCHLD signals will be delivered
to the service manager.
Receiving SIGCHLD and doing wait() is in cases of a service-manager much
preferred over any possible asynchronous notification about specific
PIDs, because the service manager has full access to the child process
data in /proc and the PID can not be re-used until the wait(), the
service-manager itself is in charge of, has happened.
As a side effect, the relevant parent PID information does not get lost
by a double-fork, which results in a more elaborate process tree and
'ps' output:
before:
# ps afx
253 ? Ss 0:00 /bin/dbus-daemon --system --nofork
294 ? Sl 0:00 /usr/libexec/polkit-1/polkitd
328 ? S 0:00 /usr/sbin/modem-manager
608 ? Sl 0:00 /usr/libexec/colord
658 ? Sl 0:00 /usr/libexec/upowerd
819 ? Sl 0:00 /usr/libexec/imsettings-daemon
916 ? Sl 0:00 /usr/libexec/udisks-daemon
917 ? S 0:00 \_ udisks-daemon: not polling any devices
after:
# ps afx
294 ? Ss 0:00 /bin/dbus-daemon --system --nofork
426 ? Sl 0:00 \_ /usr/libexec/polkit-1/polkitd
449 ? S 0:00 \_ /usr/sbin/modem-manager
635 ? Sl 0:00 \_ /usr/libexec/colord
705 ? Sl 0:00 \_ /usr/libexec/upowerd
959 ? Sl 0:00 \_ /usr/libexec/udisks-daemon
960 ? S 0:00 | \_ udisks-daemon: not polling any devices
977 ? Sl 0:00 \_ /usr/libexec/packagekitd
This prctl is orthogonal to PID namespaces. PID namespaces are isolated
from each other, while a service management process usually requires the
services to live in the same namespace, to be able to talk to each
other.
Users of this will be the systemd per-user instance, which provides
init-like functionality for the user's login session and D-Bus, which
activates bus services on-demand. Both need init-like capabilities to
be able to properly keep track of the services they start.
Many thanks to Oleg for several rounds of review and insights.
[akpm@linux-foundation.org: fix comment layout and spelling]
[akpm@linux-foundation.org: add lengthy code comment from Oleg]
Reviewed-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Lennart Poettering <lennart@poettering.net>
Signed-off-by: Kay Sievers <kay.sievers@vrfy.org>
Acked-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-24 06:01:54 +08:00
|
|
|
/*
|
|
|
|
* PR_SET_CHILD_SUBREAPER marks a process, like a service
|
|
|
|
* manager, to re-parent orphan (double-forking) child processes
|
|
|
|
* to this process instead of 'init'. The service manager is
|
|
|
|
* able to receive SIGCHLD signals and is able to investigate
|
|
|
|
* the process until it calls wait(). All children of this
|
|
|
|
* process will inherit a flag if they should look for a
|
|
|
|
* child_subreaper process at exit.
|
|
|
|
*/
|
|
|
|
unsigned int is_child_subreaper:1;
|
|
|
|
unsigned int has_child_subreaper:1;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* POSIX.1b Interval Timers */
|
2013-03-11 17:12:21 +08:00
|
|
|
int posix_timer_id;
|
|
|
|
struct list_head posix_timers;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* ITIMER_REAL timer for the process */
|
2006-01-10 12:52:34 +08:00
|
|
|
struct hrtimer real_timer;
|
2008-02-08 20:19:19 +08:00
|
|
|
struct pid *leader_pid;
|
2006-01-10 12:52:34 +08:00
|
|
|
ktime_t it_real_incr;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-07-29 18:15:26 +08:00
|
|
|
/*
|
|
|
|
* ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
|
|
|
|
* CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
|
|
|
|
* values are defined to 0 and 1 respectively
|
|
|
|
*/
|
|
|
|
struct cpu_itimer it[2];
|
2005-04-17 06:20:36 +08:00
|
|
|
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
/*
|
2009-02-05 19:24:16 +08:00
|
|
|
* Thread group totals for process CPU timers.
|
|
|
|
* See thread_group_cputimer(), et al, for details.
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
*/
|
2009-02-05 19:24:16 +08:00
|
|
|
struct thread_group_cputimer cputimer;
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
|
|
|
|
/* Earliest-expiration cache. */
|
|
|
|
struct task_cputime cputime_expires;
|
|
|
|
|
2015-06-07 21:54:30 +08:00
|
|
|
#ifdef CONFIG_NO_HZ_FULL
|
2016-03-24 22:38:00 +08:00
|
|
|
atomic_t tick_dep_mask;
|
2015-06-07 21:54:30 +08:00
|
|
|
#endif
|
|
|
|
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
struct list_head cpu_timers[3];
|
|
|
|
|
2007-02-12 16:53:00 +08:00
|
|
|
struct pid *tty_old_pgrp;
|
2006-12-08 18:37:55 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* boolean value for session group leader */
|
|
|
|
int leader;
|
|
|
|
|
|
|
|
struct tty_struct *tty; /* NULL if no tty */
|
|
|
|
|
sched: Add 'autogroup' scheduling feature: automated per session task groups
A recurring complaint from CFS users is that parallel kbuild has
a negative impact on desktop interactivity. This patch
implements an idea from Linus, to automatically create task
groups. Currently, only per session autogroups are implemented,
but the patch leaves the way open for enhancement.
Implementation: each task's signal struct contains an inherited
pointer to a refcounted autogroup struct containing a task group
pointer, the default for all tasks pointing to the
init_task_group. When a task calls setsid(), a new task group
is created, the process is moved into the new task group, and a
reference to the preveious task group is dropped. Child
processes inherit this task group thereafter, and increase it's
refcount. When the last thread of a process exits, the
process's reference is dropped, such that when the last process
referencing an autogroup exits, the autogroup is destroyed.
At runqueue selection time, IFF a task has no cgroup assignment,
its current autogroup is used.
Autogroup bandwidth is controllable via setting it's nice level
through the proc filesystem:
cat /proc/<pid>/autogroup
Displays the task's group and the group's nice level.
echo <nice level> > /proc/<pid>/autogroup
Sets the task group's shares to the weight of nice <level> task.
Setting nice level is rate limited for !admin users due to the
abuse risk of task group locking.
The feature is enabled from boot by default if
CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via
the boot option noautogroup, and can also be turned on/off on
the fly via:
echo [01] > /proc/sys/kernel/sched_autogroup_enabled
... which will automatically move tasks to/from the root task group.
Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Paul Turner <pjt@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
[ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-11-30 21:18:03 +08:00
|
|
|
#ifdef CONFIG_SCHED_AUTOGROUP
|
|
|
|
struct autogroup *autogroup;
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Cumulative resource counters for dead threads in the group,
|
|
|
|
* and for reaped dead child processes forked by this group.
|
|
|
|
* Live threads maintain their own counters and add to these
|
|
|
|
* in __exit_signal, except for the group leader.
|
|
|
|
*/
|
2014-08-17 01:40:10 +08:00
|
|
|
seqlock_t stats_lock;
|
2009-02-05 19:24:15 +08:00
|
|
|
cputime_t utime, stime, cutime, cstime;
|
2007-10-15 23:00:19 +08:00
|
|
|
cputime_t gtime;
|
|
|
|
cputime_t cgtime;
|
2015-06-30 17:30:54 +08:00
|
|
|
struct prev_cputime prev_cputime;
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
|
|
|
|
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
|
2007-05-11 13:22:37 +08:00
|
|
|
unsigned long inblock, oublock, cinblock, coublock;
|
getrusage: fill ru_maxrss value
Make ->ru_maxrss value in struct rusage filled accordingly to rss hiwater
mark. This struct is filled as a parameter to getrusage syscall.
->ru_maxrss value is set to KBs which is the way it is done in BSD
systems. /usr/bin/time (gnu time) application converts ->ru_maxrss to KBs
which seems to be incorrect behavior. Maintainer of this util was
notified by me with the patch which corrects it and cc'ed.
To make this happen we extend struct signal_struct by two fields. The
first one is ->maxrss which we use to store rss hiwater of the task. The
second one is ->cmaxrss which we use to store highest rss hiwater of all
task childs. These values are used in k_getrusage() to actually fill
->ru_maxrss. k_getrusage() uses current rss hiwater value directly if mm
struct exists.
Note:
exec() clear mm->hiwater_rss, but doesn't clear sig->maxrss.
it is intetionally behavior. *BSD getrusage have exec() inheriting.
test programs
========================================================
getrusage.c
===========
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <signal.h>
#include <sys/mman.h>
#include "common.h"
#define err(str) perror(str), exit(1)
int main(int argc, char** argv)
{
int status;
printf("allocate 100MB\n");
consume(100);
printf("testcase1: fork inherit? \n");
printf(" expect: initial.self ~= child.self\n");
show_rusage("initial");
if (__fork()) {
wait(&status);
} else {
show_rusage("fork child");
_exit(0);
}
printf("\n");
printf("testcase2: fork inherit? (cont.) \n");
printf(" expect: initial.children ~= 100MB, but child.children = 0\n");
show_rusage("initial");
if (__fork()) {
wait(&status);
} else {
show_rusage("child");
_exit(0);
}
printf("\n");
printf("testcase3: fork + malloc \n");
printf(" expect: child.self ~= initial.self + 50MB\n");
show_rusage("initial");
if (__fork()) {
wait(&status);
} else {
printf("allocate +50MB\n");
consume(50);
show_rusage("fork child");
_exit(0);
}
printf("\n");
printf("testcase4: grandchild maxrss\n");
printf(" expect: post_wait.children ~= 300MB\n");
show_rusage("initial");
if (__fork()) {
wait(&status);
show_rusage("post_wait");
} else {
system("./child -n 0 -g 300");
_exit(0);
}
printf("\n");
printf("testcase5: zombie\n");
printf(" expect: pre_wait ~= initial, IOW the zombie process is not accounted.\n");
printf(" post_wait ~= 400MB, IOW wait() collect child's max_rss. \n");
show_rusage("initial");
if (__fork()) {
sleep(1); /* children become zombie */
show_rusage("pre_wait");
wait(&status);
show_rusage("post_wait");
} else {
system("./child -n 400");
_exit(0);
}
printf("\n");
printf("testcase6: SIG_IGN\n");
printf(" expect: initial ~= after_zombie (child's 500MB alloc should be ignored).\n");
show_rusage("initial");
signal(SIGCHLD, SIG_IGN);
if (__fork()) {
sleep(1); /* children become zombie */
show_rusage("after_zombie");
} else {
system("./child -n 500");
_exit(0);
}
printf("\n");
signal(SIGCHLD, SIG_DFL);
printf("testcase7: exec (without fork) \n");
printf(" expect: initial ~= exec \n");
show_rusage("initial");
execl("./child", "child", "-v", NULL);
return 0;
}
child.c
=======
#include <sys/types.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include "common.h"
int main(int argc, char** argv)
{
int status;
int c;
long consume_size = 0;
long grandchild_consume_size = 0;
int show = 0;
while ((c = getopt(argc, argv, "n:g:v")) != -1) {
switch (c) {
case 'n':
consume_size = atol(optarg);
break;
case 'v':
show = 1;
break;
case 'g':
grandchild_consume_size = atol(optarg);
break;
default:
break;
}
}
if (show)
show_rusage("exec");
if (consume_size) {
printf("child alloc %ldMB\n", consume_size);
consume(consume_size);
}
if (grandchild_consume_size) {
if (fork()) {
wait(&status);
} else {
printf("grandchild alloc %ldMB\n", grandchild_consume_size);
consume(grandchild_consume_size);
exit(0);
}
}
return 0;
}
common.c
========
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <signal.h>
#include <sys/mman.h>
#include "common.h"
#define err(str) perror(str), exit(1)
void show_rusage(char *prefix)
{
int err, err2;
struct rusage rusage_self;
struct rusage rusage_children;
printf("%s: ", prefix);
err = getrusage(RUSAGE_SELF, &rusage_self);
if (!err)
printf("self %ld ", rusage_self.ru_maxrss);
err2 = getrusage(RUSAGE_CHILDREN, &rusage_children);
if (!err2)
printf("children %ld ", rusage_children.ru_maxrss);
printf("\n");
}
/* Some buggy OS need this worthless CPU waste. */
void make_pagefault(void)
{
void *addr;
int size = getpagesize();
int i;
for (i=0; i<1000; i++) {
addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (addr == MAP_FAILED)
err("make_pagefault");
memset(addr, 0, size);
munmap(addr, size);
}
}
void consume(int mega)
{
size_t sz = mega * 1024 * 1024;
void *ptr;
ptr = malloc(sz);
memset(ptr, 0, sz);
make_pagefault();
}
pid_t __fork(void)
{
pid_t pid;
pid = fork();
make_pagefault();
return pid;
}
common.h
========
void show_rusage(char *prefix);
void make_pagefault(void);
void consume(int mega);
pid_t __fork(void);
FreeBSD result (expected result)
========================================================
allocate 100MB
testcase1: fork inherit?
expect: initial.self ~= child.self
initial: self 103492 children 0
fork child: self 103540 children 0
testcase2: fork inherit? (cont.)
expect: initial.children ~= 100MB, but child.children = 0
initial: self 103540 children 103540
child: self 103564 children 0
testcase3: fork + malloc
expect: child.self ~= initial.self + 50MB
initial: self 103564 children 103564
allocate +50MB
fork child: self 154860 children 0
testcase4: grandchild maxrss
expect: post_wait.children ~= 300MB
initial: self 103564 children 154860
grandchild alloc 300MB
post_wait: self 103564 children 308720
testcase5: zombie
expect: pre_wait ~= initial, IOW the zombie process is not accounted.
post_wait ~= 400MB, IOW wait() collect child's max_rss.
initial: self 103564 children 308720
child alloc 400MB
pre_wait: self 103564 children 308720
post_wait: self 103564 children 411312
testcase6: SIG_IGN
expect: initial ~= after_zombie (child's 500MB alloc should be ignored).
initial: self 103564 children 411312
child alloc 500MB
after_zombie: self 103624 children 411312
testcase7: exec (without fork)
expect: initial ~= exec
initial: self 103624 children 411312
exec: self 103624 children 411312
Linux result (actual test result)
========================================================
allocate 100MB
testcase1: fork inherit?
expect: initial.self ~= child.self
initial: self 102848 children 0
fork child: self 102572 children 0
testcase2: fork inherit? (cont.)
expect: initial.children ~= 100MB, but child.children = 0
initial: self 102876 children 102644
child: self 102572 children 0
testcase3: fork + malloc
expect: child.self ~= initial.self + 50MB
initial: self 102876 children 102644
allocate +50MB
fork child: self 153804 children 0
testcase4: grandchild maxrss
expect: post_wait.children ~= 300MB
initial: self 102876 children 153864
grandchild alloc 300MB
post_wait: self 102876 children 307536
testcase5: zombie
expect: pre_wait ~= initial, IOW the zombie process is not accounted.
post_wait ~= 400MB, IOW wait() collect child's max_rss.
initial: self 102876 children 307536
child alloc 400MB
pre_wait: self 102876 children 307536
post_wait: self 102876 children 410076
testcase6: SIG_IGN
expect: initial ~= after_zombie (child's 500MB alloc should be ignored).
initial: self 102876 children 410076
child alloc 500MB
after_zombie: self 102880 children 410076
testcase7: exec (without fork)
expect: initial ~= exec
initial: self 102880 children 410076
exec: self 102880 children 410076
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-23 07:44:10 +08:00
|
|
|
unsigned long maxrss, cmaxrss;
|
2008-07-28 06:48:12 +08:00
|
|
|
struct task_io_accounting ioac;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-02-05 19:24:15 +08:00
|
|
|
/*
|
|
|
|
* Cumulative ns of schedule CPU time fo dead threads in the
|
|
|
|
* group, not including a zombie group leader, (This only differs
|
|
|
|
* from jiffies_to_ns(utime + stime) if sched_clock uses something
|
|
|
|
* other than jiffies.)
|
|
|
|
*/
|
|
|
|
unsigned long long sum_sched_runtime;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* We don't bother to synchronize most readers of this at all,
|
|
|
|
* because there is no reader checking a limit that actually needs
|
|
|
|
* to get both rlim_cur and rlim_max atomically, and either one
|
|
|
|
* alone is a single word that can safely be read normally.
|
|
|
|
* getrlimit/setrlimit use task_lock(current->group_leader) to
|
|
|
|
* protect this instead of the siglock, because they really
|
|
|
|
* have no need to disable irqs.
|
|
|
|
*/
|
|
|
|
struct rlimit rlim[RLIM_NLIMITS];
|
|
|
|
|
2006-06-25 20:49:24 +08:00
|
|
|
#ifdef CONFIG_BSD_PROCESS_ACCT
|
|
|
|
struct pacct_struct pacct; /* per-process accounting information */
|
|
|
|
#endif
|
2006-07-14 15:24:44 +08:00
|
|
|
#ifdef CONFIG_TASKSTATS
|
|
|
|
struct taskstats *stats;
|
|
|
|
#endif
|
Audit: add TTY input auditing
Add TTY input auditing, used to audit system administrator's actions. This is
required by various security standards such as DCID 6/3 and PCI to provide
non-repudiation of administrator's actions and to allow a review of past
actions if the administrator seems to overstep their duties or if the system
becomes misconfigured for unknown reasons. These requirements do not make it
necessary to audit TTY output as well.
Compared to an user-space keylogger, this approach records TTY input using the
audit subsystem, correlated with other audit events, and it is completely
transparent to the user-space application (e.g. the console ioctls still
work).
TTY input auditing works on a higher level than auditing all system calls
within the session, which would produce an overwhelming amount of mostly
useless audit events.
Add an "audit_tty" attribute, inherited across fork (). Data read from TTYs
by process with the attribute is sent to the audit subsystem by the kernel.
The audit netlink interface is extended to allow modifying the audit_tty
attribute, and to allow sending explanatory audit events from user-space (for
example, a shell might send an event containing the final command, after the
interactive command-line editing and history expansion is performed, which
might be difficult to decipher from the TTY input alone).
Because the "audit_tty" attribute is inherited across fork (), it would be set
e.g. for sshd restarted within an audited session. To prevent this, the
audit_tty attribute is cleared when a process with no open TTY file
descriptors (e.g. after daemon startup) opens a TTY.
See https://www.redhat.com/archives/linux-audit/2007-June/msg00000.html for a
more detailed rationale document for an older version of this patch.
[akpm@linux-foundation.org: build fix]
Signed-off-by: Miloslav Trmac <mitr@redhat.com>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Alan Cox <alan@lxorguk.ukuu.org.uk>
Cc: Paul Fulghum <paulkf@microgate.com>
Cc: Casey Schaufler <casey@schaufler-ca.com>
Cc: Steve Grubb <sgrubb@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-16 14:40:56 +08:00
|
|
|
#ifdef CONFIG_AUDIT
|
|
|
|
unsigned audit_tty;
|
|
|
|
struct tty_audit_buf *tty_audit_buf;
|
|
|
|
#endif
|
oom: move oom_adj value from task_struct to signal_struct
Currently, OOM logic callflow is here.
__out_of_memory()
select_bad_process() for each task
badness() calculate badness of one task
oom_kill_process() search child
oom_kill_task() kill target task and mm shared tasks with it
example, process-A have two thread, thread-A and thread-B and it have very
fat memory and each thread have following oom_adj and oom_score.
thread-A: oom_adj = OOM_DISABLE, oom_score = 0
thread-B: oom_adj = 0, oom_score = very-high
Then, select_bad_process() select thread-B, but oom_kill_task() refuse
kill the task because thread-A have OOM_DISABLE. Thus __out_of_memory()
call select_bad_process() again. but select_bad_process() select the same
task. It mean kernel fall in livelock.
The fact is, select_bad_process() must select killable task. otherwise
OOM logic go into livelock.
And root cause is, oom_adj shouldn't be per-thread value. it should be
per-process value because OOM-killer kill a process, not thread. Thus
This patch moves oomkilladj (now more appropriately named oom_adj) from
struct task_struct to struct signal_struct. it naturally prevent
select_bad_process() choose wrong task.
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Cc: David Rientjes <rientjes@google.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Rik van Riel <riel@redhat.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 08:03:13 +08:00
|
|
|
|
2012-12-12 08:02:56 +08:00
|
|
|
oom_flags_t oom_flags;
|
2012-12-12 08:02:54 +08:00
|
|
|
short oom_score_adj; /* OOM kill score adjustment */
|
|
|
|
short oom_score_adj_min; /* OOM kill score adjustment min value.
|
|
|
|
* Only settable by CAP_SYS_RESOURCE. */
|
2010-10-28 06:34:08 +08:00
|
|
|
|
|
|
|
struct mutex cred_guard_mutex; /* guard against foreign influences on
|
|
|
|
* credential calculations
|
|
|
|
* (notably. ptrace) */
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Bits in flags field of signal_struct.
|
|
|
|
*/
|
|
|
|
#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */
|
signal: Turn SIGNAL_STOP_DEQUEUED into GROUP_STOP_DEQUEUED
This patch moves SIGNAL_STOP_DEQUEUED from signal_struct->flags to
task_struct->group_stop, and thus makes it per-thread.
Like SIGNAL_STOP_DEQUEUED, GROUP_STOP_DEQUEUED can be false-positive
after return from get_signal_to_deliver(), this is fine. The only
purpose of this bit is: we can drop ->siglock after __dequeue_signal()
returns the sig_kernel_stop() signal and before we call
do_signal_stop(), in this case we must not miss SIGCONT if it comes in
between.
But, unlike SIGNAL_STOP_DEQUEUED, GROUP_STOP_DEQUEUED can not be
false-positive in do_signal_stop() if multiple threads dequeue the
sig_kernel_stop() signal at the same time.
Consider two threads T1 and T2, SIGTTIN has a hanlder.
- T1 dequeues SIGTSTP and sets SIGNAL_STOP_DEQUEUED, then
it drops ->siglock
- SIGCONT comes and clears SIGNAL_STOP_DEQUEUED, SIGTSTP
should be cancelled.
- T2 dequeues SIGTTIN and sets SIGNAL_STOP_DEQUEUED again.
Since we have a handler we should not stop, T2 returns
to usermode to run the handler.
- T1 continues, calls do_signal_stop() and wrongly starts
the group stop because SIGNAL_STOP_DEQUEUED was restored
in between.
With or without this change:
- we need to do something with ptrace_signal() which can
return SIGSTOP, but this needs another discussion
- SIGSTOP can be lost if it races with the mt exec, will
be fixed later.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Signed-off-by: Tejun Heo <tj@kernel.org>
2011-04-02 02:12:38 +08:00
|
|
|
#define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */
|
|
|
|
#define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */
|
2013-05-01 06:28:10 +08:00
|
|
|
#define SIGNAL_GROUP_COREDUMP 0x00000008 /* coredump in progress */
|
2008-04-30 15:52:44 +08:00
|
|
|
/*
|
|
|
|
* Pending notifications to parent.
|
|
|
|
*/
|
|
|
|
#define SIGNAL_CLD_STOPPED 0x00000010
|
|
|
|
#define SIGNAL_CLD_CONTINUED 0x00000020
|
|
|
|
#define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-30 15:53:03 +08:00
|
|
|
#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */
|
|
|
|
|
2008-02-05 14:27:24 +08:00
|
|
|
/* If true, all threads except ->group_exit_task have pending SIGKILL */
|
|
|
|
static inline int signal_group_exit(const struct signal_struct *sig)
|
|
|
|
{
|
|
|
|
return (sig->flags & SIGNAL_GROUP_EXIT) ||
|
|
|
|
(sig->group_exit_task != NULL);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Some day this will be a full-fledged user tracking system..
|
|
|
|
*/
|
|
|
|
struct user_struct {
|
|
|
|
atomic_t __count; /* reference count */
|
|
|
|
atomic_t processes; /* How many processes does this user have? */
|
|
|
|
atomic_t sigpending; /* How many pending signals does this user have? */
|
2006-06-02 04:10:59 +08:00
|
|
|
#ifdef CONFIG_INOTIFY_USER
|
[PATCH] inotify
inotify is intended to correct the deficiencies of dnotify, particularly
its inability to scale and its terrible user interface:
* dnotify requires the opening of one fd per each directory
that you intend to watch. This quickly results in too many
open files and pins removable media, preventing unmount.
* dnotify is directory-based. You only learn about changes to
directories. Sure, a change to a file in a directory affects
the directory, but you are then forced to keep a cache of
stat structures.
* dnotify's interface to user-space is awful. Signals?
inotify provides a more usable, simple, powerful solution to file change
notification:
* inotify's interface is a system call that returns a fd, not SIGIO.
You get a single fd, which is select()-able.
* inotify has an event that says "the filesystem that the item
you were watching is on was unmounted."
* inotify can watch directories or files.
Inotify is currently used by Beagle (a desktop search infrastructure),
Gamin (a FAM replacement), and other projects.
See Documentation/filesystems/inotify.txt.
Signed-off-by: Robert Love <rml@novell.com>
Cc: John McCutchan <ttb@tentacle.dhs.org>
Cc: Christoph Hellwig <hch@lst.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-07-13 05:06:03 +08:00
|
|
|
atomic_t inotify_watches; /* How many inotify watches does this user have? */
|
|
|
|
atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
|
|
|
|
#endif
|
2010-10-29 05:21:58 +08:00
|
|
|
#ifdef CONFIG_FANOTIFY
|
|
|
|
atomic_t fanotify_listeners;
|
|
|
|
#endif
|
epoll: introduce resource usage limits
It has been thought that the per-user file descriptors limit would also
limit the resources that a normal user can request via the epoll
interface. Vegard Nossum reported a very simple program (a modified
version attached) that can make a normal user to request a pretty large
amount of kernel memory, well within the its maximum number of fds. To
solve such problem, default limits are now imposed, and /proc based
configuration has been introduced. A new directory has been created,
named /proc/sys/fs/epoll/ and inside there, there are two configuration
points:
max_user_instances = Maximum number of devices - per user
max_user_watches = Maximum number of "watched" fds - per user
The current default for "max_user_watches" limits the memory used by epoll
to store "watches", to 1/32 of the amount of the low RAM. As example, a
256MB 32bit machine, will have "max_user_watches" set to roughly 90000.
That should be enough to not break existing heavy epoll users. The
default value for "max_user_instances" is set to 128, that should be
enough too.
This also changes the userspace, because a new error code can now come out
from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already
listed, so that should be ok.
[akpm@linux-foundation.org: use get_current_user()]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <stable@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Reported-by: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-12-02 05:13:55 +08:00
|
|
|
#ifdef CONFIG_EPOLL
|
2011-01-13 09:00:01 +08:00
|
|
|
atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
|
epoll: introduce resource usage limits
It has been thought that the per-user file descriptors limit would also
limit the resources that a normal user can request via the epoll
interface. Vegard Nossum reported a very simple program (a modified
version attached) that can make a normal user to request a pretty large
amount of kernel memory, well within the its maximum number of fds. To
solve such problem, default limits are now imposed, and /proc based
configuration has been introduced. A new directory has been created,
named /proc/sys/fs/epoll/ and inside there, there are two configuration
points:
max_user_instances = Maximum number of devices - per user
max_user_watches = Maximum number of "watched" fds - per user
The current default for "max_user_watches" limits the memory used by epoll
to store "watches", to 1/32 of the amount of the low RAM. As example, a
256MB 32bit machine, will have "max_user_watches" set to roughly 90000.
That should be enough to not break existing heavy epoll users. The
default value for "max_user_instances" is set to 128, that should be
enough too.
This also changes the userspace, because a new error code can now come out
from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already
listed, so that should be ok.
[akpm@linux-foundation.org: use get_current_user()]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <stable@kernel.org>
Cc: Cyrill Gorcunov <gorcunov@gmail.com>
Reported-by: Vegard Nossum <vegardno@ifi.uio.no>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-12-02 05:13:55 +08:00
|
|
|
#endif
|
2007-10-17 14:30:09 +08:00
|
|
|
#ifdef CONFIG_POSIX_MQUEUE
|
2005-04-17 06:20:36 +08:00
|
|
|
/* protected by mq_lock */
|
|
|
|
unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
|
2007-10-17 14:30:09 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long locked_shm; /* How many pages of mlocked shm ? */
|
2016-01-10 14:54:56 +08:00
|
|
|
unsigned long unix_inflight; /* How many files in flight in unix sockets */
|
2016-01-18 23:36:09 +08:00
|
|
|
atomic_long_t pipe_bufs; /* how many pages are allocated in pipe buffers */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_KEYS
|
|
|
|
struct key *uid_keyring; /* UID specific keyring */
|
|
|
|
struct key *session_keyring; /* UID's default session keyring */
|
|
|
|
#endif
|
|
|
|
|
|
|
|
/* Hash table maintenance information */
|
2007-09-19 13:46:44 +08:00
|
|
|
struct hlist_node uidhash_node;
|
2011-11-17 15:20:58 +08:00
|
|
|
kuid_t uid;
|
2007-10-15 23:00:09 +08:00
|
|
|
|
2015-10-08 13:23:22 +08:00
|
|
|
#if defined(CONFIG_PERF_EVENTS) || defined(CONFIG_BPF_SYSCALL)
|
2009-05-15 21:19:27 +08:00
|
|
|
atomic_long_t locked_vm;
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2007-11-02 20:47:53 +08:00
|
|
|
extern int uids_sysfs_init(void);
|
2007-10-15 23:00:14 +08:00
|
|
|
|
2011-11-17 15:20:58 +08:00
|
|
|
extern struct user_struct *find_user(kuid_t);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern struct user_struct root_user;
|
|
|
|
#define INIT_USER (&root_user)
|
|
|
|
|
2008-11-14 07:39:16 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct backing_dev_info;
|
|
|
|
struct reclaim_state;
|
|
|
|
|
2015-06-26 02:23:37 +08:00
|
|
|
#ifdef CONFIG_SCHED_INFO
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sched_info {
|
|
|
|
/* cumulative counters */
|
2007-10-15 23:00:12 +08:00
|
|
|
unsigned long pcount; /* # of times run on this cpu */
|
2008-12-17 15:41:22 +08:00
|
|
|
unsigned long long run_delay; /* time spent waiting on a runqueue */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* timestamps */
|
2007-07-10 00:52:00 +08:00
|
|
|
unsigned long long last_arrival,/* when we last ran on a cpu */
|
|
|
|
last_queued; /* when we were last queued to run */
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
2015-06-26 02:23:37 +08:00
|
|
|
#endif /* CONFIG_SCHED_INFO */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-07-14 15:24:36 +08:00
|
|
|
#ifdef CONFIG_TASK_DELAY_ACCT
|
|
|
|
struct task_delay_info {
|
|
|
|
spinlock_t lock;
|
|
|
|
unsigned int flags; /* Private per-task flags */
|
|
|
|
|
|
|
|
/* For each stat XXX, add following, aligned appropriately
|
|
|
|
*
|
|
|
|
* struct timespec XXX_start, XXX_end;
|
|
|
|
* u64 XXX_delay;
|
|
|
|
* u32 XXX_count;
|
|
|
|
*
|
|
|
|
* Atomicity of updates to XXX_delay, XXX_count protected by
|
|
|
|
* single lock above (split into XXX_lock if contention is an issue).
|
|
|
|
*/
|
2006-07-14 15:24:37 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* XXX_count is incremented on every XXX operation, the delay
|
|
|
|
* associated with the operation is added to XXX_delay.
|
|
|
|
* XXX_delay contains the accumulated delay time in nanoseconds.
|
|
|
|
*/
|
2014-07-17 05:04:35 +08:00
|
|
|
u64 blkio_start; /* Shared by blkio, swapin */
|
2006-07-14 15:24:37 +08:00
|
|
|
u64 blkio_delay; /* wait for sync block io completion */
|
|
|
|
u64 swapin_delay; /* wait for swapin block io completion */
|
|
|
|
u32 blkio_count; /* total count of the number of sync block */
|
|
|
|
/* io operations performed */
|
|
|
|
u32 swapin_count; /* total count of the number of swapin block */
|
|
|
|
/* io operations performed */
|
2008-07-25 16:48:52 +08:00
|
|
|
|
2014-07-17 05:04:35 +08:00
|
|
|
u64 freepages_start;
|
2008-07-25 16:48:52 +08:00
|
|
|
u64 freepages_delay; /* wait for memory reclaim */
|
|
|
|
u32 freepages_count; /* total count of memory reclaim */
|
2006-07-14 15:24:36 +08:00
|
|
|
};
|
2006-07-14 15:24:38 +08:00
|
|
|
#endif /* CONFIG_TASK_DELAY_ACCT */
|
|
|
|
|
|
|
|
static inline int sched_info_on(void)
|
|
|
|
{
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
return 1;
|
|
|
|
#elif defined(CONFIG_TASK_DELAY_ACCT)
|
|
|
|
extern int delayacct_on;
|
|
|
|
return delayacct_on;
|
|
|
|
#else
|
|
|
|
return 0;
|
2006-07-14 15:24:36 +08:00
|
|
|
#endif
|
2006-07-14 15:24:38 +08:00
|
|
|
}
|
2006-07-14 15:24:36 +08:00
|
|
|
|
2016-02-05 17:08:36 +08:00
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
void force_schedstat_enabled(void);
|
|
|
|
#endif
|
|
|
|
|
2007-07-10 00:51:57 +08:00
|
|
|
enum cpu_idle_type {
|
|
|
|
CPU_IDLE,
|
|
|
|
CPU_NOT_IDLE,
|
|
|
|
CPU_NEWLY_IDLE,
|
|
|
|
CPU_MAX_IDLE_TYPES
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
sched/fair: Generalize the load/util averages resolution definition
Integer metric needs fixed point arithmetic. In sched/fair, a few
metrics, e.g., weight, load, load_avg, util_avg, freq, and capacity,
may have different fixed point ranges, which makes their update and
usage error-prone.
In order to avoid the errors relating to the fixed point range, we
definie a basic fixed point range, and then formalize all metrics to
base on the basic range.
The basic range is 1024 or (1 << 10). Further, one can recursively
apply the basic range to have larger range.
Pointed out by Ben Segall, weight (visible to user, e.g., NICE-0 has
1024) and load (e.g., NICE_0_LOAD) have independent ranges, but they
must be well calibrated.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: lizefan@huawei.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1459829551-21625-2-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-05 12:12:26 +08:00
|
|
|
/*
|
|
|
|
* Integer metrics need fixed point arithmetic, e.g., sched/fair
|
|
|
|
* has a few: load, load_avg, util_avg, freq, and capacity.
|
|
|
|
*
|
|
|
|
* We define a basic fixed point arithmetic range, and then formalize
|
|
|
|
* all these metrics based on that basic range.
|
|
|
|
*/
|
|
|
|
# define SCHED_FIXEDPOINT_SHIFT 10
|
|
|
|
# define SCHED_FIXEDPOINT_SCALE (1L << SCHED_FIXEDPOINT_SHIFT)
|
|
|
|
|
2011-05-19 01:09:39 +08:00
|
|
|
/*
|
2014-05-27 06:19:39 +08:00
|
|
|
* Increase resolution of cpu_capacity calculations
|
2011-05-19 01:09:39 +08:00
|
|
|
*/
|
sched/fair: Generalize the load/util averages resolution definition
Integer metric needs fixed point arithmetic. In sched/fair, a few
metrics, e.g., weight, load, load_avg, util_avg, freq, and capacity,
may have different fixed point ranges, which makes their update and
usage error-prone.
In order to avoid the errors relating to the fixed point range, we
definie a basic fixed point range, and then formalize all metrics to
base on the basic range.
The basic range is 1024 or (1 << 10). Further, one can recursively
apply the basic range to have larger range.
Pointed out by Ben Segall, weight (visible to user, e.g., NICE-0 has
1024) and load (e.g., NICE_0_LOAD) have independent ranges, but they
must be well calibrated.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: lizefan@huawei.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1459829551-21625-2-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2016-04-05 12:12:26 +08:00
|
|
|
#define SCHED_CAPACITY_SHIFT SCHED_FIXEDPOINT_SHIFT
|
2014-05-27 06:19:39 +08:00
|
|
|
#define SCHED_CAPACITY_SCALE (1L << SCHED_CAPACITY_SHIFT)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-05-01 23:27:50 +08:00
|
|
|
/*
|
|
|
|
* Wake-queues are lists of tasks with a pending wakeup, whose
|
|
|
|
* callers have already marked the task as woken internally,
|
|
|
|
* and can thus carry on. A common use case is being able to
|
|
|
|
* do the wakeups once the corresponding user lock as been
|
|
|
|
* released.
|
|
|
|
*
|
|
|
|
* We hold reference to each task in the list across the wakeup,
|
|
|
|
* thus guaranteeing that the memory is still valid by the time
|
|
|
|
* the actual wakeups are performed in wake_up_q().
|
|
|
|
*
|
|
|
|
* One per task suffices, because there's never a need for a task to be
|
|
|
|
* in two wake queues simultaneously; it is forbidden to abandon a task
|
|
|
|
* in a wake queue (a call to wake_up_q() _must_ follow), so if a task is
|
|
|
|
* already in a wake queue, the wakeup will happen soon and the second
|
|
|
|
* waker can just skip it.
|
|
|
|
*
|
|
|
|
* The WAKE_Q macro declares and initializes the list head.
|
|
|
|
* wake_up_q() does NOT reinitialize the list; it's expected to be
|
|
|
|
* called near the end of a function, where the fact that the queue is
|
|
|
|
* not used again will be easy to see by inspection.
|
|
|
|
*
|
|
|
|
* Note that this can cause spurious wakeups. schedule() callers
|
|
|
|
* must ensure the call is done inside a loop, confirming that the
|
|
|
|
* wakeup condition has in fact occurred.
|
|
|
|
*/
|
|
|
|
struct wake_q_node {
|
|
|
|
struct wake_q_node *next;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct wake_q_head {
|
|
|
|
struct wake_q_node *first;
|
|
|
|
struct wake_q_node **lastp;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define WAKE_Q_TAIL ((struct wake_q_node *) 0x01)
|
|
|
|
|
|
|
|
#define WAKE_Q(name) \
|
|
|
|
struct wake_q_head name = { WAKE_Q_TAIL, &name.first }
|
|
|
|
|
|
|
|
extern void wake_q_add(struct wake_q_head *head,
|
|
|
|
struct task_struct *task);
|
|
|
|
extern void wake_up_q(struct wake_q_head *head);
|
|
|
|
|
2011-05-19 01:09:39 +08:00
|
|
|
/*
|
|
|
|
* sched-domains (multiprocessor balancing) declarations:
|
|
|
|
*/
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:54:34 +08:00
|
|
|
#ifdef CONFIG_SMP
|
2009-09-01 16:34:33 +08:00
|
|
|
#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
|
|
|
|
#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
|
|
|
|
#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
|
|
|
|
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
|
sched: Merge select_task_rq_fair() and sched_balance_self()
The problem with wake_idle() is that is doesn't respect things like
cpu_power, which means it doesn't deal well with SMT nor the recent
RT interaction.
To cure this, it needs to do what sched_balance_self() does, which
leads to the possibility of merging select_task_rq_fair() and
sched_balance_self().
Modify sched_balance_self() to:
- update_shares() when walking up the domain tree,
(it only called it for the top domain, but it should
have done this anyway), which allows us to remove
this ugly bit from try_to_wake_up().
- do wake_affine() on the smallest domain that contains
both this (the waking) and the prev (the wakee) cpu for
WAKE invocations.
Then use the top-down balance steps it had to replace wake_idle().
This leads to the dissapearance of SD_WAKE_BALANCE and
SD_WAKE_IDLE_FAR, with SD_WAKE_IDLE replaced with SD_BALANCE_WAKE.
SD_WAKE_AFFINE needs SD_BALANCE_WAKE to be effective.
Touch all topology bits to replace the old with new SD flags --
platforms might need re-tuning, enabling SD_BALANCE_WAKE
conditionally on a NUMA distance seems like a good additional
feature, magny-core and small nehalem systems would want this
enabled, systems with slow interconnects would not.
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-10 19:50:02 +08:00
|
|
|
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
|
2009-09-01 16:34:33 +08:00
|
|
|
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
|
2014-05-28 01:50:41 +08:00
|
|
|
#define SD_SHARE_CPUCAPACITY 0x0080 /* Domain members share cpu power */
|
2014-04-11 17:44:40 +08:00
|
|
|
#define SD_SHARE_POWERDOMAIN 0x0100 /* Domain members share power domain */
|
2009-09-01 16:34:33 +08:00
|
|
|
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
|
|
|
|
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
|
2010-06-08 12:57:02 +08:00
|
|
|
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
|
2009-09-01 16:34:33 +08:00
|
|
|
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
|
2011-07-15 16:35:52 +08:00
|
|
|
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
|
2013-10-07 18:29:00 +08:00
|
|
|
#define SD_NUMA 0x4000 /* cross-node balancing */
|
2006-06-27 17:54:42 +08:00
|
|
|
|
2014-04-11 17:44:37 +08:00
|
|
|
#ifdef CONFIG_SCHED_SMT
|
2014-06-25 09:05:29 +08:00
|
|
|
static inline int cpu_smt_flags(void)
|
2014-04-11 17:44:37 +08:00
|
|
|
{
|
2014-05-28 01:50:41 +08:00
|
|
|
return SD_SHARE_CPUCAPACITY | SD_SHARE_PKG_RESOURCES;
|
2014-04-11 17:44:37 +08:00
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_SCHED_MC
|
2014-06-25 09:05:29 +08:00
|
|
|
static inline int cpu_core_flags(void)
|
2014-04-11 17:44:37 +08:00
|
|
|
{
|
|
|
|
return SD_SHARE_PKG_RESOURCES;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_NUMA
|
2014-06-25 09:05:29 +08:00
|
|
|
static inline int cpu_numa_flags(void)
|
2014-04-11 17:44:37 +08:00
|
|
|
{
|
|
|
|
return SD_NUMA;
|
|
|
|
}
|
|
|
|
#endif
|
2010-06-08 12:57:02 +08:00
|
|
|
|
2008-04-15 13:04:23 +08:00
|
|
|
struct sched_domain_attr {
|
|
|
|
int relax_domain_level;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define SD_ATTR_INIT (struct sched_domain_attr) { \
|
|
|
|
.relax_domain_level = -1, \
|
|
|
|
}
|
|
|
|
|
2011-04-07 20:10:04 +08:00
|
|
|
extern int sched_domain_level_max;
|
|
|
|
|
2013-03-05 16:06:23 +08:00
|
|
|
struct sched_group;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sched_domain {
|
|
|
|
/* These fields must be setup */
|
|
|
|
struct sched_domain *parent; /* top domain must be null terminated */
|
2006-10-03 16:14:08 +08:00
|
|
|
struct sched_domain *child; /* bottom domain must be null terminated */
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sched_group *groups; /* the balancing groups of the domain */
|
|
|
|
unsigned long min_interval; /* Minimum balance interval ms */
|
|
|
|
unsigned long max_interval; /* Maximum balance interval ms */
|
|
|
|
unsigned int busy_factor; /* less balancing by factor if busy */
|
|
|
|
unsigned int imbalance_pct; /* No balance until over watermark */
|
|
|
|
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
|
2005-06-26 05:57:13 +08:00
|
|
|
unsigned int busy_idx;
|
|
|
|
unsigned int idle_idx;
|
|
|
|
unsigned int newidle_idx;
|
|
|
|
unsigned int wake_idx;
|
2005-06-26 05:57:19 +08:00
|
|
|
unsigned int forkexec_idx;
|
2009-09-01 16:34:35 +08:00
|
|
|
unsigned int smt_gain;
|
2013-04-23 22:59:02 +08:00
|
|
|
|
|
|
|
int nohz_idle; /* NOHZ IDLE status */
|
2005-04-17 06:20:36 +08:00
|
|
|
int flags; /* See SD_* */
|
2011-04-07 20:10:04 +08:00
|
|
|
int level;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Runtime fields. */
|
|
|
|
unsigned long last_balance; /* init to jiffies. units in jiffies */
|
|
|
|
unsigned int balance_interval; /* initialise to 1. units in ms. */
|
|
|
|
unsigned int nr_balance_failed; /* initialise to 0 */
|
|
|
|
|
2013-09-14 02:26:53 +08:00
|
|
|
/* idle_balance() stats */
|
2013-09-14 02:26:52 +08:00
|
|
|
u64 max_newidle_lb_cost;
|
2013-09-14 02:26:53 +08:00
|
|
|
unsigned long next_decay_max_lb_cost;
|
2008-06-27 19:41:35 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
/* load_balance() stats */
|
2007-10-19 03:32:56 +08:00
|
|
|
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
|
|
|
|
unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Active load balancing */
|
2007-10-19 03:32:56 +08:00
|
|
|
unsigned int alb_count;
|
|
|
|
unsigned int alb_failed;
|
|
|
|
unsigned int alb_pushed;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-06-26 05:57:20 +08:00
|
|
|
/* SD_BALANCE_EXEC stats */
|
2007-10-19 03:32:56 +08:00
|
|
|
unsigned int sbe_count;
|
|
|
|
unsigned int sbe_balanced;
|
|
|
|
unsigned int sbe_pushed;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2005-06-26 05:57:20 +08:00
|
|
|
/* SD_BALANCE_FORK stats */
|
2007-10-19 03:32:56 +08:00
|
|
|
unsigned int sbf_count;
|
|
|
|
unsigned int sbf_balanced;
|
|
|
|
unsigned int sbf_pushed;
|
2005-06-26 05:57:20 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* try_to_wake_up() stats */
|
2007-10-19 03:32:56 +08:00
|
|
|
unsigned int ttwu_wake_remote;
|
|
|
|
unsigned int ttwu_move_affine;
|
|
|
|
unsigned int ttwu_move_balance;
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
2008-10-09 17:35:51 +08:00
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
char *name;
|
|
|
|
#endif
|
2011-04-07 20:09:50 +08:00
|
|
|
union {
|
|
|
|
void *private; /* used during construction */
|
|
|
|
struct rcu_head rcu; /* used during destruction */
|
|
|
|
};
|
2008-11-25 00:05:04 +08:00
|
|
|
|
2010-04-16 20:59:29 +08:00
|
|
|
unsigned int span_weight;
|
2009-05-19 15:22:19 +08:00
|
|
|
/*
|
|
|
|
* Span of all CPUs in this domain.
|
|
|
|
*
|
|
|
|
* NOTE: this field is variable length. (Allocated dynamically
|
|
|
|
* by attaching extra space to the end of the structure,
|
|
|
|
* depending on how many CPUs the kernel has booted up with)
|
|
|
|
*/
|
|
|
|
unsigned long span[0];
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2008-11-25 00:05:04 +08:00
|
|
|
static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
|
|
|
|
{
|
2008-11-25 00:05:04 +08:00
|
|
|
return to_cpumask(sd->span);
|
2008-11-25 00:05:04 +08:00
|
|
|
}
|
|
|
|
|
2009-11-03 12:23:40 +08:00
|
|
|
extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
|
2008-04-15 13:04:23 +08:00
|
|
|
struct sched_domain_attr *dattr_new);
|
2007-10-19 14:40:20 +08:00
|
|
|
|
2009-11-03 12:23:40 +08:00
|
|
|
/* Allocate an array of sched domains, for partition_sched_domains(). */
|
|
|
|
cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
|
|
|
|
void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
|
|
|
|
|
2012-01-26 19:44:34 +08:00
|
|
|
bool cpus_share_cache(int this_cpu, int that_cpu);
|
|
|
|
|
2014-04-11 17:44:37 +08:00
|
|
|
typedef const struct cpumask *(*sched_domain_mask_f)(int cpu);
|
2014-06-25 09:05:29 +08:00
|
|
|
typedef int (*sched_domain_flags_f)(void);
|
2014-04-11 17:44:37 +08:00
|
|
|
|
|
|
|
#define SDTL_OVERLAP 0x01
|
|
|
|
|
|
|
|
struct sd_data {
|
|
|
|
struct sched_domain **__percpu sd;
|
|
|
|
struct sched_group **__percpu sg;
|
2014-05-27 06:19:37 +08:00
|
|
|
struct sched_group_capacity **__percpu sgc;
|
2014-04-11 17:44:37 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct sched_domain_topology_level {
|
|
|
|
sched_domain_mask_f mask;
|
|
|
|
sched_domain_flags_f sd_flags;
|
|
|
|
int flags;
|
|
|
|
int numa_level;
|
|
|
|
struct sd_data data;
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
char *name;
|
|
|
|
#endif
|
|
|
|
};
|
|
|
|
|
|
|
|
extern void set_sched_topology(struct sched_domain_topology_level *tl);
|
2014-09-04 15:17:53 +08:00
|
|
|
extern void wake_up_if_idle(int cpu);
|
2014-04-11 17:44:37 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_SCHED_DEBUG
|
|
|
|
# define SD_INIT_NAME(type) .name = #type
|
|
|
|
#else
|
|
|
|
# define SD_INIT_NAME(type)
|
|
|
|
#endif
|
|
|
|
|
2008-07-18 20:01:39 +08:00
|
|
|
#else /* CONFIG_SMP */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-07-18 20:01:39 +08:00
|
|
|
struct sched_domain_attr;
|
2007-07-26 19:40:43 +08:00
|
|
|
|
2008-07-18 20:01:39 +08:00
|
|
|
static inline void
|
2009-11-03 12:23:40 +08:00
|
|
|
partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
|
2008-07-18 20:01:39 +08:00
|
|
|
struct sched_domain_attr *dattr_new)
|
|
|
|
{
|
2007-07-26 19:40:43 +08:00
|
|
|
}
|
2012-01-26 19:44:34 +08:00
|
|
|
|
|
|
|
static inline bool cpus_share_cache(int this_cpu, int that_cpu)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
|
|
|
|
2008-07-18 20:01:39 +08:00
|
|
|
#endif /* !CONFIG_SMP */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-09-02 19:49:18 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct io_context; /* See blkdev.h */
|
|
|
|
|
|
|
|
|
2005-09-10 04:02:02 +08:00
|
|
|
#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
|
2006-07-03 15:25:41 +08:00
|
|
|
extern void prefetch_stack(struct task_struct *t);
|
2005-09-10 04:02:02 +08:00
|
|
|
#else
|
|
|
|
static inline void prefetch_stack(struct task_struct *t) { }
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct audit_context; /* See audit.c */
|
|
|
|
struct mempolicy;
|
2006-04-11 19:52:07 +08:00
|
|
|
struct pipe_inode_info;
|
2006-10-02 17:18:14 +08:00
|
|
|
struct uts_namespace;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-10 00:51:58 +08:00
|
|
|
struct load_weight {
|
2013-11-19 01:27:06 +08:00
|
|
|
unsigned long weight;
|
|
|
|
u32 inv_weight;
|
2007-07-10 00:51:58 +08:00
|
|
|
};
|
|
|
|
|
sched/fair: Rewrite runnable load and utilization average tracking
The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:
1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
updated at the granularity of an entity at a time, which results in the
cfs_rq's load average is stale or partially updated: at any time, only
one entity is up to date, all other entities are effectively lagging
behind. This is undesirable.
To illustrate, if we have n runnable entities in the cfs_rq, as time
elapses, they certainly become outdated:
t0: cfs_rq { e1_old, e2_old, ..., en_old }
and when we update:
t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }
t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }
...
We solve this by combining all runnable entities' load averages together
in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
on the fact that if we regard the update as a function, then:
w * update(e) = update(w * e) and
update(e1) + update(e2) = update(e1 + e2), then
w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)
therefore, by this rewrite, we have an entirely updated cfs_rq at the
time we update it:
t1: update cfs_rq { e1_new, e2_new, ..., en_new }
t2: update cfs_rq { e1_new, e2_new, ..., en_new }
...
2. cfs_rq's load average is different between top rq->cfs_rq and other
task_group's per CPU cfs_rqs in whether or not blocked_load_average
contributes to the load.
The basic idea behind runnable load average (the same for utilization)
is that the blocked state is taken into account as opposed to only
accounting for the currently runnable state. Therefore, the average
should include both the runnable/running and blocked load averages.
This rewrite does that.
In addition, we also combine runnable/running and blocked averages
of all entities into the cfs_rq's average, and update it together at
once. This is based on the fact that:
update(runnable) + update(blocked) = update(runnable + blocked)
This significantly reduces the code as we don't need to separately
maintain/update runnable/running load and blocked load.
3. How task_group entities' share is calculated is complex and imprecise.
We reduce the complexity in this rewrite to allow a very simple rule:
the task_group's load_avg is aggregated from its per CPU cfs_rqs's
load_avgs. Then group entity's weight is simply proportional to its
own cfs_rq's load_avg / task_group's load_avg. To illustrate,
if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,
task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then
cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share
To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.
As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-07-15 08:04:37 +08:00
|
|
|
/*
|
2016-04-05 12:12:28 +08:00
|
|
|
* The load_avg/util_avg accumulates an infinite geometric series
|
|
|
|
* (see __update_load_avg() in kernel/sched/fair.c).
|
|
|
|
*
|
|
|
|
* [load_avg definition]
|
|
|
|
*
|
|
|
|
* load_avg = runnable% * scale_load_down(load)
|
|
|
|
*
|
|
|
|
* where runnable% is the time ratio that a sched_entity is runnable.
|
|
|
|
* For cfs_rq, it is the aggregated load_avg of all runnable and
|
sched/fair: Rewrite runnable load and utilization average tracking
The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:
1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
updated at the granularity of an entity at a time, which results in the
cfs_rq's load average is stale or partially updated: at any time, only
one entity is up to date, all other entities are effectively lagging
behind. This is undesirable.
To illustrate, if we have n runnable entities in the cfs_rq, as time
elapses, they certainly become outdated:
t0: cfs_rq { e1_old, e2_old, ..., en_old }
and when we update:
t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }
t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }
...
We solve this by combining all runnable entities' load averages together
in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
on the fact that if we regard the update as a function, then:
w * update(e) = update(w * e) and
update(e1) + update(e2) = update(e1 + e2), then
w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)
therefore, by this rewrite, we have an entirely updated cfs_rq at the
time we update it:
t1: update cfs_rq { e1_new, e2_new, ..., en_new }
t2: update cfs_rq { e1_new, e2_new, ..., en_new }
...
2. cfs_rq's load average is different between top rq->cfs_rq and other
task_group's per CPU cfs_rqs in whether or not blocked_load_average
contributes to the load.
The basic idea behind runnable load average (the same for utilization)
is that the blocked state is taken into account as opposed to only
accounting for the currently runnable state. Therefore, the average
should include both the runnable/running and blocked load averages.
This rewrite does that.
In addition, we also combine runnable/running and blocked averages
of all entities into the cfs_rq's average, and update it together at
once. This is based on the fact that:
update(runnable) + update(blocked) = update(runnable + blocked)
This significantly reduces the code as we don't need to separately
maintain/update runnable/running load and blocked load.
3. How task_group entities' share is calculated is complex and imprecise.
We reduce the complexity in this rewrite to allow a very simple rule:
the task_group's load_avg is aggregated from its per CPU cfs_rqs's
load_avgs. Then group entity's weight is simply proportional to its
own cfs_rq's load_avg / task_group's load_avg. To illustrate,
if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,
task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then
cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share
To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.
As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-07-15 08:04:37 +08:00
|
|
|
* blocked sched_entities.
|
2016-04-05 12:12:28 +08:00
|
|
|
*
|
|
|
|
* load_avg may also take frequency scaling into account:
|
|
|
|
*
|
|
|
|
* load_avg = runnable% * scale_load_down(load) * freq%
|
|
|
|
*
|
|
|
|
* where freq% is the CPU frequency normalized to the highest frequency.
|
|
|
|
*
|
|
|
|
* [util_avg definition]
|
|
|
|
*
|
|
|
|
* util_avg = running% * SCHED_CAPACITY_SCALE
|
|
|
|
*
|
|
|
|
* where running% is the time ratio that a sched_entity is running on
|
|
|
|
* a CPU. For cfs_rq, it is the aggregated util_avg of all runnable
|
|
|
|
* and blocked sched_entities.
|
|
|
|
*
|
|
|
|
* util_avg may also factor frequency scaling and CPU capacity scaling:
|
|
|
|
*
|
|
|
|
* util_avg = running% * SCHED_CAPACITY_SCALE * freq% * capacity%
|
|
|
|
*
|
|
|
|
* where freq% is the same as above, and capacity% is the CPU capacity
|
|
|
|
* normalized to the greatest capacity (due to uarch differences, etc).
|
|
|
|
*
|
|
|
|
* N.B., the above ratios (runnable%, running%, freq%, and capacity%)
|
|
|
|
* themselves are in the range of [0, 1]. To do fixed point arithmetics,
|
|
|
|
* we therefore scale them to as large a range as necessary. This is for
|
|
|
|
* example reflected by util_avg's SCHED_CAPACITY_SCALE.
|
|
|
|
*
|
|
|
|
* [Overflow issue]
|
|
|
|
*
|
|
|
|
* The 64-bit load_sum can have 4353082796 (=2^64/47742/88761) entities
|
|
|
|
* with the highest load (=88761), always runnable on a single cfs_rq,
|
|
|
|
* and should not overflow as the number already hits PID_MAX_LIMIT.
|
|
|
|
*
|
|
|
|
* For all other cases (including 32-bit kernels), struct load_weight's
|
|
|
|
* weight will overflow first before we do, because:
|
|
|
|
*
|
|
|
|
* Max(load_avg) <= Max(load.weight)
|
|
|
|
*
|
|
|
|
* Then it is the load_weight's responsibility to consider overflow
|
|
|
|
* issues.
|
sched/fair: Rewrite runnable load and utilization average tracking
The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:
1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
updated at the granularity of an entity at a time, which results in the
cfs_rq's load average is stale or partially updated: at any time, only
one entity is up to date, all other entities are effectively lagging
behind. This is undesirable.
To illustrate, if we have n runnable entities in the cfs_rq, as time
elapses, they certainly become outdated:
t0: cfs_rq { e1_old, e2_old, ..., en_old }
and when we update:
t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }
t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }
...
We solve this by combining all runnable entities' load averages together
in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
on the fact that if we regard the update as a function, then:
w * update(e) = update(w * e) and
update(e1) + update(e2) = update(e1 + e2), then
w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)
therefore, by this rewrite, we have an entirely updated cfs_rq at the
time we update it:
t1: update cfs_rq { e1_new, e2_new, ..., en_new }
t2: update cfs_rq { e1_new, e2_new, ..., en_new }
...
2. cfs_rq's load average is different between top rq->cfs_rq and other
task_group's per CPU cfs_rqs in whether or not blocked_load_average
contributes to the load.
The basic idea behind runnable load average (the same for utilization)
is that the blocked state is taken into account as opposed to only
accounting for the currently runnable state. Therefore, the average
should include both the runnable/running and blocked load averages.
This rewrite does that.
In addition, we also combine runnable/running and blocked averages
of all entities into the cfs_rq's average, and update it together at
once. This is based on the fact that:
update(runnable) + update(blocked) = update(runnable + blocked)
This significantly reduces the code as we don't need to separately
maintain/update runnable/running load and blocked load.
3. How task_group entities' share is calculated is complex and imprecise.
We reduce the complexity in this rewrite to allow a very simple rule:
the task_group's load_avg is aggregated from its per CPU cfs_rqs's
load_avgs. Then group entity's weight is simply proportional to its
own cfs_rq's load_avg / task_group's load_avg. To illustrate,
if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,
task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then
cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share
To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.
As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-07-15 08:04:37 +08:00
|
|
|
*/
|
2012-10-04 19:18:29 +08:00
|
|
|
struct sched_avg {
|
sched/fair: Rewrite runnable load and utilization average tracking
The idea of runnable load average (let runnable time contribute to weight)
was proposed by Paul Turner and Ben Segall, and it is still followed by
this rewrite. This rewrite aims to solve the following issues:
1. cfs_rq's load average (namely runnable_load_avg and blocked_load_avg) is
updated at the granularity of an entity at a time, which results in the
cfs_rq's load average is stale or partially updated: at any time, only
one entity is up to date, all other entities are effectively lagging
behind. This is undesirable.
To illustrate, if we have n runnable entities in the cfs_rq, as time
elapses, they certainly become outdated:
t0: cfs_rq { e1_old, e2_old, ..., en_old }
and when we update:
t1: update e1, then we have cfs_rq { e1_new, e2_old, ..., en_old }
t2: update e2, then we have cfs_rq { e1_old, e2_new, ..., en_old }
...
We solve this by combining all runnable entities' load averages together
in cfs_rq's avg, and update the cfs_rq's avg as a whole. This is based
on the fact that if we regard the update as a function, then:
w * update(e) = update(w * e) and
update(e1) + update(e2) = update(e1 + e2), then
w1 * update(e1) + w2 * update(e2) = update(w1 * e1 + w2 * e2)
therefore, by this rewrite, we have an entirely updated cfs_rq at the
time we update it:
t1: update cfs_rq { e1_new, e2_new, ..., en_new }
t2: update cfs_rq { e1_new, e2_new, ..., en_new }
...
2. cfs_rq's load average is different between top rq->cfs_rq and other
task_group's per CPU cfs_rqs in whether or not blocked_load_average
contributes to the load.
The basic idea behind runnable load average (the same for utilization)
is that the blocked state is taken into account as opposed to only
accounting for the currently runnable state. Therefore, the average
should include both the runnable/running and blocked load averages.
This rewrite does that.
In addition, we also combine runnable/running and blocked averages
of all entities into the cfs_rq's average, and update it together at
once. This is based on the fact that:
update(runnable) + update(blocked) = update(runnable + blocked)
This significantly reduces the code as we don't need to separately
maintain/update runnable/running load and blocked load.
3. How task_group entities' share is calculated is complex and imprecise.
We reduce the complexity in this rewrite to allow a very simple rule:
the task_group's load_avg is aggregated from its per CPU cfs_rqs's
load_avgs. Then group entity's weight is simply proportional to its
own cfs_rq's load_avg / task_group's load_avg. To illustrate,
if a task_group has { cfs_rq1, cfs_rq2, ..., cfs_rqn }, then,
task_group_avg = cfs_rq1_avg + cfs_rq2_avg + ... + cfs_rqn_avg, then
cfs_rqx's entity's share = cfs_rqx_avg / task_group_avg * task_group's share
To sum up, this rewrite in principle is equivalent to the current one, but
fixes the issues described above. Turns out, it significantly reduces the
code complexity and hence increases clarity and efficiency. In addition,
the new averages are more smooth/continuous (no spurious spikes and valleys)
and updated more consistently and quickly to reflect the load dynamics.
As a result, we have less load tracking overhead, better performance,
and especially better power efficiency due to more balanced load.
Signed-off-by: Yuyang Du <yuyang.du@intel.com>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: arjan@linux.intel.com
Cc: bsegall@google.com
Cc: dietmar.eggemann@arm.com
Cc: fengguang.wu@intel.com
Cc: len.brown@intel.com
Cc: morten.rasmussen@arm.com
Cc: pjt@google.com
Cc: rafael.j.wysocki@intel.com
Cc: umgwanakikbuti@gmail.com
Cc: vincent.guittot@linaro.org
Link: http://lkml.kernel.org/r/1436918682-4971-3-git-send-email-yuyang.du@intel.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-07-15 08:04:37 +08:00
|
|
|
u64 last_update_time, load_sum;
|
|
|
|
u32 util_sum, period_contrib;
|
|
|
|
unsigned long load_avg, util_avg;
|
2012-10-04 19:18:29 +08:00
|
|
|
};
|
|
|
|
|
2007-08-02 23:41:40 +08:00
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
2010-03-11 10:37:45 +08:00
|
|
|
struct sched_statistics {
|
2007-07-10 00:51:58 +08:00
|
|
|
u64 wait_start;
|
2007-08-02 23:41:40 +08:00
|
|
|
u64 wait_max;
|
2008-01-26 04:08:35 +08:00
|
|
|
u64 wait_count;
|
|
|
|
u64 wait_sum;
|
2009-07-21 02:26:58 +08:00
|
|
|
u64 iowait_count;
|
|
|
|
u64 iowait_sum;
|
2007-08-02 23:41:40 +08:00
|
|
|
|
2007-07-10 00:51:58 +08:00
|
|
|
u64 sleep_start;
|
|
|
|
u64 sleep_max;
|
2007-08-02 23:41:40 +08:00
|
|
|
s64 sum_sleep_runtime;
|
|
|
|
|
|
|
|
u64 block_start;
|
2007-07-10 00:51:58 +08:00
|
|
|
u64 block_max;
|
|
|
|
u64 exec_max;
|
2007-10-15 23:00:02 +08:00
|
|
|
u64 slice_max;
|
2007-10-15 23:00:18 +08:00
|
|
|
|
|
|
|
u64 nr_migrations_cold;
|
|
|
|
u64 nr_failed_migrations_affine;
|
|
|
|
u64 nr_failed_migrations_running;
|
|
|
|
u64 nr_failed_migrations_hot;
|
|
|
|
u64 nr_forced_migrations;
|
|
|
|
|
|
|
|
u64 nr_wakeups;
|
|
|
|
u64 nr_wakeups_sync;
|
|
|
|
u64 nr_wakeups_migrate;
|
|
|
|
u64 nr_wakeups_local;
|
|
|
|
u64 nr_wakeups_remote;
|
|
|
|
u64 nr_wakeups_affine;
|
|
|
|
u64 nr_wakeups_affine_attempts;
|
|
|
|
u64 nr_wakeups_passive;
|
|
|
|
u64 nr_wakeups_idle;
|
2010-03-11 10:37:45 +08:00
|
|
|
};
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct sched_entity {
|
|
|
|
struct load_weight load; /* for load-balancing */
|
|
|
|
struct rb_node run_node;
|
|
|
|
struct list_head group_node;
|
|
|
|
unsigned int on_rq;
|
|
|
|
|
|
|
|
u64 exec_start;
|
|
|
|
u64 sum_exec_runtime;
|
|
|
|
u64 vruntime;
|
|
|
|
u64 prev_sum_exec_runtime;
|
|
|
|
|
|
|
|
u64 nr_migrations;
|
|
|
|
|
|
|
|
#ifdef CONFIG_SCHEDSTATS
|
|
|
|
struct sched_statistics statistics;
|
2007-08-02 23:41:40 +08:00
|
|
|
#endif
|
|
|
|
|
2007-07-10 00:51:58 +08:00
|
|
|
#ifdef CONFIG_FAIR_GROUP_SCHED
|
2012-02-11 13:05:00 +08:00
|
|
|
int depth;
|
2007-07-10 00:51:58 +08:00
|
|
|
struct sched_entity *parent;
|
|
|
|
/* rq on which this entity is (to be) queued: */
|
|
|
|
struct cfs_rq *cfs_rq;
|
|
|
|
/* rq "owned" by this entity/group: */
|
|
|
|
struct cfs_rq *my_q;
|
|
|
|
#endif
|
2013-02-07 23:47:07 +08:00
|
|
|
|
2013-06-26 13:05:39 +08:00
|
|
|
#ifdef CONFIG_SMP
|
sched/core: Move sched_entity::avg into separate cache line
The sched_entity::avg collides with read-mostly sched_entity data.
The perf c2c tool showed many read HITM accesses across
many CPUs for sched_entity's cfs_rq and my_q, while having
at the same time tons of stores for avg.
After placing sched_entity::avg into separate cache line,
the perf bench sched pipe showed around 20 seconds speedup.
NOTE I cut out all perf events except for cycles and
instructions from following output.
Before:
$ perf stat -r 5 perf bench sched pipe -l 10000000
# Running 'sched/pipe' benchmark:
# Executed 10000000 pipe operations between two processes
Total time: 270.348 [sec]
27.034805 usecs/op
36989 ops/sec
...
245,537,074,035 cycles # 1.433 GHz
187,264,548,519 instructions # 0.77 insns per cycle
272.653840535 seconds time elapsed ( +- 1.31% )
After:
$ perf stat -r 5 perf bench sched pipe -l 10000000
# Running 'sched/pipe' benchmark:
# Executed 10000000 pipe operations between two processes
Total time: 251.076 [sec]
25.107678 usecs/op
39828 ops/sec
...
244,573,513,928 cycles # 1.572 GHz
187,409,641,157 instructions # 0.76 insns per cycle
251.679315188 seconds time elapsed ( +- 0.31% )
Signed-off-by: Jiri Olsa <jolsa@kernel.org>
Signed-off-by: Peter Zijlstra (Intel) <peterz@infradead.org>
Cc: Arnaldo Carvalho de Melo <acme@kernel.org>
Cc: Don Zickus <dzickus@redhat.com>
Cc: Joe Mario <jmario@redhat.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Link: http://lkml.kernel.org/r/1449606239-28602-1-git-send-email-jolsa@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2015-12-09 04:23:59 +08:00
|
|
|
/*
|
|
|
|
* Per entity load average tracking.
|
|
|
|
*
|
|
|
|
* Put into separate cache line so it does not
|
|
|
|
* collide with read-mostly values above.
|
|
|
|
*/
|
|
|
|
struct sched_avg avg ____cacheline_aligned_in_smp;
|
2012-10-04 19:18:29 +08:00
|
|
|
#endif
|
2007-07-10 00:51:58 +08:00
|
|
|
};
|
2006-07-03 15:25:42 +08:00
|
|
|
|
2008-01-26 04:08:27 +08:00
|
|
|
struct sched_rt_entity {
|
|
|
|
struct list_head run_list;
|
2008-01-26 04:08:27 +08:00
|
|
|
unsigned long timeout;
|
sched/rt: Avoid updating RT entry timeout twice within one tick period
The issue below was found in 2.6.34-rt rather than mainline rt
kernel, but the issue still exists upstream as well.
So please let me describe how it was noticed on 2.6.34-rt:
On this version, each softirq has its own thread, it means there
is at least one RT FIFO task per cpu. The priority of these
tasks is set to 49 by default. If user launches an RT FIFO task
with priority lower than 49 of softirq RT tasks, it's possible
there are two RT FIFO tasks enqueued one cpu runqueue at one
moment. By current strategy of balancing RT tasks, when it comes
to RT tasks, we really need to put them off to a CPU that they
can run on as soon as possible. Even if it means a bit of cache
line flushing, we want RT tasks to be run with the least latency.
When the user RT FIFO task which just launched before is
running, the sched timer tick of the current cpu happens. In this
tick period, the timeout value of the user RT task will be
updated once. Subsequently, we try to wake up one softirq RT
task on its local cpu. As the priority of current user RT task
is lower than the softirq RT task, the current task will be
preempted by the higher priority softirq RT task. Before
preemption, we check to see if current can readily move to a
different cpu. If so, we will reschedule to allow the RT push logic
to try to move current somewhere else. Whenever the woken
softirq RT task runs, it first tries to migrate the user FIFO RT
task over to a cpu that is running a task of lesser priority. If
migration is done, it will send a reschedule request to the found
cpu by IPI interrupt. Once the target cpu responds the IPI
interrupt, it will pick the migrated user RT task to preempt its
current task. When the user RT task is running on the new cpu,
the sched timer tick of the cpu fires. So it will tick the user
RT task again. This also means the RT task timeout value will be
updated again. As the migration may be done in one tick period,
it means the user RT task timeout value will be updated twice
within one tick.
If we set a limit on the amount of cpu time for the user RT task
by setrlimit(RLIMIT_RTTIME), the SIGXCPU signal should be posted
upon reaching the soft limit.
But exactly when the SIGXCPU signal should be sent depends on the
RT task timeout value. In fact the timeout mechanism of sending
the SIGXCPU signal assumes the RT task timeout is increased once
every tick.
However, currently the timeout value may be added twice per
tick. So it results in the SIGXCPU signal being sent earlier
than expected.
To solve this issue, we prevent the timeout value from increasing
twice within one tick time by remembering the jiffies value of
last updating the timeout. As long as the RT task's jiffies is
different with the global jiffies value, we allow its timeout to
be updated.
Signed-off-by: Ying Xue <ying.xue@windriver.com>
Signed-off-by: Fan Du <fan.du@windriver.com>
Reviewed-by: Yong Zhang <yong.zhang0@gmail.com>
Acked-by: Steven Rostedt <rostedt@goodmis.org>
Cc: <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1342508623-2887-1-git-send-email-ying.xue@windriver.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2012-07-17 15:03:43 +08:00
|
|
|
unsigned long watchdog_stamp;
|
2008-08-01 20:24:08 +08:00
|
|
|
unsigned int time_slice;
|
2016-01-18 22:27:07 +08:00
|
|
|
unsigned short on_rq;
|
|
|
|
unsigned short on_list;
|
2008-01-26 04:08:30 +08:00
|
|
|
|
2008-04-20 01:45:00 +08:00
|
|
|
struct sched_rt_entity *back;
|
2008-02-13 22:45:40 +08:00
|
|
|
#ifdef CONFIG_RT_GROUP_SCHED
|
2008-01-26 04:08:30 +08:00
|
|
|
struct sched_rt_entity *parent;
|
|
|
|
/* rq on which this entity is (to be) queued: */
|
|
|
|
struct rt_rq *rt_rq;
|
|
|
|
/* rq "owned" by this entity/group: */
|
|
|
|
struct rt_rq *my_q;
|
|
|
|
#endif
|
2008-01-26 04:08:27 +08:00
|
|
|
};
|
|
|
|
|
sched/deadline: Add SCHED_DEADLINE structures & implementation
Introduces the data structures, constants and symbols needed for
SCHED_DEADLINE implementation.
Core data structure of SCHED_DEADLINE are defined, along with their
initializers. Hooks for checking if a task belong to the new policy
are also added where they are needed.
Adds a scheduling class, in sched/dl.c and a new policy called
SCHED_DEADLINE. It is an implementation of the Earliest Deadline
First (EDF) scheduling algorithm, augmented with a mechanism (called
Constant Bandwidth Server, CBS) that makes it possible to isolate
the behaviour of tasks between each other.
The typical -deadline task will be made up of a computation phase
(instance) which is activated on a periodic or sporadic fashion. The
expected (maximum) duration of such computation is called the task's
runtime; the time interval by which each instance need to be completed
is called the task's relative deadline. The task's absolute deadline
is dynamically calculated as the time instant a task (better, an
instance) activates plus the relative deadline.
The EDF algorithms selects the task with the smallest absolute
deadline as the one to be executed first, while the CBS ensures each
task to run for at most its runtime every (relative) deadline
length time interval, avoiding any interference between different
tasks (bandwidth isolation).
Thanks to this feature, also tasks that do not strictly comply with
the computational model sketched above can effectively use the new
policy.
To summarize, this patch:
- introduces the data structures, constants and symbols needed;
- implements the core logic of the scheduling algorithm in the new
scheduling class file;
- provides all the glue code between the new scheduling class and
the core scheduler and refines the interactions between sched/dl
and the other existing scheduling classes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
Signed-off-by: Fabio Checconi <fchecconi@gmail.com>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-4-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-28 18:14:43 +08:00
|
|
|
struct sched_dl_entity {
|
|
|
|
struct rb_node rb_node;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Original scheduling parameters. Copied here from sched_attr
|
2014-05-09 11:21:27 +08:00
|
|
|
* during sched_setattr(), they will remain the same until
|
|
|
|
* the next sched_setattr().
|
sched/deadline: Add SCHED_DEADLINE structures & implementation
Introduces the data structures, constants and symbols needed for
SCHED_DEADLINE implementation.
Core data structure of SCHED_DEADLINE are defined, along with their
initializers. Hooks for checking if a task belong to the new policy
are also added where they are needed.
Adds a scheduling class, in sched/dl.c and a new policy called
SCHED_DEADLINE. It is an implementation of the Earliest Deadline
First (EDF) scheduling algorithm, augmented with a mechanism (called
Constant Bandwidth Server, CBS) that makes it possible to isolate
the behaviour of tasks between each other.
The typical -deadline task will be made up of a computation phase
(instance) which is activated on a periodic or sporadic fashion. The
expected (maximum) duration of such computation is called the task's
runtime; the time interval by which each instance need to be completed
is called the task's relative deadline. The task's absolute deadline
is dynamically calculated as the time instant a task (better, an
instance) activates plus the relative deadline.
The EDF algorithms selects the task with the smallest absolute
deadline as the one to be executed first, while the CBS ensures each
task to run for at most its runtime every (relative) deadline
length time interval, avoiding any interference between different
tasks (bandwidth isolation).
Thanks to this feature, also tasks that do not strictly comply with
the computational model sketched above can effectively use the new
policy.
To summarize, this patch:
- introduces the data structures, constants and symbols needed;
- implements the core logic of the scheduling algorithm in the new
scheduling class file;
- provides all the glue code between the new scheduling class and
the core scheduler and refines the interactions between sched/dl
and the other existing scheduling classes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
Signed-off-by: Fabio Checconi <fchecconi@gmail.com>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-4-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-28 18:14:43 +08:00
|
|
|
*/
|
|
|
|
u64 dl_runtime; /* maximum runtime for each instance */
|
|
|
|
u64 dl_deadline; /* relative deadline of each instance */
|
2013-11-07 21:43:40 +08:00
|
|
|
u64 dl_period; /* separation of two instances (period) */
|
sched/deadline: Add bandwidth management for SCHED_DEADLINE tasks
In order of deadline scheduling to be effective and useful, it is
important that some method of having the allocation of the available
CPU bandwidth to tasks and task groups under control.
This is usually called "admission control" and if it is not performed
at all, no guarantee can be given on the actual scheduling of the
-deadline tasks.
Since when RT-throttling has been introduced each task group have a
bandwidth associated to itself, calculated as a certain amount of
runtime over a period. Moreover, to make it possible to manipulate
such bandwidth, readable/writable controls have been added to both
procfs (for system wide settings) and cgroupfs (for per-group
settings).
Therefore, the same interface is being used for controlling the
bandwidth distrubution to -deadline tasks and task groups, i.e.,
new controls but with similar names, equivalent meaning and with
the same usage paradigm are added.
However, more discussion is needed in order to figure out how
we want to manage SCHED_DEADLINE bandwidth at the task group level.
Therefore, this patch adds a less sophisticated, but actually
very sensible, mechanism to ensure that a certain utilization
cap is not overcome per each root_domain (the single rq for !SMP
configurations).
Another main difference between deadline bandwidth management and
RT-throttling is that -deadline tasks have bandwidth on their own
(while -rt ones doesn't!), and thus we don't need an higher level
throttling mechanism to enforce the desired bandwidth.
This patch, therefore:
- adds system wide deadline bandwidth management by means of:
* /proc/sys/kernel/sched_dl_runtime_us,
* /proc/sys/kernel/sched_dl_period_us,
that determine (i.e., runtime / period) the total bandwidth
available on each CPU of each root_domain for -deadline tasks;
- couples the RT and deadline bandwidth management, i.e., enforces
that the sum of how much bandwidth is being devoted to -rt
-deadline tasks to stay below 100%.
This means that, for a root_domain comprising M CPUs, -deadline tasks
can be created until the sum of their bandwidths stay below:
M * (sched_dl_runtime_us / sched_dl_period_us)
It is also possible to disable this bandwidth management logic, and
be thus free of oversubscribing the system up to any arbitrary level.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-12-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 21:43:45 +08:00
|
|
|
u64 dl_bw; /* dl_runtime / dl_deadline */
|
sched/deadline: Add SCHED_DEADLINE structures & implementation
Introduces the data structures, constants and symbols needed for
SCHED_DEADLINE implementation.
Core data structure of SCHED_DEADLINE are defined, along with their
initializers. Hooks for checking if a task belong to the new policy
are also added where they are needed.
Adds a scheduling class, in sched/dl.c and a new policy called
SCHED_DEADLINE. It is an implementation of the Earliest Deadline
First (EDF) scheduling algorithm, augmented with a mechanism (called
Constant Bandwidth Server, CBS) that makes it possible to isolate
the behaviour of tasks between each other.
The typical -deadline task will be made up of a computation phase
(instance) which is activated on a periodic or sporadic fashion. The
expected (maximum) duration of such computation is called the task's
runtime; the time interval by which each instance need to be completed
is called the task's relative deadline. The task's absolute deadline
is dynamically calculated as the time instant a task (better, an
instance) activates plus the relative deadline.
The EDF algorithms selects the task with the smallest absolute
deadline as the one to be executed first, while the CBS ensures each
task to run for at most its runtime every (relative) deadline
length time interval, avoiding any interference between different
tasks (bandwidth isolation).
Thanks to this feature, also tasks that do not strictly comply with
the computational model sketched above can effectively use the new
policy.
To summarize, this patch:
- introduces the data structures, constants and symbols needed;
- implements the core logic of the scheduling algorithm in the new
scheduling class file;
- provides all the glue code between the new scheduling class and
the core scheduler and refines the interactions between sched/dl
and the other existing scheduling classes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
Signed-off-by: Fabio Checconi <fchecconi@gmail.com>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-4-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-28 18:14:43 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Actual scheduling parameters. Initialized with the values above,
|
|
|
|
* they are continously updated during task execution. Note that
|
|
|
|
* the remaining runtime could be < 0 in case we are in overrun.
|
|
|
|
*/
|
|
|
|
s64 runtime; /* remaining runtime for this instance */
|
|
|
|
u64 deadline; /* absolute deadline for this instance */
|
|
|
|
unsigned int flags; /* specifying the scheduler behaviour */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Some bool flags:
|
|
|
|
*
|
|
|
|
* @dl_throttled tells if we exhausted the runtime. If so, the
|
|
|
|
* task has to wait for a replenishment to be performed at the
|
|
|
|
* next firing of dl_timer.
|
|
|
|
*
|
sched/deadline: Add SCHED_DEADLINE inheritance logic
Some method to deal with rt-mutexes and make sched_dl interact with
the current PI-coded is needed, raising all but trivial issues, that
needs (according to us) to be solved with some restructuring of
the pi-code (i.e., going toward a proxy execution-ish implementation).
This is under development, in the meanwhile, as a temporary solution,
what this commits does is:
- ensure a pi-lock owner with waiters is never throttled down. Instead,
when it runs out of runtime, it immediately gets replenished and it's
deadline is postponed;
- the scheduling parameters (relative deadline and default runtime)
used for that replenishments --during the whole period it holds the
pi-lock-- are the ones of the waiting task with earliest deadline.
Acting this way, we provide some kind of boosting to the lock-owner,
still by using the existing (actually, slightly modified by the previous
commit) pi-architecture.
We would stress the fact that this is only a surely needed, all but
clean solution to the problem. In the end it's only a way to re-start
discussion within the community. So, as always, comments, ideas, rants,
etc.. are welcome! :-)
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
[ Added !RT_MUTEXES build fix. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-11-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 21:43:44 +08:00
|
|
|
* @dl_boosted tells if we are boosted due to DI. If so we are
|
|
|
|
* outside bandwidth enforcement mechanism (but only until we
|
2014-04-15 19:49:04 +08:00
|
|
|
* exit the critical section);
|
|
|
|
*
|
|
|
|
* @dl_yielded tells if task gave up the cpu before consuming
|
|
|
|
* all its available runtime during the last job.
|
sched/deadline: Add SCHED_DEADLINE structures & implementation
Introduces the data structures, constants and symbols needed for
SCHED_DEADLINE implementation.
Core data structure of SCHED_DEADLINE are defined, along with their
initializers. Hooks for checking if a task belong to the new policy
are also added where they are needed.
Adds a scheduling class, in sched/dl.c and a new policy called
SCHED_DEADLINE. It is an implementation of the Earliest Deadline
First (EDF) scheduling algorithm, augmented with a mechanism (called
Constant Bandwidth Server, CBS) that makes it possible to isolate
the behaviour of tasks between each other.
The typical -deadline task will be made up of a computation phase
(instance) which is activated on a periodic or sporadic fashion. The
expected (maximum) duration of such computation is called the task's
runtime; the time interval by which each instance need to be completed
is called the task's relative deadline. The task's absolute deadline
is dynamically calculated as the time instant a task (better, an
instance) activates plus the relative deadline.
The EDF algorithms selects the task with the smallest absolute
deadline as the one to be executed first, while the CBS ensures each
task to run for at most its runtime every (relative) deadline
length time interval, avoiding any interference between different
tasks (bandwidth isolation).
Thanks to this feature, also tasks that do not strictly comply with
the computational model sketched above can effectively use the new
policy.
To summarize, this patch:
- introduces the data structures, constants and symbols needed;
- implements the core logic of the scheduling algorithm in the new
scheduling class file;
- provides all the glue code between the new scheduling class and
the core scheduler and refines the interactions between sched/dl
and the other existing scheduling classes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
Signed-off-by: Fabio Checconi <fchecconi@gmail.com>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-4-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-28 18:14:43 +08:00
|
|
|
*/
|
2016-03-07 19:27:04 +08:00
|
|
|
int dl_throttled, dl_boosted, dl_yielded;
|
sched/deadline: Add SCHED_DEADLINE structures & implementation
Introduces the data structures, constants and symbols needed for
SCHED_DEADLINE implementation.
Core data structure of SCHED_DEADLINE are defined, along with their
initializers. Hooks for checking if a task belong to the new policy
are also added where they are needed.
Adds a scheduling class, in sched/dl.c and a new policy called
SCHED_DEADLINE. It is an implementation of the Earliest Deadline
First (EDF) scheduling algorithm, augmented with a mechanism (called
Constant Bandwidth Server, CBS) that makes it possible to isolate
the behaviour of tasks between each other.
The typical -deadline task will be made up of a computation phase
(instance) which is activated on a periodic or sporadic fashion. The
expected (maximum) duration of such computation is called the task's
runtime; the time interval by which each instance need to be completed
is called the task's relative deadline. The task's absolute deadline
is dynamically calculated as the time instant a task (better, an
instance) activates plus the relative deadline.
The EDF algorithms selects the task with the smallest absolute
deadline as the one to be executed first, while the CBS ensures each
task to run for at most its runtime every (relative) deadline
length time interval, avoiding any interference between different
tasks (bandwidth isolation).
Thanks to this feature, also tasks that do not strictly comply with
the computational model sketched above can effectively use the new
policy.
To summarize, this patch:
- introduces the data structures, constants and symbols needed;
- implements the core logic of the scheduling algorithm in the new
scheduling class file;
- provides all the glue code between the new scheduling class and
the core scheduler and refines the interactions between sched/dl
and the other existing scheduling classes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
Signed-off-by: Fabio Checconi <fchecconi@gmail.com>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-4-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-28 18:14:43 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Bandwidth enforcement timer. Each -deadline task has its
|
|
|
|
* own bandwidth to be enforced, thus we need one timer per task.
|
|
|
|
*/
|
|
|
|
struct hrtimer dl_timer;
|
|
|
|
};
|
2013-02-07 23:47:07 +08:00
|
|
|
|
2014-08-15 07:01:53 +08:00
|
|
|
union rcu_special {
|
|
|
|
struct {
|
2015-08-03 04:53:17 +08:00
|
|
|
u8 blocked;
|
|
|
|
u8 need_qs;
|
|
|
|
u8 exp_need_qs;
|
|
|
|
u8 pad; /* Otherwise the compiler can store garbage here. */
|
|
|
|
} b; /* Bits. */
|
|
|
|
u32 s; /* Set of bits. */
|
2014-08-15 07:01:53 +08:00
|
|
|
};
|
2009-08-28 06:00:12 +08:00
|
|
|
struct rcu_node;
|
|
|
|
|
2010-09-02 22:50:03 +08:00
|
|
|
enum perf_event_task_context {
|
|
|
|
perf_invalid_context = -1,
|
|
|
|
perf_hw_context = 0,
|
2010-09-07 23:34:50 +08:00
|
|
|
perf_sw_context,
|
2010-09-02 22:50:03 +08:00
|
|
|
perf_nr_task_contexts,
|
|
|
|
};
|
|
|
|
|
2015-09-05 06:47:32 +08:00
|
|
|
/* Track pages that require TLB flushes */
|
|
|
|
struct tlbflush_unmap_batch {
|
|
|
|
/*
|
|
|
|
* Each bit set is a CPU that potentially has a TLB entry for one of
|
|
|
|
* the PFNs being flushed. See set_tlb_ubc_flush_pending().
|
|
|
|
*/
|
|
|
|
struct cpumask cpumask;
|
|
|
|
|
|
|
|
/* True if any bit in cpumask is set */
|
|
|
|
bool flush_required;
|
2015-09-05 06:47:35 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If true then the PTE was dirty when unmapped. The entry must be
|
|
|
|
* flushed before IO is initiated or a stale TLB entry potentially
|
|
|
|
* allows an update without redirtying the page.
|
|
|
|
*/
|
|
|
|
bool writable;
|
2015-09-05 06:47:32 +08:00
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct task_struct {
|
|
|
|
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
|
2007-05-09 17:35:17 +08:00
|
|
|
void *stack;
|
2005-04-17 06:20:36 +08:00
|
|
|
atomic_t usage;
|
2007-05-08 15:23:41 +08:00
|
|
|
unsigned int flags; /* per process flags, defined below */
|
|
|
|
unsigned int ptrace;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:54:34 +08:00
|
|
|
#ifdef CONFIG_SMP
|
2011-09-12 19:06:17 +08:00
|
|
|
struct llist_node wake_entry;
|
2011-04-05 23:23:40 +08:00
|
|
|
int on_cpu;
|
2015-07-14 23:39:50 +08:00
|
|
|
unsigned int wakee_flips;
|
sched: Implement smarter wake-affine logic
The wake-affine scheduler feature is currently always trying to pull
the wakee close to the waker. In theory this should be beneficial if
the waker's CPU caches hot data for the wakee, and it's also beneficial
in the extreme ping-pong high context switch rate case.
Testing shows it can benefit hackbench up to 15%.
However, the feature is somewhat blind, from which some workloads
such as pgbench suffer. It's also time-consuming algorithmically.
Testing shows it can damage pgbench up to 50% - far more than the
benefit it brings in the best case.
So wake-affine should be smarter and it should realize when to
stop its thankless effort at trying to find a suitable CPU to wake on.
This patch introduces 'wakee_flips', which will be increased each
time the task flips (switches) its wakee target.
So a high 'wakee_flips' value means the task has more than one
wakee, and the bigger the number, the higher the wakeup frequency.
Now when making the decision on whether to pull or not, pay attention to
the wakee with a high 'wakee_flips', pulling such a task may benefit
the wakee. Also imply that the waker will face cruel competition later,
it could be very cruel or very fast depends on the story behind
'wakee_flips', waker therefore suffers.
Furthermore, if waker also has a high 'wakee_flips', that implies that
multiple tasks rely on it, then waker's higher latency will damage all
of them, so pulling wakee seems to be a bad deal.
Thus, when 'waker->wakee_flips / wakee->wakee_flips' becomes
higher and higher, the cost of pulling seems to be worse and worse.
The patch therefore helps the wake-affine feature to stop its pulling
work when:
wakee->wakee_flips > factor &&
waker->wakee_flips > (factor * wakee->wakee_flips)
The 'factor' here is the number of CPUs in the current CPU's NUMA node,
so a bigger node will lead to more pulling since the trial becomes more
severe.
After applying the patch, pgbench shows up to 40% improvements and no regressions.
Tested with 12 cpu x86 server and tip 3.10.0-rc7.
The percentages in the final column highlight the areas with the biggest wins,
all other areas improved as well:
pgbench base smart
| db_size | clients | tps | | tps |
+---------+---------+-------+ +-------+
| 22 MB | 1 | 10598 | | 10796 |
| 22 MB | 2 | 21257 | | 21336 |
| 22 MB | 4 | 41386 | | 41622 |
| 22 MB | 8 | 51253 | | 57932 |
| 22 MB | 12 | 48570 | | 54000 |
| 22 MB | 16 | 46748 | | 55982 | +19.75%
| 22 MB | 24 | 44346 | | 55847 | +25.93%
| 22 MB | 32 | 43460 | | 54614 | +25.66%
| 7484 MB | 1 | 8951 | | 9193 |
| 7484 MB | 2 | 19233 | | 19240 |
| 7484 MB | 4 | 37239 | | 37302 |
| 7484 MB | 8 | 46087 | | 50018 |
| 7484 MB | 12 | 42054 | | 48763 |
| 7484 MB | 16 | 40765 | | 51633 | +26.66%
| 7484 MB | 24 | 37651 | | 52377 | +39.11%
| 7484 MB | 32 | 37056 | | 51108 | +37.92%
| 15 GB | 1 | 8845 | | 9104 |
| 15 GB | 2 | 19094 | | 19162 |
| 15 GB | 4 | 36979 | | 36983 |
| 15 GB | 8 | 46087 | | 49977 |
| 15 GB | 12 | 41901 | | 48591 |
| 15 GB | 16 | 40147 | | 50651 | +26.16%
| 15 GB | 24 | 37250 | | 52365 | +40.58%
| 15 GB | 32 | 36470 | | 50015 | +37.14%
Signed-off-by: Michael Wang <wangyun@linux.vnet.ibm.com>
Cc: Mike Galbraith <efault@gmx.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/51D50057.9000809@linux.vnet.ibm.com
[ Improved the changelog. ]
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-07-04 12:55:51 +08:00
|
|
|
unsigned long wakee_flip_decay_ts;
|
2015-07-14 23:39:50 +08:00
|
|
|
struct task_struct *last_wakee;
|
2013-10-07 18:29:16 +08:00
|
|
|
|
|
|
|
int wake_cpu;
|
[PATCH] sched: implement smpnice
Problem:
The introduction of separate run queues per CPU has brought with it "nice"
enforcement problems that are best described by a simple example.
For the sake of argument suppose that on a single CPU machine with a
nice==19 hard spinner and a nice==0 hard spinner running that the nice==0
task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now
suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and
2 nice==0 hard spinners running. The user of this system would be entitled
to expect that the nice==0 tasks each get 95% of a CPU and the nice==19
tasks only get 5% each. However, whether this expectation is met is pretty
much down to luck as there are four equally likely distributions of the
tasks to the CPUs that the load balancing code will consider to be balanced
with loads of 2.0 for each CPU. Two of these distributions involve one
nice==0 and one nice==19 task per CPU and in these circumstances the users
expectations will be met. The other two distributions both involve both
nice==0 tasks being on one CPU and both nice==19 being on the other CPU and
each task will get 50% of a CPU and the user's expectations will not be
met.
Solution:
The solution to this problem that is implemented in the attached patch is
to use weighted loads when determining if the system is balanced and, when
an imbalance is detected, to move an amount of weighted load between run
queues (as opposed to a number of tasks) to restore the balance. Once
again, the easiest way to explain why both of these measures are necessary
is to use a simple example. Suppose that (in a slight variation of the
above example) that we have a two CPU system with 4 nice==0 and 4 nice=19
hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and
the 4 nice==19 tasks are on the other CPU. The weighted loads for the two
CPUs would be 4.0 and 0.2 respectively and the load balancing code would
move 2 tasks resulting in one CPU with a load of 2.0 and the other with
load of 2.2. If this was considered to be a big enough imbalance to
justify moving a task and that task was moved using the current
move_tasks() then it would move the highest priority task that it found and
this would result in one CPU with a load of 3.0 and the other with a load
of 1.2 which would result in the movement of a task in the opposite
direction and so on -- infinite loop. If, on the other hand, an amount of
load to be moved is calculated from the imbalance (in this case 0.1) and
move_tasks() skips tasks until it find ones whose contributions to the
weighted load are less than this amount it would move two of the nice==19
tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with
loads of 2.1 for each CPU.
One of the advantages of this mechanism is that on a system where all tasks
have nice==0 the load balancing calculations would be mathematically
identical to the current load balancing code.
Notes:
struct task_struct:
has a new field load_weight which (in a trade off of space for speed)
stores the contribution that this task makes to a CPU's weighted load when
it is runnable.
struct runqueue:
has a new field raw_weighted_load which is the sum of the load_weight
values for the currently runnable tasks on this run queue. This field
always needs to be updated when nr_running is updated so two new inline
functions inc_nr_running() and dec_nr_running() have been created to make
sure that this happens. This also offers a convenient way to optimize away
this part of the smpnice mechanism when CONFIG_SMP is not defined.
int try_to_wake_up():
in this function the value SCHED_LOAD_BALANCE is used to represent the load
contribution of a single task in various calculations in the code that
decides which CPU to put the waking task on. While this would be a valid
on a system where the nice values for the runnable tasks were distributed
evenly around zero it will lead to anomalous load balancing if the
distribution is skewed in either direction. To overcome this problem
SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task
or by the average load_weight per task for the queue in question (as
appropriate).
int move_tasks():
The modifications to this function were complicated by the fact that
active_load_balance() uses it to move exactly one task without checking
whether an imbalance actually exists. This precluded the simple
overloading of max_nr_move with max_load_move and necessitated the addition
of the latter as an extra argument to the function. The internal
implementation is then modified to move up to max_nr_move tasks and
max_load_move of weighted load. This slightly complicates the code where
move_tasks() is called and if ever active_load_balance() is changed to not
use move_tasks() the implementation of move_tasks() should be simplified
accordingly.
struct sched_group *find_busiest_group():
Similar to try_to_wake_up(), there are places in this function where
SCHED_LOAD_SCALE is used to represent the load contribution of a single
task and the same issues are created. A similar solution is adopted except
that it is now the average per task contribution to a group's load (as
opposed to a run queue) that is required. As this value is not directly
available from the group it is calculated on the fly as the queues in the
groups are visited when determining the busiest group.
A key change to this function is that it is no longer to scale down
*imbalance on exit as move_tasks() uses the load in its scaled form.
void set_user_nice():
has been modified to update the task's load_weight field when it's nice
value and also to ensure that its run queue's raw_weighted_load field is
updated if it was runnable.
From: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
With smpnice, sched groups with highest priority tasks can mask the imbalance
between the other sched groups with in the same domain. This patch fixes some
of the listed down scenarios by not considering the sched groups which are
lightly loaded.
a) on a simple 4-way MP system, if we have one high priority and 4 normal
priority tasks, with smpnice we would like to see the high priority task
scheduled on one cpu, two other cpus getting one normal task each and the
fourth cpu getting the remaining two normal tasks. but with current
smpnice extra normal priority task keeps jumping from one cpu to another
cpu having the normal priority task. This is because of the
busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the
cpu with high priority task in max_load calculations but including that in
total and avg_load calcuations.. leading to max_load < avg_load and load
balance between cpus running normal priority tasks(2 Vs 1) will always show
imbalanace as one normal priority and the extra normal priority task will
keep moving from one cpu to another cpu having normal priority task..
b) 4-way system with HT (8 logical processors). Package-P0 T0 has a
highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal
priority task each.. P2 and P3 are idle. With this patch, one of the
normal priority tasks on P1 will be moved to P2 or P3..
c) With the current weighted smp nice calculations, it doesn't always make
sense to look at the highest weighted runqueue in the busy group..
Consider a load balance scenario on a DP with HT system, with Package-0
containing one high priority and one low priority, Package-1 containing one
low priority(with other thread being idle).. Package-1 thinks that it need
to take the low priority thread from Package-0. And find_busiest_queue()
returns the cpu thread with highest priority task.. And ultimately(with
help of active load balance) we move high priority task to Package-1. And
same continues with Package-0 now, moving high priority task from package-1
to package-0.. Even without the presence of active load balance, load
balance will fail to balance the above scenario.. Fix find_busiest_queue
to use "imbalance" when it is lightly loaded.
[kernel@kolivas.org: sched: store weighted load on up]
[kernel@kolivas.org: sched: add discrete weighted cpu load function]
[suresh.b.siddha@intel.com: sched: remove dead code]
Signed-off-by: Peter Williams <pwil3058@bigpond.com.au>
Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com>
Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com>
Acked-by: Ingo Molnar <mingo@elte.hu>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Signed-off-by: Con Kolivas <kernel@kolivas.org>
Cc: John Hawkes <hawkes@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:54:34 +08:00
|
|
|
#endif
|
2011-04-05 23:23:44 +08:00
|
|
|
int on_rq;
|
2007-07-10 00:52:00 +08:00
|
|
|
|
2006-06-27 17:54:51 +08:00
|
|
|
int prio, static_prio, normal_prio;
|
2008-05-15 19:09:15 +08:00
|
|
|
unsigned int rt_priority;
|
2007-10-15 23:00:12 +08:00
|
|
|
const struct sched_class *sched_class;
|
2007-07-10 00:51:58 +08:00
|
|
|
struct sched_entity se;
|
2008-01-26 04:08:27 +08:00
|
|
|
struct sched_rt_entity rt;
|
2012-06-22 19:36:05 +08:00
|
|
|
#ifdef CONFIG_CGROUP_SCHED
|
|
|
|
struct task_group *sched_task_group;
|
|
|
|
#endif
|
sched/deadline: Add SCHED_DEADLINE structures & implementation
Introduces the data structures, constants and symbols needed for
SCHED_DEADLINE implementation.
Core data structure of SCHED_DEADLINE are defined, along with their
initializers. Hooks for checking if a task belong to the new policy
are also added where they are needed.
Adds a scheduling class, in sched/dl.c and a new policy called
SCHED_DEADLINE. It is an implementation of the Earliest Deadline
First (EDF) scheduling algorithm, augmented with a mechanism (called
Constant Bandwidth Server, CBS) that makes it possible to isolate
the behaviour of tasks between each other.
The typical -deadline task will be made up of a computation phase
(instance) which is activated on a periodic or sporadic fashion. The
expected (maximum) duration of such computation is called the task's
runtime; the time interval by which each instance need to be completed
is called the task's relative deadline. The task's absolute deadline
is dynamically calculated as the time instant a task (better, an
instance) activates plus the relative deadline.
The EDF algorithms selects the task with the smallest absolute
deadline as the one to be executed first, while the CBS ensures each
task to run for at most its runtime every (relative) deadline
length time interval, avoiding any interference between different
tasks (bandwidth isolation).
Thanks to this feature, also tasks that do not strictly comply with
the computational model sketched above can effectively use the new
policy.
To summarize, this patch:
- introduces the data structures, constants and symbols needed;
- implements the core logic of the scheduling algorithm in the new
scheduling class file;
- provides all the glue code between the new scheduling class and
the core scheduler and refines the interactions between sched/dl
and the other existing scheduling classes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
Signed-off-by: Fabio Checconi <fchecconi@gmail.com>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-4-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-28 18:14:43 +08:00
|
|
|
struct sched_dl_entity dl;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-26 19:40:43 +08:00
|
|
|
#ifdef CONFIG_PREEMPT_NOTIFIERS
|
|
|
|
/* list of struct preempt_notifier: */
|
|
|
|
struct hlist_head preempt_notifiers;
|
|
|
|
#endif
|
|
|
|
|
2006-09-29 16:59:40 +08:00
|
|
|
#ifdef CONFIG_BLK_DEV_IO_TRACE
|
2006-03-24 03:00:26 +08:00
|
|
|
unsigned int btrace_seq;
|
2006-09-29 16:59:40 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-05-08 15:23:41 +08:00
|
|
|
unsigned int policy;
|
2012-04-23 18:11:21 +08:00
|
|
|
int nr_cpus_allowed;
|
2005-04-17 06:20:36 +08:00
|
|
|
cpumask_t cpus_allowed;
|
|
|
|
|
2010-06-30 07:49:16 +08:00
|
|
|
#ifdef CONFIG_PREEMPT_RCU
|
2008-01-26 04:08:24 +08:00
|
|
|
int rcu_read_lock_nesting;
|
2014-08-15 07:01:53 +08:00
|
|
|
union rcu_special rcu_read_unlock_special;
|
rcu: Merge preemptable-RCU functionality into hierarchical RCU
Create a kernel/rcutree_plugin.h file that contains definitions
for preemptable RCU (or, under the #else branch of the #ifdef,
empty definitions for the classic non-preemptable semantics).
These definitions fit into plugins defined in kernel/rcutree.c
for this purpose.
This variant of preemptable RCU uses a new algorithm whose
read-side expense is roughly that of classic hierarchical RCU
under CONFIG_PREEMPT. This new algorithm's update-side expense
is similar to that of classic hierarchical RCU, and, in absence
of read-side preemption or blocking, is exactly that of classic
hierarchical RCU. Perhaps more important, this new algorithm
has a much simpler implementation, saving well over 1,000 lines
of code compared to mainline's implementation of preemptable
RCU, which will hopefully be retired in favor of this new
algorithm.
The simplifications are obtained by maintaining per-task
nesting state for running tasks, and using a simple
lock-protected algorithm to handle accounting when tasks block
within RCU read-side critical sections, making use of lessons
learned while creating numerous user-level RCU implementations
over the past 18 months.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12509746134003-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 04:56:52 +08:00
|
|
|
struct list_head rcu_node_entry;
|
2010-06-30 07:49:16 +08:00
|
|
|
struct rcu_node *rcu_blocked_node;
|
2014-09-23 02:00:48 +08:00
|
|
|
#endif /* #ifdef CONFIG_PREEMPT_RCU */
|
2014-06-28 04:42:20 +08:00
|
|
|
#ifdef CONFIG_TASKS_RCU
|
|
|
|
unsigned long rcu_tasks_nvcsw;
|
|
|
|
bool rcu_tasks_holdout;
|
|
|
|
struct list_head rcu_tasks_holdout_list;
|
2014-08-05 08:43:50 +08:00
|
|
|
int rcu_tasks_idle_cpu;
|
2014-06-28 04:42:20 +08:00
|
|
|
#endif /* #ifdef CONFIG_TASKS_RCU */
|
2008-01-26 04:08:24 +08:00
|
|
|
|
2015-06-26 02:23:37 +08:00
|
|
|
#ifdef CONFIG_SCHED_INFO
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sched_info sched_info;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
struct list_head tasks;
|
2010-12-01 02:51:33 +08:00
|
|
|
#ifdef CONFIG_SMP
|
sched: create "pushable_tasks" list to limit pushing to one attempt
The RT scheduler employs a "push/pull" design to actively balance tasks
within the system (on a per disjoint cpuset basis). When a task is
awoken, it is immediately determined if there are any lower priority
cpus which should be preempted. This is opposed to the way normal
SCHED_OTHER tasks behave, which will wait for a periodic rebalancing
operation to occur before spreading out load.
When a particular RQ has more than 1 active RT task, it is said to
be in an "overloaded" state. Once this occurs, the system enters
the active balancing mode, where it will try to push the task away,
or persuade a different cpu to pull it over. The system will stay
in this state until the system falls back below the <= 1 queued RT
task per RQ.
However, the current implementation suffers from a limitation in the
push logic. Once overloaded, all tasks (other than current) on the
RQ are analyzed on every push operation, even if it was previously
unpushable (due to affinity, etc). Whats more, the operation stops
at the first task that is unpushable and will not look at items
lower in the queue. This causes two problems:
1) We can have the same tasks analyzed over and over again during each
push, which extends out the fast path in the scheduler for no
gain. Consider a RQ that has dozens of tasks that are bound to a
core. Each one of those tasks will be encountered and skipped
for each push operation while they are queued.
2) There may be lower-priority tasks under the unpushable task that
could have been successfully pushed, but will never be considered
until either the unpushable task is cleared, or a pull operation
succeeds. The net result is a potential latency source for mid
priority tasks.
This patch aims to rectify these two conditions by introducing a new
priority sorted list: "pushable_tasks". A task is added to the list
each time a task is activated or preempted. It is removed from the
list any time it is deactivated, made current, or fails to push.
This works because a task only needs to be attempted to push once.
After an initial failure to push, the other cpus will eventually try to
pull the task when the conditions are proper. This also solves the
problem that we don't completely analyze all tasks due to encountering
an unpushable tasks. Now every task will have a push attempted (when
appropriate).
This reduces latency both by shorting the critical section of the
rq->lock for certain workloads, and by making sure the algorithm
considers all eligible tasks in the system.
[ rostedt: added a couple more BUG_ONs ]
Signed-off-by: Gregory Haskins <ghaskins@novell.com>
Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 22:39:53 +08:00
|
|
|
struct plist_node pushable_tasks;
|
sched/deadline: Add SCHED_DEADLINE SMP-related data structures & logic
Introduces data structures relevant for implementing dynamic
migration of -deadline tasks and the logic for checking if
runqueues are overloaded with -deadline tasks and for choosing
where a task should migrate, when it is the case.
Adds also dynamic migrations to SCHED_DEADLINE, so that tasks can
be moved among CPUs when necessary. It is also possible to bind a
task to a (set of) CPU(s), thus restricting its capability of
migrating, or forbidding migrations at all.
The very same approach used in sched_rt is utilised:
- -deadline tasks are kept into CPU-specific runqueues,
- -deadline tasks are migrated among runqueues to achieve the
following:
* on an M-CPU system the M earliest deadline ready tasks
are always running;
* affinity/cpusets settings of all the -deadline tasks is
always respected.
Therefore, this very special form of "load balancing" is done with
an active method, i.e., the scheduler pushes or pulls tasks between
runqueues when they are woken up and/or (de)scheduled.
IOW, every time a preemption occurs, the descheduled task might be sent
to some other CPU (depending on its deadline) to continue executing
(push). On the other hand, every time a CPU becomes idle, it might pull
the second earliest deadline ready task from some other CPU.
To enforce this, a pull operation is always attempted before taking any
scheduling decision (pre_schedule()), as well as a push one after each
scheduling decision (post_schedule()). In addition, when a task arrives
or wakes up, the best CPU where to resume it is selected taking into
account its affinity mask, the system topology, but also its deadline.
E.g., from the scheduling point of view, the best CPU where to wake
up (and also where to push) a task is the one which is running the task
with the latest deadline among the M executing ones.
In order to facilitate these decisions, per-runqueue "caching" of the
deadlines of the currently running and of the first ready task is used.
Queued but not running tasks are also parked in another rb-tree to
speed-up pushes.
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-5-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 21:43:38 +08:00
|
|
|
struct rb_node pushable_dl_tasks;
|
2010-12-01 02:51:33 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct mm_struct *mm, *active_mm;
|
mm: per-thread vma caching
This patch is a continuation of efforts trying to optimize find_vma(),
avoiding potentially expensive rbtree walks to locate a vma upon faults.
The original approach (https://lkml.org/lkml/2013/11/1/410), where the
largest vma was also cached, ended up being too specific and random,
thus further comparison with other approaches were needed. There are
two things to consider when dealing with this, the cache hit rate and
the latency of find_vma(). Improving the hit-rate does not necessarily
translate in finding the vma any faster, as the overhead of any fancy
caching schemes can be too high to consider.
We currently cache the last used vma for the whole address space, which
provides a nice optimization, reducing the total cycles in find_vma() by
up to 250%, for workloads with good locality. On the other hand, this
simple scheme is pretty much useless for workloads with poor locality.
Analyzing ebizzy runs shows that, no matter how many threads are
running, the mmap_cache hit rate is less than 2%, and in many situations
below 1%.
The proposed approach is to replace this scheme with a small per-thread
cache, maximizing hit rates at a very low maintenance cost.
Invalidations are performed by simply bumping up a 32-bit sequence
number. The only expensive operation is in the rare case of a seq
number overflow, where all caches that share the same address space are
flushed. Upon a miss, the proposed replacement policy is based on the
page number that contains the virtual address in question. Concretely,
the following results are seen on an 80 core, 8 socket x86-64 box:
1) System bootup: Most programs are single threaded, so the per-thread
scheme does improve ~50% hit rate by just adding a few more slots to
the cache.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 50.61% | 19.90 |
| patched | 73.45% | 13.58 |
+----------------+----------+------------------+
2) Kernel build: This one is already pretty good with the current
approach as we're dealing with good locality.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 75.28% | 11.03 |
| patched | 88.09% | 9.31 |
+----------------+----------+------------------+
3) Oracle 11g Data Mining (4k pages): Similar to the kernel build workload.
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 70.66% | 17.14 |
| patched | 91.15% | 12.57 |
+----------------+----------+------------------+
4) Ebizzy: There's a fair amount of variation from run to run, but this
approach always shows nearly perfect hit rates, while baseline is just
about non-existent. The amounts of cycles can fluctuate between
anywhere from ~60 to ~116 for the baseline scheme, but this approach
reduces it considerably. For instance, with 80 threads:
+----------------+----------+------------------+
| caching scheme | hit-rate | cycles (billion) |
+----------------+----------+------------------+
| baseline | 1.06% | 91.54 |
| patched | 99.97% | 14.18 |
+----------------+----------+------------------+
[akpm@linux-foundation.org: fix nommu build, per Davidlohr]
[akpm@linux-foundation.org: document vmacache_valid() logic]
[akpm@linux-foundation.org: attempt to untangle header files]
[akpm@linux-foundation.org: add vmacache_find() BUG_ON]
[hughd@google.com: add vmacache_valid_mm() (from Oleg)]
[akpm@linux-foundation.org: coding-style fixes]
[akpm@linux-foundation.org: adjust and enhance comments]
Signed-off-by: Davidlohr Bueso <davidlohr@hp.com>
Reviewed-by: Rik van Riel <riel@redhat.com>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Reviewed-by: Michel Lespinasse <walken@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Tested-by: Hugh Dickins <hughd@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-08 06:37:25 +08:00
|
|
|
/* per-thread vma caching */
|
|
|
|
u32 vmacache_seqnum;
|
|
|
|
struct vm_area_struct *vmacache[VMACACHE_SIZE];
|
2010-03-06 05:41:40 +08:00
|
|
|
#if defined(SPLIT_RSS_COUNTING)
|
|
|
|
struct task_rss_stat rss_stat;
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
/* task state */
|
2007-05-08 15:23:41 +08:00
|
|
|
int exit_state;
|
2005-04-17 06:20:36 +08:00
|
|
|
int exit_code, exit_signal;
|
|
|
|
int pdeath_signal; /* The signal sent when the parent dies */
|
2015-05-01 12:19:55 +08:00
|
|
|
unsigned long jobctl; /* JOBCTL_*, siglock protected */
|
2013-04-12 01:30:29 +08:00
|
|
|
|
|
|
|
/* Used for emulating ABI behavior of previous Linux versions */
|
2007-05-08 15:23:41 +08:00
|
|
|
unsigned int personality;
|
2013-04-12 01:30:29 +08:00
|
|
|
|
2015-11-25 23:02:07 +08:00
|
|
|
/* scheduler bits, serialized by scheduler locks */
|
2009-06-15 23:17:47 +08:00
|
|
|
unsigned sched_reset_on_fork:1;
|
2011-04-05 23:23:49 +08:00
|
|
|
unsigned sched_contributes_to_load:1;
|
2015-04-18 02:05:30 +08:00
|
|
|
unsigned sched_migrated:1;
|
2015-11-25 23:02:07 +08:00
|
|
|
unsigned :0; /* force alignment to the next boundary */
|
|
|
|
|
|
|
|
/* unserialized, strictly 'current' */
|
|
|
|
unsigned in_execve:1; /* bit to tell LSMs we're in execve */
|
|
|
|
unsigned in_iowait:1;
|
2015-11-06 10:46:09 +08:00
|
|
|
#ifdef CONFIG_MEMCG
|
|
|
|
unsigned memcg_may_oom:1;
|
2016-01-21 07:02:32 +08:00
|
|
|
#ifndef CONFIG_SLOB
|
2014-12-13 08:55:15 +08:00
|
|
|
unsigned memcg_kmem_skip_account:1;
|
|
|
|
#endif
|
2016-01-21 07:02:32 +08:00
|
|
|
#endif
|
2015-04-18 02:05:30 +08:00
|
|
|
#ifdef CONFIG_COMPAT_BRK
|
|
|
|
unsigned brk_randomized:1;
|
|
|
|
#endif
|
2014-12-13 08:55:15 +08:00
|
|
|
|
2014-05-22 06:23:46 +08:00
|
|
|
unsigned long atomic_flags; /* Flags needing atomic access. */
|
|
|
|
|
2015-02-13 07:01:14 +08:00
|
|
|
struct restart_block restart_block;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
pid_t pid;
|
|
|
|
pid_t tgid;
|
2006-09-26 16:52:38 +08:00
|
|
|
|
2009-08-18 14:06:02 +08:00
|
|
|
#ifdef CONFIG_CC_STACKPROTECTOR
|
2006-09-26 16:52:38 +08:00
|
|
|
/* Canary value for the -fstack-protector gcc feature */
|
|
|
|
unsigned long stack_canary;
|
2009-08-18 14:06:02 +08:00
|
|
|
#endif
|
2012-05-11 08:59:08 +08:00
|
|
|
/*
|
2005-04-17 06:20:36 +08:00
|
|
|
* pointers to (original) parent process, youngest child, younger sibling,
|
2012-05-11 08:59:08 +08:00
|
|
|
* older sibling, respectively. (p->father can be replaced with
|
2008-03-25 09:36:23 +08:00
|
|
|
* p->real_parent->pid)
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2011-12-15 06:39:26 +08:00
|
|
|
struct task_struct __rcu *real_parent; /* real parent process */
|
|
|
|
struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2008-03-25 09:36:23 +08:00
|
|
|
* children/sibling forms the list of my natural children
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
struct list_head children; /* list of my children */
|
|
|
|
struct list_head sibling; /* linkage in my parent's children list */
|
|
|
|
struct task_struct *group_leader; /* threadgroup leader */
|
|
|
|
|
2008-03-25 09:36:23 +08:00
|
|
|
/*
|
|
|
|
* ptraced is the list of tasks this task is using ptrace on.
|
|
|
|
* This includes both natural children and PTRACE_ATTACH targets.
|
|
|
|
* p->ptrace_entry is p's link on the p->parent->ptraced list.
|
|
|
|
*/
|
|
|
|
struct list_head ptraced;
|
|
|
|
struct list_head ptrace_entry;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* PID/PID hash table linkage. */
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 18:31:42 +08:00
|
|
|
struct pid_link pids[PIDTYPE_MAX];
|
2006-03-29 08:11:25 +08:00
|
|
|
struct list_head thread_group;
|
introduce for_each_thread() to replace the buggy while_each_thread()
while_each_thread() and next_thread() should die, almost every lockless
usage is wrong.
1. Unless g == current, the lockless while_each_thread() is not safe.
while_each_thread(g, t) can loop forever if g exits, next_thread()
can't reach the unhashed thread in this case. Note that this can
happen even if g is the group leader, it can exec.
2. Even if while_each_thread() itself was correct, people often use
it wrongly.
It was never safe to just take rcu_read_lock() and loop unless
you verify that pid_alive(g) == T, even the first next_thread()
can point to the already freed/reused memory.
This patch adds signal_struct->thread_head and task->thread_node to
create the normal rcu-safe list with the stable head. The new
for_each_thread(g, t) helper is always safe under rcu_read_lock() as
long as this task_struct can't go away.
Note: of course it is ugly to have both task_struct->thread_node and the
old task_struct->thread_group, we will kill it later, after we change
the users of while_each_thread() to use for_each_thread().
Perhaps we can kill it even before we convert all users, we can
reimplement next_thread(t) using the new thread_head/thread_node. But
we can't do this right now because this will lead to subtle behavioural
changes. For example, do/while_each_thread() always sees at least one
task, while for_each_thread() can do nothing if the whole thread group
has died. Or thread_group_empty(), currently its semantics is not clear
unless thread_group_leader(p) and we need to audit the callers before we
can change it.
So this patch adds the new interface which has to coexist with the old
one for some time, hopefully the next changes will be more or less
straightforward and the old one will go away soon.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Sergey Dyasly <dserrg@gmail.com>
Tested-by: Sergey Dyasly <dserrg@gmail.com>
Reviewed-by: Sameer Nanda <snanda@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mandeep Singh Baines <msb@chromium.org>
Cc: "Ma, Xindong" <xindong.ma@intel.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: "Tu, Xiaobing" <xiaobing.tu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-01-22 07:49:56 +08:00
|
|
|
struct list_head thread_node;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
struct completion *vfork_done; /* for vfork() */
|
|
|
|
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
|
|
|
|
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
|
|
|
|
|
2007-10-18 18:06:34 +08:00
|
|
|
cputime_t utime, stime, utimescaled, stimescaled;
|
2007-10-15 23:00:19 +08:00
|
|
|
cputime_t gtime;
|
2015-06-30 17:30:54 +08:00
|
|
|
struct prev_cputime prev_cputime;
|
2012-12-17 03:00:34 +08:00
|
|
|
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
2015-11-19 23:47:34 +08:00
|
|
|
seqcount_t vtime_seqcount;
|
2012-12-17 03:00:34 +08:00
|
|
|
unsigned long long vtime_snap;
|
|
|
|
enum {
|
2015-11-19 23:47:30 +08:00
|
|
|
/* Task is sleeping or running in a CPU with VTIME inactive */
|
|
|
|
VTIME_INACTIVE = 0,
|
|
|
|
/* Task runs in userspace in a CPU with VTIME active */
|
2012-12-17 03:00:34 +08:00
|
|
|
VTIME_USER,
|
2015-11-19 23:47:30 +08:00
|
|
|
/* Task runs in kernelspace in a CPU with VTIME active */
|
2012-12-17 03:00:34 +08:00
|
|
|
VTIME_SYS,
|
|
|
|
} vtime_snap_whence;
|
2009-12-02 16:26:47 +08:00
|
|
|
#endif
|
2015-06-07 21:54:30 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_NO_HZ_FULL
|
2016-03-24 22:38:00 +08:00
|
|
|
atomic_t tick_dep_mask;
|
2015-06-07 21:54:30 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned long nvcsw, nivcsw; /* context switch counts */
|
2014-07-17 05:04:34 +08:00
|
|
|
u64 start_time; /* monotonic time in nsec */
|
2014-07-17 05:04:32 +08:00
|
|
|
u64 real_start_time; /* boot based time in nsec */
|
2005-04-17 06:20:36 +08:00
|
|
|
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
|
|
|
|
unsigned long min_flt, maj_flt;
|
|
|
|
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
struct task_cputime cputime_expires;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct list_head cpu_timers[3];
|
|
|
|
|
|
|
|
/* process credentials */
|
2010-02-25 02:45:09 +08:00
|
|
|
const struct cred __rcu *real_cred; /* objective and real subjective task
|
2008-11-14 07:39:26 +08:00
|
|
|
* credentials (COW) */
|
2010-02-25 02:45:09 +08:00
|
|
|
const struct cred __rcu *cred; /* effective (overridable) subjective task
|
2008-11-14 07:39:26 +08:00
|
|
|
* credentials (COW) */
|
2005-05-06 07:16:12 +08:00
|
|
|
char comm[TASK_COMM_LEN]; /* executable name excluding path
|
|
|
|
- access with [gs]et_task_comm (which lock
|
|
|
|
it with task_lock())
|
Split 'flush_old_exec' into two functions
'flush_old_exec()' is the point of no return when doing an execve(), and
it is pretty badly misnamed. It doesn't just flush the old executable
environment, it also starts up the new one.
Which is very inconvenient for things like setting up the new
personality, because we want the new personality to affect the starting
of the new environment, but at the same time we do _not_ want the new
personality to take effect if flushing the old one fails.
As a result, the x86-64 '32-bit' personality is actually done using this
insane "I'm going to change the ABI, but I haven't done it yet" bit
(TIF_ABI_PENDING), with SET_PERSONALITY() not actually setting the
personality, but just the "pending" bit, so that "flush_thread()" can do
the actual personality magic.
This patch in no way changes any of that insanity, but it does split the
'flush_old_exec()' function up into a preparatory part that can fail
(still called flush_old_exec()), and a new part that will actually set
up the new exec environment (setup_new_exec()). All callers are changed
to trivially comply with the new world order.
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-29 14:14:42 +08:00
|
|
|
- initialized normally by setup_new_exec */
|
2005-04-17 06:20:36 +08:00
|
|
|
/* file system info */
|
2015-03-23 10:37:38 +08:00
|
|
|
struct nameidata *nameidata;
|
2006-09-29 16:59:40 +08:00
|
|
|
#ifdef CONFIG_SYSVIPC
|
2005-04-17 06:20:36 +08:00
|
|
|
/* ipc stuff */
|
|
|
|
struct sysv_sem sysvsem;
|
shm: make exit_shm work proportional to task activity
This is small set of patches our team has had kicking around for a few
versions internally that fixes tasks getting hung on shm_exit when there
are many threads hammering it at once.
Anton wrote a simple test to cause the issue:
http://ozlabs.org/~anton/junkcode/bust_shm_exit.c
Before applying this patchset, this test code will cause either hanging
tracebacks or pthread out of memory errors.
After this patchset, it will still produce output like:
root@somehost:~# ./bust_shm_exit 1024 160
...
INFO: rcu_sched detected stalls on CPUs/tasks: {} (detected by 116, t=2111 jiffies, g=241, c=240, q=7113)
INFO: Stall ended before state dump start
...
But the task will continue to run along happily, so we consider this an
improvement over hanging, even if it's a bit noisy.
This patch (of 3):
exit_shm obtains the ipc_ns shm rwsem for write and holds it while it
walks every shared memory segment in the namespace. Thus the amount of
work is related to the number of shm segments in the namespace not the
number of segments that might need to be cleaned.
In addition, this occurs after the task has been notified the thread has
exited, so the number of tasks waiting for the ns shm rwsem can grow
without bound until memory is exausted.
Add a list to the task struct of all shmids allocated by this task. Init
the list head in copy_process. Use the ns->rwsem for locking. Add
segments after id is added, remove before removing from id.
On unshare of NEW_IPCNS orphan any ids as if the task had exited, similar
to handling of semaphore undo.
I chose a define for the init sequence since its a simple list init,
otherwise it would require a function call to avoid include loops between
the semaphore code and the task struct. Converting the list_del to
list_del_init for the unshare cases would remove the exit followed by
init, but I left it blow up if not inited.
Signed-off-by: Milton Miller <miltonm@bga.com>
Signed-off-by: Jack Miller <millerjo@us.ibm.com>
Cc: Davidlohr Bueso <davidlohr@hp.com>
Cc: Manfred Spraul <manfred@colorfullife.com>
Cc: Anton Blanchard <anton@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 05:23:19 +08:00
|
|
|
struct sysv_shm sysvshm;
|
2006-09-29 16:59:40 +08:00
|
|
|
#endif
|
2009-01-16 03:08:40 +08:00
|
|
|
#ifdef CONFIG_DETECT_HUNG_TASK
|
2008-01-26 04:08:02 +08:00
|
|
|
/* hung task detection */
|
|
|
|
unsigned long last_switch_count;
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
/* filesystem information */
|
|
|
|
struct fs_struct *fs;
|
|
|
|
/* open file information */
|
|
|
|
struct files_struct *files;
|
2006-10-02 17:18:08 +08:00
|
|
|
/* namespaces */
|
2006-10-02 17:18:06 +08:00
|
|
|
struct nsproxy *nsproxy;
|
2005-04-17 06:20:36 +08:00
|
|
|
/* signal handlers */
|
|
|
|
struct signal_struct *signal;
|
|
|
|
struct sighand_struct *sighand;
|
|
|
|
|
|
|
|
sigset_t blocked, real_blocked;
|
2008-04-30 15:53:09 +08:00
|
|
|
sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
|
2005-04-17 06:20:36 +08:00
|
|
|
struct sigpending pending;
|
|
|
|
|
|
|
|
unsigned long sas_ss_sp;
|
|
|
|
size_t sas_ss_size;
|
2015-11-07 08:32:19 +08:00
|
|
|
|
2012-06-27 15:07:19 +08:00
|
|
|
struct callback_head *task_works;
|
2012-05-11 08:59:07 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct audit_context *audit_context;
|
2008-01-10 17:53:18 +08:00
|
|
|
#ifdef CONFIG_AUDITSYSCALL
|
2012-09-11 13:39:43 +08:00
|
|
|
kuid_t loginuid;
|
2008-01-08 23:06:53 +08:00
|
|
|
unsigned int sessionid;
|
2008-01-10 17:53:18 +08:00
|
|
|
#endif
|
2012-04-13 05:47:54 +08:00
|
|
|
struct seccomp seccomp;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Thread group tracking */
|
|
|
|
u32 parent_exec_id;
|
|
|
|
u32 self_exec_id;
|
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.
In order to update tasks' mems_allowed in time, we must modify the code of
memory policy. Because the memory policy is applied in the process's
context originally. After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.
But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression. But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set. In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.
[lee.schermerhorn@hp.com:
The rework of mpol_new() to extract the adjusting of the node mask to
apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
allocation. Fix this by adding the check for MPOL_PREFERRED and empty
node mask to mpol_new_mpolicy().
Remove the now unneeded 'nodes = NULL' from mpol_new().
Note that mpol_new_mempolicy() is always called with a non-NULL
'nodes' parameter now that it has been removed from mpol_new().
Therefore, we don't need to test nodes for NULL before testing it for
'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to
verify this assumption.]
[lee.schermerhorn@hp.com:
I don't think the function name 'mpol_new_mempolicy' is descriptive
enough to differentiate it from mpol_new().
This function applies cpuset set context, usually constraining nodes
to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag
is set, it also translates the nodes. So I settled on
'mpol_set_nodemask()', because the comment block for mpol_new() mentions
that we need to call this function to "set nodes".
Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
|
|
|
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
|
|
|
|
* mempolicy */
|
2005-04-17 06:20:36 +08:00
|
|
|
spinlock_t alloc_lock;
|
|
|
|
|
2006-06-27 17:54:51 +08:00
|
|
|
/* Protection of the PI data structures: */
|
2009-11-17 21:54:03 +08:00
|
|
|
raw_spinlock_t pi_lock;
|
2006-06-27 17:54:51 +08:00
|
|
|
|
2015-05-01 23:27:50 +08:00
|
|
|
struct wake_q_node wake_q;
|
|
|
|
|
2006-06-27 17:54:53 +08:00
|
|
|
#ifdef CONFIG_RT_MUTEXES
|
|
|
|
/* PI waiters blocked on a rt_mutex held by this task */
|
rtmutex: Turn the plist into an rb-tree
Turn the pi-chains from plist to rb-tree, in the rt_mutex code,
and provide a proper comparison function for -deadline and
-priority tasks.
This is done mainly because:
- classical prio field of the plist is just an int, which might
not be enough for representing a deadline;
- manipulating such a list would become O(nr_deadline_tasks),
which might be to much, as the number of -deadline task increases.
Therefore, an rb-tree is used, and tasks are queued in it according
to the following logic:
- among two -priority (i.e., SCHED_BATCH/OTHER/RR/FIFO) tasks, the
one with the higher (lower, actually!) prio wins;
- among a -priority and a -deadline task, the latter always wins;
- among two -deadline tasks, the one with the earliest deadline
wins.
Queueing and dequeueing functions are changed accordingly, for both
the list of a task's pi-waiters and the list of tasks blocked on
a pi-lock.
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-again-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-10-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 21:43:43 +08:00
|
|
|
struct rb_root pi_waiters;
|
|
|
|
struct rb_node *pi_waiters_leftmost;
|
2006-06-27 17:54:53 +08:00
|
|
|
/* Deadlock detection and priority inheritance handling */
|
|
|
|
struct rt_mutex_waiter *pi_blocked_on;
|
|
|
|
#endif
|
|
|
|
|
2006-01-10 07:59:20 +08:00
|
|
|
#ifdef CONFIG_DEBUG_MUTEXES
|
|
|
|
/* mutex deadlock detection */
|
|
|
|
struct mutex_waiter *blocked_on;
|
|
|
|
#endif
|
2006-07-03 15:24:42 +08:00
|
|
|
#ifdef CONFIG_TRACE_IRQFLAGS
|
|
|
|
unsigned int irq_events;
|
|
|
|
unsigned long hardirq_enable_ip;
|
|
|
|
unsigned long hardirq_disable_ip;
|
2009-11-30 13:59:44 +08:00
|
|
|
unsigned int hardirq_enable_event;
|
2006-07-03 15:24:42 +08:00
|
|
|
unsigned int hardirq_disable_event;
|
2009-11-30 13:59:44 +08:00
|
|
|
int hardirqs_enabled;
|
|
|
|
int hardirq_context;
|
2006-07-03 15:24:42 +08:00
|
|
|
unsigned long softirq_disable_ip;
|
|
|
|
unsigned long softirq_enable_ip;
|
2009-11-30 13:59:44 +08:00
|
|
|
unsigned int softirq_disable_event;
|
2006-07-03 15:24:42 +08:00
|
|
|
unsigned int softirq_enable_event;
|
2009-11-30 13:59:44 +08:00
|
|
|
int softirqs_enabled;
|
2006-07-03 15:24:42 +08:00
|
|
|
int softirq_context;
|
|
|
|
#endif
|
[PATCH] lockdep: core
Do 'make oldconfig' and accept all the defaults for new config options -
reboot into the kernel and if everything goes well it should boot up fine and
you should have /proc/lockdep and /proc/lockdep_stats files.
Typically if the lock validator finds some problem it will print out
voluminous debug output that begins with "BUG: ..." and which syslog output
can be used by kernel developers to figure out the precise locking scenario.
What does the lock validator do? It "observes" and maps all locking rules as
they occur dynamically (as triggered by the kernel's natural use of spinlocks,
rwlocks, mutexes and rwsems). Whenever the lock validator subsystem detects a
new locking scenario, it validates this new rule against the existing set of
rules. If this new rule is consistent with the existing set of rules then the
new rule is added transparently and the kernel continues as normal. If the
new rule could create a deadlock scenario then this condition is printed out.
When determining validity of locking, all possible "deadlock scenarios" are
considered: assuming arbitrary number of CPUs, arbitrary irq context and task
context constellations, running arbitrary combinations of all the existing
locking scenarios. In a typical system this means millions of separate
scenarios. This is why we call it a "locking correctness" validator - for all
rules that are observed the lock validator proves it with mathematical
certainty that a deadlock could not occur (assuming that the lock validator
implementation itself is correct and its internal data structures are not
corrupted by some other kernel subsystem). [see more details and conditionals
of this statement in include/linux/lockdep.h and
Documentation/lockdep-design.txt]
Furthermore, this "all possible scenarios" property of the validator also
enables the finding of complex, highly unlikely multi-CPU multi-context races
via single single-context rules, increasing the likelyhood of finding bugs
drastically. In practical terms: the lock validator already found a bug in
the upstream kernel that could only occur on systems with 3 or more CPUs, and
which needed 3 very unlikely code sequences to occur at once on the 3 CPUs.
That bug was found and reported on a single-CPU system (!). So in essence a
race will be found "piecemail-wise", triggering all the necessary components
for the race, without having to reproduce the race scenario itself! In its
short existence the lock validator found and reported many bugs before they
actually caused a real deadlock.
To further increase the efficiency of the validator, the mapping is not per
"lock instance", but per "lock-class". For example, all struct inode objects
in the kernel have inode->inotify_mutex. If there are 10,000 inodes cached,
then there are 10,000 lock objects. But ->inotify_mutex is a single "lock
type", and all locking activities that occur against ->inotify_mutex are
"unified" into this single lock-class. The advantage of the lock-class
approach is that all historical ->inotify_mutex uses are mapped into a single
(and as narrow as possible) set of locking rules - regardless of how many
different tasks or inode structures it took to build this set of rules. The
set of rules persist during the lifetime of the kernel.
To see the rough magnitude of checking that the lock validator does, here's a
portion of /proc/lockdep_stats, fresh after bootup:
lock-classes: 694 [max: 2048]
direct dependencies: 1598 [max: 8192]
indirect dependencies: 17896
all direct dependencies: 16206
dependency chains: 1910 [max: 8192]
in-hardirq chains: 17
in-softirq chains: 105
in-process chains: 1065
stack-trace entries: 38761 [max: 131072]
combined max dependencies: 2033928
hardirq-safe locks: 24
hardirq-unsafe locks: 176
softirq-safe locks: 53
softirq-unsafe locks: 137
irq-safe locks: 59
irq-unsafe locks: 176
The lock validator has observed 1598 actual single-thread locking patterns,
and has validated all possible 2033928 distinct locking scenarios.
More details about the design of the lock validator can be found in
Documentation/lockdep-design.txt, which can also found at:
http://redhat.com/~mingo/lockdep-patches/lockdep-design.txt
[bunk@stusta.de: cleanups]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-03 15:24:50 +08:00
|
|
|
#ifdef CONFIG_LOCKDEP
|
2008-02-26 06:02:48 +08:00
|
|
|
# define MAX_LOCK_DEPTH 48UL
|
[PATCH] lockdep: core
Do 'make oldconfig' and accept all the defaults for new config options -
reboot into the kernel and if everything goes well it should boot up fine and
you should have /proc/lockdep and /proc/lockdep_stats files.
Typically if the lock validator finds some problem it will print out
voluminous debug output that begins with "BUG: ..." and which syslog output
can be used by kernel developers to figure out the precise locking scenario.
What does the lock validator do? It "observes" and maps all locking rules as
they occur dynamically (as triggered by the kernel's natural use of spinlocks,
rwlocks, mutexes and rwsems). Whenever the lock validator subsystem detects a
new locking scenario, it validates this new rule against the existing set of
rules. If this new rule is consistent with the existing set of rules then the
new rule is added transparently and the kernel continues as normal. If the
new rule could create a deadlock scenario then this condition is printed out.
When determining validity of locking, all possible "deadlock scenarios" are
considered: assuming arbitrary number of CPUs, arbitrary irq context and task
context constellations, running arbitrary combinations of all the existing
locking scenarios. In a typical system this means millions of separate
scenarios. This is why we call it a "locking correctness" validator - for all
rules that are observed the lock validator proves it with mathematical
certainty that a deadlock could not occur (assuming that the lock validator
implementation itself is correct and its internal data structures are not
corrupted by some other kernel subsystem). [see more details and conditionals
of this statement in include/linux/lockdep.h and
Documentation/lockdep-design.txt]
Furthermore, this "all possible scenarios" property of the validator also
enables the finding of complex, highly unlikely multi-CPU multi-context races
via single single-context rules, increasing the likelyhood of finding bugs
drastically. In practical terms: the lock validator already found a bug in
the upstream kernel that could only occur on systems with 3 or more CPUs, and
which needed 3 very unlikely code sequences to occur at once on the 3 CPUs.
That bug was found and reported on a single-CPU system (!). So in essence a
race will be found "piecemail-wise", triggering all the necessary components
for the race, without having to reproduce the race scenario itself! In its
short existence the lock validator found and reported many bugs before they
actually caused a real deadlock.
To further increase the efficiency of the validator, the mapping is not per
"lock instance", but per "lock-class". For example, all struct inode objects
in the kernel have inode->inotify_mutex. If there are 10,000 inodes cached,
then there are 10,000 lock objects. But ->inotify_mutex is a single "lock
type", and all locking activities that occur against ->inotify_mutex are
"unified" into this single lock-class. The advantage of the lock-class
approach is that all historical ->inotify_mutex uses are mapped into a single
(and as narrow as possible) set of locking rules - regardless of how many
different tasks or inode structures it took to build this set of rules. The
set of rules persist during the lifetime of the kernel.
To see the rough magnitude of checking that the lock validator does, here's a
portion of /proc/lockdep_stats, fresh after bootup:
lock-classes: 694 [max: 2048]
direct dependencies: 1598 [max: 8192]
indirect dependencies: 17896
all direct dependencies: 16206
dependency chains: 1910 [max: 8192]
in-hardirq chains: 17
in-softirq chains: 105
in-process chains: 1065
stack-trace entries: 38761 [max: 131072]
combined max dependencies: 2033928
hardirq-safe locks: 24
hardirq-unsafe locks: 176
softirq-safe locks: 53
softirq-unsafe locks: 137
irq-safe locks: 59
irq-unsafe locks: 176
The lock validator has observed 1598 actual single-thread locking patterns,
and has validated all possible 2033928 distinct locking scenarios.
More details about the design of the lock validator can be found in
Documentation/lockdep-design.txt, which can also found at:
http://redhat.com/~mingo/lockdep-patches/lockdep-design.txt
[bunk@stusta.de: cleanups]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-03 15:24:50 +08:00
|
|
|
u64 curr_chain_key;
|
|
|
|
int lockdep_depth;
|
|
|
|
unsigned int lockdep_recursion;
|
2008-05-15 19:09:15 +08:00
|
|
|
struct held_lock held_locks[MAX_LOCK_DEPTH];
|
lockdep: annotate reclaim context (__GFP_NOFS)
Here is another version, with the incremental patch rolled up, and
added reclaim context annotation to kswapd, and allocation tracing
to slab allocators (which may only ever reach the page allocator
in rare cases, so it is good to put annotations here too).
Haven't tested this version as such, but it should be getting closer
to merge worthy ;)
--
After noticing some code in mm/filemap.c accidentally perform a __GFP_FS
allocation when it should not have been, I thought it might be a good idea to
try to catch this kind of thing with lockdep.
I coded up a little idea that seems to work. Unfortunately the system has to
actually be in __GFP_FS page reclaim, then take the lock, before it will mark
it. But at least that might still be some orders of magnitude more common
(and more debuggable) than an actual deadlock condition, so we have some
improvement I hope (the concept is no less complete than discovery of a lock's
interrupt contexts).
I guess we could even do the same thing with __GFP_IO (normal reclaim), and
even GFP_NOIO locks too... but filesystems will have the most locks and fiddly
code paths, so let's start there and see how it goes.
It *seems* to work. I did a quick test.
=================================
[ INFO: inconsistent lock state ]
2.6.28-rc6-00007-ged31348-dirty #26
---------------------------------
inconsistent {in-reclaim-W} -> {ov-reclaim-W} usage.
modprobe/8526 [HC0[0]:SC0[0]:HE1:SE1] takes:
(testlock){--..}, at: [<ffffffffa0020055>] brd_init+0x55/0x216 [brd]
{in-reclaim-W} state was registered at:
[<ffffffff80267bdb>] __lock_acquire+0x75b/0x1a60
[<ffffffff80268f71>] lock_acquire+0x91/0xc0
[<ffffffff8070f0e1>] mutex_lock_nested+0xb1/0x310
[<ffffffffa002002b>] brd_init+0x2b/0x216 [brd]
[<ffffffff8020903b>] _stext+0x3b/0x170
[<ffffffff80272ebf>] sys_init_module+0xaf/0x1e0
[<ffffffff8020c3fb>] system_call_fastpath+0x16/0x1b
[<ffffffffffffffff>] 0xffffffffffffffff
irq event stamp: 3929
hardirqs last enabled at (3929): [<ffffffff8070f2b5>] mutex_lock_nested+0x285/0x310
hardirqs last disabled at (3928): [<ffffffff8070f089>] mutex_lock_nested+0x59/0x310
softirqs last enabled at (3732): [<ffffffff8061f623>] sk_filter+0x83/0xe0
softirqs last disabled at (3730): [<ffffffff8061f5b6>] sk_filter+0x16/0xe0
other info that might help us debug this:
1 lock held by modprobe/8526:
#0: (testlock){--..}, at: [<ffffffffa0020055>] brd_init+0x55/0x216 [brd]
stack backtrace:
Pid: 8526, comm: modprobe Not tainted 2.6.28-rc6-00007-ged31348-dirty #26
Call Trace:
[<ffffffff80265483>] print_usage_bug+0x193/0x1d0
[<ffffffff80266530>] mark_lock+0xaf0/0xca0
[<ffffffff80266735>] mark_held_locks+0x55/0xc0
[<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd]
[<ffffffff802667ca>] trace_reclaim_fs+0x2a/0x60
[<ffffffff80285005>] __alloc_pages_internal+0x475/0x580
[<ffffffff8070f29e>] ? mutex_lock_nested+0x26e/0x310
[<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd]
[<ffffffffa002006a>] brd_init+0x6a/0x216 [brd]
[<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd]
[<ffffffff8020903b>] _stext+0x3b/0x170
[<ffffffff8070f8b9>] ? mutex_unlock+0x9/0x10
[<ffffffff8070f83d>] ? __mutex_unlock_slowpath+0x10d/0x180
[<ffffffff802669ec>] ? trace_hardirqs_on_caller+0x12c/0x190
[<ffffffff80272ebf>] sys_init_module+0xaf/0x1e0
[<ffffffff8020c3fb>] system_call_fastpath+0x16/0x1b
Signed-off-by: Nick Piggin <npiggin@suse.de>
Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-01-21 15:12:39 +08:00
|
|
|
gfp_t lockdep_reclaim_gfp;
|
[PATCH] lockdep: core
Do 'make oldconfig' and accept all the defaults for new config options -
reboot into the kernel and if everything goes well it should boot up fine and
you should have /proc/lockdep and /proc/lockdep_stats files.
Typically if the lock validator finds some problem it will print out
voluminous debug output that begins with "BUG: ..." and which syslog output
can be used by kernel developers to figure out the precise locking scenario.
What does the lock validator do? It "observes" and maps all locking rules as
they occur dynamically (as triggered by the kernel's natural use of spinlocks,
rwlocks, mutexes and rwsems). Whenever the lock validator subsystem detects a
new locking scenario, it validates this new rule against the existing set of
rules. If this new rule is consistent with the existing set of rules then the
new rule is added transparently and the kernel continues as normal. If the
new rule could create a deadlock scenario then this condition is printed out.
When determining validity of locking, all possible "deadlock scenarios" are
considered: assuming arbitrary number of CPUs, arbitrary irq context and task
context constellations, running arbitrary combinations of all the existing
locking scenarios. In a typical system this means millions of separate
scenarios. This is why we call it a "locking correctness" validator - for all
rules that are observed the lock validator proves it with mathematical
certainty that a deadlock could not occur (assuming that the lock validator
implementation itself is correct and its internal data structures are not
corrupted by some other kernel subsystem). [see more details and conditionals
of this statement in include/linux/lockdep.h and
Documentation/lockdep-design.txt]
Furthermore, this "all possible scenarios" property of the validator also
enables the finding of complex, highly unlikely multi-CPU multi-context races
via single single-context rules, increasing the likelyhood of finding bugs
drastically. In practical terms: the lock validator already found a bug in
the upstream kernel that could only occur on systems with 3 or more CPUs, and
which needed 3 very unlikely code sequences to occur at once on the 3 CPUs.
That bug was found and reported on a single-CPU system (!). So in essence a
race will be found "piecemail-wise", triggering all the necessary components
for the race, without having to reproduce the race scenario itself! In its
short existence the lock validator found and reported many bugs before they
actually caused a real deadlock.
To further increase the efficiency of the validator, the mapping is not per
"lock instance", but per "lock-class". For example, all struct inode objects
in the kernel have inode->inotify_mutex. If there are 10,000 inodes cached,
then there are 10,000 lock objects. But ->inotify_mutex is a single "lock
type", and all locking activities that occur against ->inotify_mutex are
"unified" into this single lock-class. The advantage of the lock-class
approach is that all historical ->inotify_mutex uses are mapped into a single
(and as narrow as possible) set of locking rules - regardless of how many
different tasks or inode structures it took to build this set of rules. The
set of rules persist during the lifetime of the kernel.
To see the rough magnitude of checking that the lock validator does, here's a
portion of /proc/lockdep_stats, fresh after bootup:
lock-classes: 694 [max: 2048]
direct dependencies: 1598 [max: 8192]
indirect dependencies: 17896
all direct dependencies: 16206
dependency chains: 1910 [max: 8192]
in-hardirq chains: 17
in-softirq chains: 105
in-process chains: 1065
stack-trace entries: 38761 [max: 131072]
combined max dependencies: 2033928
hardirq-safe locks: 24
hardirq-unsafe locks: 176
softirq-safe locks: 53
softirq-unsafe locks: 137
irq-safe locks: 59
irq-unsafe locks: 176
The lock validator has observed 1598 actual single-thread locking patterns,
and has validated all possible 2033928 distinct locking scenarios.
More details about the design of the lock validator can be found in
Documentation/lockdep-design.txt, which can also found at:
http://redhat.com/~mingo/lockdep-patches/lockdep-design.txt
[bunk@stusta.de: cleanups]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Signed-off-by: Adrian Bunk <bunk@stusta.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-03 15:24:50 +08:00
|
|
|
#endif
|
2016-01-21 07:00:55 +08:00
|
|
|
#ifdef CONFIG_UBSAN
|
|
|
|
unsigned int in_ubsan;
|
|
|
|
#endif
|
2006-01-10 07:59:20 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* journalling filesystem info */
|
|
|
|
void *journal_info;
|
|
|
|
|
When stacked block devices are in-use (e.g. md or dm), the recursive calls
to generic_make_request can use up a lot of space, and we would rather they
didn't.
As generic_make_request is a void function, and as it is generally not
expected that it will have any effect immediately, it is safe to delay any
call to generic_make_request until there is sufficient stack space
available.
As ->bi_next is reserved for the driver to use, it can have no valid value
when generic_make_request is called, and as __make_request implicitly
assumes it will be NULL (ELEVATOR_BACK_MERGE fork of switch) we can be
certain that all callers set it to NULL. We can therefore safely use
bi_next to link pending requests together, providing we clear it before
making the real call.
So, we choose to allow each thread to only be active in one
generic_make_request at a time. If a subsequent (recursive) call is made,
the bio is linked into a per-thread list, and is handled when the active
call completes.
As the list of pending bios is per-thread, there are no locking issues to
worry about.
I say above that it is "safe to delay any call...". There are, however,
some behaviours of a make_request_fn which would make it unsafe. These
include any behaviour that assumes anything will have changed after a
recursive call to generic_make_request.
These could include:
- waiting for that call to finish and call it's bi_end_io function.
md use to sometimes do this (marking the superblock dirty before
completing a write) but doesn't any more
- inspecting the bio for fields that generic_make_request might
change, such as bi_sector or bi_bdev. It is hard to see a good
reason for this, and I don't think anyone actually does it.
- inspecing the queue to see if, e.g. it is 'full' yet. Again, I
think this is very unlikely to be useful, or to be done.
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: <dm-devel@redhat.com>
Alasdair G Kergon <agk@redhat.com> said:
I can see nothing wrong with this in principle.
For device-mapper at the moment though it's essential that, while the bio
mappings may now get delayed, they still get processed in exactly
the same order as they were passed to generic_make_request().
My main concern is whether the timing changes implicit in this patch
will make the rare data-corrupting races in the existing snapshot code
more likely. (I'm working on a fix for these races, but the unfinished
patch is already several hundred lines long.)
It would be helpful if some people on this mailing list would test
this patch in various scenarios and report back.
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2007-05-01 15:53:42 +08:00
|
|
|
/* stacked block device info */
|
2010-02-23 15:55:42 +08:00
|
|
|
struct bio_list *bio_list;
|
When stacked block devices are in-use (e.g. md or dm), the recursive calls
to generic_make_request can use up a lot of space, and we would rather they
didn't.
As generic_make_request is a void function, and as it is generally not
expected that it will have any effect immediately, it is safe to delay any
call to generic_make_request until there is sufficient stack space
available.
As ->bi_next is reserved for the driver to use, it can have no valid value
when generic_make_request is called, and as __make_request implicitly
assumes it will be NULL (ELEVATOR_BACK_MERGE fork of switch) we can be
certain that all callers set it to NULL. We can therefore safely use
bi_next to link pending requests together, providing we clear it before
making the real call.
So, we choose to allow each thread to only be active in one
generic_make_request at a time. If a subsequent (recursive) call is made,
the bio is linked into a per-thread list, and is handled when the active
call completes.
As the list of pending bios is per-thread, there are no locking issues to
worry about.
I say above that it is "safe to delay any call...". There are, however,
some behaviours of a make_request_fn which would make it unsafe. These
include any behaviour that assumes anything will have changed after a
recursive call to generic_make_request.
These could include:
- waiting for that call to finish and call it's bi_end_io function.
md use to sometimes do this (marking the superblock dirty before
completing a write) but doesn't any more
- inspecting the bio for fields that generic_make_request might
change, such as bi_sector or bi_bdev. It is hard to see a good
reason for this, and I don't think anyone actually does it.
- inspecing the queue to see if, e.g. it is 'full' yet. Again, I
think this is very unlikely to be useful, or to be done.
Signed-off-by: Neil Brown <neilb@suse.de>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: <dm-devel@redhat.com>
Alasdair G Kergon <agk@redhat.com> said:
I can see nothing wrong with this in principle.
For device-mapper at the moment though it's essential that, while the bio
mappings may now get delayed, they still get processed in exactly
the same order as they were passed to generic_make_request().
My main concern is whether the timing changes implicit in this patch
will make the rare data-corrupting races in the existing snapshot code
more likely. (I'm working on a fix for these races, but the unfinished
patch is already several hundred lines long.)
It would be helpful if some people on this mailing list would test
this patch in various scenarios and report back.
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2007-05-01 15:53:42 +08:00
|
|
|
|
2011-03-08 20:19:51 +08:00
|
|
|
#ifdef CONFIG_BLOCK
|
|
|
|
/* stack plugging */
|
|
|
|
struct blk_plug *plug;
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* VM state */
|
|
|
|
struct reclaim_state *reclaim_state;
|
|
|
|
|
|
|
|
struct backing_dev_info *backing_dev_info;
|
|
|
|
|
|
|
|
struct io_context *io_context;
|
|
|
|
|
|
|
|
unsigned long ptrace_message;
|
|
|
|
siginfo_t *last_siginfo; /* For ptrace use. */
|
2006-12-10 18:19:19 +08:00
|
|
|
struct task_io_accounting ioac;
|
2006-10-01 14:28:59 +08:00
|
|
|
#if defined(CONFIG_TASK_XACCT)
|
2005-04-17 06:20:36 +08:00
|
|
|
u64 acct_rss_mem1; /* accumulated rss usage */
|
|
|
|
u64 acct_vm_mem1; /* accumulated virtual memory usage */
|
2008-07-25 16:48:40 +08:00
|
|
|
cputime_t acct_timexpd; /* stime + utime since last update */
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_CPUSETS
|
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.
In order to update tasks' mems_allowed in time, we must modify the code of
memory policy. Because the memory policy is applied in the process's
context originally. After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.
But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression. But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set. In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.
[lee.schermerhorn@hp.com:
The rework of mpol_new() to extract the adjusting of the node mask to
apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
allocation. Fix this by adding the check for MPOL_PREFERRED and empty
node mask to mpol_new_mpolicy().
Remove the now unneeded 'nodes = NULL' from mpol_new().
Note that mpol_new_mempolicy() is always called with a non-NULL
'nodes' parameter now that it has been removed from mpol_new().
Therefore, we don't need to test nodes for NULL before testing it for
'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to
verify this assumption.]
[lee.schermerhorn@hp.com:
I don't think the function name 'mpol_new_mempolicy' is descriptive
enough to differentiate it from mpol_new().
This function applies cpuset set context, usually constraining nodes
to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag
is set, it also translates the nodes. So I settled on
'mpol_set_nodemask()', because the comment block for mpol_new() mentions
that we need to call this function to "set nodes".
Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
|
|
|
nodemask_t mems_allowed; /* Protected by alloc_lock */
|
cpuset: mm: reduce large amounts of memory barrier related damage v3
Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when
changing cpuset's mems") wins a super prize for the largest number of
memory barriers entered into fast paths for one commit.
[get|put]_mems_allowed is incredibly heavy with pairs of full memory
barriers inserted into a number of hot paths. This was detected while
investigating at large page allocator slowdown introduced some time
after 2.6.32. The largest portion of this overhead was shown by
oprofile to be at an mfence introduced by this commit into the page
allocator hot path.
For extra style points, the commit introduced the use of yield() in an
implementation of what looks like a spinning mutex.
This patch replaces the full memory barriers on both read and write
sides with a sequence counter with just read barriers on the fast path
side. This is much cheaper on some architectures, including x86. The
main bulk of the patch is the retry logic if the nodemask changes in a
manner that can cause a false failure.
While updating the nodemask, a check is made to see if a false failure
is a risk. If it is, the sequence number gets bumped and parallel
allocators will briefly stall while the nodemask update takes place.
In a page fault test microbenchmark, oprofile samples from
__alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The
actual results were
3.3.0-rc3 3.3.0-rc3
rc3-vanilla nobarrier-v2r1
Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%)
Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%)
Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%)
Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%)
Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%)
Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%)
Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%)
Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%)
Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%)
Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%)
Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%)
Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%)
Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%)
Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%)
Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%)
MMTests Statistics: duration
Sys Time Running Test (seconds) 135.68 132.17
User+Sys Time Running Test (seconds) 164.2 160.13
Total Elapsed Time (seconds) 123.46 120.87
The overall improvement is small but the System CPU time is much
improved and roughly in correlation to what oprofile reported (these
performance figures are without profiling so skew is expected). The
actual number of page faults is noticeably improved.
For benchmarks like kernel builds, the overall benefit is marginal but
the system CPU time is slightly reduced.
To test the actual bug the commit fixed I opened two terminals. The
first ran within a cpuset and continually ran a small program that
faulted 100M of anonymous data. In a second window, the nodemask of the
cpuset was continually randomised in a loop.
Without the commit, the program would fail every so often (usually
within 10 seconds) and obviously with the commit everything worked fine.
With this patch applied, it also worked fine so the fix should be
functionally equivalent.
Signed-off-by: Mel Gorman <mgorman@suse.de>
Cc: Miao Xie <miaox@cn.fujitsu.com>
Cc: David Rientjes <rientjes@google.com>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:11 +08:00
|
|
|
seqcount_t mems_allowed_seq; /* Seqence no to catch updates */
|
[PATCH] cpuset memory spread basic implementation
This patch provides the implementation and cpuset interface for an alternative
memory allocation policy that can be applied to certain kinds of memory
allocations, such as the page cache (file system buffers) and some slab caches
(such as inode caches).
The policy is called "memory spreading." If enabled, it spreads out these
kinds of memory allocations over all the nodes allowed to a task, instead of
preferring to place them on the node where the task is executing.
All other kinds of allocations, including anonymous pages for a tasks stack
and data regions, are not affected by this policy choice, and continue to be
allocated preferring the node local to execution, as modified by the NUMA
mempolicy.
There are two boolean flag files per cpuset that control where the kernel
allocates pages for the file system buffers and related in kernel data
structures. They are called 'memory_spread_page' and 'memory_spread_slab'.
If the per-cpuset boolean flag file 'memory_spread_page' is set, then the
kernel will spread the file system buffers (page cache) evenly over all the
nodes that the faulting task is allowed to use, instead of preferring to put
those pages on the node where the task is running.
If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the
kernel will spread some file system related slab caches, such as for inodes
and dentries evenly over all the nodes that the faulting task is allowed to
use, instead of preferring to put those pages on the node where the task is
running.
The implementation is simple. Setting the cpuset flags 'memory_spread_page'
or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or
PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or
subsequently joins that cpuset. In subsequent patches, the page allocation
calls for the affected page cache and slab caches are modified to perform an
inline check for these flags, and if set, a call to a new routine
cpuset_mem_spread_node() returns the node to prefer for the allocation.
The cpuset_mem_spread_node() routine is also simple. It uses the value of a
per-task rotor cpuset_mem_spread_rotor to select the next node in the current
tasks mems_allowed to prefer for the allocation.
This policy can provide substantial improvements for jobs that need to place
thread local data on the corresponding node, but that need to access large
file system data sets that need to be spread across the several nodes in the
jobs cpuset in order to fit. Without this patch, especially for jobs that
might have one thread reading in the data set, the memory allocation across
the nodes in the jobs cpuset can become very uneven.
A couple of Copyright year ranges are updated as well. And a couple of email
addresses that can be found in the MAINTAINERS file are removed.
Signed-off-by: Paul Jackson <pj@sgi.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
|
|
|
int cpuset_mem_spread_rotor;
|
2010-05-27 05:42:49 +08:00
|
|
|
int cpuset_slab_spread_rotor;
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
Task Control Groups: basic task cgroup framework
Generic Process Control Groups
--------------------------
There have recently been various proposals floating around for
resource management/accounting and other task grouping subsystems in
the kernel, including ResGroups, User BeanCounters, NSProxy
cgroups, and others. These all need the basic abstraction of being
able to group together multiple processes in an aggregate, in order to
track/limit the resources permitted to those processes, or control
other behaviour of the processes, and all implement this grouping in
different ways.
This patchset provides a framework for tracking and grouping processes
into arbitrary "cgroups" and assigning arbitrary state to those
groupings, in order to control the behaviour of the cgroup as an
aggregate.
The intention is that the various resource management and
virtualization/cgroup efforts can also become task cgroup
clients, with the result that:
- the userspace APIs are (somewhat) normalised
- it's easier to test e.g. the ResGroups CPU controller in
conjunction with the BeanCounters memory controller, or use either of
them as the resource-control portion of a virtual server system.
- the additional kernel footprint of any of the competing resource
management systems is substantially reduced, since it doesn't need
to provide process grouping/containment, hence improving their
chances of getting into the kernel
This patch:
Add the main task cgroups framework - the cgroup filesystem, and the
basic structures for tracking membership and associating subsystem state
objects to tasks.
Signed-off-by: Paul Menage <menage@google.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:30 +08:00
|
|
|
#ifdef CONFIG_CGROUPS
|
2007-10-19 14:39:36 +08:00
|
|
|
/* Control Group info protected by css_set_lock */
|
2010-02-25 02:41:39 +08:00
|
|
|
struct css_set __rcu *cgroups;
|
2007-10-19 14:39:36 +08:00
|
|
|
/* cg_list protected by css_set_lock and tsk->alloc_lock */
|
|
|
|
struct list_head cg_list;
|
Task Control Groups: basic task cgroup framework
Generic Process Control Groups
--------------------------
There have recently been various proposals floating around for
resource management/accounting and other task grouping subsystems in
the kernel, including ResGroups, User BeanCounters, NSProxy
cgroups, and others. These all need the basic abstraction of being
able to group together multiple processes in an aggregate, in order to
track/limit the resources permitted to those processes, or control
other behaviour of the processes, and all implement this grouping in
different ways.
This patchset provides a framework for tracking and grouping processes
into arbitrary "cgroups" and assigning arbitrary state to those
groupings, in order to control the behaviour of the cgroup as an
aggregate.
The intention is that the various resource management and
virtualization/cgroup efforts can also become task cgroup
clients, with the result that:
- the userspace APIs are (somewhat) normalised
- it's easier to test e.g. the ResGroups CPU controller in
conjunction with the BeanCounters memory controller, or use either of
them as the resource-control portion of a virtual server system.
- the additional kernel footprint of any of the competing resource
management systems is substantially reduced, since it doesn't need
to provide process grouping/containment, hence improving their
chances of getting into the kernel
This patch:
Add the main task cgroups framework - the cgroup filesystem, and the
basic structures for tracking membership and associating subsystem state
objects to tasks.
Signed-off-by: Paul Menage <menage@google.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:30 +08:00
|
|
|
#endif
|
2007-10-17 14:27:30 +08:00
|
|
|
#ifdef CONFIG_FUTEX
|
2006-03-27 17:16:22 +08:00
|
|
|
struct robust_list_head __user *robust_list;
|
2006-03-27 17:16:24 +08:00
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
struct compat_robust_list_head __user *compat_robust_list;
|
|
|
|
#endif
|
2006-06-27 17:54:58 +08:00
|
|
|
struct list_head pi_state_list;
|
|
|
|
struct futex_pi_state *pi_state_cache;
|
2008-05-15 19:09:15 +08:00
|
|
|
#endif
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
#ifdef CONFIG_PERF_EVENTS
|
2010-09-02 22:50:03 +08:00
|
|
|
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
struct mutex perf_event_mutex;
|
|
|
|
struct list_head perf_event_list;
|
perf_counter: Dynamically allocate tasks' perf_counter_context struct
This replaces the struct perf_counter_context in the task_struct with
a pointer to a dynamically allocated perf_counter_context struct. The
main reason for doing is this is to allow us to transfer a
perf_counter_context from one task to another when we do lazy PMU
switching in a later patch.
This has a few side-benefits: the task_struct becomes a little smaller,
we save some memory because only tasks that have perf_counters attached
get a perf_counter_context allocated for them, and we can remove the
inclusion of <linux/perf_counter.h> in sched.h, meaning that we don't
end up recompiling nearly everything whenever perf_counter.h changes.
The perf_counter_context structures are reference-counted and freed
when the last reference is dropped. A context can have references
from its task and the counters on its task. Counters can outlive the
task so it is possible that a context will be freed well after its
task has exited.
Contexts are allocated on fork if the parent had a context, or
otherwise the first time that a per-task counter is created on a task.
In the latter case, we set the context pointer in the task struct
locklessly using an atomic compare-and-exchange operation in case we
raced with some other task in creating a context for the subject task.
This also removes the task pointer from the perf_counter struct. The
task pointer was not used anywhere and would make it harder to move a
context from one task to another. Anything that needed to know which
task a counter was attached to was already using counter->ctx->task.
The __perf_counter_init_context function moves up in perf_counter.c
so that it can be called from find_get_context, and now initializes
the refcount, but is otherwise unchanged.
We were potentially calling list_del_counter twice: once from
__perf_counter_exit_task when the task exits and once from
__perf_counter_remove_from_context when the counter's fd gets closed.
This adds a check in list_del_counter so it doesn't do anything if
the counter has already been removed from the lists.
Since perf_counter_task_sched_in doesn't do anything if the task doesn't
have a context, and leaves cpuctx->task_ctx = NULL, this adds code to
__perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in
the case where the current task adds the first counter to itself and
thus creates a context for itself.
This also adds similar code to __perf_counter_enable to handle a
similar situation which can arise when the counters have been disabled
using prctl; that also leaves cpuctx->task_ctx = NULL.
[ Impact: refactor counter context management to prepare for new feature ]
Signed-off-by: Paul Mackerras <paulus@samba.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com>
Cc: Marcelo Tosatti <mtosatti@redhat.com>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-05-22 12:17:31 +08:00
|
|
|
#endif
|
2014-02-08 03:58:39 +08:00
|
|
|
#ifdef CONFIG_DEBUG_PREEMPT
|
|
|
|
unsigned long preempt_disable_ip;
|
|
|
|
#endif
|
2008-05-15 19:09:15 +08:00
|
|
|
#ifdef CONFIG_NUMA
|
cpuset,mm: update tasks' mems_allowed in time
Fix allocating page cache/slab object on the unallowed node when memory
spread is set by updating tasks' mems_allowed after its cpuset's mems is
changed.
In order to update tasks' mems_allowed in time, we must modify the code of
memory policy. Because the memory policy is applied in the process's
context originally. After applying this patch, one task directly
manipulates anothers mems_allowed, and we use alloc_lock in the
task_struct to protect mems_allowed and memory policy of the task.
But in the fast path, we didn't use lock to protect them, because adding a
lock may lead to performance regression. But if we don't add a lock,the
task might see no nodes when changing cpuset's mems_allowed to some
non-overlapping set. In order to avoid it, we set all new allowed nodes,
then clear newly disallowed ones.
[lee.schermerhorn@hp.com:
The rework of mpol_new() to extract the adjusting of the node mask to
apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind()
with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local
allocation. Fix this by adding the check for MPOL_PREFERRED and empty
node mask to mpol_new_mpolicy().
Remove the now unneeded 'nodes = NULL' from mpol_new().
Note that mpol_new_mempolicy() is always called with a non-NULL
'nodes' parameter now that it has been removed from mpol_new().
Therefore, we don't need to test nodes for NULL before testing it for
'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to
verify this assumption.]
[lee.schermerhorn@hp.com:
I don't think the function name 'mpol_new_mempolicy' is descriptive
enough to differentiate it from mpol_new().
This function applies cpuset set context, usually constraining nodes
to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag
is set, it also translates the nodes. So I settled on
'mpol_set_nodemask()', because the comment block for mpol_new() mentions
that we need to call this function to "set nodes".
Some additional minor line length, whitespace and typo cleanup.]
Signed-off-by: Miao Xie <miaox@cn.fujitsu.com>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Christoph Lameter <cl@linux-foundation.org>
Cc: Paul Menage <menage@google.com>
Cc: Nick Piggin <nickpiggin@yahoo.com.au>
Cc: Yasunori Goto <y-goto@jp.fujitsu.com>
Cc: Pekka Enberg <penberg@cs.helsinki.fi>
Cc: David Rientjes <rientjes@google.com>
Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
|
|
|
struct mempolicy *mempolicy; /* Protected by alloc_lock */
|
2008-05-15 19:09:15 +08:00
|
|
|
short il_next;
|
2011-03-23 07:30:44 +08:00
|
|
|
short pref_node_fork;
|
2007-10-17 14:27:30 +08:00
|
|
|
#endif
|
2012-10-25 20:16:43 +08:00
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
|
|
|
int numa_scan_seq;
|
|
|
|
unsigned int numa_scan_period;
|
2013-10-07 18:28:55 +08:00
|
|
|
unsigned int numa_scan_period_max;
|
2013-10-07 18:29:39 +08:00
|
|
|
int numa_preferred_nid;
|
2013-10-07 18:29:11 +08:00
|
|
|
unsigned long numa_migrate_retry;
|
2012-10-25 20:16:43 +08:00
|
|
|
u64 node_stamp; /* migration stamp */
|
sched/numa: Normalize faults_cpu stats and weigh by CPU use
Tracing the code that decides the active nodes has made it abundantly clear
that the naive implementation of the faults_from code has issues.
Specifically, the garbage collector in some workloads will access orders
of magnitudes more memory than the threads that do all the active work.
This resulted in the node with the garbage collector being marked the only
active node in the group.
This issue is avoided if we weigh the statistics by CPU use of each task in
the numa group, instead of by how many faults each thread has occurred.
To achieve this, we normalize the number of faults to the fraction of faults
that occurred on each node, and then multiply that fraction by the fraction
of CPU time the task has used since the last time task_numa_placement was
invoked.
This way the nodes in the active node mask will be the ones where the tasks
from the numa group are most actively running, and the influence of eg. the
garbage collector and other do-little threads is properly minimized.
On a 4 node system, using CPU use statistics calculated over a longer interval
results in about 1% fewer page migrations with two 32-warehouse specjbb runs
on a 4 node system, and about 5% fewer page migrations, as well as 1% better
throughput, with two 8-warehouse specjbb runs, as compared with the shorter
term statistics kept by the scheduler.
Signed-off-by: Rik van Riel <riel@redhat.com>
Acked-by: Mel Gorman <mgorman@suse.de>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Cc: Chegu Vinod <chegu_vinod@hp.com>
Link: http://lkml.kernel.org/r/1390860228-21539-7-git-send-email-riel@redhat.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2014-01-28 06:03:45 +08:00
|
|
|
u64 last_task_numa_placement;
|
|
|
|
u64 last_sum_exec_runtime;
|
2012-10-25 20:16:43 +08:00
|
|
|
struct callback_head numa_work;
|
2013-10-07 18:28:57 +08:00
|
|
|
|
2013-10-07 18:29:21 +08:00
|
|
|
struct list_head numa_entry;
|
|
|
|
struct numa_group *numa_group;
|
|
|
|
|
2013-10-07 18:28:59 +08:00
|
|
|
/*
|
2014-10-31 08:13:31 +08:00
|
|
|
* numa_faults is an array split into four regions:
|
|
|
|
* faults_memory, faults_cpu, faults_memory_buffer, faults_cpu_buffer
|
|
|
|
* in this precise order.
|
|
|
|
*
|
|
|
|
* faults_memory: Exponential decaying average of faults on a per-node
|
|
|
|
* basis. Scheduling placement decisions are made based on these
|
|
|
|
* counts. The values remain static for the duration of a PTE scan.
|
|
|
|
* faults_cpu: Track the nodes the process was running on when a NUMA
|
|
|
|
* hinting fault was incurred.
|
|
|
|
* faults_memory_buffer and faults_cpu_buffer: Record faults per node
|
|
|
|
* during the current scan window. When the scan completes, the counts
|
|
|
|
* in faults_memory and faults_cpu decay and these values are copied.
|
2013-10-07 18:28:59 +08:00
|
|
|
*/
|
2014-10-31 08:13:31 +08:00
|
|
|
unsigned long *numa_faults;
|
2013-10-07 18:29:27 +08:00
|
|
|
unsigned long total_numa_faults;
|
2013-10-07 18:28:59 +08:00
|
|
|
|
2013-10-07 18:29:36 +08:00
|
|
|
/*
|
|
|
|
* numa_faults_locality tracks if faults recorded during the last
|
2015-03-26 06:55:42 +08:00
|
|
|
* scan window were remote/local or failed to migrate. The task scan
|
|
|
|
* period is adapted based on the locality of the faults with different
|
|
|
|
* weights depending on whether they were shared or private faults
|
2013-10-07 18:29:36 +08:00
|
|
|
*/
|
2015-03-26 06:55:42 +08:00
|
|
|
unsigned long numa_faults_locality[3];
|
2013-10-07 18:29:36 +08:00
|
|
|
|
2013-10-07 18:29:30 +08:00
|
|
|
unsigned long numa_pages_migrated;
|
2012-10-25 20:16:43 +08:00
|
|
|
#endif /* CONFIG_NUMA_BALANCING */
|
|
|
|
|
2015-09-05 06:47:32 +08:00
|
|
|
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
|
|
|
struct tlbflush_unmap_batch tlb_ubc;
|
|
|
|
#endif
|
|
|
|
|
2006-01-08 17:01:37 +08:00
|
|
|
struct rcu_head rcu;
|
2006-04-11 19:52:07 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* cache last used pipe for splice
|
|
|
|
*/
|
|
|
|
struct pipe_inode_info *splice_pipe;
|
net: use a per task frag allocator
We currently use a per socket order-0 page cache for tcp_sendmsg()
operations.
This page is used to build fragments for skbs.
Its done to increase probability of coalescing small write() into
single segments in skbs still in write queue (not yet sent)
But it wastes a lot of memory for applications handling many mostly
idle sockets, since each socket holds one page in sk->sk_sndmsg_page
Its also quite inefficient to build TSO 64KB packets, because we need
about 16 pages per skb on arches where PAGE_SIZE = 4096, so we hit
page allocator more than wanted.
This patch adds a per task frag allocator and uses bigger pages,
if available. An automatic fallback is done in case of memory pressure.
(up to 32768 bytes per frag, thats order-3 pages on x86)
This increases TCP stream performance by 20% on loopback device,
but also benefits on other network devices, since 8x less frags are
mapped on transmit and unmapped on tx completion. Alexander Duyck
mentioned a probable performance win on systems with IOMMU enabled.
Its possible some SG enabled hardware cant cope with bigger fragments,
but their ndo_start_xmit() should already handle this, splitting a
fragment in sub fragments, since some arches have PAGE_SIZE=65536
Successfully tested on various ethernet devices.
(ixgbe, igb, bnx2x, tg3, mellanox mlx4)
Signed-off-by: Eric Dumazet <edumazet@google.com>
Cc: Ben Hutchings <bhutchings@solarflare.com>
Cc: Vijay Subramanian <subramanian.vijay@gmail.com>
Cc: Alexander Duyck <alexander.h.duyck@intel.com>
Tested-by: Vijay Subramanian <subramanian.vijay@gmail.com>
Signed-off-by: David S. Miller <davem@davemloft.net>
2012-09-24 07:04:42 +08:00
|
|
|
|
|
|
|
struct page_frag task_frag;
|
|
|
|
|
2006-07-14 15:24:36 +08:00
|
|
|
#ifdef CONFIG_TASK_DELAY_ACCT
|
|
|
|
struct task_delay_info *delays;
|
2006-12-08 18:39:47 +08:00
|
|
|
#endif
|
|
|
|
#ifdef CONFIG_FAULT_INJECTION
|
|
|
|
int make_it_fail;
|
2006-07-14 15:24:36 +08:00
|
|
|
#endif
|
writeback: per task dirty rate limit
Add two fields to task_struct.
1) account dirtied pages in the individual tasks, for accuracy
2) per-task balance_dirty_pages() call intervals, for flexibility
The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will
scale near-sqrt to the safety gap between dirty pages and threshold.
The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying
pages at exactly the same time, each task will be assigned a large
initial nr_dirtied_pause, so that the dirty threshold will be exceeded
long before each task reached its nr_dirtied_pause and hence call
balance_dirty_pages().
The solution is to watch for the number of pages dirtied on each CPU in
between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages
(3% dirty threshold), force call balance_dirty_pages() for a chance to
set bdi->dirty_exceeded. In normal situations, this safeguarding
condition is not expected to trigger at all.
On the sqrt in dirty_poll_interval():
It will serve as an initial guess when dirty pages are still in the
freerun area.
When dirty pages are floating inside the dirty control scope [freerun,
limit], a followup patch will use some refined dirty poll interval to
get the desired pause time.
thresh-dirty (MB) sqrt
1 16
2 22
4 32
8 45
16 64
32 90
64 128
128 181
256 256
512 362
1024 512
The above table means, given 1MB (or 1GB) gap and the dd tasks polling
balance_dirty_pages() on every 16 (or 512) pages, the dirty limit won't
be exceeded as long as there are less than 16 (or 512) concurrent dd's.
So sqrt naturally leads to less overheads and more safe concurrent tasks
for large memory servers, which have large (thresh-freerun) gaps.
peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Andrea Righi <andrea@betterlinux.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
2011-06-12 08:10:12 +08:00
|
|
|
/*
|
|
|
|
* when (nr_dirtied >= nr_dirtied_pause), it's time to call
|
|
|
|
* balance_dirty_pages() for some dirty throttling pause
|
|
|
|
*/
|
|
|
|
int nr_dirtied;
|
|
|
|
int nr_dirtied_pause;
|
2011-06-12 09:25:42 +08:00
|
|
|
unsigned long dirty_paused_when; /* start of a write-and-pause period */
|
writeback: per task dirty rate limit
Add two fields to task_struct.
1) account dirtied pages in the individual tasks, for accuracy
2) per-task balance_dirty_pages() call intervals, for flexibility
The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will
scale near-sqrt to the safety gap between dirty pages and threshold.
The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying
pages at exactly the same time, each task will be assigned a large
initial nr_dirtied_pause, so that the dirty threshold will be exceeded
long before each task reached its nr_dirtied_pause and hence call
balance_dirty_pages().
The solution is to watch for the number of pages dirtied on each CPU in
between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages
(3% dirty threshold), force call balance_dirty_pages() for a chance to
set bdi->dirty_exceeded. In normal situations, this safeguarding
condition is not expected to trigger at all.
On the sqrt in dirty_poll_interval():
It will serve as an initial guess when dirty pages are still in the
freerun area.
When dirty pages are floating inside the dirty control scope [freerun,
limit], a followup patch will use some refined dirty poll interval to
get the desired pause time.
thresh-dirty (MB) sqrt
1 16
2 22
4 32
8 45
16 64
32 90
64 128
128 181
256 256
512 362
1024 512
The above table means, given 1MB (or 1GB) gap and the dd tasks polling
balance_dirty_pages() on every 16 (or 512) pages, the dirty limit won't
be exceeded as long as there are less than 16 (or 512) concurrent dd's.
So sqrt naturally leads to less overheads and more safe concurrent tasks
for large memory servers, which have large (thresh-freerun) gaps.
peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case
CC: Peter Zijlstra <a.p.zijlstra@chello.nl>
Reviewed-by: Andrea Righi <andrea@betterlinux.com>
Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
2011-06-12 08:10:12 +08:00
|
|
|
|
2008-01-26 04:08:34 +08:00
|
|
|
#ifdef CONFIG_LATENCYTOP
|
|
|
|
int latency_record_count;
|
|
|
|
struct latency_record latency_record[LT_SAVECOUNT];
|
|
|
|
#endif
|
2008-09-02 06:52:40 +08:00
|
|
|
/*
|
|
|
|
* time slack values; these are used to round up poll() and
|
|
|
|
* select() etc timeout values. These are in nanoseconds.
|
|
|
|
*/
|
timer: convert timer_slack_ns from unsigned long to u64
This patchset introduces a /proc/<pid>/timerslack_ns interface which
would allow controlling processes to be able to set the timerslack value
on other processes in order to save power by avoiding wakeups (Something
Android currently does via out-of-tree patches).
The first patch tries to fix the internal timer_slack_ns usage which was
defined as a long, which limits the slack range to ~4 seconds on 32bit
systems. It converts it to a u64, which provides the same basically
unlimited slack (500 years) on both 32bit and 64bit machines.
The second patch introduces the /proc/<pid>/timerslack_ns interface
which allows the full 64bit slack range for a task to be read or set on
both 32bit and 64bit machines.
With these two patches, on a 32bit machine, after setting the slack on
bash to 10 seconds:
$ time sleep 1
real 0m10.747s
user 0m0.001s
sys 0m0.005s
The first patch is a little ugly, since I had to chase the slack delta
arguments through a number of functions converting them to u64s. Let me
know if it makes sense to break that up more or not.
Other than that things are fairly straightforward.
This patch (of 2):
The timer_slack_ns value in the task struct is currently a unsigned
long. This means that on 32bit applications, the maximum slack is just
over 4 seconds. However, on 64bit machines, its much much larger (~500
years).
This disparity could make application development a little (as well as
the default_slack) to a u64. This means both 32bit and 64bit systems
have the same effective internal slack range.
Now the existing ABI via PR_GET_TIMERSLACK and PR_SET_TIMERSLACK specify
the interface as a unsigned long, so we preserve that limitation on
32bit systems, where SET_TIMERSLACK can only set the slack to a unsigned
long value, and GET_TIMERSLACK will return ULONG_MAX if the slack is
actually larger then what can be stored by an unsigned long.
This patch also modifies hrtimer functions which specified the slack
delta as a unsigned long.
Signed-off-by: John Stultz <john.stultz@linaro.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Oren Laadan <orenl@cellrox.com>
Cc: Ruchi Kandoi <kandoiruchi@google.com>
Cc: Rom Lemarchand <romlem@android.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Android Kernel Team <kernel-team@android.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-18 05:20:51 +08:00
|
|
|
u64 timer_slack_ns;
|
|
|
|
u64 default_timer_slack_ns;
|
2008-11-06 16:37:40 +08:00
|
|
|
|
kasan: add kernel address sanitizer infrastructure
Kernel Address sanitizer (KASan) is a dynamic memory error detector. It
provides fast and comprehensive solution for finding use-after-free and
out-of-bounds bugs.
KASAN uses compile-time instrumentation for checking every memory access,
therefore GCC > v4.9.2 required. v4.9.2 almost works, but has issues with
putting symbol aliases into the wrong section, which breaks kasan
instrumentation of globals.
This patch only adds infrastructure for kernel address sanitizer. It's
not available for use yet. The idea and some code was borrowed from [1].
Basic idea:
The main idea of KASAN is to use shadow memory to record whether each byte
of memory is safe to access or not, and use compiler's instrumentation to
check the shadow memory on each memory access.
Address sanitizer uses 1/8 of the memory addressable in kernel for shadow
memory and uses direct mapping with a scale and offset to translate a
memory address to its corresponding shadow address.
Here is function to translate address to corresponding shadow address:
unsigned long kasan_mem_to_shadow(unsigned long addr)
{
return (addr >> KASAN_SHADOW_SCALE_SHIFT) + KASAN_SHADOW_OFFSET;
}
where KASAN_SHADOW_SCALE_SHIFT = 3.
So for every 8 bytes there is one corresponding byte of shadow memory.
The following encoding used for each shadow byte: 0 means that all 8 bytes
of the corresponding memory region are valid for access; k (1 <= k <= 7)
means that the first k bytes are valid for access, and other (8 - k) bytes
are not; Any negative value indicates that the entire 8-bytes are
inaccessible. Different negative values used to distinguish between
different kinds of inaccessible memory (redzones, freed memory) (see
mm/kasan/kasan.h).
To be able to detect accesses to bad memory we need a special compiler.
Such compiler inserts a specific function calls (__asan_load*(addr),
__asan_store*(addr)) before each memory access of size 1, 2, 4, 8 or 16.
These functions check whether memory region is valid to access or not by
checking corresponding shadow memory. If access is not valid an error
printed.
Historical background of the address sanitizer from Dmitry Vyukov:
"We've developed the set of tools, AddressSanitizer (Asan),
ThreadSanitizer and MemorySanitizer, for user space. We actively use
them for testing inside of Google (continuous testing, fuzzing,
running prod services). To date the tools have found more than 10'000
scary bugs in Chromium, Google internal codebase and various
open-source projects (Firefox, OpenSSL, gcc, clang, ffmpeg, MySQL and
lots of others): [2] [3] [4].
The tools are part of both gcc and clang compilers.
We have not yet done massive testing under the Kernel AddressSanitizer
(it's kind of chicken and egg problem, you need it to be upstream to
start applying it extensively). To date it has found about 50 bugs.
Bugs that we've found in upstream kernel are listed in [5].
We've also found ~20 bugs in out internal version of the kernel. Also
people from Samsung and Oracle have found some.
[...]
As others noted, the main feature of AddressSanitizer is its
performance due to inline compiler instrumentation and simple linear
shadow memory. User-space Asan has ~2x slowdown on computational
programs and ~2x memory consumption increase. Taking into account that
kernel usually consumes only small fraction of CPU and memory when
running real user-space programs, I would expect that kernel Asan will
have ~10-30% slowdown and similar memory consumption increase (when we
finish all tuning).
I agree that Asan can well replace kmemcheck. We have plans to start
working on Kernel MemorySanitizer that finds uses of unitialized
memory. Asan+Msan will provide feature-parity with kmemcheck. As
others noted, Asan will unlikely replace debug slab and pagealloc that
can be enabled at runtime. Asan uses compiler instrumentation, so even
if it is disabled, it still incurs visible overheads.
Asan technology is easily portable to other architectures. Compiler
instrumentation is fully portable. Runtime has some arch-dependent
parts like shadow mapping and atomic operation interception. They are
relatively easy to port."
Comparison with other debugging features:
========================================
KMEMCHECK:
- KASan can do almost everything that kmemcheck can. KASan uses
compile-time instrumentation, which makes it significantly faster than
kmemcheck. The only advantage of kmemcheck over KASan is detection of
uninitialized memory reads.
Some brief performance testing showed that kasan could be
x500-x600 times faster than kmemcheck:
$ netperf -l 30
MIGRATED TCP STREAM TEST from 0.0.0.0 (0.0.0.0) port 0 AF_INET to localhost (127.0.0.1) port 0 AF_INET
Recv Send Send
Socket Socket Message Elapsed
Size Size Size Time Throughput
bytes bytes bytes secs. 10^6bits/sec
no debug: 87380 16384 16384 30.00 41624.72
kasan inline: 87380 16384 16384 30.00 12870.54
kasan outline: 87380 16384 16384 30.00 10586.39
kmemcheck: 87380 16384 16384 30.03 20.23
- Also kmemcheck couldn't work on several CPUs. It always sets
number of CPUs to 1. KASan doesn't have such limitation.
DEBUG_PAGEALLOC:
- KASan is slower than DEBUG_PAGEALLOC, but KASan works on sub-page
granularity level, so it able to find more bugs.
SLUB_DEBUG (poisoning, redzones):
- SLUB_DEBUG has lower overhead than KASan.
- SLUB_DEBUG in most cases are not able to detect bad reads,
KASan able to detect both reads and writes.
- In some cases (e.g. redzone overwritten) SLUB_DEBUG detect
bugs only on allocation/freeing of object. KASan catch
bugs right before it will happen, so we always know exact
place of first bad read/write.
[1] https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel
[2] https://code.google.com/p/address-sanitizer/wiki/FoundBugs
[3] https://code.google.com/p/thread-sanitizer/wiki/FoundBugs
[4] https://code.google.com/p/memory-sanitizer/wiki/FoundBugs
[5] https://code.google.com/p/address-sanitizer/wiki/AddressSanitizerForKernel#Trophies
Based on work by Andrey Konovalov.
Signed-off-by: Andrey Ryabinin <a.ryabinin@samsung.com>
Acked-by: Michal Marek <mmarek@suse.cz>
Signed-off-by: Andrey Konovalov <adech.fo@gmail.com>
Cc: Dmitry Vyukov <dvyukov@google.com>
Cc: Konstantin Serebryany <kcc@google.com>
Cc: Dmitry Chernenkov <dmitryc@google.com>
Cc: Yuri Gribov <tetra2005@gmail.com>
Cc: Konstantin Khlebnikov <koct9i@gmail.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Joonsoo Kim <iamjoonsoo.kim@lge.com>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Christoph Lameter <cl@linux.com>
Cc: Pekka Enberg <penberg@kernel.org>
Cc: David Rientjes <rientjes@google.com>
Cc: Stephen Rothwell <sfr@canb.auug.org.au>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-02-14 06:39:17 +08:00
|
|
|
#ifdef CONFIG_KASAN
|
|
|
|
unsigned int kasan_depth;
|
|
|
|
#endif
|
2008-11-26 04:07:04 +08:00
|
|
|
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
|
tree-wide: Assorted spelling fixes
In particular, several occurances of funny versions of 'success',
'unknown', 'therefore', 'acknowledge', 'argument', 'achieve', 'address',
'beginning', 'desirable', 'separate' and 'necessary' are fixed.
Signed-off-by: Daniel Mack <daniel@caiaq.de>
Cc: Joe Perches <joe@perches.com>
Cc: Junio C Hamano <gitster@pobox.com>
Signed-off-by: Jiri Kosina <jkosina@suse.cz>
2010-02-03 08:01:28 +08:00
|
|
|
/* Index of current stored address in ret_stack */
|
2008-11-23 13:22:56 +08:00
|
|
|
int curr_ret_stack;
|
|
|
|
/* Stack of return addresses for return function tracing */
|
|
|
|
struct ftrace_ret_stack *ret_stack;
|
2009-03-24 13:10:15 +08:00
|
|
|
/* time stamp for last schedule */
|
|
|
|
unsigned long long ftrace_timestamp;
|
2008-11-23 13:22:56 +08:00
|
|
|
/*
|
|
|
|
* Number of functions that haven't been traced
|
|
|
|
* because of depth overrun.
|
|
|
|
*/
|
|
|
|
atomic_t trace_overrun;
|
2008-12-06 10:43:41 +08:00
|
|
|
/* Pause for the tracing */
|
|
|
|
atomic_t tracing_graph_pause;
|
2008-11-23 13:22:56 +08:00
|
|
|
#endif
|
2008-12-04 04:36:57 +08:00
|
|
|
#ifdef CONFIG_TRACING
|
|
|
|
/* state flags for use by tracers */
|
|
|
|
unsigned long trace;
|
ftrace: Add internal recursive checks
Witold reported a reboot caused by the selftests of the dynamic function
tracer. He sent me a config and I used ktest to do a config_bisect on it
(as my config did not cause the crash). It pointed out that the problem
config was CONFIG_PROVE_RCU.
What happened was that if multiple callbacks are attached to the
function tracer, we iterate a list of callbacks. Because the list is
managed by synchronize_sched() and preempt_disable, the access to the
pointers uses rcu_dereference_raw().
When PROVE_RCU is enabled, the rcu_dereference_raw() calls some
debugging functions, which happen to be traced. The tracing of the debug
function would then call rcu_dereference_raw() which would then call the
debug function and then... well you get the idea.
I first wrote two different patches to solve this bug.
1) add a __rcu_dereference_raw() that would not do any checks.
2) add notrace to the offending debug functions.
Both of these patches worked.
Talking with Paul McKenney on IRC, he suggested to add recursion
detection instead. This seemed to be a better solution, so I decided to
implement it. As the task_struct already has a trace_recursion to detect
recursion in the ring buffer, and that has a very small number it
allows, I decided to use that same variable to add flags that can detect
the recursion inside the infrastructure of the function tracer.
I plan to change it so that the task struct bit can be checked in
mcount, but as that requires changes to all archs, I will hold that off
to the next merge window.
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Link: http://lkml.kernel.org/r/1306348063.1465.116.camel@gandalf.stny.rr.com
Reported-by: Witold Baryluk <baryluk@smp.if.uj.edu.pl>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-05-26 02:27:43 +08:00
|
|
|
/* bitmask and counter of trace recursion */
|
tracing: add same level recursion detection
The tracing infrastructure allows for recursion. That is, an interrupt
may interrupt the act of tracing an event, and that interrupt may very well
perform its own trace. This is a recursive trace, and is fine to do.
The problem arises when there is a bug, and the utility doing the trace
calls something that recurses back into the tracer. This recursion is not
caused by an external event like an interrupt, but by code that is not
expected to recurse. The result could be a lockup.
This patch adds a bitmask to the task structure that keeps track
of the trace recursion. To find the interrupt depth, the following
algorithm is used:
level = hardirq_count() + softirq_count() + in_nmi;
Here, level will be the depth of interrutps and softirqs, and even handles
the nmi. Then the corresponding bit is set in the recursion bitmask.
If the bit was already set, we know we had a recursion at the same level
and we warn about it and fail the writing to the buffer.
After the data has been committed to the buffer, we clear the bit.
No atomics are needed. The only races are with interrupts and they reset
the bitmask before returning anywy.
[ Impact: detect same irq level trace recursion ]
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2009-04-17 09:41:52 +08:00
|
|
|
unsigned long trace_recursion;
|
|
|
|
#endif /* CONFIG_TRACING */
|
kernel: add kcov code coverage
kcov provides code coverage collection for coverage-guided fuzzing
(randomized testing). Coverage-guided fuzzing is a testing technique
that uses coverage feedback to determine new interesting inputs to a
system. A notable user-space example is AFL
(http://lcamtuf.coredump.cx/afl/). However, this technique is not
widely used for kernel testing due to missing compiler and kernel
support.
kcov does not aim to collect as much coverage as possible. It aims to
collect more or less stable coverage that is function of syscall inputs.
To achieve this goal it does not collect coverage in soft/hard
interrupts and instrumentation of some inherently non-deterministic or
non-interesting parts of kernel is disbled (e.g. scheduler, locking).
Currently there is a single coverage collection mode (tracing), but the
API anticipates additional collection modes. Initially I also
implemented a second mode which exposes coverage in a fixed-size hash
table of counters (what Quentin used in his original patch). I've
dropped the second mode for simplicity.
This patch adds the necessary support on kernel side. The complimentary
compiler support was added in gcc revision 231296.
We've used this support to build syzkaller system call fuzzer, which has
found 90 kernel bugs in just 2 months:
https://github.com/google/syzkaller/wiki/Found-Bugs
We've also found 30+ bugs in our internal systems with syzkaller.
Another (yet unexplored) direction where kcov coverage would greatly
help is more traditional "blob mutation". For example, mounting a
random blob as a filesystem, or receiving a random blob over wire.
Why not gcov. Typical fuzzing loop looks as follows: (1) reset
coverage, (2) execute a bit of code, (3) collect coverage, repeat. A
typical coverage can be just a dozen of basic blocks (e.g. an invalid
input). In such context gcov becomes prohibitively expensive as
reset/collect coverage steps depend on total number of basic
blocks/edges in program (in case of kernel it is about 2M). Cost of
kcov depends only on number of executed basic blocks/edges. On top of
that, kernel requires per-thread coverage because there are always
background threads and unrelated processes that also produce coverage.
With inlined gcov instrumentation per-thread coverage is not possible.
kcov exposes kernel PCs and control flow to user-space which is
insecure. But debugfs should not be mapped as user accessible.
Based on a patch by Quentin Casasnovas.
[akpm@linux-foundation.org: make task_struct.kcov_mode have type `enum kcov_mode']
[akpm@linux-foundation.org: unbreak allmodconfig]
[akpm@linux-foundation.org: follow x86 Makefile layout standards]
Signed-off-by: Dmitry Vyukov <dvyukov@google.com>
Reviewed-by: Kees Cook <keescook@chromium.org>
Cc: syzkaller <syzkaller@googlegroups.com>
Cc: Vegard Nossum <vegard.nossum@oracle.com>
Cc: Catalin Marinas <catalin.marinas@arm.com>
Cc: Tavis Ormandy <taviso@google.com>
Cc: Will Deacon <will.deacon@arm.com>
Cc: Quentin Casasnovas <quentin.casasnovas@oracle.com>
Cc: Kostya Serebryany <kcc@google.com>
Cc: Eric Dumazet <edumazet@google.com>
Cc: Alexander Potapenko <glider@google.com>
Cc: Kees Cook <keescook@google.com>
Cc: Bjorn Helgaas <bhelgaas@google.com>
Cc: Sasha Levin <sasha.levin@oracle.com>
Cc: David Drysdale <drysdale@google.com>
Cc: Ard Biesheuvel <ard.biesheuvel@linaro.org>
Cc: Andrey Ryabinin <ryabinin.a.a@gmail.com>
Cc: Kirill A. Shutemov <kirill@shutemov.name>
Cc: Jiri Slaby <jslaby@suse.cz>
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-03-23 05:27:30 +08:00
|
|
|
#ifdef CONFIG_KCOV
|
|
|
|
/* Coverage collection mode enabled for this task (0 if disabled). */
|
|
|
|
enum kcov_mode kcov_mode;
|
|
|
|
/* Size of the kcov_area. */
|
|
|
|
unsigned kcov_size;
|
|
|
|
/* Buffer for coverage collection. */
|
|
|
|
void *kcov_area;
|
|
|
|
/* kcov desciptor wired with this task or NULL. */
|
|
|
|
struct kcov *kcov;
|
|
|
|
#endif
|
2014-12-13 08:55:15 +08:00
|
|
|
#ifdef CONFIG_MEMCG
|
2015-11-06 10:46:09 +08:00
|
|
|
struct mem_cgroup *memcg_in_oom;
|
|
|
|
gfp_t memcg_oom_gfp_mask;
|
|
|
|
int memcg_oom_order;
|
memcg: punt high overage reclaim to return-to-userland path
Currently, try_charge() tries to reclaim memory synchronously when the
high limit is breached; however, if the allocation doesn't have
__GFP_WAIT, synchronous reclaim is skipped. If a process performs only
speculative allocations, it can blow way past the high limit. This is
actually easily reproducible by simply doing "find /". slab/slub
allocator tries speculative allocations first, so as long as there's
memory which can be consumed without blocking, it can keep allocating
memory regardless of the high limit.
This patch makes try_charge() always punt the over-high reclaim to the
return-to-userland path. If try_charge() detects that high limit is
breached, it adds the overage to current->memcg_nr_pages_over_high and
schedules execution of mem_cgroup_handle_over_high() which performs
synchronous reclaim from the return-to-userland path.
As long as kernel doesn't have a run-away allocation spree, this should
provide enough protection while making kmemcg behave more consistently.
It also has the following benefits.
- All over-high reclaims can use GFP_KERNEL regardless of the specific
gfp mask in use, e.g. GFP_NOFS, when the limit was breached.
- It copes with prio inversion. Previously, a low-prio task with
small memory.high might perform over-high reclaim with a bunch of
locks held. If a higher prio task needed any of these locks, it
would have to wait until the low prio task finished reclaim and
released the locks. By handing over-high reclaim to the task exit
path this issue can be avoided.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Michal Hocko <mhocko@kernel.org>
Reviewed-by: Vladimir Davydov <vdavydov@parallels.com>
Acked-by: Johannes Weiner <hannes@cmpxchg.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-11-06 10:46:11 +08:00
|
|
|
|
|
|
|
/* number of pages to reclaim on returning to userland */
|
|
|
|
unsigned int memcg_nr_pages_over_high;
|
2009-12-16 08:47:03 +08:00
|
|
|
#endif
|
uprobes/core: Handle breakpoint and singlestep exceptions
Uprobes uses exception notifiers to get to know if a thread hit
a breakpoint or a singlestep exception.
When a thread hits a uprobe or is singlestepping post a uprobe
hit, the uprobe exception notifier sets its TIF_UPROBE bit,
which will then be checked on its return to userspace path
(do_notify_resume() ->uprobe_notify_resume()), where the
consumers handlers are run (in task context) based on the
defined filters.
Uprobe hits are thread specific and hence we need to maintain
information about if a task hit a uprobe, what uprobe was hit,
the slot where the original instruction was copied for xol so
that it can be singlestepped with appropriate fixups.
In some cases, special care is needed for instructions that are
executed out of line (xol). These are architecture specific
artefacts, such as handling RIP relative instructions on x86_64.
Since the instruction at which the uprobe was inserted is
executed out of line, architecture specific fixups are added so
that the thread continues normal execution in the presence of a
uprobe.
Postpone the signals until we execute the probed insn.
post_xol() path does a recalc_sigpending() before return to
user-mode, this ensures the signal can't be lost.
Uprobes relies on DIE_DEBUG notification to notify if a
singlestep is complete.
Adds x86 specific uprobe exception notifiers and appropriate
hooks needed to determine a uprobe hit and subsequent post
processing.
Add requisite x86 fixups for xol for uprobes. Specific cases
needing fixups include relative jumps (x86_64), calls, etc.
Where possible, we check and skip singlestepping the
breakpointed instructions. For now we skip single byte as well
as few multibyte nop instructions. However this can be extended
to other instructions too.
Credits to Oleg Nesterov for suggestions/patches related to
signal, breakpoint, singlestep handling code.
Signed-off-by: Srikar Dronamraju <srikar@linux.vnet.ibm.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Ananth N Mavinakayanahalli <ananth@in.ibm.com>
Cc: Jim Keniston <jkenisto@linux.vnet.ibm.com>
Cc: Linux-mm <linux-mm@kvack.org>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Andi Kleen <andi@firstfloor.org>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Arnaldo Carvalho de Melo <acme@infradead.org>
Cc: Masami Hiramatsu <masami.hiramatsu.pt@hitachi.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/20120313180011.29771.89027.sendpatchset@srdronam.in.ibm.com
[ Performed various cleanliness edits ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2012-03-14 02:00:11 +08:00
|
|
|
#ifdef CONFIG_UPROBES
|
|
|
|
struct uprobe_task *utask;
|
|
|
|
#endif
|
2013-03-24 07:11:31 +08:00
|
|
|
#if defined(CONFIG_BCACHE) || defined(CONFIG_BCACHE_MODULE)
|
|
|
|
unsigned int sequential_io;
|
|
|
|
unsigned int sequential_io_avg;
|
|
|
|
#endif
|
2014-09-24 16:18:55 +08:00
|
|
|
#ifdef CONFIG_DEBUG_ATOMIC_SLEEP
|
|
|
|
unsigned long task_state_change;
|
|
|
|
#endif
|
2015-05-11 23:52:06 +08:00
|
|
|
int pagefault_disabled;
|
2016-03-26 05:20:33 +08:00
|
|
|
#ifdef CONFIG_MMU
|
2016-03-26 05:20:39 +08:00
|
|
|
struct task_struct *oom_reaper_list;
|
2016-03-26 05:20:33 +08:00
|
|
|
#endif
|
2015-07-17 18:28:11 +08:00
|
|
|
/* CPU-specific state of this task */
|
|
|
|
struct thread_struct thread;
|
|
|
|
/*
|
|
|
|
* WARNING: on x86, 'thread_struct' contains a variable-sized
|
|
|
|
* structure. It *MUST* be at the end of 'task_struct'.
|
|
|
|
*
|
|
|
|
* Do not put anything below here!
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
};
|
|
|
|
|
2015-07-17 18:28:12 +08:00
|
|
|
#ifdef CONFIG_ARCH_WANTS_DYNAMIC_TASK_STRUCT
|
|
|
|
extern int arch_task_struct_size __read_mostly;
|
|
|
|
#else
|
|
|
|
# define arch_task_struct_size (sizeof(struct task_struct))
|
|
|
|
#endif
|
2015-07-17 18:28:11 +08:00
|
|
|
|
2009-03-13 04:35:43 +08:00
|
|
|
/* Future-safe accessor for struct task_struct's cpus_allowed. */
|
2009-12-18 01:43:29 +08:00
|
|
|
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
|
2009-03-13 04:35:43 +08:00
|
|
|
|
2013-10-07 18:29:24 +08:00
|
|
|
#define TNF_MIGRATED 0x01
|
|
|
|
#define TNF_NO_GROUP 0x02
|
2013-10-07 18:29:34 +08:00
|
|
|
#define TNF_SHARED 0x04
|
2013-10-07 18:29:36 +08:00
|
|
|
#define TNF_FAULT_LOCAL 0x08
|
2015-03-26 06:55:42 +08:00
|
|
|
#define TNF_MIGRATE_FAIL 0x10
|
2013-10-07 18:29:24 +08:00
|
|
|
|
2012-10-25 20:16:43 +08:00
|
|
|
#ifdef CONFIG_NUMA_BALANCING
|
2013-10-07 18:29:24 +08:00
|
|
|
extern void task_numa_fault(int last_node, int node, int pages, int flags);
|
2013-10-07 18:29:22 +08:00
|
|
|
extern pid_t task_numa_group_id(struct task_struct *p);
|
2012-11-22 19:16:36 +08:00
|
|
|
extern void set_numabalancing_state(bool enabled);
|
2013-10-07 18:29:28 +08:00
|
|
|
extern void task_numa_free(struct task_struct *p);
|
2014-01-28 06:03:44 +08:00
|
|
|
extern bool should_numa_migrate_memory(struct task_struct *p, struct page *page,
|
|
|
|
int src_nid, int dst_cpu);
|
2012-10-25 20:16:43 +08:00
|
|
|
#else
|
2013-10-07 18:29:03 +08:00
|
|
|
static inline void task_numa_fault(int last_node, int node, int pages,
|
2013-10-07 18:29:24 +08:00
|
|
|
int flags)
|
2012-10-25 20:16:43 +08:00
|
|
|
{
|
|
|
|
}
|
2013-10-07 18:29:22 +08:00
|
|
|
static inline pid_t task_numa_group_id(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2012-11-22 19:16:36 +08:00
|
|
|
static inline void set_numabalancing_state(bool enabled)
|
|
|
|
{
|
|
|
|
}
|
2013-10-07 18:29:28 +08:00
|
|
|
static inline void task_numa_free(struct task_struct *p)
|
|
|
|
{
|
|
|
|
}
|
2014-01-28 06:03:44 +08:00
|
|
|
static inline bool should_numa_migrate_memory(struct task_struct *p,
|
|
|
|
struct page *page, int src_nid, int dst_cpu)
|
|
|
|
{
|
|
|
|
return true;
|
|
|
|
}
|
2012-10-25 20:16:43 +08:00
|
|
|
#endif
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline struct pid *task_pid(struct task_struct *task)
|
2006-10-02 17:17:09 +08:00
|
|
|
{
|
|
|
|
return task->pids[PIDTYPE_PID].pid;
|
|
|
|
}
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline struct pid *task_tgid(struct task_struct *task)
|
2006-10-02 17:17:09 +08:00
|
|
|
{
|
|
|
|
return task->group_leader->pids[PIDTYPE_PID].pid;
|
|
|
|
}
|
|
|
|
|
2009-04-03 07:58:35 +08:00
|
|
|
/*
|
|
|
|
* Without tasklist or rcu lock it is not safe to dereference
|
|
|
|
* the result of task_pgrp/task_session even if task == current,
|
|
|
|
* we can race with another thread doing sys_setsid/sys_setpgid.
|
|
|
|
*/
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline struct pid *task_pgrp(struct task_struct *task)
|
2006-10-02 17:17:09 +08:00
|
|
|
{
|
|
|
|
return task->group_leader->pids[PIDTYPE_PGID].pid;
|
|
|
|
}
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline struct pid *task_session(struct task_struct *task)
|
2006-10-02 17:17:09 +08:00
|
|
|
{
|
|
|
|
return task->group_leader->pids[PIDTYPE_SID].pid;
|
|
|
|
}
|
|
|
|
|
2007-10-19 14:40:06 +08:00
|
|
|
struct pid_namespace;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* the helpers to get the task's different pids as they are seen
|
|
|
|
* from various namespaces
|
|
|
|
*
|
|
|
|
* task_xid_nr() : global id, i.e. the id seen from the init namespace;
|
2008-02-08 20:19:15 +08:00
|
|
|
* task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of
|
|
|
|
* current.
|
2007-10-19 14:40:06 +08:00
|
|
|
* task_xid_nr_ns() : id seen from the ns specified;
|
|
|
|
*
|
|
|
|
* set_task_vxid() : assigns a virtual id to a task;
|
|
|
|
*
|
|
|
|
* see also pid_nr() etc in include/linux/pid.h
|
|
|
|
*/
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 07:58:38 +08:00
|
|
|
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
|
|
|
|
struct pid_namespace *ns);
|
2007-10-19 14:40:06 +08:00
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline pid_t task_pid_nr(struct task_struct *tsk)
|
2007-10-19 14:40:06 +08:00
|
|
|
{
|
|
|
|
return tsk->pid;
|
|
|
|
}
|
|
|
|
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 07:58:38 +08:00
|
|
|
static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
|
|
|
|
struct pid_namespace *ns)
|
|
|
|
{
|
|
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
|
|
|
|
}
|
2007-10-19 14:40:06 +08:00
|
|
|
|
|
|
|
static inline pid_t task_pid_vnr(struct task_struct *tsk)
|
|
|
|
{
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 07:58:38 +08:00
|
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
|
2007-10-19 14:40:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline pid_t task_tgid_nr(struct task_struct *tsk)
|
2007-10-19 14:40:06 +08:00
|
|
|
{
|
|
|
|
return tsk->tgid;
|
|
|
|
}
|
|
|
|
|
2007-10-19 14:40:19 +08:00
|
|
|
pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
|
2007-10-19 14:40:06 +08:00
|
|
|
|
|
|
|
static inline pid_t task_tgid_vnr(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return pid_vnr(task_tgid(tsk));
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2014-03-17 02:00:19 +08:00
|
|
|
static inline int pid_alive(const struct task_struct *p);
|
2013-08-16 06:05:12 +08:00
|
|
|
static inline pid_t task_ppid_nr_ns(const struct task_struct *tsk, struct pid_namespace *ns)
|
|
|
|
{
|
|
|
|
pid_t pid = 0;
|
|
|
|
|
|
|
|
rcu_read_lock();
|
|
|
|
if (pid_alive(tsk))
|
|
|
|
pid = task_tgid_nr_ns(rcu_dereference(tsk->real_parent), ns);
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
|
|
|
return pid;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline pid_t task_ppid_nr(const struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return task_ppid_nr_ns(tsk, &init_pid_ns);
|
|
|
|
}
|
|
|
|
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 07:58:38 +08:00
|
|
|
static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
|
|
|
|
struct pid_namespace *ns)
|
2007-10-19 14:40:06 +08:00
|
|
|
{
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 07:58:38 +08:00
|
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
|
2007-10-19 14:40:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
|
|
|
|
{
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 07:58:38 +08:00
|
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
|
2007-10-19 14:40:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 07:58:38 +08:00
|
|
|
static inline pid_t task_session_nr_ns(struct task_struct *tsk,
|
|
|
|
struct pid_namespace *ns)
|
2007-10-19 14:40:06 +08:00
|
|
|
{
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 07:58:38 +08:00
|
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
|
2007-10-19 14:40:06 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline pid_t task_session_vnr(struct task_struct *tsk)
|
|
|
|
{
|
pids: refactor vnr/nr_ns helpers to make them safe
Inho, the safety rules for vnr/nr_ns helpers are horrible and buggy.
task_pid_nr_ns(task) needs rcu/tasklist depending on task == current.
As for "special" pids, vnr/nr_ns helpers always need rcu. However, if
task != current, they are unsafe even under rcu lock, we can't trust
task->group_leader without the special checks.
And almost every helper has a callsite which needs a fix.
Also, it is a bit annoying that the implementations of, say,
task_pgrp_vnr() and task_pgrp_nr_ns() are not "symmetrical".
This patch introduces the new helper, __task_pid_nr_ns(), which is always
safe to use, and turns all other helpers into the trivial wrappers.
After this I'll send another patch which converts task_tgid_xxx() as well,
they're are a bit special.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Louis Rilling <Louis.Rilling@kerlabs.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Pavel Emelyanov <xemul@openvz.org>
Cc: Sukadev Bhattiprolu <sukadev@linux.vnet.ibm.com>
Cc: Roland McGrath <roland@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 07:58:38 +08:00
|
|
|
return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
|
2007-10-19 14:40:06 +08:00
|
|
|
}
|
|
|
|
|
2009-04-03 07:58:39 +08:00
|
|
|
/* obsolete, do not use */
|
|
|
|
static inline pid_t task_pgrp_nr(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return task_pgrp_nr_ns(tsk, &init_pid_ns);
|
|
|
|
}
|
2007-10-19 14:40:06 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/**
|
|
|
|
* pid_alive - check that a task structure is not stale
|
|
|
|
* @p: Task structure to be checked.
|
|
|
|
*
|
|
|
|
* Test if a process is not yet dead (at most zombie state)
|
|
|
|
* If pid_alive fails, then pointers within the task structure
|
|
|
|
* can be stale and must not be dereferenced.
|
2013-07-13 02:45:47 +08:00
|
|
|
*
|
|
|
|
* Return: 1 if the process is alive. 0 otherwise.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2013-08-16 06:05:12 +08:00
|
|
|
static inline int pid_alive(const struct task_struct *p)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
[PATCH] pidhash: Refactor the pid hash table
Simplifies the code, reduces the need for 4 pid hash tables, and makes the
code more capable.
In the discussions I had with Oleg it was felt that to a large extent the
cleanup itself justified the work. With struct pid being dynamically
allocated meant we could create the hash table entry when the pid was
allocated and free the hash table entry when the pid was freed. Instead of
playing with the hash lists when ever a process would attach or detach to a
process.
For myself the fact that it gave what my previous task_ref patch gave for free
with simpler code was a big win. The problem is that if you hold a reference
to struct task_struct you lock in 10K of low memory. If you do that in a user
controllable way like /proc does, with an unprivileged but hostile user space
application with typical resource limits of 1000 fds and 100 processes I can
trigger the OOM killer by consuming all of low memory with task structs, on a
machine wight 1GB of low memory.
If I instead hold a reference to struct pid which holds a pointer to my
task_struct, I don't suffer from that problem because struct pid is 2 orders
of magnitude smaller. In fact struct pid is small enough that most other
kernel data structures dwarf it, so simply limiting the number of referring
data structures is enough to prevent exhaustion of low memory.
This splits the current struct pid into two structures, struct pid and struct
pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one.
struct pid_link is the per process linkage into the hash tables and lives in
struct task_struct. struct pid is given an indepedent lifetime, and holds
pointers to each of the pid types.
The independent life of struct pid simplifies attach_pid, and detach_pid,
because we are always manipulating the list of pids and not the hash table.
In addition in giving struct pid an indpendent life it makes the concept much
more powerful.
Kernel data structures can now embed a struct pid * instead of a pid_t and
not suffer from pid wrap around problems or from keeping unnecessarily
large amounts of memory allocated.
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 18:31:42 +08:00
|
|
|
return p->pids[PIDTYPE_PID].pid != NULL;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-09-29 17:00:07 +08:00
|
|
|
/**
|
2016-01-01 22:03:01 +08:00
|
|
|
* is_global_init - check if a task structure is init. Since init
|
|
|
|
* is free to have sub-threads we need to check tgid.
|
2006-10-06 15:44:01 +08:00
|
|
|
* @tsk: Task structure to be checked.
|
|
|
|
*
|
|
|
|
* Check if a task structure is the first user space task the kernel created.
|
2013-07-13 02:45:47 +08:00
|
|
|
*
|
|
|
|
* Return: 1 if the task structure is init. 0 otherwise.
|
2007-10-19 14:39:52 +08:00
|
|
|
*/
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline int is_global_init(struct task_struct *tsk)
|
2007-10-19 14:40:09 +08:00
|
|
|
{
|
2016-01-01 22:03:01 +08:00
|
|
|
return task_tgid_nr(tsk) == 1;
|
2007-10-19 14:40:09 +08:00
|
|
|
}
|
2007-10-19 14:39:52 +08:00
|
|
|
|
2006-10-02 17:19:00 +08:00
|
|
|
extern struct pid *cad_pid;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void free_task(struct task_struct *tsk);
|
|
|
|
#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
|
2006-01-08 17:01:37 +08:00
|
|
|
|
2006-03-31 18:31:34 +08:00
|
|
|
extern void __put_task_struct(struct task_struct *t);
|
2006-01-08 17:01:37 +08:00
|
|
|
|
|
|
|
static inline void put_task_struct(struct task_struct *t)
|
|
|
|
{
|
|
|
|
if (atomic_dec_and_test(&t->usage))
|
2006-03-31 18:31:37 +08:00
|
|
|
__put_task_struct(t);
|
2006-01-08 17:01:37 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-12-17 03:00:34 +08:00
|
|
|
#ifdef CONFIG_VIRT_CPU_ACCOUNTING_GEN
|
|
|
|
extern void task_cputime(struct task_struct *t,
|
|
|
|
cputime_t *utime, cputime_t *stime);
|
|
|
|
extern void task_cputime_scaled(struct task_struct *t,
|
|
|
|
cputime_t *utimescaled, cputime_t *stimescaled);
|
|
|
|
extern cputime_t task_gtime(struct task_struct *t);
|
|
|
|
#else
|
2012-11-13 21:20:55 +08:00
|
|
|
static inline void task_cputime(struct task_struct *t,
|
|
|
|
cputime_t *utime, cputime_t *stime)
|
|
|
|
{
|
|
|
|
if (utime)
|
|
|
|
*utime = t->utime;
|
|
|
|
if (stime)
|
|
|
|
*stime = t->stime;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void task_cputime_scaled(struct task_struct *t,
|
|
|
|
cputime_t *utimescaled,
|
|
|
|
cputime_t *stimescaled)
|
|
|
|
{
|
|
|
|
if (utimescaled)
|
|
|
|
*utimescaled = t->utimescaled;
|
|
|
|
if (stimescaled)
|
|
|
|
*stimescaled = t->stimescaled;
|
|
|
|
}
|
2012-12-17 03:00:34 +08:00
|
|
|
|
|
|
|
static inline cputime_t task_gtime(struct task_struct *t)
|
|
|
|
{
|
|
|
|
return t->gtime;
|
|
|
|
}
|
|
|
|
#endif
|
2012-11-21 23:26:44 +08:00
|
|
|
extern void task_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
|
|
|
|
extern void thread_group_cputime_adjusted(struct task_struct *p, cputime_t *ut, cputime_t *st);
|
2008-09-06 00:12:23 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Per process flags
|
|
|
|
*/
|
|
|
|
#define PF_EXITING 0x00000004 /* getting shut down */
|
2007-06-09 04:47:00 +08:00
|
|
|
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
|
2007-10-15 23:00:19 +08:00
|
|
|
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
|
2010-06-09 03:40:37 +08:00
|
|
|
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
|
2009-09-16 17:50:14 +08:00
|
|
|
#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
|
|
|
|
#define PF_DUMPCORE 0x00000200 /* dumped core */
|
|
|
|
#define PF_SIGNALED 0x00000400 /* killed by a signal */
|
|
|
|
#define PF_MEMALLOC 0x00000800 /* Allocating memory */
|
2011-08-08 23:02:04 +08:00
|
|
|
#define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */
|
2013-01-16 10:52:51 +08:00
|
|
|
#define PF_USED_ASYNC 0x00004000 /* used async_schedule*(), used by module init */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */
|
|
|
|
#define PF_FROZEN 0x00010000 /* frozen for system suspend */
|
|
|
|
#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */
|
|
|
|
#define PF_KSWAPD 0x00040000 /* I am kswapd */
|
mm: teach mm by current context info to not do I/O during memory allocation
This patch introduces PF_MEMALLOC_NOIO on process flag('flags' field of
'struct task_struct'), so that the flag can be set by one task to avoid
doing I/O inside memory allocation in the task's context.
The patch trys to solve one deadlock problem caused by block device, and
the problem may happen at least in the below situations:
- during block device runtime resume, if memory allocation with
GFP_KERNEL is called inside runtime resume callback of any one of its
ancestors(or the block device itself), the deadlock may be triggered
inside the memory allocation since it might not complete until the block
device becomes active and the involed page I/O finishes. The situation
is pointed out first by Alan Stern. It is not a good approach to
convert all GFP_KERNEL[1] in the path into GFP_NOIO because several
subsystems may be involved(for example, PCI, USB and SCSI may be
involved for usb mass stoarage device, network devices involved too in
the iSCSI case)
- during block device runtime suspend, because runtime resume need to
wait for completion of concurrent runtime suspend.
- during error handling of usb mass storage deivce, USB bus reset will
be put on the device, so there shouldn't have any memory allocation with
GFP_KERNEL during USB bus reset, otherwise the deadlock similar with
above may be triggered. Unfortunately, any usb device may include one
mass storage interface in theory, so it requires all usb interface
drivers to handle the situation. In fact, most usb drivers don't know
how to handle bus reset on the device and don't provide .pre_set() and
.post_reset() callback at all, so USB core has to unbind and bind driver
for these devices. So it is still not practical to resort to GFP_NOIO
for solving the problem.
Also the introduced solution can be used by block subsystem or block
drivers too, for example, set the PF_MEMALLOC_NOIO flag before doing
actual I/O transfer.
It is not a good idea to convert all these GFP_KERNEL in the affected
path into GFP_NOIO because these functions doing that may be implemented
as library and will be called in many other contexts.
In fact, memalloc_noio_flags() can convert some of current static
GFP_NOIO allocation into GFP_KERNEL back in other non-affected contexts,
at least almost all GFP_NOIO in USB subsystem can be converted into
GFP_KERNEL after applying the approach and make allocation with GFP_NOIO
only happen in runtime resume/bus reset/block I/O transfer contexts
generally.
[1], several GFP_KERNEL allocation examples in runtime resume path
- pci subsystem
acpi_os_allocate
<-acpi_ut_allocate
<-ACPI_ALLOCATE_ZEROED
<-acpi_evaluate_object
<-__acpi_bus_set_power
<-acpi_bus_set_power
<-acpi_pci_set_power_state
<-platform_pci_set_power_state
<-pci_platform_power_transition
<-__pci_complete_power_transition
<-pci_set_power_state
<-pci_restore_standard_config
<-pci_pm_runtime_resume
- usb subsystem
usb_get_status
<-finish_port_resume
<-usb_port_resume
<-generic_resume
<-usb_resume_device
<-usb_resume_both
<-usb_runtime_resume
- some individual usb drivers
usblp, uvc, gspca, most of dvb-usb-v2 media drivers, cpia2, az6007, ....
That is just what I have found. Unfortunately, this allocation can only
be found by human being now, and there should be many not found since
any function in the resume path(call tree) may allocate memory with
GFP_KERNEL.
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Oliver Neukum <oneukum@suse.de>
Cc: Jiri Kosina <jiri.kosina@suse.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Greg KH <greg@kroah.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Decotigny <david.decotigny@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-23 08:34:08 +08:00
|
|
|
#define PF_MEMALLOC_NOIO 0x00080000 /* Allocating memory without IO involved */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
|
2008-07-25 16:47:38 +08:00
|
|
|
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
|
2006-06-13 14:26:10 +08:00
|
|
|
#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
|
|
|
|
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
|
2013-03-20 04:45:20 +08:00
|
|
|
#define PF_NO_SETAFFINITY 0x04000000 /* Userland is not allowed to meddle with cpus_allowed */
|
2009-09-16 17:50:14 +08:00
|
|
|
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
|
2006-06-27 17:54:56 +08:00
|
|
|
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
|
2011-02-16 16:25:31 +08:00
|
|
|
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
|
2013-07-25 08:41:33 +08:00
|
|
|
#define PF_SUSPEND_TASK 0x80000000 /* this thread called freeze_processes and should not be frozen */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Only the _current_ task can read/write to tsk->flags, but other
|
|
|
|
* tasks can access tsk->flags in readonly mode for example
|
|
|
|
* with tsk_used_math (like during threaded core dumping).
|
|
|
|
* There is however an exception to this rule during ptrace
|
|
|
|
* or during fork: the ptracer task is allowed to write to the
|
|
|
|
* child->flags of its traced child (same goes for fork, the parent
|
|
|
|
* can write to the child->flags), because we're guaranteed the
|
|
|
|
* child is not running and in turn not changing child->flags
|
|
|
|
* at the same time the parent does it.
|
|
|
|
*/
|
|
|
|
#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
|
|
|
|
#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
|
|
|
|
#define clear_used_math() clear_stopped_child_used_math(current)
|
|
|
|
#define set_used_math() set_stopped_child_used_math(current)
|
|
|
|
#define conditional_stopped_child_used_math(condition, child) \
|
|
|
|
do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
|
|
|
|
#define conditional_used_math(condition) \
|
|
|
|
conditional_stopped_child_used_math(condition, current)
|
|
|
|
#define copy_to_stopped_child_used_math(child) \
|
|
|
|
do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
|
|
|
|
/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
|
|
|
|
#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
|
|
|
|
#define used_math() tsk_used_math(current)
|
|
|
|
|
mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O
during memory allocation") introduces PF_MEMALLOC_NOIO flag to avoid doing
I/O inside memory allocation, __GFP_IO is cleared when this flag is set,
but __GFP_FS implies __GFP_IO, it should also be cleared. Or it may still
run into I/O, like in superblock shrinker. And this will make the kernel
run into the deadlock case described in that commit.
See Dave Chinner's comment about io in superblock shrinker:
Filesystem shrinkers do indeed perform IO from the superblock shrinker and
have for years. Even clean inodes can require IO before they can be freed
- e.g. on an orphan list, need truncation of post-eof blocks, need to
wait for ordered operations to complete before it can be freed, etc.
IOWs, Ext4, btrfs and XFS all can issue and/or block on arbitrary amounts
of IO in the superblock shrinker context. XFS, in particular, has been
doing transactions and IO from the VFS inode cache shrinker since it was
first introduced....
Fix this by clearing __GFP_FS in memalloc_noio_flags(), this function has
masked all the gfp_mask that will be passed into fs for the processes
setting PF_MEMALLOC_NOIO in the direct reclaim path.
v1 thread at: https://lkml.org/lkml/2014/9/3/32
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: joyce.xue <xuejiufei@huawei.com>
Cc: Ming Lei <ming.lei@canonical.com>
Cc: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-10 06:28:23 +08:00
|
|
|
/* __GFP_IO isn't allowed if PF_MEMALLOC_NOIO is set in current->flags
|
|
|
|
* __GFP_FS is also cleared as it implies __GFP_IO.
|
|
|
|
*/
|
mm: teach mm by current context info to not do I/O during memory allocation
This patch introduces PF_MEMALLOC_NOIO on process flag('flags' field of
'struct task_struct'), so that the flag can be set by one task to avoid
doing I/O inside memory allocation in the task's context.
The patch trys to solve one deadlock problem caused by block device, and
the problem may happen at least in the below situations:
- during block device runtime resume, if memory allocation with
GFP_KERNEL is called inside runtime resume callback of any one of its
ancestors(or the block device itself), the deadlock may be triggered
inside the memory allocation since it might not complete until the block
device becomes active and the involed page I/O finishes. The situation
is pointed out first by Alan Stern. It is not a good approach to
convert all GFP_KERNEL[1] in the path into GFP_NOIO because several
subsystems may be involved(for example, PCI, USB and SCSI may be
involved for usb mass stoarage device, network devices involved too in
the iSCSI case)
- during block device runtime suspend, because runtime resume need to
wait for completion of concurrent runtime suspend.
- during error handling of usb mass storage deivce, USB bus reset will
be put on the device, so there shouldn't have any memory allocation with
GFP_KERNEL during USB bus reset, otherwise the deadlock similar with
above may be triggered. Unfortunately, any usb device may include one
mass storage interface in theory, so it requires all usb interface
drivers to handle the situation. In fact, most usb drivers don't know
how to handle bus reset on the device and don't provide .pre_set() and
.post_reset() callback at all, so USB core has to unbind and bind driver
for these devices. So it is still not practical to resort to GFP_NOIO
for solving the problem.
Also the introduced solution can be used by block subsystem or block
drivers too, for example, set the PF_MEMALLOC_NOIO flag before doing
actual I/O transfer.
It is not a good idea to convert all these GFP_KERNEL in the affected
path into GFP_NOIO because these functions doing that may be implemented
as library and will be called in many other contexts.
In fact, memalloc_noio_flags() can convert some of current static
GFP_NOIO allocation into GFP_KERNEL back in other non-affected contexts,
at least almost all GFP_NOIO in USB subsystem can be converted into
GFP_KERNEL after applying the approach and make allocation with GFP_NOIO
only happen in runtime resume/bus reset/block I/O transfer contexts
generally.
[1], several GFP_KERNEL allocation examples in runtime resume path
- pci subsystem
acpi_os_allocate
<-acpi_ut_allocate
<-ACPI_ALLOCATE_ZEROED
<-acpi_evaluate_object
<-__acpi_bus_set_power
<-acpi_bus_set_power
<-acpi_pci_set_power_state
<-platform_pci_set_power_state
<-pci_platform_power_transition
<-__pci_complete_power_transition
<-pci_set_power_state
<-pci_restore_standard_config
<-pci_pm_runtime_resume
- usb subsystem
usb_get_status
<-finish_port_resume
<-usb_port_resume
<-generic_resume
<-usb_resume_device
<-usb_resume_both
<-usb_runtime_resume
- some individual usb drivers
usblp, uvc, gspca, most of dvb-usb-v2 media drivers, cpia2, az6007, ....
That is just what I have found. Unfortunately, this allocation can only
be found by human being now, and there should be many not found since
any function in the resume path(call tree) may allocate memory with
GFP_KERNEL.
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Oliver Neukum <oneukum@suse.de>
Cc: Jiri Kosina <jiri.kosina@suse.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Greg KH <greg@kroah.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Decotigny <david.decotigny@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-23 08:34:08 +08:00
|
|
|
static inline gfp_t memalloc_noio_flags(gfp_t flags)
|
|
|
|
{
|
|
|
|
if (unlikely(current->flags & PF_MEMALLOC_NOIO))
|
mm: clear __GFP_FS when PF_MEMALLOC_NOIO is set
commit 21caf2fc1931 ("mm: teach mm by current context info to not do I/O
during memory allocation") introduces PF_MEMALLOC_NOIO flag to avoid doing
I/O inside memory allocation, __GFP_IO is cleared when this flag is set,
but __GFP_FS implies __GFP_IO, it should also be cleared. Or it may still
run into I/O, like in superblock shrinker. And this will make the kernel
run into the deadlock case described in that commit.
See Dave Chinner's comment about io in superblock shrinker:
Filesystem shrinkers do indeed perform IO from the superblock shrinker and
have for years. Even clean inodes can require IO before they can be freed
- e.g. on an orphan list, need truncation of post-eof blocks, need to
wait for ordered operations to complete before it can be freed, etc.
IOWs, Ext4, btrfs and XFS all can issue and/or block on arbitrary amounts
of IO in the superblock shrinker context. XFS, in particular, has been
doing transactions and IO from the VFS inode cache shrinker since it was
first introduced....
Fix this by clearing __GFP_FS in memalloc_noio_flags(), this function has
masked all the gfp_mask that will be passed into fs for the processes
setting PF_MEMALLOC_NOIO in the direct reclaim path.
v1 thread at: https://lkml.org/lkml/2014/9/3/32
Signed-off-by: Junxiao Bi <junxiao.bi@oracle.com>
Cc: Dave Chinner <david@fromorbit.com>
Cc: joyce.xue <xuejiufei@huawei.com>
Cc: Ming Lei <ming.lei@canonical.com>
Cc: Trond Myklebust <trond.myklebust@primarydata.com>
Cc: <stable@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-10-10 06:28:23 +08:00
|
|
|
flags &= ~(__GFP_IO | __GFP_FS);
|
mm: teach mm by current context info to not do I/O during memory allocation
This patch introduces PF_MEMALLOC_NOIO on process flag('flags' field of
'struct task_struct'), so that the flag can be set by one task to avoid
doing I/O inside memory allocation in the task's context.
The patch trys to solve one deadlock problem caused by block device, and
the problem may happen at least in the below situations:
- during block device runtime resume, if memory allocation with
GFP_KERNEL is called inside runtime resume callback of any one of its
ancestors(or the block device itself), the deadlock may be triggered
inside the memory allocation since it might not complete until the block
device becomes active and the involed page I/O finishes. The situation
is pointed out first by Alan Stern. It is not a good approach to
convert all GFP_KERNEL[1] in the path into GFP_NOIO because several
subsystems may be involved(for example, PCI, USB and SCSI may be
involved for usb mass stoarage device, network devices involved too in
the iSCSI case)
- during block device runtime suspend, because runtime resume need to
wait for completion of concurrent runtime suspend.
- during error handling of usb mass storage deivce, USB bus reset will
be put on the device, so there shouldn't have any memory allocation with
GFP_KERNEL during USB bus reset, otherwise the deadlock similar with
above may be triggered. Unfortunately, any usb device may include one
mass storage interface in theory, so it requires all usb interface
drivers to handle the situation. In fact, most usb drivers don't know
how to handle bus reset on the device and don't provide .pre_set() and
.post_reset() callback at all, so USB core has to unbind and bind driver
for these devices. So it is still not practical to resort to GFP_NOIO
for solving the problem.
Also the introduced solution can be used by block subsystem or block
drivers too, for example, set the PF_MEMALLOC_NOIO flag before doing
actual I/O transfer.
It is not a good idea to convert all these GFP_KERNEL in the affected
path into GFP_NOIO because these functions doing that may be implemented
as library and will be called in many other contexts.
In fact, memalloc_noio_flags() can convert some of current static
GFP_NOIO allocation into GFP_KERNEL back in other non-affected contexts,
at least almost all GFP_NOIO in USB subsystem can be converted into
GFP_KERNEL after applying the approach and make allocation with GFP_NOIO
only happen in runtime resume/bus reset/block I/O transfer contexts
generally.
[1], several GFP_KERNEL allocation examples in runtime resume path
- pci subsystem
acpi_os_allocate
<-acpi_ut_allocate
<-ACPI_ALLOCATE_ZEROED
<-acpi_evaluate_object
<-__acpi_bus_set_power
<-acpi_bus_set_power
<-acpi_pci_set_power_state
<-platform_pci_set_power_state
<-pci_platform_power_transition
<-__pci_complete_power_transition
<-pci_set_power_state
<-pci_restore_standard_config
<-pci_pm_runtime_resume
- usb subsystem
usb_get_status
<-finish_port_resume
<-usb_port_resume
<-generic_resume
<-usb_resume_device
<-usb_resume_both
<-usb_runtime_resume
- some individual usb drivers
usblp, uvc, gspca, most of dvb-usb-v2 media drivers, cpia2, az6007, ....
That is just what I have found. Unfortunately, this allocation can only
be found by human being now, and there should be many not found since
any function in the resume path(call tree) may allocate memory with
GFP_KERNEL.
Signed-off-by: Ming Lei <ming.lei@canonical.com>
Signed-off-by: Minchan Kim <minchan@kernel.org>
Cc: Alan Stern <stern@rowland.harvard.edu>
Cc: Oliver Neukum <oneukum@suse.de>
Cc: Jiri Kosina <jiri.kosina@suse.com>
Cc: Mel Gorman <mel@csn.ul.ie>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: "Rafael J. Wysocki" <rjw@sisk.pl>
Cc: Greg KH <greg@kroah.com>
Cc: Jens Axboe <axboe@kernel.dk>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Eric Dumazet <eric.dumazet@gmail.com>
Cc: David Decotigny <david.decotigny@google.com>
Cc: Tom Herbert <therbert@google.com>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2013-02-23 08:34:08 +08:00
|
|
|
return flags;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned int memalloc_noio_save(void)
|
|
|
|
{
|
|
|
|
unsigned int flags = current->flags & PF_MEMALLOC_NOIO;
|
|
|
|
current->flags |= PF_MEMALLOC_NOIO;
|
|
|
|
return flags;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void memalloc_noio_restore(unsigned int flags)
|
|
|
|
{
|
|
|
|
current->flags = (current->flags & ~PF_MEMALLOC_NOIO) | flags;
|
|
|
|
}
|
|
|
|
|
2014-05-22 06:23:46 +08:00
|
|
|
/* Per-process atomic flags. */
|
2014-09-25 09:40:17 +08:00
|
|
|
#define PFA_NO_NEW_PRIVS 0 /* May not gain new privileges. */
|
2014-09-25 09:41:02 +08:00
|
|
|
#define PFA_SPREAD_PAGE 1 /* Spread page cache over cpuset */
|
|
|
|
#define PFA_SPREAD_SLAB 2 /* Spread some slab caches over cpuset */
|
|
|
|
|
2014-05-22 06:23:46 +08:00
|
|
|
|
2014-09-25 09:40:40 +08:00
|
|
|
#define TASK_PFA_TEST(name, func) \
|
|
|
|
static inline bool task_##func(struct task_struct *p) \
|
|
|
|
{ return test_bit(PFA_##name, &p->atomic_flags); }
|
|
|
|
#define TASK_PFA_SET(name, func) \
|
|
|
|
static inline void task_set_##func(struct task_struct *p) \
|
|
|
|
{ set_bit(PFA_##name, &p->atomic_flags); }
|
|
|
|
#define TASK_PFA_CLEAR(name, func) \
|
|
|
|
static inline void task_clear_##func(struct task_struct *p) \
|
|
|
|
{ clear_bit(PFA_##name, &p->atomic_flags); }
|
|
|
|
|
|
|
|
TASK_PFA_TEST(NO_NEW_PRIVS, no_new_privs)
|
|
|
|
TASK_PFA_SET(NO_NEW_PRIVS, no_new_privs)
|
2014-05-22 06:23:46 +08:00
|
|
|
|
2014-09-25 09:41:02 +08:00
|
|
|
TASK_PFA_TEST(SPREAD_PAGE, spread_page)
|
|
|
|
TASK_PFA_SET(SPREAD_PAGE, spread_page)
|
|
|
|
TASK_PFA_CLEAR(SPREAD_PAGE, spread_page)
|
|
|
|
|
|
|
|
TASK_PFA_TEST(SPREAD_SLAB, spread_slab)
|
|
|
|
TASK_PFA_SET(SPREAD_SLAB, spread_slab)
|
|
|
|
TASK_PFA_CLEAR(SPREAD_SLAB, spread_slab)
|
2014-05-22 06:23:46 +08:00
|
|
|
|
signal: Fix premature completion of group stop when interfered by ptrace
task->signal->group_stop_count is used to track the progress of group
stop. It's initialized to the number of tasks which need to stop for
group stop to finish and each stopping or trapping task decrements.
However, each task doesn't keep track of whether it decremented the
counter or not and if woken up before the group stop is complete and
stops again, it can decrement the counter multiple times.
Please consider the following example code.
static void *worker(void *arg)
{
while (1) ;
return NULL;
}
int main(void)
{
pthread_t thread;
pid_t pid;
int i;
pid = fork();
if (!pid) {
for (i = 0; i < 5; i++)
pthread_create(&thread, NULL, worker, NULL);
while (1) ;
return 0;
}
ptrace(PTRACE_ATTACH, pid, NULL, NULL);
while (1) {
waitid(P_PID, pid, NULL, WSTOPPED);
ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP);
}
return 0;
}
The child creates five threads and the parent continuously traps the
first thread and whenever the child gets a signal, SIGSTOP is
delivered. If an external process sends SIGSTOP to the child, all
other threads in the process should reliably stop. However, due to
the above bug, the first thread will often end up consuming
group_stop_count multiple times and SIGSTOP often ends up stopping
none or part of the other four threads.
This patch adds a new field task->group_stop which is protected by
siglock and uses GROUP_STOP_CONSUME flag to track which task is still
to consume group_stop_count to fix this bug.
task_clear_group_stop_pending() and task_participate_group_stop() are
added to help manipulating group stop states. As ptrace_stop() now
also uses task_participate_group_stop(), it will set
SIGNAL_STOP_STOPPED if it completes a group stop.
There still are many issues regarding the interaction between group
stop and ptrace. Patches to address them will follow.
- Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
2011-03-23 17:37:00 +08:00
|
|
|
/*
|
2011-06-02 17:13:59 +08:00
|
|
|
* task->jobctl flags
|
signal: Fix premature completion of group stop when interfered by ptrace
task->signal->group_stop_count is used to track the progress of group
stop. It's initialized to the number of tasks which need to stop for
group stop to finish and each stopping or trapping task decrements.
However, each task doesn't keep track of whether it decremented the
counter or not and if woken up before the group stop is complete and
stops again, it can decrement the counter multiple times.
Please consider the following example code.
static void *worker(void *arg)
{
while (1) ;
return NULL;
}
int main(void)
{
pthread_t thread;
pid_t pid;
int i;
pid = fork();
if (!pid) {
for (i = 0; i < 5; i++)
pthread_create(&thread, NULL, worker, NULL);
while (1) ;
return 0;
}
ptrace(PTRACE_ATTACH, pid, NULL, NULL);
while (1) {
waitid(P_PID, pid, NULL, WSTOPPED);
ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP);
}
return 0;
}
The child creates five threads and the parent continuously traps the
first thread and whenever the child gets a signal, SIGSTOP is
delivered. If an external process sends SIGSTOP to the child, all
other threads in the process should reliably stop. However, due to
the above bug, the first thread will often end up consuming
group_stop_count multiple times and SIGSTOP often ends up stopping
none or part of the other four threads.
This patch adds a new field task->group_stop which is protected by
siglock and uses GROUP_STOP_CONSUME flag to track which task is still
to consume group_stop_count to fix this bug.
task_clear_group_stop_pending() and task_participate_group_stop() are
added to help manipulating group stop states. As ptrace_stop() now
also uses task_participate_group_stop(), it will set
SIGNAL_STOP_STOPPED if it completes a group stop.
There still are many issues regarding the interaction between group
stop and ptrace. Patches to address them will follow.
- Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
2011-03-23 17:37:00 +08:00
|
|
|
*/
|
2011-06-02 17:13:59 +08:00
|
|
|
#define JOBCTL_STOP_SIGMASK 0xffff /* signr of the last group stop */
|
signal: Fix premature completion of group stop when interfered by ptrace
task->signal->group_stop_count is used to track the progress of group
stop. It's initialized to the number of tasks which need to stop for
group stop to finish and each stopping or trapping task decrements.
However, each task doesn't keep track of whether it decremented the
counter or not and if woken up before the group stop is complete and
stops again, it can decrement the counter multiple times.
Please consider the following example code.
static void *worker(void *arg)
{
while (1) ;
return NULL;
}
int main(void)
{
pthread_t thread;
pid_t pid;
int i;
pid = fork();
if (!pid) {
for (i = 0; i < 5; i++)
pthread_create(&thread, NULL, worker, NULL);
while (1) ;
return 0;
}
ptrace(PTRACE_ATTACH, pid, NULL, NULL);
while (1) {
waitid(P_PID, pid, NULL, WSTOPPED);
ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP);
}
return 0;
}
The child creates five threads and the parent continuously traps the
first thread and whenever the child gets a signal, SIGSTOP is
delivered. If an external process sends SIGSTOP to the child, all
other threads in the process should reliably stop. However, due to
the above bug, the first thread will often end up consuming
group_stop_count multiple times and SIGSTOP often ends up stopping
none or part of the other four threads.
This patch adds a new field task->group_stop which is protected by
siglock and uses GROUP_STOP_CONSUME flag to track which task is still
to consume group_stop_count to fix this bug.
task_clear_group_stop_pending() and task_participate_group_stop() are
added to help manipulating group stop states. As ptrace_stop() now
also uses task_participate_group_stop(), it will set
SIGNAL_STOP_STOPPED if it completes a group stop.
There still are many issues regarding the interaction between group
stop and ptrace. Patches to address them will follow.
- Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped.
Signed-off-by: Tejun Heo <tj@kernel.org>
Acked-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
2011-03-23 17:37:00 +08:00
|
|
|
|
2011-06-02 17:13:59 +08:00
|
|
|
#define JOBCTL_STOP_DEQUEUED_BIT 16 /* stop signal dequeued */
|
|
|
|
#define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */
|
|
|
|
#define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */
|
2011-06-14 17:20:14 +08:00
|
|
|
#define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */
|
ptrace: implement TRAP_NOTIFY and use it for group stop events
Currently there's no way for ptracer to find out whether group stop
finished other than polling with INTERRUPT - GETSIGINFO - CONT
sequence. This patch implements group stop notification for ptracer
using STOP traps.
When group stop state of a seized tracee changes, JOBCTL_TRAP_NOTIFY
is set, which schedules a STOP trap which is sticky - it isn't cleared
by other traps and at least one STOP trap will happen eventually.
STOP trap is synchronization point for event notification and the
tracer can determine the current group stop state by looking at the
signal number portion of exit code (si_status from waitid(2) or
si_code from PTRACE_GETSIGINFO).
Notifications are generated both on start and end of group stops but,
because group stop participation always happens before STOP trap, this
doesn't cause an extra trap while tracee is participating in group
stop. The symmetry will be useful later.
Note that this notification works iff tracee is not trapped.
Currently there is no way to be notified of group stop state changes
while tracee is trapped. This will be addressed by a later patch.
An example program follows.
#define PTRACE_SEIZE 0x4206
#define PTRACE_INTERRUPT 0x4207
#define PTRACE_SEIZE_DEVEL 0x80000000
static const struct timespec ts1s = { .tv_sec = 1 };
int main(int argc, char **argv)
{
pid_t tracee, tracer;
int i;
tracee = fork();
if (!tracee)
while (1)
pause();
tracer = fork();
if (!tracer) {
siginfo_t si;
ptrace(PTRACE_SEIZE, tracee, NULL,
(void *)(unsigned long)PTRACE_SEIZE_DEVEL);
ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL);
repeat:
waitid(P_PID, tracee, NULL, WSTOPPED);
ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si);
if (!si.si_code) {
printf("tracer: SIG %d\n", si.si_signo);
ptrace(PTRACE_CONT, tracee, NULL,
(void *)(unsigned long)si.si_signo);
goto repeat;
}
printf("tracer: stopped=%d signo=%d\n",
si.si_signo != SIGTRAP, si.si_signo);
ptrace(PTRACE_CONT, tracee, NULL, NULL);
goto repeat;
}
for (i = 0; i < 3; i++) {
nanosleep(&ts1s, NULL);
printf("mother: SIGSTOP\n");
kill(tracee, SIGSTOP);
nanosleep(&ts1s, NULL);
printf("mother: SIGCONT\n");
kill(tracee, SIGCONT);
}
nanosleep(&ts1s, NULL);
kill(tracer, SIGKILL);
kill(tracee, SIGKILL);
return 0;
}
In the above program, tracer keeps tracee running and gets
notification of each group stop state changes.
# ./test-notify
tracer: stopped=0 signo=5
mother: SIGSTOP
tracer: SIG 19
tracer: stopped=1 signo=19
mother: SIGCONT
tracer: stopped=0 signo=5
tracer: SIG 18
mother: SIGSTOP
tracer: SIG 19
tracer: stopped=1 signo=19
mother: SIGCONT
tracer: stopped=0 signo=5
tracer: SIG 18
mother: SIGSTOP
tracer: SIG 19
tracer: stopped=1 signo=19
mother: SIGCONT
tracer: stopped=0 signo=5
tracer: SIG 18
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
2011-06-14 17:20:17 +08:00
|
|
|
#define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */
|
2011-06-02 17:13:59 +08:00
|
|
|
#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */
|
ptrace: implement PTRACE_LISTEN
The previous patch implemented async notification for ptrace but it
only worked while trace is running. This patch introduces
PTRACE_LISTEN which is suggested by Oleg Nestrov.
It's allowed iff tracee is in STOP trap and puts tracee into
quasi-running state - tracee never really runs but wait(2) and
ptrace(2) consider it to be running. While ptracer is listening,
tracee is allowed to re-enter STOP to notify an async event.
Listening state is cleared on the first notification. Ptracer can
also clear it by issuing INTERRUPT - tracee will re-trap into STOP
with listening state cleared.
This allows ptracer to monitor group stop state without running tracee
- use INTERRUPT to put tracee into STOP trap, issue LISTEN and then
wait(2) to wait for the next group stop event. When it happens,
PTRACE_GETSIGINFO provides information to determine the current state.
Test program follows.
#define PTRACE_SEIZE 0x4206
#define PTRACE_INTERRUPT 0x4207
#define PTRACE_LISTEN 0x4208
#define PTRACE_SEIZE_DEVEL 0x80000000
static const struct timespec ts1s = { .tv_sec = 1 };
int main(int argc, char **argv)
{
pid_t tracee, tracer;
int i;
tracee = fork();
if (!tracee)
while (1)
pause();
tracer = fork();
if (!tracer) {
siginfo_t si;
ptrace(PTRACE_SEIZE, tracee, NULL,
(void *)(unsigned long)PTRACE_SEIZE_DEVEL);
ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL);
repeat:
waitid(P_PID, tracee, NULL, WSTOPPED);
ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si);
if (!si.si_code) {
printf("tracer: SIG %d\n", si.si_signo);
ptrace(PTRACE_CONT, tracee, NULL,
(void *)(unsigned long)si.si_signo);
goto repeat;
}
printf("tracer: stopped=%d signo=%d\n",
si.si_signo != SIGTRAP, si.si_signo);
if (si.si_signo != SIGTRAP)
ptrace(PTRACE_LISTEN, tracee, NULL, NULL);
else
ptrace(PTRACE_CONT, tracee, NULL, NULL);
goto repeat;
}
for (i = 0; i < 3; i++) {
nanosleep(&ts1s, NULL);
printf("mother: SIGSTOP\n");
kill(tracee, SIGSTOP);
nanosleep(&ts1s, NULL);
printf("mother: SIGCONT\n");
kill(tracee, SIGCONT);
}
nanosleep(&ts1s, NULL);
kill(tracer, SIGKILL);
kill(tracee, SIGKILL);
return 0;
}
This is identical to the program to test TRAP_NOTIFY except that
tracee is PTRACE_LISTEN'd instead of PTRACE_CONT'd when group stopped.
This allows ptracer to monitor when group stop ends without running
tracee.
# ./test-listen
tracer: stopped=0 signo=5
mother: SIGSTOP
tracer: SIG 19
tracer: stopped=1 signo=19
mother: SIGCONT
tracer: stopped=0 signo=5
tracer: SIG 18
mother: SIGSTOP
tracer: SIG 19
tracer: stopped=1 signo=19
mother: SIGCONT
tracer: stopped=0 signo=5
tracer: SIG 18
mother: SIGSTOP
tracer: SIG 19
tracer: stopped=1 signo=19
mother: SIGCONT
tracer: stopped=0 signo=5
tracer: SIG 18
-v2: Moved JOBCTL_LISTENING check in wait_task_stopped() into
task_stopped_code() as suggested by Oleg.
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
2011-06-14 17:20:18 +08:00
|
|
|
#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
|
2011-06-02 17:13:59 +08:00
|
|
|
|
2015-05-01 12:19:57 +08:00
|
|
|
#define JOBCTL_STOP_DEQUEUED (1UL << JOBCTL_STOP_DEQUEUED_BIT)
|
|
|
|
#define JOBCTL_STOP_PENDING (1UL << JOBCTL_STOP_PENDING_BIT)
|
|
|
|
#define JOBCTL_STOP_CONSUME (1UL << JOBCTL_STOP_CONSUME_BIT)
|
|
|
|
#define JOBCTL_TRAP_STOP (1UL << JOBCTL_TRAP_STOP_BIT)
|
|
|
|
#define JOBCTL_TRAP_NOTIFY (1UL << JOBCTL_TRAP_NOTIFY_BIT)
|
|
|
|
#define JOBCTL_TRAPPING (1UL << JOBCTL_TRAPPING_BIT)
|
|
|
|
#define JOBCTL_LISTENING (1UL << JOBCTL_LISTENING_BIT)
|
2011-06-02 17:13:59 +08:00
|
|
|
|
ptrace: implement TRAP_NOTIFY and use it for group stop events
Currently there's no way for ptracer to find out whether group stop
finished other than polling with INTERRUPT - GETSIGINFO - CONT
sequence. This patch implements group stop notification for ptracer
using STOP traps.
When group stop state of a seized tracee changes, JOBCTL_TRAP_NOTIFY
is set, which schedules a STOP trap which is sticky - it isn't cleared
by other traps and at least one STOP trap will happen eventually.
STOP trap is synchronization point for event notification and the
tracer can determine the current group stop state by looking at the
signal number portion of exit code (si_status from waitid(2) or
si_code from PTRACE_GETSIGINFO).
Notifications are generated both on start and end of group stops but,
because group stop participation always happens before STOP trap, this
doesn't cause an extra trap while tracee is participating in group
stop. The symmetry will be useful later.
Note that this notification works iff tracee is not trapped.
Currently there is no way to be notified of group stop state changes
while tracee is trapped. This will be addressed by a later patch.
An example program follows.
#define PTRACE_SEIZE 0x4206
#define PTRACE_INTERRUPT 0x4207
#define PTRACE_SEIZE_DEVEL 0x80000000
static const struct timespec ts1s = { .tv_sec = 1 };
int main(int argc, char **argv)
{
pid_t tracee, tracer;
int i;
tracee = fork();
if (!tracee)
while (1)
pause();
tracer = fork();
if (!tracer) {
siginfo_t si;
ptrace(PTRACE_SEIZE, tracee, NULL,
(void *)(unsigned long)PTRACE_SEIZE_DEVEL);
ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL);
repeat:
waitid(P_PID, tracee, NULL, WSTOPPED);
ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si);
if (!si.si_code) {
printf("tracer: SIG %d\n", si.si_signo);
ptrace(PTRACE_CONT, tracee, NULL,
(void *)(unsigned long)si.si_signo);
goto repeat;
}
printf("tracer: stopped=%d signo=%d\n",
si.si_signo != SIGTRAP, si.si_signo);
ptrace(PTRACE_CONT, tracee, NULL, NULL);
goto repeat;
}
for (i = 0; i < 3; i++) {
nanosleep(&ts1s, NULL);
printf("mother: SIGSTOP\n");
kill(tracee, SIGSTOP);
nanosleep(&ts1s, NULL);
printf("mother: SIGCONT\n");
kill(tracee, SIGCONT);
}
nanosleep(&ts1s, NULL);
kill(tracer, SIGKILL);
kill(tracee, SIGKILL);
return 0;
}
In the above program, tracer keeps tracee running and gets
notification of each group stop state changes.
# ./test-notify
tracer: stopped=0 signo=5
mother: SIGSTOP
tracer: SIG 19
tracer: stopped=1 signo=19
mother: SIGCONT
tracer: stopped=0 signo=5
tracer: SIG 18
mother: SIGSTOP
tracer: SIG 19
tracer: stopped=1 signo=19
mother: SIGCONT
tracer: stopped=0 signo=5
tracer: SIG 18
mother: SIGSTOP
tracer: SIG 19
tracer: stopped=1 signo=19
mother: SIGCONT
tracer: stopped=0 signo=5
tracer: SIG 18
Signed-off-by: Tejun Heo <tj@kernel.org>
Cc: Oleg Nesterov <oleg@redhat.com>
2011-06-14 17:20:17 +08:00
|
|
|
#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
|
2011-06-14 17:20:14 +08:00
|
|
|
#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
|
2011-06-02 17:14:00 +08:00
|
|
|
|
2011-06-02 17:14:00 +08:00
|
|
|
extern bool task_set_jobctl_pending(struct task_struct *task,
|
2015-05-01 12:19:57 +08:00
|
|
|
unsigned long mask);
|
2011-06-14 17:20:14 +08:00
|
|
|
extern void task_clear_jobctl_trapping(struct task_struct *task);
|
2011-06-02 17:14:00 +08:00
|
|
|
extern void task_clear_jobctl_pending(struct task_struct *task,
|
2015-05-01 12:19:57 +08:00
|
|
|
unsigned long mask);
|
2011-03-23 17:37:00 +08:00
|
|
|
|
rcu: Merge preemptable-RCU functionality into hierarchical RCU
Create a kernel/rcutree_plugin.h file that contains definitions
for preemptable RCU (or, under the #else branch of the #ifdef,
empty definitions for the classic non-preemptable semantics).
These definitions fit into plugins defined in kernel/rcutree.c
for this purpose.
This variant of preemptable RCU uses a new algorithm whose
read-side expense is roughly that of classic hierarchical RCU
under CONFIG_PREEMPT. This new algorithm's update-side expense
is similar to that of classic hierarchical RCU, and, in absence
of read-side preemption or blocking, is exactly that of classic
hierarchical RCU. Perhaps more important, this new algorithm
has a much simpler implementation, saving well over 1,000 lines
of code compared to mainline's implementation of preemptable
RCU, which will hopefully be retired in favor of this new
algorithm.
The simplifications are obtained by maintaining per-task
nesting state for running tasks, and using a simple
lock-protected algorithm to handle accounting when tasks block
within RCU read-side critical sections, making use of lessons
learned while creating numerous user-level RCU implementations
over the past 18 months.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12509746134003-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 04:56:52 +08:00
|
|
|
static inline void rcu_copy_process(struct task_struct *p)
|
|
|
|
{
|
2014-06-28 04:42:20 +08:00
|
|
|
#ifdef CONFIG_PREEMPT_RCU
|
rcu: Merge preemptable-RCU functionality into hierarchical RCU
Create a kernel/rcutree_plugin.h file that contains definitions
for preemptable RCU (or, under the #else branch of the #ifdef,
empty definitions for the classic non-preemptable semantics).
These definitions fit into plugins defined in kernel/rcutree.c
for this purpose.
This variant of preemptable RCU uses a new algorithm whose
read-side expense is roughly that of classic hierarchical RCU
under CONFIG_PREEMPT. This new algorithm's update-side expense
is similar to that of classic hierarchical RCU, and, in absence
of read-side preemption or blocking, is exactly that of classic
hierarchical RCU. Perhaps more important, this new algorithm
has a much simpler implementation, saving well over 1,000 lines
of code compared to mainline's implementation of preemptable
RCU, which will hopefully be retired in favor of this new
algorithm.
The simplifications are obtained by maintaining per-task
nesting state for running tasks, and using a simple
lock-protected algorithm to handle accounting when tasks block
within RCU read-side critical sections, making use of lessons
learned while creating numerous user-level RCU implementations
over the past 18 months.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12509746134003-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 04:56:52 +08:00
|
|
|
p->rcu_read_lock_nesting = 0;
|
2014-08-15 07:01:53 +08:00
|
|
|
p->rcu_read_unlock_special.s = 0;
|
2009-08-28 05:58:16 +08:00
|
|
|
p->rcu_blocked_node = NULL;
|
rcu: Merge preemptable-RCU functionality into hierarchical RCU
Create a kernel/rcutree_plugin.h file that contains definitions
for preemptable RCU (or, under the #else branch of the #ifdef,
empty definitions for the classic non-preemptable semantics).
These definitions fit into plugins defined in kernel/rcutree.c
for this purpose.
This variant of preemptable RCU uses a new algorithm whose
read-side expense is roughly that of classic hierarchical RCU
under CONFIG_PREEMPT. This new algorithm's update-side expense
is similar to that of classic hierarchical RCU, and, in absence
of read-side preemption or blocking, is exactly that of classic
hierarchical RCU. Perhaps more important, this new algorithm
has a much simpler implementation, saving well over 1,000 lines
of code compared to mainline's implementation of preemptable
RCU, which will hopefully be retired in favor of this new
algorithm.
The simplifications are obtained by maintaining per-task
nesting state for running tasks, and using a simple
lock-protected algorithm to handle accounting when tasks block
within RCU read-side critical sections, making use of lessons
learned while creating numerous user-level RCU implementations
over the past 18 months.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12509746134003-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 04:56:52 +08:00
|
|
|
INIT_LIST_HEAD(&p->rcu_node_entry);
|
2014-06-28 04:42:20 +08:00
|
|
|
#endif /* #ifdef CONFIG_PREEMPT_RCU */
|
|
|
|
#ifdef CONFIG_TASKS_RCU
|
|
|
|
p->rcu_tasks_holdout = false;
|
|
|
|
INIT_LIST_HEAD(&p->rcu_tasks_holdout_list);
|
2014-08-05 08:43:50 +08:00
|
|
|
p->rcu_tasks_idle_cpu = -1;
|
2014-06-28 04:42:20 +08:00
|
|
|
#endif /* #ifdef CONFIG_TASKS_RCU */
|
rcu: Merge preemptable-RCU functionality into hierarchical RCU
Create a kernel/rcutree_plugin.h file that contains definitions
for preemptable RCU (or, under the #else branch of the #ifdef,
empty definitions for the classic non-preemptable semantics).
These definitions fit into plugins defined in kernel/rcutree.c
for this purpose.
This variant of preemptable RCU uses a new algorithm whose
read-side expense is roughly that of classic hierarchical RCU
under CONFIG_PREEMPT. This new algorithm's update-side expense
is similar to that of classic hierarchical RCU, and, in absence
of read-side preemption or blocking, is exactly that of classic
hierarchical RCU. Perhaps more important, this new algorithm
has a much simpler implementation, saving well over 1,000 lines
of code compared to mainline's implementation of preemptable
RCU, which will hopefully be retired in favor of this new
algorithm.
The simplifications are obtained by maintaining per-task
nesting state for running tasks, and using a simple
lock-protected algorithm to handle accounting when tasks block
within RCU read-side critical sections, making use of lessons
learned while creating numerous user-level RCU implementations
over the past 18 months.
Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Cc: laijs@cn.fujitsu.com
Cc: dipankar@in.ibm.com
Cc: akpm@linux-foundation.org
Cc: mathieu.desnoyers@polymtl.ca
Cc: josht@linux.vnet.ibm.com
Cc: dvhltc@us.ibm.com
Cc: niv@us.ibm.com
Cc: peterz@infradead.org
Cc: rostedt@goodmis.org
LKML-Reference: <12509746134003-git-send-email->
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 04:56:52 +08:00
|
|
|
}
|
|
|
|
|
2012-08-01 07:44:07 +08:00
|
|
|
static inline void tsk_restore_flags(struct task_struct *task,
|
|
|
|
unsigned long orig_flags, unsigned long flags)
|
|
|
|
{
|
|
|
|
task->flags &= ~flags;
|
|
|
|
task->flags |= orig_flags & flags;
|
|
|
|
}
|
|
|
|
|
2014-10-07 16:52:11 +08:00
|
|
|
extern int cpuset_cpumask_can_shrink(const struct cpumask *cur,
|
|
|
|
const struct cpumask *trial);
|
2014-09-19 17:22:40 +08:00
|
|
|
extern int task_can_attach(struct task_struct *p,
|
|
|
|
const struct cpumask *cs_cpus_allowed);
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_SMP
|
2011-05-19 14:08:58 +08:00
|
|
|
extern void do_set_cpus_allowed(struct task_struct *p,
|
|
|
|
const struct cpumask *new_mask);
|
|
|
|
|
2008-03-27 05:23:49 +08:00
|
|
|
extern int set_cpus_allowed_ptr(struct task_struct *p,
|
2008-11-25 00:05:14 +08:00
|
|
|
const struct cpumask *new_mask);
|
2005-04-17 06:20:36 +08:00
|
|
|
#else
|
2011-05-19 14:08:58 +08:00
|
|
|
static inline void do_set_cpus_allowed(struct task_struct *p,
|
|
|
|
const struct cpumask *new_mask)
|
|
|
|
{
|
|
|
|
}
|
2008-03-27 05:23:49 +08:00
|
|
|
static inline int set_cpus_allowed_ptr(struct task_struct *p,
|
2008-11-25 00:05:14 +08:00
|
|
|
const struct cpumask *new_mask)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-11-25 00:05:14 +08:00
|
|
|
if (!cpumask_test_cpu(0, new_mask))
|
2005-04-17 06:20:36 +08:00
|
|
|
return -EINVAL;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
2009-09-24 23:34:38 +08:00
|
|
|
|
2011-08-11 05:21:01 +08:00
|
|
|
#ifdef CONFIG_NO_HZ_COMMON
|
2012-06-22 21:52:09 +08:00
|
|
|
void calc_load_enter_idle(void);
|
|
|
|
void calc_load_exit_idle(void);
|
|
|
|
#else
|
|
|
|
static inline void calc_load_enter_idle(void) { }
|
|
|
|
static inline void calc_load_exit_idle(void) { }
|
2011-08-11 05:21:01 +08:00
|
|
|
#endif /* CONFIG_NO_HZ_COMMON */
|
2012-06-22 21:52:09 +08:00
|
|
|
|
2009-02-27 03:20:29 +08:00
|
|
|
/*
|
2010-05-25 16:48:51 +08:00
|
|
|
* Do not use outside of architecture code which knows its limitations.
|
|
|
|
*
|
|
|
|
* sched_clock() has no promise of monotonicity or bounded drift between
|
|
|
|
* CPUs, use (which you should not) requires disabling IRQs.
|
|
|
|
*
|
|
|
|
* Please use one of the three interfaces below.
|
2009-02-27 03:20:29 +08:00
|
|
|
*/
|
2009-12-10 09:07:03 +08:00
|
|
|
extern unsigned long long notrace sched_clock(void);
|
2010-05-25 16:48:51 +08:00
|
|
|
/*
|
2012-04-02 16:00:44 +08:00
|
|
|
* See the comment in kernel/sched/clock.c
|
2010-05-25 16:48:51 +08:00
|
|
|
*/
|
2015-02-13 07:01:24 +08:00
|
|
|
extern u64 running_clock(void);
|
2010-05-25 16:48:51 +08:00
|
|
|
extern u64 sched_clock_cpu(int cpu);
|
|
|
|
|
2007-07-20 03:28:35 +08:00
|
|
|
|
2008-08-11 14:59:03 +08:00
|
|
|
extern void sched_clock_init(void);
|
2008-05-04 00:29:28 +08:00
|
|
|
|
2008-08-11 14:59:03 +08:00
|
|
|
#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
|
2008-05-04 00:29:28 +08:00
|
|
|
static inline void sched_clock_tick(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void sched_clock_idle_sleep_event(void)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
|
|
|
|
{
|
|
|
|
}
|
2016-04-11 22:38:34 +08:00
|
|
|
|
|
|
|
static inline u64 cpu_clock(int cpu)
|
|
|
|
{
|
|
|
|
return sched_clock();
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 local_clock(void)
|
|
|
|
{
|
|
|
|
return sched_clock();
|
|
|
|
}
|
2008-05-04 00:29:28 +08:00
|
|
|
#else
|
2010-05-25 16:48:51 +08:00
|
|
|
/*
|
|
|
|
* Architectures can set this to 1 if they have specified
|
|
|
|
* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
|
|
|
|
* but then during bootup it turns out that sched_clock()
|
|
|
|
* is reliable after all:
|
|
|
|
*/
|
2013-11-29 02:38:42 +08:00
|
|
|
extern int sched_clock_stable(void);
|
|
|
|
extern void set_sched_clock_stable(void);
|
|
|
|
extern void clear_sched_clock_stable(void);
|
2010-05-25 16:48:51 +08:00
|
|
|
|
2008-05-04 00:29:28 +08:00
|
|
|
extern void sched_clock_tick(void);
|
|
|
|
extern void sched_clock_idle_sleep_event(void);
|
|
|
|
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
|
2016-04-11 22:38:34 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* As outlined in clock.c, provides a fast, high resolution, nanosecond
|
|
|
|
* time source that is monotonic per cpu argument and has bounded drift
|
|
|
|
* between cpus.
|
|
|
|
*
|
|
|
|
* ######################### BIG FAT WARNING ##########################
|
|
|
|
* # when comparing cpu_clock(i) to cpu_clock(j) for i != j, time can #
|
|
|
|
* # go backwards !! #
|
|
|
|
* ####################################################################
|
|
|
|
*/
|
|
|
|
static inline u64 cpu_clock(int cpu)
|
|
|
|
{
|
|
|
|
return sched_clock_cpu(cpu);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline u64 local_clock(void)
|
|
|
|
{
|
|
|
|
return sched_clock_cpu(raw_smp_processor_id());
|
|
|
|
}
|
2008-05-04 00:29:28 +08:00
|
|
|
#endif
|
|
|
|
|
2010-10-05 08:03:19 +08:00
|
|
|
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
|
|
|
|
/*
|
|
|
|
* An i/f to runtime opt-in for irq time accounting based off of sched_clock.
|
|
|
|
* The reason for this explicit opt-in is not to have perf penalty with
|
|
|
|
* slow sched_clocks.
|
|
|
|
*/
|
|
|
|
extern void enable_sched_clock_irqtime(void);
|
|
|
|
extern void disable_sched_clock_irqtime(void);
|
|
|
|
#else
|
|
|
|
static inline void enable_sched_clock_irqtime(void) {}
|
|
|
|
static inline void disable_sched_clock_irqtime(void) {}
|
|
|
|
#endif
|
|
|
|
|
2006-07-03 15:25:41 +08:00
|
|
|
extern unsigned long long
|
2007-07-10 00:51:58 +08:00
|
|
|
task_sched_runtime(struct task_struct *task);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* sched_exec is called by processes performing an exec */
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
extern void sched_exec(void);
|
|
|
|
#else
|
|
|
|
#define sched_exec() {}
|
|
|
|
#endif
|
|
|
|
|
2007-08-23 21:18:02 +08:00
|
|
|
extern void sched_clock_idle_sleep_event(void);
|
|
|
|
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
|
2007-07-10 00:51:59 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_HOTPLUG_CPU
|
|
|
|
extern void idle_task_exit(void);
|
|
|
|
#else
|
|
|
|
static inline void idle_task_exit(void) {}
|
|
|
|
#endif
|
|
|
|
|
2011-08-11 05:21:01 +08:00
|
|
|
#if defined(CONFIG_NO_HZ_COMMON) && defined(CONFIG_SMP)
|
2011-08-11 05:21:01 +08:00
|
|
|
extern void wake_up_nohz_cpu(int cpu);
|
2008-03-22 16:20:24 +08:00
|
|
|
#else
|
2011-08-11 05:21:01 +08:00
|
|
|
static inline void wake_up_nohz_cpu(int cpu) { }
|
2008-03-22 16:20:24 +08:00
|
|
|
#endif
|
|
|
|
|
2013-04-20 21:15:35 +08:00
|
|
|
#ifdef CONFIG_NO_HZ_FULL
|
2013-05-03 09:39:05 +08:00
|
|
|
extern u64 scheduler_tick_max_deferment(void);
|
2008-03-22 16:20:24 +08:00
|
|
|
#endif
|
|
|
|
|
sched: Add 'autogroup' scheduling feature: automated per session task groups
A recurring complaint from CFS users is that parallel kbuild has
a negative impact on desktop interactivity. This patch
implements an idea from Linus, to automatically create task
groups. Currently, only per session autogroups are implemented,
but the patch leaves the way open for enhancement.
Implementation: each task's signal struct contains an inherited
pointer to a refcounted autogroup struct containing a task group
pointer, the default for all tasks pointing to the
init_task_group. When a task calls setsid(), a new task group
is created, the process is moved into the new task group, and a
reference to the preveious task group is dropped. Child
processes inherit this task group thereafter, and increase it's
refcount. When the last thread of a process exits, the
process's reference is dropped, such that when the last process
referencing an autogroup exits, the autogroup is destroyed.
At runqueue selection time, IFF a task has no cgroup assignment,
its current autogroup is used.
Autogroup bandwidth is controllable via setting it's nice level
through the proc filesystem:
cat /proc/<pid>/autogroup
Displays the task's group and the group's nice level.
echo <nice level> > /proc/<pid>/autogroup
Sets the task group's shares to the weight of nice <level> task.
Setting nice level is rate limited for !admin users due to the
abuse risk of task group locking.
The feature is enabled from boot by default if
CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via
the boot option noautogroup, and can also be turned on/off on
the fly via:
echo [01] > /proc/sys/kernel/sched_autogroup_enabled
... which will automatically move tasks to/from the root task group.
Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Paul Turner <pjt@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
[ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-11-30 21:18:03 +08:00
|
|
|
#ifdef CONFIG_SCHED_AUTOGROUP
|
|
|
|
extern void sched_autogroup_create_attach(struct task_struct *p);
|
|
|
|
extern void sched_autogroup_detach(struct task_struct *p);
|
|
|
|
extern void sched_autogroup_fork(struct signal_struct *sig);
|
|
|
|
extern void sched_autogroup_exit(struct signal_struct *sig);
|
|
|
|
#ifdef CONFIG_PROC_FS
|
|
|
|
extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
|
2012-02-23 16:41:27 +08:00
|
|
|
extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
|
sched: Add 'autogroup' scheduling feature: automated per session task groups
A recurring complaint from CFS users is that parallel kbuild has
a negative impact on desktop interactivity. This patch
implements an idea from Linus, to automatically create task
groups. Currently, only per session autogroups are implemented,
but the patch leaves the way open for enhancement.
Implementation: each task's signal struct contains an inherited
pointer to a refcounted autogroup struct containing a task group
pointer, the default for all tasks pointing to the
init_task_group. When a task calls setsid(), a new task group
is created, the process is moved into the new task group, and a
reference to the preveious task group is dropped. Child
processes inherit this task group thereafter, and increase it's
refcount. When the last thread of a process exits, the
process's reference is dropped, such that when the last process
referencing an autogroup exits, the autogroup is destroyed.
At runqueue selection time, IFF a task has no cgroup assignment,
its current autogroup is used.
Autogroup bandwidth is controllable via setting it's nice level
through the proc filesystem:
cat /proc/<pid>/autogroup
Displays the task's group and the group's nice level.
echo <nice level> > /proc/<pid>/autogroup
Sets the task group's shares to the weight of nice <level> task.
Setting nice level is rate limited for !admin users due to the
abuse risk of task group locking.
The feature is enabled from boot by default if
CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via
the boot option noautogroup, and can also be turned on/off on
the fly via:
echo [01] > /proc/sys/kernel/sched_autogroup_enabled
... which will automatically move tasks to/from the root task group.
Signed-off-by: Mike Galbraith <efault@gmx.de>
Acked-by: Linus Torvalds <torvalds@linux-foundation.org>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Cc: Markus Trippelsdorf <markus@trippelsdorf.de>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Cc: Paul Turner <pjt@google.com>
Cc: Oleg Nesterov <oleg@redhat.com>
[ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ]
Signed-off-by: Ingo Molnar <mingo@elte.hu>
LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-11-30 21:18:03 +08:00
|
|
|
#endif
|
|
|
|
#else
|
|
|
|
static inline void sched_autogroup_create_attach(struct task_struct *p) { }
|
|
|
|
static inline void sched_autogroup_detach(struct task_struct *p) { }
|
|
|
|
static inline void sched_autogroup_fork(struct signal_struct *sig) { }
|
|
|
|
static inline void sched_autogroup_exit(struct signal_struct *sig) { }
|
|
|
|
#endif
|
|
|
|
|
2014-05-23 18:20:42 +08:00
|
|
|
extern int yield_to(struct task_struct *p, bool preempt);
|
2006-07-03 15:25:41 +08:00
|
|
|
extern void set_user_nice(struct task_struct *p, long nice);
|
|
|
|
extern int task_prio(const struct task_struct *p);
|
2014-01-28 11:00:45 +08:00
|
|
|
/**
|
|
|
|
* task_nice - return the nice value of a given task.
|
|
|
|
* @p: the task in question.
|
|
|
|
*
|
|
|
|
* Return: The nice value [ -20 ... 0 ... 19 ].
|
|
|
|
*/
|
|
|
|
static inline int task_nice(const struct task_struct *p)
|
|
|
|
{
|
|
|
|
return PRIO_TO_NICE((p)->static_prio);
|
|
|
|
}
|
2006-07-03 15:25:41 +08:00
|
|
|
extern int can_nice(const struct task_struct *p, const int nice);
|
|
|
|
extern int task_curr(const struct task_struct *p);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int idle_cpu(int cpu);
|
2010-10-21 07:01:12 +08:00
|
|
|
extern int sched_setscheduler(struct task_struct *, int,
|
|
|
|
const struct sched_param *);
|
2008-06-23 11:55:38 +08:00
|
|
|
extern int sched_setscheduler_nocheck(struct task_struct *, int,
|
2010-10-21 07:01:12 +08:00
|
|
|
const struct sched_param *);
|
sched: Add new scheduler syscalls to support an extended scheduling parameters ABI
Add the syscalls needed for supporting scheduling algorithms
with extended scheduling parameters (e.g., SCHED_DEADLINE).
In general, it makes possible to specify a periodic/sporadic task,
that executes for a given amount of runtime at each instance, and is
scheduled according to the urgency of their own timing constraints,
i.e.:
- a (maximum/typical) instance execution time,
- a minimum interval between consecutive instances,
- a time constraint by which each instance must be completed.
Thus, both the data structure that holds the scheduling parameters of
the tasks and the system calls dealing with it must be extended.
Unfortunately, modifying the existing struct sched_param would break
the ABI and result in potentially serious compatibility issues with
legacy binaries.
For these reasons, this patch:
- defines the new struct sched_attr, containing all the fields
that are necessary for specifying a task in the computational
model described above;
- defines and implements the new scheduling related syscalls that
manipulate it, i.e., sched_setattr() and sched_getattr().
Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a
proof of concept and for developing and testing purposes. Making them
available on other architectures is straightforward.
Since no "user" for these new parameters is introduced in this patch,
the implementation of the new system calls is just identical to their
already existing counterpart. Future patches that implement scheduling
policies able to exploit the new data structure must also take care of
modifying the sched_*attr() calls accordingly with their own purposes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
[ Rewrote to use sched_attr. ]
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
[ Removed sched_setscheduler2() for now. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 21:43:36 +08:00
|
|
|
extern int sched_setattr(struct task_struct *,
|
|
|
|
const struct sched_attr *);
|
2006-07-03 15:25:41 +08:00
|
|
|
extern struct task_struct *idle_task(int cpu);
|
2011-11-11 04:41:56 +08:00
|
|
|
/**
|
|
|
|
* is_idle_task - is the specified task an idle task?
|
2012-01-22 03:03:13 +08:00
|
|
|
* @p: the task in question.
|
2013-07-13 02:45:47 +08:00
|
|
|
*
|
|
|
|
* Return: 1 if @p is an idle task. 0 otherwise.
|
2011-11-11 04:41:56 +08:00
|
|
|
*/
|
2011-12-21 00:20:46 +08:00
|
|
|
static inline bool is_idle_task(const struct task_struct *p)
|
2011-11-11 04:41:56 +08:00
|
|
|
{
|
|
|
|
return p->pid == 0;
|
|
|
|
}
|
2006-07-03 15:25:41 +08:00
|
|
|
extern struct task_struct *curr_task(int cpu);
|
|
|
|
extern void set_curr_task(int cpu, struct task_struct *p);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
void yield(void);
|
|
|
|
|
|
|
|
union thread_union {
|
|
|
|
struct thread_info thread_info;
|
|
|
|
unsigned long stack[THREAD_SIZE/sizeof(long)];
|
|
|
|
};
|
|
|
|
|
|
|
|
#ifndef __HAVE_ARCH_KSTACK_END
|
|
|
|
static inline int kstack_end(void *addr)
|
|
|
|
{
|
|
|
|
/* Reliable end of stack detection:
|
|
|
|
* Some APM bios versions misalign the stack
|
|
|
|
*/
|
|
|
|
return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
extern union thread_union init_thread_union;
|
|
|
|
extern struct task_struct init_task;
|
|
|
|
|
|
|
|
extern struct mm_struct init_mm;
|
|
|
|
|
2007-10-19 14:40:06 +08:00
|
|
|
extern struct pid_namespace init_pid_ns;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* find a task by one of its numerical ids
|
|
|
|
*
|
|
|
|
* find_task_by_pid_ns():
|
|
|
|
* finds a task by its pid in the specified namespace
|
2007-10-19 14:40:16 +08:00
|
|
|
* find_task_by_vpid():
|
|
|
|
* finds a task by its virtual pid
|
2007-10-19 14:40:06 +08:00
|
|
|
*
|
2008-07-25 16:48:36 +08:00
|
|
|
* see also find_vpid() etc in include/linux/pid.h
|
2007-10-19 14:40:06 +08:00
|
|
|
*/
|
|
|
|
|
2007-10-19 14:40:16 +08:00
|
|
|
extern struct task_struct *find_task_by_vpid(pid_t nr);
|
|
|
|
extern struct task_struct *find_task_by_pid_ns(pid_t nr,
|
|
|
|
struct pid_namespace *ns);
|
2007-10-19 14:40:06 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* per-UID process charging. */
|
2011-11-17 15:20:58 +08:00
|
|
|
extern struct user_struct * alloc_uid(kuid_t);
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline struct user_struct *get_uid(struct user_struct *u)
|
|
|
|
{
|
|
|
|
atomic_inc(&u->__count);
|
|
|
|
return u;
|
|
|
|
}
|
|
|
|
extern void free_uid(struct user_struct *);
|
|
|
|
|
|
|
|
#include <asm/current.h>
|
|
|
|
|
2011-01-27 22:59:10 +08:00
|
|
|
extern void xtime_update(unsigned long ticks);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-02-14 07:03:15 +08:00
|
|
|
extern int wake_up_state(struct task_struct *tsk, unsigned int state);
|
|
|
|
extern int wake_up_process(struct task_struct *tsk);
|
2011-05-12 00:18:05 +08:00
|
|
|
extern void wake_up_new_task(struct task_struct *tsk);
|
2005-04-17 06:20:36 +08:00
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
extern void kick_process(struct task_struct *tsk);
|
|
|
|
#else
|
|
|
|
static inline void kick_process(struct task_struct *tsk) { }
|
|
|
|
#endif
|
sched/deadline: Add SCHED_DEADLINE structures & implementation
Introduces the data structures, constants and symbols needed for
SCHED_DEADLINE implementation.
Core data structure of SCHED_DEADLINE are defined, along with their
initializers. Hooks for checking if a task belong to the new policy
are also added where they are needed.
Adds a scheduling class, in sched/dl.c and a new policy called
SCHED_DEADLINE. It is an implementation of the Earliest Deadline
First (EDF) scheduling algorithm, augmented with a mechanism (called
Constant Bandwidth Server, CBS) that makes it possible to isolate
the behaviour of tasks between each other.
The typical -deadline task will be made up of a computation phase
(instance) which is activated on a periodic or sporadic fashion. The
expected (maximum) duration of such computation is called the task's
runtime; the time interval by which each instance need to be completed
is called the task's relative deadline. The task's absolute deadline
is dynamically calculated as the time instant a task (better, an
instance) activates plus the relative deadline.
The EDF algorithms selects the task with the smallest absolute
deadline as the one to be executed first, while the CBS ensures each
task to run for at most its runtime every (relative) deadline
length time interval, avoiding any interference between different
tasks (bandwidth isolation).
Thanks to this feature, also tasks that do not strictly comply with
the computational model sketched above can effectively use the new
policy.
To summarize, this patch:
- introduces the data structures, constants and symbols needed;
- implements the core logic of the scheduling algorithm in the new
scheduling class file;
- provides all the glue code between the new scheduling class and
the core scheduler and refines the interactions between sched/dl
and the other existing scheduling classes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
Signed-off-by: Michael Trimarchi <michael@amarulasolutions.com>
Signed-off-by: Fabio Checconi <fchecconi@gmail.com>
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-4-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-28 18:14:43 +08:00
|
|
|
extern int sched_fork(unsigned long clone_flags, struct task_struct *p);
|
2007-07-10 00:52:00 +08:00
|
|
|
extern void sched_dead(struct task_struct *p);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
extern void proc_caches_init(void);
|
|
|
|
extern void flush_signals(struct task_struct *);
|
2007-05-09 17:34:37 +08:00
|
|
|
extern void ignore_signals(struct task_struct *);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void flush_signal_handlers(struct task_struct *, int force_default);
|
|
|
|
extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
|
|
|
|
|
2015-11-07 08:32:22 +08:00
|
|
|
static inline int kernel_dequeue_signal(siginfo_t *info)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2015-11-07 08:32:22 +08:00
|
|
|
struct task_struct *tsk = current;
|
|
|
|
siginfo_t __info;
|
2005-04-17 06:20:36 +08:00
|
|
|
int ret;
|
|
|
|
|
2015-11-07 08:32:22 +08:00
|
|
|
spin_lock_irq(&tsk->sighand->siglock);
|
|
|
|
ret = dequeue_signal(tsk, &tsk->blocked, info ?: &__info);
|
|
|
|
spin_unlock_irq(&tsk->sighand->siglock);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
return ret;
|
2011-06-23 05:08:18 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2015-11-07 08:32:25 +08:00
|
|
|
static inline void kernel_signal_stop(void)
|
|
|
|
{
|
|
|
|
spin_lock_irq(¤t->sighand->siglock);
|
|
|
|
if (current->jobctl & JOBCTL_STOP_DEQUEUED)
|
|
|
|
__set_current_state(TASK_STOPPED);
|
|
|
|
spin_unlock_irq(¤t->sighand->siglock);
|
|
|
|
|
|
|
|
schedule();
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void release_task(struct task_struct * p);
|
|
|
|
extern int send_sig_info(int, struct siginfo *, struct task_struct *);
|
|
|
|
extern int force_sigsegv(int, struct task_struct *);
|
|
|
|
extern int force_sig_info(int, struct siginfo *, struct task_struct *);
|
2006-10-02 17:17:10 +08:00
|
|
|
extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
|
|
|
|
extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
|
2011-09-26 23:45:18 +08:00
|
|
|
extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
|
|
|
|
const struct cred *, u32);
|
2006-10-02 17:17:10 +08:00
|
|
|
extern int kill_pgrp(struct pid *pid, int sig, int priv);
|
|
|
|
extern int kill_pid(struct pid *pid, int sig, int priv);
|
2007-02-09 23:11:47 +08:00
|
|
|
extern int kill_proc_info(int, struct siginfo *, pid_t);
|
2011-06-23 05:09:09 +08:00
|
|
|
extern __must_check bool do_notify_parent(struct task_struct *, int);
|
ptrace: __ptrace_detach: do __wake_up_parent() if we reap the tracee
The bug is old, it wasn't cause by recent changes.
Test case:
static void *tfunc(void *arg)
{
int pid = (long)arg;
assert(ptrace(PTRACE_ATTACH, pid, NULL, NULL) == 0);
kill(pid, SIGKILL);
sleep(1);
return NULL;
}
int main(void)
{
pthread_t th;
long pid = fork();
if (!pid)
pause();
signal(SIGCHLD, SIG_IGN);
assert(pthread_create(&th, NULL, tfunc, (void*)pid) == 0);
int r = waitpid(-1, NULL, __WNOTHREAD);
printf("waitpid: %d %m\n", r);
return 0;
}
Before the patch this program hangs, after this patch waitpid() correctly
fails with errno == -ECHILD.
The problem is, __ptrace_detach() reaps the EXIT_ZOMBIE tracee if its
->real_parent is our sub-thread and we ignore SIGCHLD. But in this case
we should wake up other threads which can sleep in do_wait().
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Vitaly Mayatskikh <vmayatsk@redhat.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-24 06:56:44 +08:00
|
|
|
extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void force_sig(int, struct task_struct *);
|
|
|
|
extern int send_sig(int, struct task_struct *, int);
|
2010-05-27 05:43:11 +08:00
|
|
|
extern int zap_other_threads(struct task_struct *p);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern struct sigqueue *sigqueue_alloc(void);
|
|
|
|
extern void sigqueue_free(struct sigqueue *);
|
2008-04-30 15:52:57 +08:00
|
|
|
extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group);
|
2006-02-10 03:41:50 +08:00
|
|
|
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-05-22 11:33:55 +08:00
|
|
|
static inline void restore_saved_sigmask(void)
|
|
|
|
{
|
|
|
|
if (test_and_clear_restore_sigmask())
|
2012-04-28 01:58:59 +08:00
|
|
|
__set_current_blocked(¤t->saved_sigmask);
|
2012-05-22 11:33:55 +08:00
|
|
|
}
|
|
|
|
|
2012-05-02 21:59:21 +08:00
|
|
|
static inline sigset_t *sigmask_to_save(void)
|
|
|
|
{
|
|
|
|
sigset_t *res = ¤t->blocked;
|
|
|
|
if (unlikely(test_restore_sigmask()))
|
|
|
|
res = ¤t->saved_sigmask;
|
|
|
|
return res;
|
|
|
|
}
|
|
|
|
|
2006-10-02 17:19:00 +08:00
|
|
|
static inline int kill_cad_pid(int sig, int priv)
|
|
|
|
{
|
|
|
|
return kill_pid(cad_pid, sig, priv);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* These can be the second arg to send_sig_info/send_group_sig_info. */
|
|
|
|
#define SEND_SIG_NOINFO ((struct siginfo *) 0)
|
|
|
|
#define SEND_SIG_PRIV ((struct siginfo *) 1)
|
|
|
|
#define SEND_SIG_FORCED ((struct siginfo *) 2)
|
|
|
|
|
2009-10-25 22:37:58 +08:00
|
|
|
/*
|
|
|
|
* True if we are on the alternate signal stack.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline int on_sig_stack(unsigned long sp)
|
|
|
|
{
|
2009-10-25 22:37:58 +08:00
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
return sp >= current->sas_ss_sp &&
|
|
|
|
sp - current->sas_ss_sp < current->sas_ss_size;
|
|
|
|
#else
|
|
|
|
return sp > current->sas_ss_sp &&
|
|
|
|
sp - current->sas_ss_sp <= current->sas_ss_size;
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int sas_ss_flags(unsigned long sp)
|
|
|
|
{
|
2014-03-05 22:15:22 +08:00
|
|
|
if (!current->sas_ss_size)
|
|
|
|
return SS_DISABLE;
|
|
|
|
|
|
|
|
return on_sig_stack(sp) ? SS_ONSTACK : 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2012-11-07 02:28:21 +08:00
|
|
|
static inline unsigned long sigsp(unsigned long sp, struct ksignal *ksig)
|
|
|
|
{
|
|
|
|
if (unlikely((ksig->ka.sa.sa_flags & SA_ONSTACK)) && ! sas_ss_flags(sp))
|
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
return current->sas_ss_sp;
|
|
|
|
#else
|
|
|
|
return current->sas_ss_sp + current->sas_ss_size;
|
|
|
|
#endif
|
|
|
|
return sp;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Routines for handling mm_structs
|
|
|
|
*/
|
|
|
|
extern struct mm_struct * mm_alloc(void);
|
|
|
|
|
|
|
|
/* mmdrop drops the mm and the page tables */
|
2008-02-14 07:03:15 +08:00
|
|
|
extern void __mmdrop(struct mm_struct *);
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline void mmdrop(struct mm_struct * mm)
|
|
|
|
{
|
2007-07-10 00:52:01 +08:00
|
|
|
if (unlikely(atomic_dec_and_test(&mm->mm_count)))
|
2005-04-17 06:20:36 +08:00
|
|
|
__mmdrop(mm);
|
|
|
|
}
|
|
|
|
|
|
|
|
/* mmput gets rid of the mappings and all user-space */
|
|
|
|
extern void mmput(struct mm_struct *);
|
|
|
|
/* Grab a reference to a task's mm, if it is not already going away */
|
|
|
|
extern struct mm_struct *get_task_mm(struct task_struct *task);
|
2012-02-02 09:04:09 +08:00
|
|
|
/*
|
|
|
|
* Grab a reference to a task's mm, if it is not already going away
|
|
|
|
* and ptrace_may_access with the mode parameter passed to it
|
|
|
|
* succeeds.
|
|
|
|
*/
|
|
|
|
extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Remove the current tasks stale references to the old mm_struct */
|
|
|
|
extern void mm_release(struct task_struct *, struct mm_struct *);
|
|
|
|
|
clone: support passing tls argument via C rather than pt_regs magic
clone has some of the quirkiest syscall handling in the kernel, with a
pile of special cases, historical curiosities, and architecture-specific
calling conventions. In particular, clone with CLONE_SETTLS accepts a
parameter "tls" that the C entry point completely ignores and some
assembly entry points overwrite; instead, the low-level arch-specific
code pulls the tls parameter out of the arch-specific register captured
as part of pt_regs on entry to the kernel. That's a massive hack, and
it makes the arch-specific code only work when called via the specific
existing syscall entry points; because of this hack, any new clone-like
system call would have to accept an identical tls argument in exactly
the same arch-specific position, rather than providing a unified system
call entry point across architectures.
The first patch allows architectures to handle the tls argument via
normal C parameter passing, if they opt in by selecting
HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt
into this.
These two patches came out of the clone4 series, which isn't ready for
this merge window, but these first two cleanup patches were entirely
uncontroversial and have acks. I'd like to go ahead and submit these
two so that other architectures can begin building on top of this and
opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and
send these through the next merge window (along with v3 of clone4) if
anyone would prefer that.
This patch (of 2):
clone with CLONE_SETTLS accepts an argument to set the thread-local
storage area for the new thread. sys_clone declares an int argument
tls_val in the appropriate point in the argument list (based on the
various CLONE_BACKWARDS variants), but doesn't actually use or pass along
that argument. Instead, sys_clone calls do_fork, which calls
copy_process, which calls the arch-specific copy_thread, and copy_thread
pulls the corresponding syscall argument out of the pt_regs captured at
kernel entry (knowing what argument of clone that architecture passes tls
in).
Apart from being awful and inscrutable, that also only works because only
one code path into copy_thread can pass the CLONE_SETTLS flag, and that
code path comes from sys_clone with its architecture-specific
argument-passing order. This prevents introducing a new version of the
clone system call without propagating the same architecture-specific
position of the tls argument.
However, there's no reason to pull the argument out of pt_regs when
sys_clone could just pass it down via C function call arguments.
Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into,
and a new copy_thread_tls that accepts the tls parameter as an additional
unsigned long (syscall-argument-sized) argument. Change sys_clone's tls
argument to an unsigned long (which does not change the ABI), and pass
that down to copy_thread_tls.
Architectures that don't opt into copy_thread_tls will continue to ignore
the C argument to sys_clone in favor of the pt_regs captured at kernel
entry, and thus will be unable to introduce new versions of the clone
syscall.
Patch co-authored by Josh Triplett and Thiago Macieira.
Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thiago Macieira <thiago.macieira@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-06-26 06:01:19 +08:00
|
|
|
#ifdef CONFIG_HAVE_COPY_THREAD_TLS
|
|
|
|
extern int copy_thread_tls(unsigned long, unsigned long, unsigned long,
|
|
|
|
struct task_struct *, unsigned long);
|
|
|
|
#else
|
2009-04-03 07:56:59 +08:00
|
|
|
extern int copy_thread(unsigned long, unsigned long, unsigned long,
|
2012-10-23 10:51:14 +08:00
|
|
|
struct task_struct *);
|
clone: support passing tls argument via C rather than pt_regs magic
clone has some of the quirkiest syscall handling in the kernel, with a
pile of special cases, historical curiosities, and architecture-specific
calling conventions. In particular, clone with CLONE_SETTLS accepts a
parameter "tls" that the C entry point completely ignores and some
assembly entry points overwrite; instead, the low-level arch-specific
code pulls the tls parameter out of the arch-specific register captured
as part of pt_regs on entry to the kernel. That's a massive hack, and
it makes the arch-specific code only work when called via the specific
existing syscall entry points; because of this hack, any new clone-like
system call would have to accept an identical tls argument in exactly
the same arch-specific position, rather than providing a unified system
call entry point across architectures.
The first patch allows architectures to handle the tls argument via
normal C parameter passing, if they opt in by selecting
HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt
into this.
These two patches came out of the clone4 series, which isn't ready for
this merge window, but these first two cleanup patches were entirely
uncontroversial and have acks. I'd like to go ahead and submit these
two so that other architectures can begin building on top of this and
opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and
send these through the next merge window (along with v3 of clone4) if
anyone would prefer that.
This patch (of 2):
clone with CLONE_SETTLS accepts an argument to set the thread-local
storage area for the new thread. sys_clone declares an int argument
tls_val in the appropriate point in the argument list (based on the
various CLONE_BACKWARDS variants), but doesn't actually use or pass along
that argument. Instead, sys_clone calls do_fork, which calls
copy_process, which calls the arch-specific copy_thread, and copy_thread
pulls the corresponding syscall argument out of the pt_regs captured at
kernel entry (knowing what argument of clone that architecture passes tls
in).
Apart from being awful and inscrutable, that also only works because only
one code path into copy_thread can pass the CLONE_SETTLS flag, and that
code path comes from sys_clone with its architecture-specific
argument-passing order. This prevents introducing a new version of the
clone system call without propagating the same architecture-specific
position of the tls argument.
However, there's no reason to pull the argument out of pt_regs when
sys_clone could just pass it down via C function call arguments.
Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into,
and a new copy_thread_tls that accepts the tls parameter as an additional
unsigned long (syscall-argument-sized) argument. Change sys_clone's tls
argument to an unsigned long (which does not change the ABI), and pass
that down to copy_thread_tls.
Architectures that don't opt into copy_thread_tls will continue to ignore
the C argument to sys_clone in favor of the pt_regs captured at kernel
entry, and thus will be unable to introduce new versions of the clone
syscall.
Patch co-authored by Josh Triplett and Thiago Macieira.
Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thiago Macieira <thiago.macieira@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-06-26 06:01:19 +08:00
|
|
|
|
|
|
|
/* Architectures that haven't opted into copy_thread_tls get the tls argument
|
|
|
|
* via pt_regs, so ignore the tls argument passed via C. */
|
|
|
|
static inline int copy_thread_tls(
|
|
|
|
unsigned long clone_flags, unsigned long sp, unsigned long arg,
|
|
|
|
struct task_struct *p, unsigned long tls)
|
|
|
|
{
|
|
|
|
return copy_thread(clone_flags, sp, arg, p);
|
|
|
|
}
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void flush_thread(void);
|
|
|
|
extern void exit_thread(void);
|
|
|
|
|
|
|
|
extern void exit_files(struct task_struct *);
|
2006-03-29 08:11:27 +08:00
|
|
|
extern void __cleanup_sighand(struct sighand_struct *);
|
2008-05-27 00:55:42 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void exit_itimers(struct signal_struct *);
|
2008-05-27 00:55:42 +08:00
|
|
|
extern void flush_itimer_signals(void);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2012-01-13 09:17:17 +08:00
|
|
|
extern void do_group_exit(int);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-02-06 04:54:53 +08:00
|
|
|
extern int do_execve(struct filename *,
|
2010-08-18 06:52:56 +08:00
|
|
|
const char __user * const __user *,
|
2012-10-21 09:49:33 +08:00
|
|
|
const char __user * const __user *);
|
syscalls: implement execveat() system call
This patchset adds execveat(2) for x86, and is derived from Meredydd
Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528).
The primary aim of adding an execveat syscall is to allow an
implementation of fexecve(3) that does not rely on the /proc filesystem,
at least for executables (rather than scripts). The current glibc version
of fexecve(3) is implemented via /proc, which causes problems in sandboxed
or otherwise restricted environments.
Given the desire for a /proc-free fexecve() implementation, HPA suggested
(https://lkml.org/lkml/2006/7/11/556) that an execveat(2) syscall would be
an appropriate generalization.
Also, having a new syscall means that it can take a flags argument without
back-compatibility concerns. The current implementation just defines the
AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW flags, but other flags could be
added in future -- for example, flags for new namespaces (as suggested at
https://lkml.org/lkml/2006/7/11/474).
Related history:
- https://lkml.org/lkml/2006/12/27/123 is an example of someone
realizing that fexecve() is likely to fail in a chroot environment.
- http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered
documenting the /proc requirement of fexecve(3) in its manpage, to
"prevent other people from wasting their time".
- https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a
problem where a process that did setuid() could not fexecve()
because it no longer had access to /proc/self/fd; this has since
been fixed.
This patch (of 4):
Add a new execveat(2) system call. execveat() is to execve() as openat()
is to open(): it takes a file descriptor that refers to a directory, and
resolves the filename relative to that.
In addition, if the filename is empty and AT_EMPTY_PATH is specified,
execveat() executes the file to which the file descriptor refers. This
replicates the functionality of fexecve(), which is a system call in other
UNIXen, but in Linux glibc it depends on opening "/proc/self/fd/<fd>" (and
so relies on /proc being mounted).
The filename fed to the executed program as argv[0] (or the name of the
script fed to a script interpreter) will be of the form "/dev/fd/<fd>"
(for an empty filename) or "/dev/fd/<fd>/<filename>", effectively
reflecting how the executable was found. This does however mean that
execution of a script in a /proc-less environment won't work; also, script
execution via an O_CLOEXEC file descriptor fails (as the file will not be
accessible after exec).
Based on patches by Meredydd Luff.
Signed-off-by: David Drysdale <drysdale@google.com>
Cc: Meredydd Luff <meredydd@senatehouse.org>
Cc: Shuah Khan <shuah.kh@samsung.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Rich Felker <dalias@aerifal.cx>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-13 08:57:29 +08:00
|
|
|
extern int do_execveat(int, struct filename *,
|
|
|
|
const char __user * const __user *,
|
|
|
|
const char __user * const __user *,
|
|
|
|
int);
|
clone: support passing tls argument via C rather than pt_regs magic
clone has some of the quirkiest syscall handling in the kernel, with a
pile of special cases, historical curiosities, and architecture-specific
calling conventions. In particular, clone with CLONE_SETTLS accepts a
parameter "tls" that the C entry point completely ignores and some
assembly entry points overwrite; instead, the low-level arch-specific
code pulls the tls parameter out of the arch-specific register captured
as part of pt_regs on entry to the kernel. That's a massive hack, and
it makes the arch-specific code only work when called via the specific
existing syscall entry points; because of this hack, any new clone-like
system call would have to accept an identical tls argument in exactly
the same arch-specific position, rather than providing a unified system
call entry point across architectures.
The first patch allows architectures to handle the tls argument via
normal C parameter passing, if they opt in by selecting
HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt
into this.
These two patches came out of the clone4 series, which isn't ready for
this merge window, but these first two cleanup patches were entirely
uncontroversial and have acks. I'd like to go ahead and submit these
two so that other architectures can begin building on top of this and
opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and
send these through the next merge window (along with v3 of clone4) if
anyone would prefer that.
This patch (of 2):
clone with CLONE_SETTLS accepts an argument to set the thread-local
storage area for the new thread. sys_clone declares an int argument
tls_val in the appropriate point in the argument list (based on the
various CLONE_BACKWARDS variants), but doesn't actually use or pass along
that argument. Instead, sys_clone calls do_fork, which calls
copy_process, which calls the arch-specific copy_thread, and copy_thread
pulls the corresponding syscall argument out of the pt_regs captured at
kernel entry (knowing what argument of clone that architecture passes tls
in).
Apart from being awful and inscrutable, that also only works because only
one code path into copy_thread can pass the CLONE_SETTLS flag, and that
code path comes from sys_clone with its architecture-specific
argument-passing order. This prevents introducing a new version of the
clone system call without propagating the same architecture-specific
position of the tls argument.
However, there's no reason to pull the argument out of pt_regs when
sys_clone could just pass it down via C function call arguments.
Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into,
and a new copy_thread_tls that accepts the tls parameter as an additional
unsigned long (syscall-argument-sized) argument. Change sys_clone's tls
argument to an unsigned long (which does not change the ABI), and pass
that down to copy_thread_tls.
Architectures that don't opt into copy_thread_tls will continue to ignore
the C argument to sys_clone in favor of the pt_regs captured at kernel
entry, and thus will be unable to introduce new versions of the clone
syscall.
Patch co-authored by Josh Triplett and Thiago Macieira.
Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thiago Macieira <thiago.macieira@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-06-26 06:01:19 +08:00
|
|
|
extern long _do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *, unsigned long);
|
2012-10-23 11:10:08 +08:00
|
|
|
extern long do_fork(unsigned long, unsigned long, unsigned long, int __user *, int __user *);
|
2006-07-03 15:25:41 +08:00
|
|
|
struct task_struct *fork_idle(int);
|
2012-09-22 07:55:31 +08:00
|
|
|
extern pid_t kernel_thread(int (*fn)(void *), void *arg, unsigned long flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2014-05-28 16:45:04 +08:00
|
|
|
extern void __set_task_comm(struct task_struct *tsk, const char *from, bool exec);
|
|
|
|
static inline void set_task_comm(struct task_struct *tsk, const char *from)
|
|
|
|
{
|
|
|
|
__set_task_comm(tsk, from, false);
|
|
|
|
}
|
2008-02-05 14:27:21 +08:00
|
|
|
extern char *get_task_comm(char *to, struct task_struct *tsk);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_SMP
|
2011-04-05 23:23:58 +08:00
|
|
|
void scheduler_ipi(void);
|
2008-07-26 10:45:58 +08:00
|
|
|
extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
|
2005-04-17 06:20:36 +08:00
|
|
|
#else
|
2011-04-05 23:23:39 +08:00
|
|
|
static inline void scheduler_ipi(void) { }
|
2008-07-26 10:45:58 +08:00
|
|
|
static inline unsigned long wait_task_inactive(struct task_struct *p,
|
|
|
|
long match_state)
|
|
|
|
{
|
|
|
|
return 1;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|
|
|
|
|
2015-05-07 00:04:24 +08:00
|
|
|
#define tasklist_empty() \
|
|
|
|
list_empty(&init_task.tasks)
|
|
|
|
|
2009-04-15 02:17:16 +08:00
|
|
|
#define next_task(p) \
|
|
|
|
list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#define for_each_process(p) \
|
|
|
|
for (p = &init_task ; (p = next_task(p)) != &init_task ; )
|
|
|
|
|
2009-07-10 09:48:23 +08:00
|
|
|
extern bool current_is_single_threaded(void);
|
CRED: Inaugurate COW credentials
Inaugurate copy-on-write credentials management. This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.
A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().
With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:
struct cred *new = prepare_creds();
int ret = blah(new);
if (ret < 0) {
abort_creds(new);
return ret;
}
return commit_creds(new);
There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.
To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const. The purpose of this is compile-time
discouragement of altering credentials through those pointers. Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:
(1) Its reference count may incremented and decremented.
(2) The keyrings to which it points may be modified, but not replaced.
The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
This now prepares and commits credentials in various places in the
security code rather than altering the current creds directly.
(2) Temporary credential overrides.
do_coredump() and sys_faccessat() now prepare their own credentials and
temporarily override the ones currently on the acting thread, whilst
preventing interference from other threads by holding cred_replace_mutex
on the thread being dumped.
This will be replaced in a future patch by something that hands down the
credentials directly to the functions being called, rather than altering
the task's objective credentials.
(3) LSM interface.
A number of functions have been changed, added or removed:
(*) security_capset_check(), ->capset_check()
(*) security_capset_set(), ->capset_set()
Removed in favour of security_capset().
(*) security_capset(), ->capset()
New. This is passed a pointer to the new creds, a pointer to the old
creds and the proposed capability sets. It should fill in the new
creds or return an error. All pointers, barring the pointer to the
new creds, are now const.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
Changed; now returns a value, which will cause the process to be
killed if it's an error.
(*) security_task_alloc(), ->task_alloc_security()
Removed in favour of security_prepare_creds().
(*) security_cred_free(), ->cred_free()
New. Free security data attached to cred->security.
(*) security_prepare_creds(), ->cred_prepare()
New. Duplicate any security data attached to cred->security.
(*) security_commit_creds(), ->cred_commit()
New. Apply any security effects for the upcoming installation of new
security by commit_creds().
(*) security_task_post_setuid(), ->task_post_setuid()
Removed in favour of security_task_fix_setuid().
(*) security_task_fix_setuid(), ->task_fix_setuid()
Fix up the proposed new credentials for setuid(). This is used by
cap_set_fix_setuid() to implicitly adjust capabilities in line with
setuid() changes. Changes are made to the new credentials, rather
than the task itself as in security_task_post_setuid().
(*) security_task_reparent_to_init(), ->task_reparent_to_init()
Removed. Instead the task being reparented to init is referred
directly to init's credentials.
NOTE! This results in the loss of some state: SELinux's osid no
longer records the sid of the thread that forked it.
(*) security_key_alloc(), ->key_alloc()
(*) security_key_permission(), ->key_permission()
Changed. These now take cred pointers rather than task pointers to
refer to the security context.
(4) sys_capset().
This has been simplified and uses less locking. The LSM functions it
calls have been merged.
(5) reparent_to_kthreadd().
This gives the current thread the same credentials as init by simply using
commit_thread() to point that way.
(6) __sigqueue_alloc() and switch_uid()
__sigqueue_alloc() can't stop the target task from changing its creds
beneath it, so this function gets a reference to the currently applicable
user_struct which it then passes into the sigqueue struct it returns if
successful.
switch_uid() is now called from commit_creds(), and possibly should be
folded into that. commit_creds() should take care of protecting
__sigqueue_alloc().
(7) [sg]et[ug]id() and co and [sg]et_current_groups.
The set functions now all use prepare_creds(), commit_creds() and
abort_creds() to build and check a new set of credentials before applying
it.
security_task_set[ug]id() is called inside the prepared section. This
guarantees that nothing else will affect the creds until we've finished.
The calling of set_dumpable() has been moved into commit_creds().
Much of the functionality of set_user() has been moved into
commit_creds().
The get functions all simply access the data directly.
(8) security_task_prctl() and cap_task_prctl().
security_task_prctl() has been modified to return -ENOSYS if it doesn't
want to handle a function, or otherwise return the return value directly
rather than through an argument.
Additionally, cap_task_prctl() now prepares a new set of credentials, even
if it doesn't end up using it.
(9) Keyrings.
A number of changes have been made to the keyrings code:
(a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
all been dropped and built in to the credentials functions directly.
They may want separating out again later.
(b) key_alloc() and search_process_keyrings() now take a cred pointer
rather than a task pointer to specify the security context.
(c) copy_creds() gives a new thread within the same thread group a new
thread keyring if its parent had one, otherwise it discards the thread
keyring.
(d) The authorisation key now points directly to the credentials to extend
the search into rather pointing to the task that carries them.
(e) Installing thread, process or session keyrings causes a new set of
credentials to be created, even though it's not strictly necessary for
process or session keyrings (they're shared).
(10) Usermode helper.
The usermode helper code now carries a cred struct pointer in its
subprocess_info struct instead of a new session keyring pointer. This set
of credentials is derived from init_cred and installed on the new process
after it has been cloned.
call_usermodehelper_setup() allocates the new credentials and
call_usermodehelper_freeinfo() discards them if they haven't been used. A
special cred function (prepare_usermodeinfo_creds()) is provided
specifically for call_usermodehelper_setup() to call.
call_usermodehelper_setkeys() adjusts the credentials to sport the
supplied keyring as the new session keyring.
(11) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) selinux_setprocattr() no longer does its check for whether the
current ptracer can access processes with the new SID inside the lock
that covers getting the ptracer's SID. Whilst this lock ensures that
the check is done with the ptracer pinned, the result is only valid
until the lock is released, so there's no point doing it inside the
lock.
(12) is_single_threaded().
This function has been extracted from selinux_setprocattr() and put into
a file of its own in the lib/ directory as join_session_keyring() now
wants to use it too.
The code in SELinux just checked to see whether a task shared mm_structs
with other tasks (CLONE_VM), but that isn't good enough. We really want
to know if they're part of the same thread group (CLONE_THREAD).
(13) nfsd.
The NFS server daemon now has to use the COW credentials to set the
credentials it is going to use. It really needs to pass the credentials
down to the functions it calls, but it can't do that until other patches
in this series have been applied.
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:23 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Careful: do_each_thread/while_each_thread is a double loop so
|
|
|
|
* 'break' will not work as expected - use goto instead.
|
|
|
|
*/
|
|
|
|
#define do_each_thread(g, t) \
|
|
|
|
for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
|
|
|
|
|
|
|
|
#define while_each_thread(g, t) \
|
|
|
|
while ((t = next_thread(t)) != g)
|
|
|
|
|
introduce for_each_thread() to replace the buggy while_each_thread()
while_each_thread() and next_thread() should die, almost every lockless
usage is wrong.
1. Unless g == current, the lockless while_each_thread() is not safe.
while_each_thread(g, t) can loop forever if g exits, next_thread()
can't reach the unhashed thread in this case. Note that this can
happen even if g is the group leader, it can exec.
2. Even if while_each_thread() itself was correct, people often use
it wrongly.
It was never safe to just take rcu_read_lock() and loop unless
you verify that pid_alive(g) == T, even the first next_thread()
can point to the already freed/reused memory.
This patch adds signal_struct->thread_head and task->thread_node to
create the normal rcu-safe list with the stable head. The new
for_each_thread(g, t) helper is always safe under rcu_read_lock() as
long as this task_struct can't go away.
Note: of course it is ugly to have both task_struct->thread_node and the
old task_struct->thread_group, we will kill it later, after we change
the users of while_each_thread() to use for_each_thread().
Perhaps we can kill it even before we convert all users, we can
reimplement next_thread(t) using the new thread_head/thread_node. But
we can't do this right now because this will lead to subtle behavioural
changes. For example, do/while_each_thread() always sees at least one
task, while for_each_thread() can do nothing if the whole thread group
has died. Or thread_group_empty(), currently its semantics is not clear
unless thread_group_leader(p) and we need to audit the callers before we
can change it.
So this patch adds the new interface which has to coexist with the old
one for some time, hopefully the next changes will be more or less
straightforward and the old one will go away soon.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Reviewed-by: Sergey Dyasly <dserrg@gmail.com>
Tested-by: Sergey Dyasly <dserrg@gmail.com>
Reviewed-by: Sameer Nanda <snanda@chromium.org>
Acked-by: David Rientjes <rientjes@google.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Mandeep Singh Baines <msb@chromium.org>
Cc: "Ma, Xindong" <xindong.ma@intel.com>
Cc: Michal Hocko <mhocko@suse.cz>
Cc: "Tu, Xiaobing" <xiaobing.tu@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-01-22 07:49:56 +08:00
|
|
|
#define __for_each_thread(signal, t) \
|
|
|
|
list_for_each_entry_rcu(t, &(signal)->thread_head, thread_node)
|
|
|
|
|
|
|
|
#define for_each_thread(p, t) \
|
|
|
|
__for_each_thread((p)->signal, t)
|
|
|
|
|
|
|
|
/* Careful: this is a double loop, 'break' won't work as expected. */
|
|
|
|
#define for_each_process_thread(p, t) \
|
|
|
|
for_each_process(p) for_each_thread(p, t)
|
|
|
|
|
2010-05-27 05:43:22 +08:00
|
|
|
static inline int get_nr_threads(struct task_struct *tsk)
|
|
|
|
{
|
2010-05-27 05:43:24 +08:00
|
|
|
return tsk->signal->nr_threads;
|
2010-05-27 05:43:22 +08:00
|
|
|
}
|
|
|
|
|
2011-06-23 05:10:26 +08:00
|
|
|
static inline bool thread_group_leader(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return p->exit_signal >= 0;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[PATCH] proc: readdir race fix (take 3)
The problem: An opendir, readdir, closedir sequence can fail to report
process ids that are continually in use throughout the sequence of system
calls. For this race to trigger the process that proc_pid_readdir stops at
must exit before readdir is called again.
This can cause ps to fail to report processes, and it is in violation of
posix guarantees and normal application expectations with respect to
readdir.
Currently there is no way to work around this problem in user space short
of providing a gargantuan buffer to user space so the directory read all
happens in on system call.
This patch implements the normal directory semantics for proc, that
guarantee that a directory entry that is neither created nor destroyed
while reading the directory entry will be returned. For directory that are
either created or destroyed during the readdir you may or may not see them.
Furthermore you may seek to a directory offset you have previously seen.
These are the guarantee that ext[23] provides and that posix requires, and
more importantly that user space expects. Plus it is a simple semantic to
implement reliable service. It is just a matter of calling readdir a
second time if you are wondering if something new has show up.
These better semantics are implemented by scanning through the pids in
numerical order and by making the file offset a pid plus a fixed offset.
The pid scan happens on the pid bitmap, which when you look at it is
remarkably efficient for a brute force algorithm. Given that a typical
cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There
are only 40 cache lines for the entire 32K pid space. A typical system
will have 100 pids or more so this is actually fewer cache lines we have to
look at to scan a linked list, and the worst case of having to scan the
entire pid bitmap is pretty reasonable.
If we need something more efficient we can go to a more efficient data
structure for indexing the pids, but for now what we have should be
sufficient.
In addition this takes no additional locks and is actually less code than
what we are doing now.
Also another very subtle bug in this area has been fixed. It is possible
to catch a task in the middle of de_thread where a thread is assuming the
thread of it's thread group leader. This patch carefully handles that case
so if we hit it we don't fail to return the pid, that is undergoing the
de_thread dance.
Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for
providing the first fix, pointing this out and working on it.
[oleg@tv-sign.ru: fix it]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 17:17:04 +08:00
|
|
|
/* Do to the insanities of de_thread it is possible for a process
|
|
|
|
* to have the pid of the thread group leader without actually being
|
|
|
|
* the thread group leader. For iteration through the pids in proc
|
|
|
|
* all we care about is that we have a task with the appropriate
|
|
|
|
* pid, we don't actually care if we have the right task.
|
|
|
|
*/
|
2013-09-12 05:20:06 +08:00
|
|
|
static inline bool has_group_leader_pid(struct task_struct *p)
|
[PATCH] proc: readdir race fix (take 3)
The problem: An opendir, readdir, closedir sequence can fail to report
process ids that are continually in use throughout the sequence of system
calls. For this race to trigger the process that proc_pid_readdir stops at
must exit before readdir is called again.
This can cause ps to fail to report processes, and it is in violation of
posix guarantees and normal application expectations with respect to
readdir.
Currently there is no way to work around this problem in user space short
of providing a gargantuan buffer to user space so the directory read all
happens in on system call.
This patch implements the normal directory semantics for proc, that
guarantee that a directory entry that is neither created nor destroyed
while reading the directory entry will be returned. For directory that are
either created or destroyed during the readdir you may or may not see them.
Furthermore you may seek to a directory offset you have previously seen.
These are the guarantee that ext[23] provides and that posix requires, and
more importantly that user space expects. Plus it is a simple semantic to
implement reliable service. It is just a matter of calling readdir a
second time if you are wondering if something new has show up.
These better semantics are implemented by scanning through the pids in
numerical order and by making the file offset a pid plus a fixed offset.
The pid scan happens on the pid bitmap, which when you look at it is
remarkably efficient for a brute force algorithm. Given that a typical
cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There
are only 40 cache lines for the entire 32K pid space. A typical system
will have 100 pids or more so this is actually fewer cache lines we have to
look at to scan a linked list, and the worst case of having to scan the
entire pid bitmap is pretty reasonable.
If we need something more efficient we can go to a more efficient data
structure for indexing the pids, but for now what we have should be
sufficient.
In addition this takes no additional locks and is actually less code than
what we are doing now.
Also another very subtle bug in this area has been fixed. It is possible
to catch a task in the middle of de_thread where a thread is assuming the
thread of it's thread group leader. This patch carefully handles that case
so if we hit it we don't fail to return the pid, that is undergoing the
de_thread dance.
Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for
providing the first fix, pointing this out and working on it.
[oleg@tv-sign.ru: fix it]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 17:17:04 +08:00
|
|
|
{
|
2013-09-12 05:20:06 +08:00
|
|
|
return task_pid(p) == p->signal->leader_pid;
|
[PATCH] proc: readdir race fix (take 3)
The problem: An opendir, readdir, closedir sequence can fail to report
process ids that are continually in use throughout the sequence of system
calls. For this race to trigger the process that proc_pid_readdir stops at
must exit before readdir is called again.
This can cause ps to fail to report processes, and it is in violation of
posix guarantees and normal application expectations with respect to
readdir.
Currently there is no way to work around this problem in user space short
of providing a gargantuan buffer to user space so the directory read all
happens in on system call.
This patch implements the normal directory semantics for proc, that
guarantee that a directory entry that is neither created nor destroyed
while reading the directory entry will be returned. For directory that are
either created or destroyed during the readdir you may or may not see them.
Furthermore you may seek to a directory offset you have previously seen.
These are the guarantee that ext[23] provides and that posix requires, and
more importantly that user space expects. Plus it is a simple semantic to
implement reliable service. It is just a matter of calling readdir a
second time if you are wondering if something new has show up.
These better semantics are implemented by scanning through the pids in
numerical order and by making the file offset a pid plus a fixed offset.
The pid scan happens on the pid bitmap, which when you look at it is
remarkably efficient for a brute force algorithm. Given that a typical
cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There
are only 40 cache lines for the entire 32K pid space. A typical system
will have 100 pids or more so this is actually fewer cache lines we have to
look at to scan a linked list, and the worst case of having to scan the
entire pid bitmap is pretty reasonable.
If we need something more efficient we can go to a more efficient data
structure for indexing the pids, but for now what we have should be
sufficient.
In addition this takes no additional locks and is actually less code than
what we are doing now.
Also another very subtle bug in this area has been fixed. It is possible
to catch a task in the middle of de_thread where a thread is assuming the
thread of it's thread group leader. This patch carefully handles that case
so if we hit it we don't fail to return the pid, that is undergoing the
de_thread dance.
Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for
providing the first fix, pointing this out and working on it.
[oleg@tv-sign.ru: fix it]
Signed-off-by: Eric W. Biederman <ebiederm@xmission.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Cc: Jean Delvare <jdelvare@suse.de>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 17:17:04 +08:00
|
|
|
}
|
|
|
|
|
2007-10-19 14:40:18 +08:00
|
|
|
static inline
|
2013-09-12 05:20:06 +08:00
|
|
|
bool same_thread_group(struct task_struct *p1, struct task_struct *p2)
|
2007-10-19 14:40:18 +08:00
|
|
|
{
|
2013-09-12 05:20:06 +08:00
|
|
|
return p1->signal == p2->signal;
|
2007-10-19 14:40:18 +08:00
|
|
|
}
|
|
|
|
|
2006-07-03 15:25:41 +08:00
|
|
|
static inline struct task_struct *next_thread(const struct task_struct *p)
|
2006-03-29 08:11:25 +08:00
|
|
|
{
|
2009-04-15 02:17:16 +08:00
|
|
|
return list_entry_rcu(p->thread_group.next,
|
|
|
|
struct task_struct, thread_group);
|
2006-03-29 08:11:25 +08:00
|
|
|
}
|
|
|
|
|
2007-10-26 16:17:22 +08:00
|
|
|
static inline int thread_group_empty(struct task_struct *p)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-03-29 08:11:25 +08:00
|
|
|
return list_empty(&p->thread_group);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#define delay_group_leader(p) \
|
|
|
|
(thread_group_leader(p) && !thread_group_empty(p))
|
|
|
|
|
|
|
|
/*
|
2006-06-23 17:05:18 +08:00
|
|
|
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
|
2005-06-27 16:55:12 +08:00
|
|
|
* subscriptions and synchronises with wait4(). Also used in procfs. Also
|
Task Control Groups: basic task cgroup framework
Generic Process Control Groups
--------------------------
There have recently been various proposals floating around for
resource management/accounting and other task grouping subsystems in
the kernel, including ResGroups, User BeanCounters, NSProxy
cgroups, and others. These all need the basic abstraction of being
able to group together multiple processes in an aggregate, in order to
track/limit the resources permitted to those processes, or control
other behaviour of the processes, and all implement this grouping in
different ways.
This patchset provides a framework for tracking and grouping processes
into arbitrary "cgroups" and assigning arbitrary state to those
groupings, in order to control the behaviour of the cgroup as an
aggregate.
The intention is that the various resource management and
virtualization/cgroup efforts can also become task cgroup
clients, with the result that:
- the userspace APIs are (somewhat) normalised
- it's easier to test e.g. the ResGroups CPU controller in
conjunction with the BeanCounters memory controller, or use either of
them as the resource-control portion of a virtual server system.
- the additional kernel footprint of any of the competing resource
management systems is substantially reduced, since it doesn't need
to provide process grouping/containment, hence improving their
chances of getting into the kernel
This patch:
Add the main task cgroups framework - the cgroup filesystem, and the
basic structures for tracking membership and associating subsystem state
objects to tasks.
Signed-off-by: Paul Menage <menage@google.com>
Cc: Serge E. Hallyn <serue@us.ibm.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Dave Hansen <haveblue@us.ibm.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Cc: Paul Jackson <pj@sgi.com>
Cc: Kirill Korotaev <dev@openvz.org>
Cc: Herbert Poetzl <herbert@13thfloor.at>
Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com>
Cc: Cedric Le Goater <clg@fr.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:30 +08:00
|
|
|
* pins the final release of task.io_context. Also protects ->cpuset and
|
2012-03-06 06:59:13 +08:00
|
|
|
* ->cgroup.subsys[]. And ->vfork_done.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
|
|
|
* Nests both inside and outside of read_lock(&tasklist_lock).
|
|
|
|
* It must not be nested with write_lock_irq(&tasklist_lock),
|
|
|
|
* neither inside nor outside.
|
|
|
|
*/
|
|
|
|
static inline void task_lock(struct task_struct *p)
|
|
|
|
{
|
|
|
|
spin_lock(&p->alloc_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void task_unlock(struct task_struct *p)
|
|
|
|
{
|
|
|
|
spin_unlock(&p->alloc_lock);
|
|
|
|
}
|
|
|
|
|
2010-10-28 06:34:06 +08:00
|
|
|
extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
|
2006-03-29 08:11:13 +08:00
|
|
|
unsigned long *flags);
|
|
|
|
|
2012-02-10 00:45:19 +08:00
|
|
|
static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
|
|
|
|
unsigned long *flags)
|
|
|
|
{
|
|
|
|
struct sighand_struct *ret;
|
|
|
|
|
|
|
|
ret = __lock_task_sighand(tsk, flags);
|
|
|
|
(void)__cond_lock(&tsk->sighand->siglock, ret);
|
|
|
|
return ret;
|
|
|
|
}
|
2010-10-28 06:34:06 +08:00
|
|
|
|
2006-03-29 08:11:13 +08:00
|
|
|
static inline void unlock_task_sighand(struct task_struct *tsk,
|
|
|
|
unsigned long *flags)
|
|
|
|
{
|
|
|
|
spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
|
|
|
|
}
|
|
|
|
|
2011-12-13 10:12:21 +08:00
|
|
|
/**
|
2015-05-14 04:35:16 +08:00
|
|
|
* threadgroup_change_begin - mark the beginning of changes to a threadgroup
|
|
|
|
* @tsk: task causing the changes
|
2011-12-13 10:12:21 +08:00
|
|
|
*
|
2015-05-14 04:35:16 +08:00
|
|
|
* All operations which modify a threadgroup - a new thread joining the
|
|
|
|
* group, death of a member thread (the assertion of PF_EXITING) and
|
|
|
|
* exec(2) dethreading the process and replacing the leader - are wrapped
|
|
|
|
* by threadgroup_change_{begin|end}(). This is to provide a place which
|
|
|
|
* subsystems needing threadgroup stability can hook into for
|
|
|
|
* synchronization.
|
2011-12-13 10:12:21 +08:00
|
|
|
*/
|
2015-05-14 04:35:16 +08:00
|
|
|
static inline void threadgroup_change_begin(struct task_struct *tsk)
|
2011-05-27 07:25:18 +08:00
|
|
|
{
|
2015-05-14 04:35:16 +08:00
|
|
|
might_sleep();
|
|
|
|
cgroup_threadgroup_change_begin(tsk);
|
2011-05-27 07:25:18 +08:00
|
|
|
}
|
2011-12-13 10:12:21 +08:00
|
|
|
|
|
|
|
/**
|
2015-05-14 04:35:16 +08:00
|
|
|
* threadgroup_change_end - mark the end of changes to a threadgroup
|
|
|
|
* @tsk: task causing the changes
|
2011-12-13 10:12:21 +08:00
|
|
|
*
|
2015-05-14 04:35:16 +08:00
|
|
|
* See threadgroup_change_begin().
|
2011-12-13 10:12:21 +08:00
|
|
|
*/
|
2015-05-14 04:35:16 +08:00
|
|
|
static inline void threadgroup_change_end(struct task_struct *tsk)
|
2011-05-27 07:25:18 +08:00
|
|
|
{
|
2015-05-14 04:35:16 +08:00
|
|
|
cgroup_threadgroup_change_end(tsk);
|
2011-05-27 07:25:18 +08:00
|
|
|
}
|
|
|
|
|
2005-11-14 08:06:57 +08:00
|
|
|
#ifndef __HAVE_THREAD_FUNCTIONS
|
|
|
|
|
2007-05-09 17:35:17 +08:00
|
|
|
#define task_thread_info(task) ((struct thread_info *)(task)->stack)
|
|
|
|
#define task_stack_page(task) ((task)->stack)
|
2005-11-14 08:06:55 +08:00
|
|
|
|
2005-11-14 08:06:56 +08:00
|
|
|
static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
|
|
|
|
{
|
|
|
|
*task_thread_info(p) = *task_thread_info(org);
|
|
|
|
task_thread_info(p)->task = p;
|
|
|
|
}
|
|
|
|
|
2014-09-20 23:17:51 +08:00
|
|
|
/*
|
|
|
|
* Return the address of the last usable long on the stack.
|
|
|
|
*
|
|
|
|
* When the stack grows down, this is just above the thread
|
|
|
|
* info struct. Going any lower will corrupt the threadinfo.
|
|
|
|
*
|
|
|
|
* When the stack grows up, this is the highest address.
|
|
|
|
* Beyond that position, we corrupt data on the next page.
|
|
|
|
*/
|
2005-11-14 08:06:56 +08:00
|
|
|
static inline unsigned long *end_of_stack(struct task_struct *p)
|
|
|
|
{
|
2014-09-20 23:17:51 +08:00
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
return (unsigned long *)((unsigned long)task_thread_info(p) + THREAD_SIZE) - 1;
|
|
|
|
#else
|
2007-05-09 17:35:17 +08:00
|
|
|
return (unsigned long *)(task_thread_info(p) + 1);
|
2014-09-20 23:17:51 +08:00
|
|
|
#endif
|
2005-11-14 08:06:56 +08:00
|
|
|
}
|
|
|
|
|
2005-11-14 08:06:57 +08:00
|
|
|
#endif
|
2014-09-12 21:16:18 +08:00
|
|
|
#define task_stack_end_corrupted(task) \
|
|
|
|
(*(end_of_stack(task)) != STACK_END_MAGIC)
|
2005-11-14 08:06:57 +08:00
|
|
|
|
2008-07-24 12:26:53 +08:00
|
|
|
static inline int object_is_on_stack(void *obj)
|
|
|
|
{
|
|
|
|
void *stack = task_stack_page(current);
|
|
|
|
|
|
|
|
return (obj >= stack) && (obj < (stack + THREAD_SIZE));
|
|
|
|
}
|
|
|
|
|
2008-04-18 14:56:15 +08:00
|
|
|
extern void thread_info_cache_init(void);
|
|
|
|
|
2008-04-23 05:38:23 +08:00
|
|
|
#ifdef CONFIG_DEBUG_STACK_USAGE
|
|
|
|
static inline unsigned long stack_not_used(struct task_struct *p)
|
|
|
|
{
|
|
|
|
unsigned long *n = end_of_stack(p);
|
|
|
|
|
|
|
|
do { /* Skip over canary */
|
2016-03-20 00:54:10 +08:00
|
|
|
# ifdef CONFIG_STACK_GROWSUP
|
|
|
|
n--;
|
|
|
|
# else
|
2008-04-23 05:38:23 +08:00
|
|
|
n++;
|
2016-03-20 00:54:10 +08:00
|
|
|
# endif
|
2008-04-23 05:38:23 +08:00
|
|
|
} while (!*n);
|
|
|
|
|
2016-03-20 00:54:10 +08:00
|
|
|
# ifdef CONFIG_STACK_GROWSUP
|
|
|
|
return (unsigned long)end_of_stack(p) - (unsigned long)n;
|
|
|
|
# else
|
2008-04-23 05:38:23 +08:00
|
|
|
return (unsigned long)n - (unsigned long)end_of_stack(p);
|
2016-03-20 00:54:10 +08:00
|
|
|
# endif
|
2008-04-23 05:38:23 +08:00
|
|
|
}
|
|
|
|
#endif
|
2014-09-12 21:16:17 +08:00
|
|
|
extern void set_task_stack_end_magic(struct task_struct *tsk);
|
2008-04-23 05:38:23 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* set thread flags in other task's structures
|
|
|
|
* - see asm/thread_info.h for TIF_xxxx flags available
|
|
|
|
*/
|
|
|
|
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-14 08:06:55 +08:00
|
|
|
set_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-14 08:06:55 +08:00
|
|
|
clear_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-14 08:06:55 +08:00
|
|
|
return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-14 08:06:55 +08:00
|
|
|
return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
|
|
|
|
{
|
2005-11-14 08:06:55 +08:00
|
|
|
return test_ti_thread_flag(task_thread_info(tsk), flag);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_tsk_need_resched(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void clear_tsk_need_resched(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
|
|
|
|
}
|
|
|
|
|
2008-04-23 19:13:29 +08:00
|
|
|
static inline int test_tsk_need_resched(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
|
|
|
|
}
|
|
|
|
|
2009-05-14 00:55:10 +08:00
|
|
|
static inline int restart_syscall(void)
|
|
|
|
{
|
|
|
|
set_tsk_thread_flag(current, TIF_SIGPENDING);
|
|
|
|
return -ERESTARTNOINTR;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
static inline int signal_pending(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
|
|
|
|
}
|
2007-12-07 00:15:50 +08:00
|
|
|
|
2009-09-24 06:57:04 +08:00
|
|
|
static inline int __fatal_signal_pending(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return unlikely(sigismember(&p->pending.signal, SIGKILL));
|
|
|
|
}
|
2007-12-07 00:15:50 +08:00
|
|
|
|
|
|
|
static inline int fatal_signal_pending(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return signal_pending(p) && __fatal_signal_pending(p);
|
|
|
|
}
|
|
|
|
|
sched: fix TASK_WAKEKILL vs SIGKILL race
schedule() has the special "TASK_INTERRUPTIBLE && signal_pending()" case,
this allows us to do
current->state = TASK_INTERRUPTIBLE;
schedule();
without fear to sleep with pending signal.
However, the code like
current->state = TASK_KILLABLE;
schedule();
is not right, schedule() doesn't take TASK_WAKEKILL into account. This means
that mutex_lock_killable(), wait_for_completion_killable(), down_killable(),
schedule_timeout_killable() can miss SIGKILL (and btw the second SIGKILL has
no effect).
Introduce the new helper, signal_pending_state(), and change schedule() to
use it. Hopefully it will have more users, that is why the task's state is
passed separately.
Note this "__TASK_STOPPED | __TASK_TRACED" check in signal_pending_state().
This is needed to preserve the current behaviour (ptrace_notify). I hope
this check will be removed soon, but this (afaics good) change needs the
separate discussion.
The fast path is "(state & (INTERRUPTIBLE | WAKEKILL)) + signal_pending(p)",
basically the same that schedule() does now. However, this patch of course
bloats schedule().
Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-06-09 01:20:41 +08:00
|
|
|
static inline int signal_pending_state(long state, struct task_struct *p)
|
|
|
|
{
|
|
|
|
if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
|
|
|
|
return 0;
|
|
|
|
if (!signal_pending(p))
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* cond_resched() and cond_resched_lock(): latency reduction via
|
|
|
|
* explicit rescheduling in places that are safe. The return
|
|
|
|
* value indicates whether a reschedule was done in fact.
|
|
|
|
* cond_resched_lock() will drop the spinlock before scheduling,
|
|
|
|
* cond_resched_softirq() will enable bhs before scheduling.
|
|
|
|
*/
|
2008-05-12 07:04:48 +08:00
|
|
|
extern int _cond_resched(void);
|
2009-07-16 21:44:29 +08:00
|
|
|
|
2009-07-16 21:44:29 +08:00
|
|
|
#define cond_resched() ({ \
|
2014-09-24 16:18:56 +08:00
|
|
|
___might_sleep(__FILE__, __LINE__, 0); \
|
2009-07-16 21:44:29 +08:00
|
|
|
_cond_resched(); \
|
|
|
|
})
|
2009-07-16 21:44:29 +08:00
|
|
|
|
2009-07-16 21:44:29 +08:00
|
|
|
extern int __cond_resched_lock(spinlock_t *lock);
|
|
|
|
|
|
|
|
#define cond_resched_lock(lock) ({ \
|
2014-09-24 16:18:56 +08:00
|
|
|
___might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET);\
|
2009-07-16 21:44:29 +08:00
|
|
|
__cond_resched_lock(lock); \
|
|
|
|
})
|
|
|
|
|
|
|
|
extern int __cond_resched_softirq(void);
|
|
|
|
|
2010-10-05 08:03:16 +08:00
|
|
|
#define cond_resched_softirq() ({ \
|
2014-09-24 16:18:56 +08:00
|
|
|
___might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
|
2010-10-05 08:03:16 +08:00
|
|
|
__cond_resched_softirq(); \
|
2009-07-16 21:44:29 +08:00
|
|
|
})
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-05-22 13:50:31 +08:00
|
|
|
static inline void cond_resched_rcu(void)
|
|
|
|
{
|
|
|
|
#if defined(CONFIG_DEBUG_ATOMIC_SLEEP) || !defined(CONFIG_PREEMPT_RCU)
|
|
|
|
rcu_read_unlock();
|
|
|
|
cond_resched();
|
|
|
|
rcu_read_lock();
|
|
|
|
#endif
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Does a critical section need to be broken due to another
|
2008-01-30 20:31:20 +08:00
|
|
|
* task waiting?: (technically does not depend on CONFIG_PREEMPT,
|
|
|
|
* but a general need for low latency)
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2008-01-30 20:31:20 +08:00
|
|
|
static inline int spin_needbreak(spinlock_t *lock)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-01-30 20:31:20 +08:00
|
|
|
#ifdef CONFIG_PREEMPT
|
|
|
|
return spin_is_contended(lock);
|
|
|
|
#else
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
2008-01-30 20:31:20 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2013-03-22 05:49:32 +08:00
|
|
|
/*
|
|
|
|
* Idle thread specific functions to determine the need_resched
|
2014-04-09 20:30:10 +08:00
|
|
|
* polling state.
|
2013-03-22 05:49:32 +08:00
|
|
|
*/
|
2014-04-09 20:30:10 +08:00
|
|
|
#ifdef TIF_POLLING_NRFLAG
|
2013-03-22 05:49:32 +08:00
|
|
|
static inline int tsk_is_polling(struct task_struct *p)
|
|
|
|
{
|
|
|
|
return test_tsk_thread_flag(p, TIF_POLLING_NRFLAG);
|
|
|
|
}
|
2013-09-11 18:43:13 +08:00
|
|
|
|
|
|
|
static inline void __current_set_polling(void)
|
2013-03-22 05:49:33 +08:00
|
|
|
{
|
|
|
|
set_thread_flag(TIF_POLLING_NRFLAG);
|
|
|
|
}
|
|
|
|
|
2013-09-11 18:43:13 +08:00
|
|
|
static inline bool __must_check current_set_polling_and_test(void)
|
|
|
|
{
|
|
|
|
__current_set_polling();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Polling state must be visible before we test NEED_RESCHED,
|
2014-06-29 04:03:57 +08:00
|
|
|
* paired by resched_curr()
|
2013-09-11 18:43:13 +08:00
|
|
|
*/
|
2014-03-18 01:06:10 +08:00
|
|
|
smp_mb__after_atomic();
|
2013-09-11 18:43:13 +08:00
|
|
|
|
|
|
|
return unlikely(tif_need_resched());
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void __current_clr_polling(void)
|
2013-03-22 05:49:33 +08:00
|
|
|
{
|
|
|
|
clear_thread_flag(TIF_POLLING_NRFLAG);
|
|
|
|
}
|
2013-09-11 18:43:13 +08:00
|
|
|
|
|
|
|
static inline bool __must_check current_clr_polling_and_test(void)
|
|
|
|
{
|
|
|
|
__current_clr_polling();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Polling state must be visible before we test NEED_RESCHED,
|
2014-06-29 04:03:57 +08:00
|
|
|
* paired by resched_curr()
|
2013-09-11 18:43:13 +08:00
|
|
|
*/
|
2014-03-18 01:06:10 +08:00
|
|
|
smp_mb__after_atomic();
|
2013-09-11 18:43:13 +08:00
|
|
|
|
|
|
|
return unlikely(tif_need_resched());
|
|
|
|
}
|
|
|
|
|
2013-03-22 05:49:32 +08:00
|
|
|
#else
|
|
|
|
static inline int tsk_is_polling(struct task_struct *p) { return 0; }
|
2013-09-11 18:43:13 +08:00
|
|
|
static inline void __current_set_polling(void) { }
|
|
|
|
static inline void __current_clr_polling(void) { }
|
|
|
|
|
|
|
|
static inline bool __must_check current_set_polling_and_test(void)
|
|
|
|
{
|
|
|
|
return unlikely(tif_need_resched());
|
|
|
|
}
|
|
|
|
static inline bool __must_check current_clr_polling_and_test(void)
|
|
|
|
{
|
|
|
|
return unlikely(tif_need_resched());
|
|
|
|
}
|
2013-03-22 05:49:32 +08:00
|
|
|
#endif
|
|
|
|
|
2013-11-20 19:22:37 +08:00
|
|
|
static inline void current_clr_polling(void)
|
|
|
|
{
|
|
|
|
__current_clr_polling();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Ensure we check TIF_NEED_RESCHED after we clear the polling bit.
|
|
|
|
* Once the bit is cleared, we'll get IPIs with every new
|
|
|
|
* TIF_NEED_RESCHED and the IPI handler, scheduler_ipi(), will also
|
|
|
|
* fold.
|
|
|
|
*/
|
2014-06-29 04:03:57 +08:00
|
|
|
smp_mb(); /* paired with resched_curr() */
|
2013-11-20 19:22:37 +08:00
|
|
|
|
|
|
|
preempt_fold_need_resched();
|
|
|
|
}
|
|
|
|
|
2013-09-27 23:30:03 +08:00
|
|
|
static __always_inline bool need_resched(void)
|
|
|
|
{
|
|
|
|
return unlikely(tif_need_resched());
|
|
|
|
}
|
|
|
|
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
/*
|
|
|
|
* Thread group CPU time accounting.
|
|
|
|
*/
|
2009-02-05 19:24:16 +08:00
|
|
|
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
|
2009-02-11 18:30:27 +08:00
|
|
|
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
|
timers: fix itimer/many thread hang
Overview
This patch reworks the handling of POSIX CPU timers, including the
ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together
with the help of Roland McGrath, the owner and original writer of this code.
The problem we ran into, and the reason for this rework, has to do with using
a profiling timer in a process with a large number of threads. It appears
that the performance of the old implementation of run_posix_cpu_timers() was
at least O(n*3) (where "n" is the number of threads in a process) or worse.
Everything is fine with an increasing number of threads until the time taken
for that routine to run becomes the same as or greater than the tick time, at
which point things degrade rather quickly.
This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF."
Code Changes
This rework corrects the implementation of run_posix_cpu_timers() to make it
run in constant time for a particular machine. (Performance may vary between
one machine and another depending upon whether the kernel is built as single-
or multiprocessor and, in the latter case, depending upon the number of
running processors.) To do this, at each tick we now update fields in
signal_struct as well as task_struct. The run_posix_cpu_timers() function
uses those fields to make its decisions.
We define a new structure, "task_cputime," to contain user, system and
scheduler times and use these in appropriate places:
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
This is included in the structure "thread_group_cputime," which is a new
substructure of signal_struct and which varies for uniprocessor versus
multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as
a simple substructure, while for multiprocessor kernels it is a pointer:
struct thread_group_cputime {
struct task_cputime totals;
};
struct thread_group_cputime {
struct task_cputime *totals;
};
We also add a new task_cputime substructure directly to signal_struct, to
cache the earliest expiration of process-wide timers, and task_cputime also
replaces the it_*_expires fields of task_struct (used for earliest expiration
of thread timers). The "thread_group_cputime" structure contains process-wide
timers that are updated via account_user_time() and friends. In the non-SMP
case the structure is a simple aggregator; unfortunately in the SMP case that
simplicity was not achievable due to cache-line contention between CPUs (in
one measured case performance was actually _worse_ on a 16-cpu system than
the same test on a 4-cpu system, due to this contention). For SMP, the
thread_group_cputime counters are maintained as a per-cpu structure allocated
using alloc_percpu(). The timer functions update only the timer field in
the structure corresponding to the running CPU, obtained using per_cpu_ptr().
We define a set of inline functions in sched.h that we use to maintain the
thread_group_cputime structure and hide the differences between UP and SMP
implementations from the rest of the kernel. The thread_group_cputime_init()
function initializes the thread_group_cputime structure for the given task.
The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the
out-of-line function thread_group_cputime_alloc_smp() to allocate and fill
in the per-cpu structures and fields. The thread_group_cputime_free()
function, also a no-op for UP, in SMP frees the per-cpu structures. The
thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls
thread_group_cputime_alloc() if the per-cpu structures haven't yet been
allocated. The thread_group_cputime() function fills the task_cputime
structure it is passed with the contents of the thread_group_cputime fields;
in UP it's that simple but in SMP it must also safely check that tsk->signal
is non-NULL (if it is it just uses the appropriate fields of task_struct) and,
if so, sums the per-cpu values for each online CPU. Finally, the three
functions account_group_user_time(), account_group_system_time() and
account_group_exec_runtime() are used by timer functions to update the
respective fields of the thread_group_cputime structure.
Non-SMP operation is trivial and will not be mentioned further.
The per-cpu structure is always allocated when a task creates its first new
thread, via a call to thread_group_cputime_clone_thread() from copy_signal().
It is freed at process exit via a call to thread_group_cputime_free() from
cleanup_signal().
All functions that formerly summed utime/stime/sum_sched_runtime values from
from all threads in the thread group now use thread_group_cputime() to
snapshot the values in the thread_group_cputime structure or the values in
the task structure itself if the per-cpu structure hasn't been allocated.
Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit.
The run_posix_cpu_timers() function has been split into a fast path and a
slow path; the former safely checks whether there are any expired thread
timers and, if not, just returns, while the slow path does the heavy lifting.
With the dedicated thread group fields, timers are no longer "rebalanced" and
the process_timer_rebalance() function and related code has gone away. All
summing loops are gone and all code that used them now uses the
thread_group_cputime() inline. When process-wide timers are set, the new
task_cputime structure in signal_struct is used to cache the earliest
expiration; this is checked in the fast path.
Performance
The fix appears not to add significant overhead to existing operations. It
generally performs the same as the current code except in two cases, one in
which it performs slightly worse (Case 5 below) and one in which it performs
very significantly better (Case 2 below). Overall it's a wash except in those
two cases.
I've since done somewhat more involved testing on a dual-core Opteron system.
Case 1: With no itimer running, for a test with 100,000 threads, the fixed
kernel took 1428.5 seconds, 513 seconds more than the unfixed system,
all of which was spent in the system. There were twice as many
voluntary context switches with the fix as without it.
Case 2: With an itimer running at .01 second ticks and 4000 threads (the most
an unmodified kernel can handle), the fixed kernel ran the test in
eight percent of the time (5.8 seconds as opposed to 70 seconds) and
had better tick accuracy (.012 seconds per tick as opposed to .023
seconds per tick).
Case 3: A 4000-thread test with an initial timer tick of .01 second and an
interval of 10,000 seconds (i.e. a timer that ticks only once) had
very nearly the same performance in both cases: 6.3 seconds elapsed
for the fixed kernel versus 5.5 seconds for the unfixed kernel.
With fewer threads (eight in these tests), the Case 1 test ran in essentially
the same time on both the modified and unmodified kernels (5.2 seconds versus
5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds
versus 5.4 seconds but again with much better tick accuracy, .013 seconds per
tick versus .025 seconds per tick for the unmodified kernel.
Since the fix affected the rlimit code, I also tested soft and hard CPU limits.
Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer
running), the modified kernel was very slightly favored in that while
it killed the process in 19.997 seconds of CPU time (5.002 seconds of
wall time), only .003 seconds of that was system time, the rest was
user time. The unmodified kernel killed the process in 20.001 seconds
of CPU (5.014 seconds of wall time) of which .016 seconds was system
time. Really, though, the results were too close to call. The results
were essentially the same with no itimer running.
Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds
(where the hard limit would never be reached) and an itimer running,
the modified kernel exhibited worse tick accuracy than the unmodified
kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise,
performance was almost indistinguishable. With no itimer running this
test exhibited virtually identical behavior and times in both cases.
In times past I did some limited performance testing. those results are below.
On a four-cpu Opteron system without this fix, a sixteen-thread test executed
in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On
the same system with the fix, user and elapsed time were about the same, but
system time dropped to 0.007 seconds. Performance with eight, four and one
thread were comparable. Interestingly, the timer ticks with the fix seemed
more accurate: The sixteen-thread test with the fix received 149543 ticks
for 0.024 seconds per tick, while the same test without the fix received 58720
for 0.061 seconds per tick. Both cases were configured for an interval of
0.01 seconds. Again, the other tests were comparable. Each thread in this
test computed the primes up to 25,000,000.
I also did a test with a large number of threads, 100,000 threads, which is
impossible without the fix. In this case each thread computed the primes only
up to 10,000 (to make the runtime manageable). System time dominated, at
1546.968 seconds out of a total 2176.906 seconds (giving a user time of
629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite
accurate. There is obviously no comparable test without the fix.
Signed-off-by: Frank Mayhar <fmayhar@google.com>
Cc: Roland McGrath <roland@redhat.com>
Cc: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
|
|
|
|
2007-05-24 04:57:44 +08:00
|
|
|
/*
|
|
|
|
* Reevaluate whether the task has signals pending delivery.
|
|
|
|
* Wake the task if so.
|
|
|
|
* This is required every time the blocked sigset_t changes.
|
|
|
|
* callers must hold sighand->siglock.
|
|
|
|
*/
|
|
|
|
extern void recalc_sigpending_and_wake(struct task_struct *t);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void recalc_sigpending(void);
|
|
|
|
|
2013-01-22 03:47:41 +08:00
|
|
|
extern void signal_wake_up_state(struct task_struct *t, unsigned int state);
|
|
|
|
|
|
|
|
static inline void signal_wake_up(struct task_struct *t, bool resume)
|
|
|
|
{
|
|
|
|
signal_wake_up_state(t, resume ? TASK_WAKEKILL : 0);
|
|
|
|
}
|
|
|
|
static inline void ptrace_signal_wake_up(struct task_struct *t, bool resume)
|
|
|
|
{
|
|
|
|
signal_wake_up_state(t, resume ? __TASK_TRACED : 0);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Wrappers for p->thread_info->cpu access. No-op on UP.
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_SMP
|
|
|
|
|
|
|
|
static inline unsigned int task_cpu(const struct task_struct *p)
|
|
|
|
{
|
2005-11-14 08:06:55 +08:00
|
|
|
return task_thread_info(p)->cpu;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2013-10-07 18:29:30 +08:00
|
|
|
static inline int task_node(const struct task_struct *p)
|
|
|
|
{
|
|
|
|
return cpu_to_node(task_cpu(p));
|
|
|
|
}
|
|
|
|
|
2007-07-10 00:51:58 +08:00
|
|
|
extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#else
|
|
|
|
|
|
|
|
static inline unsigned int task_cpu(const struct task_struct *p)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_SMP */
|
|
|
|
|
2008-11-25 00:05:14 +08:00
|
|
|
extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
|
|
|
|
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
|
2006-06-27 17:54:42 +08:00
|
|
|
|
2010-01-20 20:26:18 +08:00
|
|
|
#ifdef CONFIG_CGROUP_SCHED
|
2011-01-07 15:17:36 +08:00
|
|
|
extern struct task_group root_task_group;
|
2012-06-22 19:36:05 +08:00
|
|
|
#endif /* CONFIG_CGROUP_SCHED */
|
2007-10-15 23:00:09 +08:00
|
|
|
|
2009-02-27 17:43:54 +08:00
|
|
|
extern int task_can_switch_user(struct user_struct *up,
|
|
|
|
struct task_struct *tsk);
|
|
|
|
|
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 17:46:45 +08:00
|
|
|
#ifdef CONFIG_TASK_XACCT
|
|
|
|
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
|
|
|
|
{
|
2008-07-28 06:48:12 +08:00
|
|
|
tsk->ioac.rchar += amt;
|
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 17:46:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
|
|
|
|
{
|
2008-07-28 06:48:12 +08:00
|
|
|
tsk->ioac.wchar += amt;
|
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 17:46:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inc_syscr(struct task_struct *tsk)
|
|
|
|
{
|
2008-07-28 06:48:12 +08:00
|
|
|
tsk->ioac.syscr++;
|
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 17:46:45 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inc_syscw(struct task_struct *tsk)
|
|
|
|
{
|
2008-07-28 06:48:12 +08:00
|
|
|
tsk->ioac.syscw++;
|
[PATCH] ifdef ->rchar, ->wchar, ->syscr, ->syscw from task_struct
They are fat: 4x8 bytes in task_struct.
They are uncoditionally updated in every fork, read, write and sendfile.
They are used only if you have some "extended acct fields feature".
And please, please, please, read(2) knows about bytes, not characters,
why it is called "rchar"?
Signed-off-by: Alexey Dobriyan <adobriyan@gmail.com>
Cc: Jay Lan <jlan@engr.sgi.com>
Cc: Balbir Singh <balbir@in.ibm.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-02-10 17:46:45 +08:00
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inc_syscr(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void inc_syscw(struct task_struct *tsk)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
2008-02-05 14:28:59 +08:00
|
|
|
#ifndef TASK_SIZE_OF
|
|
|
|
#define TASK_SIZE_OF(tsk) TASK_SIZE
|
|
|
|
#endif
|
|
|
|
|
2014-06-05 07:07:34 +08:00
|
|
|
#ifdef CONFIG_MEMCG
|
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 16:00:16 +08:00
|
|
|
extern void mm_update_next_owner(struct mm_struct *mm);
|
|
|
|
#else
|
|
|
|
static inline void mm_update_next_owner(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
}
|
2014-06-05 07:07:34 +08:00
|
|
|
#endif /* CONFIG_MEMCG */
|
cgroups: add an owner to the mm_struct
Remove the mem_cgroup member from mm_struct and instead adds an owner.
This approach was suggested by Paul Menage. The advantage of this approach
is that, once the mm->owner is known, using the subsystem id, the cgroup
can be determined. It also allows several control groups that are
virtually grouped by mm_struct, to exist independent of the memory
controller i.e., without adding mem_cgroup's for each controller, to
mm_struct.
A new config option CONFIG_MM_OWNER is added and the memory resource
controller selects this config option.
This patch also adds cgroup callbacks to notify subsystems when mm->owner
changes. The mm_cgroup_changed callback is called with the task_lock() of
the new task held and is called just prior to changing the mm->owner.
I am indebted to Paul Menage for the several reviews of this patchset and
helping me make it lighter and simpler.
This patch was tested on a powerpc box, it was compiled with both the
MM_OWNER config turned on and off.
After the thread group leader exits, it's moved to init_css_state by
cgroup_exit(), thus all future charges from runnings threads would be
redirected to the init_css_set's subsystem.
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Cc: Pavel Emelianov <xemul@openvz.org>
Cc: Hugh Dickins <hugh@veritas.com>
Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com>
Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp>
Cc: Hirokazu Takahashi <taka@valinux.co.jp>
Cc: David Rientjes <rientjes@google.com>,
Cc: Balbir Singh <balbir@linux.vnet.ibm.com>
Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Acked-by: Pekka Enberg <penberg@cs.helsinki.fi>
Reviewed-by: Paul Menage <menage@google.com>
Cc: Oleg Nesterov <oleg@tv-sign.ru>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 16:00:16 +08:00
|
|
|
|
2009-11-20 00:16:37 +08:00
|
|
|
static inline unsigned long task_rlimit(const struct task_struct *tsk,
|
|
|
|
unsigned int limit)
|
|
|
|
{
|
2015-04-29 04:00:20 +08:00
|
|
|
return READ_ONCE(tsk->signal->rlim[limit].rlim_cur);
|
2009-11-20 00:16:37 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
|
|
|
|
unsigned int limit)
|
|
|
|
{
|
2015-04-29 04:00:20 +08:00
|
|
|
return READ_ONCE(tsk->signal->rlim[limit].rlim_max);
|
2009-11-20 00:16:37 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long rlimit(unsigned int limit)
|
|
|
|
{
|
|
|
|
return task_rlimit(current, limit);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline unsigned long rlimit_max(unsigned int limit)
|
|
|
|
{
|
|
|
|
return task_rlimit_max(current, limit);
|
|
|
|
}
|
|
|
|
|
2016-03-11 03:44:47 +08:00
|
|
|
#ifdef CONFIG_CPU_FREQ
|
|
|
|
struct update_util_data {
|
|
|
|
void (*func)(struct update_util_data *data,
|
|
|
|
u64 time, unsigned long util, unsigned long max);
|
|
|
|
};
|
|
|
|
|
|
|
|
void cpufreq_set_update_util_data(int cpu, struct update_util_data *data);
|
|
|
|
#endif /* CONFIG_CPU_FREQ */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|