OpenCloudOS-Kernel/include/linux/sched.h

2824 lines
83 KiB
C
Raw Normal View History

#ifndef _LINUX_SCHED_H
#define _LINUX_SCHED_H
/*
* cloning flags:
*/
#define CSIGNAL 0x000000ff /* signal mask to be sent at exit */
#define CLONE_VM 0x00000100 /* set if VM shared between processes */
#define CLONE_FS 0x00000200 /* set if fs info shared between processes */
#define CLONE_FILES 0x00000400 /* set if open files shared between processes */
#define CLONE_SIGHAND 0x00000800 /* set if signal handlers and blocked signals shared */
#define CLONE_PTRACE 0x00002000 /* set if we want to let tracing continue on the child too */
#define CLONE_VFORK 0x00004000 /* set if the parent wants the child to wake it up on mm_release */
#define CLONE_PARENT 0x00008000 /* set if we want to have the same parent as the cloner */
#define CLONE_THREAD 0x00010000 /* Same thread group? */
#define CLONE_NEWNS 0x00020000 /* New namespace group? */
#define CLONE_SYSVSEM 0x00040000 /* share system V SEM_UNDO semantics */
#define CLONE_SETTLS 0x00080000 /* create a new TLS for the child */
#define CLONE_PARENT_SETTID 0x00100000 /* set the TID in the parent */
#define CLONE_CHILD_CLEARTID 0x00200000 /* clear the TID in the child */
#define CLONE_DETACHED 0x00400000 /* Unused, ignored */
#define CLONE_UNTRACED 0x00800000 /* set if the tracing process can't force CLONE_PTRACE on this clone */
#define CLONE_CHILD_SETTID 0x01000000 /* set the TID in the child */
/* 0x02000000 was previously the unused CLONE_STOPPED (Start in stopped state)
and is now available for re-use. */
#define CLONE_NEWUTS 0x04000000 /* New utsname group? */
#define CLONE_NEWIPC 0x08000000 /* New ipcs */
#define CLONE_NEWUSER 0x10000000 /* New user namespace */
#define CLONE_NEWPID 0x20000000 /* New pid namespace */
#define CLONE_NEWNET 0x40000000 /* New network namespace */
#define CLONE_IO 0x80000000 /* Clone io context */
/*
* Scheduling policies
*/
#define SCHED_NORMAL 0
#define SCHED_FIFO 1
#define SCHED_RR 2
#define SCHED_BATCH 3
/* SCHED_ISO: reserved but not implemented yet */
#define SCHED_IDLE 5
sched: Introduce SCHED_RESET_ON_FORK scheduling policy flag This patch introduces a new flag SCHED_RESET_ON_FORK which can be passed to the kernel via sched_setscheduler(), ORed in the policy parameter. If set this will make sure that when the process forks a) the scheduling priority is reset to DEFAULT_PRIO if it was higher and b) the scheduling policy is reset to SCHED_NORMAL if it was either SCHED_FIFO or SCHED_RR. Why have this? Currently, if a process is real-time scheduled this will 'leak' to all its child processes. For security reasons it is often (always?) a good idea to make sure that if a process acquires RT scheduling this is confined to this process and only this process. More specifically this makes the per-process resource limit RLIMIT_RTTIME useful for security purposes, because it makes it impossible to use a fork bomb to circumvent the per-process RLIMIT_RTTIME accounting. This feature is also useful for tools like 'renice' which can then change the nice level of a process without having this spill to all its child processes. Why expose this via sched_setscheduler() and not other syscalls such as prctl() or sched_setparam()? prctl() does not take a pid parameter. Due to that it would be impossible to modify this flag for other processes than the current one. The struct passed to sched_setparam() can unfortunately not be extended without breaking compatibility, since sched_setparam() lacks a size parameter. How to use this from userspace? In your RT program simply replace this: sched_setscheduler(pid, SCHED_FIFO, &param); by this: sched_setscheduler(pid, SCHED_FIFO|SCHED_RESET_ON_FORK, &param); Signed-off-by: Lennart Poettering <lennart@poettering.net> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <20090615152714.GA29092@tango.0pointer.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-15 23:17:47 +08:00
/* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */
#define SCHED_RESET_ON_FORK 0x40000000
#ifdef __KERNEL__
struct sched_param {
int sched_priority;
};
#include <asm/param.h> /* for HZ */
#include <linux/capability.h>
#include <linux/threads.h>
#include <linux/kernel.h>
#include <linux/types.h>
#include <linux/timex.h>
#include <linux/jiffies.h>
#include <linux/rbtree.h>
#include <linux/thread_info.h>
#include <linux/cpumask.h>
#include <linux/errno.h>
#include <linux/nodemask.h>
#include <linux/mm_types.h>
#include <asm/page.h>
#include <asm/ptrace.h>
#include <asm/cputime.h>
#include <linux/smp.h>
#include <linux/sem.h>
#include <linux/signal.h>
#include <linux/compiler.h>
#include <linux/completion.h>
#include <linux/pid.h>
#include <linux/percpu.h>
#include <linux/topology.h>
#include <linux/proportions.h>
#include <linux/seccomp.h>
#include <linux/rcupdate.h>
#include <linux/rculist.h>
#include <linux/rtmutex.h>
#include <linux/time.h>
#include <linux/param.h>
#include <linux/resource.h>
#include <linux/timer.h>
#include <linux/hrtimer.h>
[PATCH] io-accounting: core statistics The present per-task IO accounting isn't very useful. It simply counts the number of bytes passed into read() and write(). So if a process reads 1MB from an already-cached file, it is accused of having performed 1MB of I/O, which is wrong. (David Wright had some comments on the applicability of the present logical IO accounting: For billing purposes it is useless but for workload analysis it is very useful read_bytes/read_calls average read request size write_bytes/write_calls average write request size read_bytes/read_blocks ie logical/physical can indicate hit rate or thrashing write_bytes/write_blocks ie logical/physical guess since pdflush writes can be missed I often look for logical larger than physical to see filesystem cache problems. And the bytes/cpusec can help find applications that are dominating the cache and causing slow interactive response from page cache contention. I want to find the IO intensive applications and make sure they are doing efficient IO. Thus the acctcms(sysV) or csacms command would give the high IO commands). This patchset adds new accounting which tries to be more accurate. We account for three things: reads: attempt to count the number of bytes which this process really did cause to be fetched from the storage layer. Done at the submit_bio() level, so it is accurate for block-backed filesystems. I also attempt to wire up NFS and CIFS. writes: attempt to count the number of bytes which this process caused to be sent to the storage layer. This is done at page-dirtying time. The big inaccuracy here is truncate. If a process writes 1MB to a file and then deletes the file, it will in fact perform no writeout. But it will have been accounted as having caused 1MB of write. So... cancelled_writes: account the number of bytes which this process caused to not happen, by truncating pagecache. We _could_ just subtract this from the process's `write' accounting. But that means that some processes would be reported to have done negative amounts of write IO, which is silly. So we just report the raw number and punt this decision up to userspace. Now, we _could_ account for writes at the physical I/O level. But - This would require that we track memory-dirtying tasks at the per-page level (would require a new pointer in struct page). - It would mean that IO statistics for a process are usually only available long after that process has exitted. Which means that we probably cannot communicate this info via taskstats. This patch: Wire up the kernel-private data structures and the accessor functions to manipulate them. Cc: Jay Lan <jlan@sgi.com> Cc: Shailabh Nagar <nagar@watson.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Chris Sturtivant <csturtiv@sgi.com> Cc: Tony Ernst <tee@sgi.com> Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net> Cc: David Wright <daw@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-10 18:19:19 +08:00
#include <linux/task_io_accounting.h>
#include <linux/latencytop.h>
#include <linux/cred.h>
#include <linux/llist.h>
#include <asm/processor.h>
struct exec_domain;
struct futex_pi_state;
struct robust_list_head;
struct bio_list;
struct fs_struct;
perf: Do the big rename: Performance Counters -> Performance Events Bye-bye Performance Counters, welcome Performance Events! In the past few months the perfcounters subsystem has grown out its initial role of counting hardware events, and has become (and is becoming) a much broader generic event enumeration, reporting, logging, monitoring, analysis facility. Naming its core object 'perf_counter' and naming the subsystem 'perfcounters' has become more and more of a misnomer. With pending code like hw-breakpoints support the 'counter' name is less and less appropriate. All in one, we've decided to rename the subsystem to 'performance events' and to propagate this rename through all fields, variables and API names. (in an ABI compatible fashion) The word 'event' is also a bit shorter than 'counter' - which makes it slightly more convenient to write/handle as well. Thanks goes to Stephane Eranian who first observed this misnomer and suggested a rename. User-space tooling and ABI compatibility is not affected - this patch should be function-invariant. (Also, defconfigs were not touched to keep the size down.) This patch has been generated via the following script: FILES=$(find * -type f | grep -vE 'oprofile|[^K]config') sed -i \ -e 's/PERF_EVENT_/PERF_RECORD_/g' \ -e 's/PERF_COUNTER/PERF_EVENT/g' \ -e 's/perf_counter/perf_event/g' \ -e 's/nb_counters/nb_events/g' \ -e 's/swcounter/swevent/g' \ -e 's/tpcounter_event/tp_event/g' \ $FILES for N in $(find . -name perf_counter.[ch]); do M=$(echo $N | sed 's/perf_counter/perf_event/g') mv $N $M done FILES=$(find . -name perf_event.*) sed -i \ -e 's/COUNTER_MASK/REG_MASK/g' \ -e 's/COUNTER/EVENT/g' \ -e 's/\<event\>/event_id/g' \ -e 's/counter/event/g' \ -e 's/Counter/Event/g' \ $FILES ... to keep it as correct as possible. This script can also be used by anyone who has pending perfcounters patches - it converts a Linux kernel tree over to the new naming. We tried to time this change to the point in time where the amount of pending patches is the smallest: the end of the merge window. Namespace clashes were fixed up in a preparatory patch - and some stylistic fallout will be fixed up in a subsequent patch. ( NOTE: 'counters' are still the proper terminology when we deal with hardware registers - and these sed scripts are a bit over-eager in renaming them. I've undone some of that, but in case there's something left where 'counter' would be better than 'event' we can undo that on an individual basis instead of touching an otherwise nicely automated patch. ) Suggested-by: Stephane Eranian <eranian@google.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Paul Mackerras <paulus@samba.org> Reviewed-by: Arjan van de Ven <arjan@linux.intel.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Howells <dhowells@redhat.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: <linux-arch@vger.kernel.org> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
struct perf_event_context;
struct blk_plug;
/*
* List of flags we want to share for kernel threads,
* if only because they are not used by them anyway.
*/
#define CLONE_KERNEL (CLONE_FS | CLONE_FILES | CLONE_SIGHAND)
/*
* These are the constant used to fake the fixed-point load-average
* counting. Some notes:
* - 11 bit fractions expand to 22 bits by the multiplies: this gives
* a load-average precision of 10 bits integer + 11 bits fractional
* - if you want to count load-averages more often, you need more
* precision, or rounding will get you. With 2-second counting freq,
* the EXP_n values would be 1981, 2034 and 2043 if still using only
* 11 bit fractions.
*/
extern unsigned long avenrun[]; /* Load averages */
extern void get_avenrun(unsigned long *loads, unsigned long offset, int shift);
#define FSHIFT 11 /* nr of bits of precision */
#define FIXED_1 (1<<FSHIFT) /* 1.0 as fixed-point */
#define LOAD_FREQ (5*HZ+1) /* 5 sec intervals */
#define EXP_1 1884 /* 1/exp(5sec/1min) as fixed-point */
#define EXP_5 2014 /* 1/exp(5sec/5min) */
#define EXP_15 2037 /* 1/exp(5sec/15min) */
#define CALC_LOAD(load,exp,n) \
load *= exp; \
load += n*(FIXED_1-exp); \
load >>= FSHIFT;
extern unsigned long total_forks;
extern int nr_threads;
DECLARE_PER_CPU(unsigned long, process_counts);
extern int nr_processes(void);
extern unsigned long nr_running(void);
extern unsigned long nr_uninterruptible(void);
extern unsigned long nr_iowait(void);
extern unsigned long nr_iowait_cpu(int cpu);
cpuidle: fix the menu governor to boost IO performance Fix the menu idle governor which balances power savings, energy efficiency and performance impact. The reason for a reworked governor is that there have been serious performance issues reported with the existing code on Nehalem server systems. To show this I'm sure Andrew wants to see benchmark results: (benchmark is "fio", "no cstates" is using "idle=poll") no cstates current linux new algorithm 1 disk 107 Mb/s 85 Mb/s 105 Mb/s 2 disks 215 Mb/s 123 Mb/s 209 Mb/s 12 disks 590 Mb/s 320 Mb/s 585 Mb/s In various power benchmark measurements, no degredation was found by our measurement&diagnostics team. Obviously a small percentage more power was used in the "fio" benchmark, due to the much higher performance. While it would be a novel idea to describe the new algorithm in this commit message, I cheaped out and described it in comments in the code instead. [changes since first post: spelling fixes from akpm, review feedback, folded menu-tng into menu.c] Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Cc: Venkatesh Pallipadi <venkatesh.pallipadi@intel.com> Cc: Len Brown <lenb@kernel.org> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Yanmin Zhang <yanmin_zhang@linux.intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 08:04:08 +08:00
extern unsigned long this_cpu_load(void);
2010-12-01 02:48:45 +08:00
extern void calc_global_load(unsigned long ticks);
extern unsigned long get_parent_ip(unsigned long addr);
struct seq_file;
struct cfs_rq;
struct task_group;
#ifdef CONFIG_SCHED_DEBUG
extern void proc_sched_show_task(struct task_struct *p, struct seq_file *m);
extern void proc_sched_set_task(struct task_struct *p);
extern void
print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq);
#else
static inline void
proc_sched_show_task(struct task_struct *p, struct seq_file *m)
{
}
static inline void proc_sched_set_task(struct task_struct *p)
{
}
static inline void
print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq)
{
}
#endif
/*
* Task state bitmask. NOTE! These bits are also
* encoded in fs/proc/array.c: get_task_state().
*
* We have two separate sets of flags: task->state
* is about runnability, while task->exit_state are
* about the task exiting. Confusing, but this way
* modifying one set can't modify the other one by
* mistake.
*/
#define TASK_RUNNING 0
#define TASK_INTERRUPTIBLE 1
#define TASK_UNINTERRUPTIBLE 2
#define __TASK_STOPPED 4
#define __TASK_TRACED 8
/* in tsk->exit_state */
#define EXIT_ZOMBIE 16
#define EXIT_DEAD 32
/* in tsk->state again */
#define TASK_DEAD 64
#define TASK_WAKEKILL 128
#define TASK_WAKING 256
#define TASK_STATE_MAX 512
#define TASK_STATE_TO_CHAR_STR "RSDTtZXxKW"
extern char ___assert_task_state[1 - 2*!!(
sizeof(TASK_STATE_TO_CHAR_STR)-1 != ilog2(TASK_STATE_MAX)+1)];
/* Convenience macros for the sake of set_task_state */
#define TASK_KILLABLE (TASK_WAKEKILL | TASK_UNINTERRUPTIBLE)
#define TASK_STOPPED (TASK_WAKEKILL | __TASK_STOPPED)
#define TASK_TRACED (TASK_WAKEKILL | __TASK_TRACED)
/* Convenience macros for the sake of wake_up */
#define TASK_NORMAL (TASK_INTERRUPTIBLE | TASK_UNINTERRUPTIBLE)
#define TASK_ALL (TASK_NORMAL | __TASK_STOPPED | __TASK_TRACED)
/* get_task_state() */
#define TASK_REPORT (TASK_RUNNING | TASK_INTERRUPTIBLE | \
TASK_UNINTERRUPTIBLE | __TASK_STOPPED | \
__TASK_TRACED)
#define task_is_traced(task) ((task->state & __TASK_TRACED) != 0)
#define task_is_stopped(task) ((task->state & __TASK_STOPPED) != 0)
#define task_is_dead(task) ((task)->exit_state != 0)
#define task_is_stopped_or_traced(task) \
((task->state & (__TASK_STOPPED | __TASK_TRACED)) != 0)
#define task_contributes_to_load(task) \
((task->state & TASK_UNINTERRUPTIBLE) != 0 && \
(task->flags & PF_FROZEN) == 0)
#define __set_task_state(tsk, state_value) \
do { (tsk)->state = (state_value); } while (0)
#define set_task_state(tsk, state_value) \
set_mb((tsk)->state, (state_value))
/*
* set_current_state() includes a barrier so that the write of current->state
* is correctly serialised wrt the caller's subsequent test of whether to
* actually sleep:
*
* set_current_state(TASK_UNINTERRUPTIBLE);
* if (do_i_need_to_sleep())
* schedule();
*
* If the caller does not need such serialisation then use __set_current_state()
*/
#define __set_current_state(state_value) \
do { current->state = (state_value); } while (0)
#define set_current_state(state_value) \
set_mb(current->state, (state_value))
/* Task command name length */
#define TASK_COMM_LEN 16
#include <linux/spinlock.h>
/*
* This serializes "schedule()" and also protects
* the run-queue from deletions/modifications (but
* _adding_ to the beginning of the run-queue has
* a separate lock).
*/
extern rwlock_t tasklist_lock;
extern spinlock_t mmlist_lock;
struct task_struct;
#ifdef CONFIG_PROVE_RCU
extern int lockdep_tasklist_lock_is_held(void);
#endif /* #ifdef CONFIG_PROVE_RCU */
extern void sched_init(void);
extern void sched_init_smp(void);
extern asmlinkage void schedule_tail(struct task_struct *prev);
extern void init_idle(struct task_struct *idle, int cpu);
extern void init_idle_bootup_task(struct task_struct *idle);
extern int runqueue_is_locked(int cpu);
#if defined(CONFIG_SMP) && defined(CONFIG_NO_HZ)
sched: Change nohz idle load balancing logic to push model In the new push model, all idle CPUs indeed go into nohz mode. There is still the concept of idle load balancer (performing the load balancing on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz balancer when any of the nohz CPUs need idle load balancing. The kickee CPU does the idle load balancing on behalf of all idle CPUs instead of the normal idle balance. This addresses the below two problems with the current nohz ilb logic: * the idle load balancer continued to have periodic ticks during idle and wokeup frequently, even though it did not have any rebalancing to do on behalf of any of the idle CPUs. * On x86 and CPUs that have APIC timer stoppage on idle CPUs, this periodic wakeup can result in a periodic additional interrupt on a CPU doing the timer broadcast. Also currently we are migrating the unpinned timers from an idle to the cpu doing idle load balancing (when all the cpus in the system are idle, there is no idle load balancing cpu and timers get added to the same idle cpu where the request was made. So the existing optimization works only on semi idle system). And In semi idle system, we no longer have periodic ticks on the idle load balancer CPU. Using that cpu will add more delays to the timers than intended (as that cpu's timer base may not be uptodate wrt jiffies etc). This was causing mysterious slowdowns during boot etc. For now, in the semi idle case, use the nearest busy cpu for migrating timers from an idle cpu. This is good for power-savings anyway. Signed-off-by: Venkatesh Pallipadi <venki@google.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-22 08:09:41 +08:00
extern void select_nohz_load_balancer(int stop_tick);
extern void set_cpu_sd_state_idle(void);
sched: Change nohz idle load balancing logic to push model In the new push model, all idle CPUs indeed go into nohz mode. There is still the concept of idle load balancer (performing the load balancing on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz balancer when any of the nohz CPUs need idle load balancing. The kickee CPU does the idle load balancing on behalf of all idle CPUs instead of the normal idle balance. This addresses the below two problems with the current nohz ilb logic: * the idle load balancer continued to have periodic ticks during idle and wokeup frequently, even though it did not have any rebalancing to do on behalf of any of the idle CPUs. * On x86 and CPUs that have APIC timer stoppage on idle CPUs, this periodic wakeup can result in a periodic additional interrupt on a CPU doing the timer broadcast. Also currently we are migrating the unpinned timers from an idle to the cpu doing idle load balancing (when all the cpus in the system are idle, there is no idle load balancing cpu and timers get added to the same idle cpu where the request was made. So the existing optimization works only on semi idle system). And In semi idle system, we no longer have periodic ticks on the idle load balancer CPU. Using that cpu will add more delays to the timers than intended (as that cpu's timer base may not be uptodate wrt jiffies etc). This was causing mysterious slowdowns during boot etc. For now, in the semi idle case, use the nearest busy cpu for migrating timers from an idle cpu. This is good for power-savings anyway. Signed-off-by: Venkatesh Pallipadi <venki@google.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-22 08:09:41 +08:00
extern int get_nohz_timer_target(void);
#else
sched: Change nohz idle load balancing logic to push model In the new push model, all idle CPUs indeed go into nohz mode. There is still the concept of idle load balancer (performing the load balancing on behalf of all the idle cpu's in the system). Busy CPU kicks the nohz balancer when any of the nohz CPUs need idle load balancing. The kickee CPU does the idle load balancing on behalf of all idle CPUs instead of the normal idle balance. This addresses the below two problems with the current nohz ilb logic: * the idle load balancer continued to have periodic ticks during idle and wokeup frequently, even though it did not have any rebalancing to do on behalf of any of the idle CPUs. * On x86 and CPUs that have APIC timer stoppage on idle CPUs, this periodic wakeup can result in a periodic additional interrupt on a CPU doing the timer broadcast. Also currently we are migrating the unpinned timers from an idle to the cpu doing idle load balancing (when all the cpus in the system are idle, there is no idle load balancing cpu and timers get added to the same idle cpu where the request was made. So the existing optimization works only on semi idle system). And In semi idle system, we no longer have periodic ticks on the idle load balancer CPU. Using that cpu will add more delays to the timers than intended (as that cpu's timer base may not be uptodate wrt jiffies etc). This was causing mysterious slowdowns during boot etc. For now, in the semi idle case, use the nearest busy cpu for migrating timers from an idle cpu. This is good for power-savings anyway. Signed-off-by: Venkatesh Pallipadi <venki@google.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <1274486981.2840.46.camel@sbs-t61.sc.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-05-22 08:09:41 +08:00
static inline void select_nohz_load_balancer(int stop_tick) { }
static inline void set_cpu_sd_state_idle(void) { }
#endif
/*
* Only dump TASK_* tasks. (0 for all tasks)
*/
extern void show_state_filter(unsigned long state_filter);
static inline void show_state(void)
{
show_state_filter(0);
}
extern void show_regs(struct pt_regs *);
/*
* TASK is a pointer to the task whose backtrace we want to see (or NULL for current
* task), SP is the stack pointer of the first frame that should be shown in the back
* trace (or NULL if the entire call-chain of the task should be shown).
*/
extern void show_stack(struct task_struct *task, unsigned long *sp);
void io_schedule(void);
long io_schedule_timeout(long timeout);
extern void cpu_init (void);
extern void trap_init(void);
extern void update_process_times(int user);
extern void scheduler_tick(void);
softlockup: automatically detect hung TASK_UNINTERRUPTIBLE tasks this patch extends the soft-lockup detector to automatically detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are printed the following way: ------------------> INFO: task prctl:3042 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message prctl D fd5e3793 0 3042 2997 f6050f38 00000046 00000001 fd5e3793 00000009 c06d8264 c06dae80 00000286 f6050f40 f6050f00 f7d34d90 f7d34fc8 c1e1be80 00000001 f6050000 00000000 f7e92d00 00000286 f6050f18 c0489d1a f6050f40 00006605 00000000 c0133a5b Call Trace: [<c04883a5>] schedule_timeout+0x6d/0x8b [<c04883d8>] schedule_timeout_uninterruptible+0x15/0x17 [<c0133a76>] msleep+0x10/0x16 [<c0138974>] sys_prctl+0x30/0x1e2 [<c0104c52>] sysenter_past_esp+0x5f/0xa5 ======================= 2 locks held by prctl/3042: #0: (&sb->s_type->i_mutex_key#5){--..}, at: [<c0197d11>] do_fsync+0x38/0x7a #1: (jbd_handle){--..}, at: [<c01ca3d2>] journal_start+0xc7/0xe9 <------------------ the current default timeout is 120 seconds. Such messages are printed up to 10 times per bootup. If the system has crashed already then the messages are not printed. if lockdep is enabled then all held locks are printed as well. this feature is a natural extension to the softlockup-detector (kernel locked up without scheduling) and to the NMI watchdog (kernel locked up with IRQs disabled). [ Gautham R Shenoy <ego@in.ibm.com>: CPU hotplug fixes. ] [ Andrew Morton <akpm@linux-foundation.org>: build warning fix. ] Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
2008-01-26 04:08:02 +08:00
extern void sched_show_task(struct task_struct *p);
#ifdef CONFIG_LOCKUP_DETECTOR
extern void touch_softlockup_watchdog(void);
softlockup: Add sched_clock_tick() to avoid kernel warning on kgdb resume When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, sched_clock() gets the time from hardware such as the TSC on x86. In this configuration kgdb will report a softlock warning message on resuming or detaching from a debug session. Sequence of events in the problem case: 1) "cpu sched clock" and "hardware time" are at 100 sec prior to a call to kgdb_handle_exception() 2) Debugger waits in kgdb_handle_exception() for 80 sec and on exit the following is called ... touch_softlockup_watchdog() --> __raw_get_cpu_var(touch_timestamp) = 0; 3) "cpu sched clock" = 100s (it was not updated, because the interrupt was disabled in kgdb) but the "hardware time" = 180 sec 4) The first timer interrupt after resuming from kgdb_handle_exception updates the watchdog from the "cpu sched clock" update_process_times() { ... run_local_timers() --> softlockup_tick() --> check (touch_timestamp == 0) (it is "YES" here, we have set "touch_timestamp = 0" at kgdb) --> __touch_softlockup_watchdog() ***(A)--> reset "touch_timestamp" to "get_timestamp()" (Here, the "touch_timestamp" will still be set to 100s.) ... scheduler_tick() ***(B)--> sched_clock_tick() (update "cpu sched clock" to "hardware time" = 180s) ... } 5) The Second timer interrupt handler appears to have a large jump and trips the softlockup warning. update_process_times() { ... run_local_timers() --> softlockup_tick() --> "cpu sched clock" - "touch_timestamp" = 180s-100s > 60s --> printk "soft lockup error messages" ... } note: ***(A) reset "touch_timestamp" to "get_timestamp(this_cpu)" Why is "touch_timestamp" 100 sec, instead of 180 sec? When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, the call trace of get_timestamp() is: get_timestamp(this_cpu) -->cpu_clock(this_cpu) -->sched_clock_cpu(this_cpu) -->__update_sched_clock(sched_clock_data, now) The __update_sched_clock() function uses the GTOD tick value to create a window to normalize the "now" values. So if "now" value is too big for sched_clock_data, it will be ignored. The fix is to invoke sched_clock_tick() to update "cpu sched clock" in order to recover from this state. This is done by introducing the function touch_softlockup_watchdog_sync(). This allows kgdb to request that the sched clock is updated when the watchdog thread runs the first time after a resume from kgdb. [yong.zhang0@gmail.com: Use per cpu instead of an array] Signed-off-by: Jason Wessel <jason.wessel@windriver.com> Signed-off-by: Dongdong Deng <Dongdong.Deng@windriver.com> Cc: kgdb-bugreport@lists.sourceforge.net Cc: peterz@infradead.org LKML-Reference: <1264631124-4837-2-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-01-28 06:25:22 +08:00
extern void touch_softlockup_watchdog_sync(void);
extern void touch_all_softlockup_watchdogs(void);
extern int proc_dowatchdog_thresh(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos);
extern unsigned int softlockup_panic;
void lockup_detector_init(void);
#else
static inline void touch_softlockup_watchdog(void)
{
}
softlockup: Add sched_clock_tick() to avoid kernel warning on kgdb resume When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, sched_clock() gets the time from hardware such as the TSC on x86. In this configuration kgdb will report a softlock warning message on resuming or detaching from a debug session. Sequence of events in the problem case: 1) "cpu sched clock" and "hardware time" are at 100 sec prior to a call to kgdb_handle_exception() 2) Debugger waits in kgdb_handle_exception() for 80 sec and on exit the following is called ... touch_softlockup_watchdog() --> __raw_get_cpu_var(touch_timestamp) = 0; 3) "cpu sched clock" = 100s (it was not updated, because the interrupt was disabled in kgdb) but the "hardware time" = 180 sec 4) The first timer interrupt after resuming from kgdb_handle_exception updates the watchdog from the "cpu sched clock" update_process_times() { ... run_local_timers() --> softlockup_tick() --> check (touch_timestamp == 0) (it is "YES" here, we have set "touch_timestamp = 0" at kgdb) --> __touch_softlockup_watchdog() ***(A)--> reset "touch_timestamp" to "get_timestamp()" (Here, the "touch_timestamp" will still be set to 100s.) ... scheduler_tick() ***(B)--> sched_clock_tick() (update "cpu sched clock" to "hardware time" = 180s) ... } 5) The Second timer interrupt handler appears to have a large jump and trips the softlockup warning. update_process_times() { ... run_local_timers() --> softlockup_tick() --> "cpu sched clock" - "touch_timestamp" = 180s-100s > 60s --> printk "soft lockup error messages" ... } note: ***(A) reset "touch_timestamp" to "get_timestamp(this_cpu)" Why is "touch_timestamp" 100 sec, instead of 180 sec? When CONFIG_HAVE_UNSTABLE_SCHED_CLOCK is set, the call trace of get_timestamp() is: get_timestamp(this_cpu) -->cpu_clock(this_cpu) -->sched_clock_cpu(this_cpu) -->__update_sched_clock(sched_clock_data, now) The __update_sched_clock() function uses the GTOD tick value to create a window to normalize the "now" values. So if "now" value is too big for sched_clock_data, it will be ignored. The fix is to invoke sched_clock_tick() to update "cpu sched clock" in order to recover from this state. This is done by introducing the function touch_softlockup_watchdog_sync(). This allows kgdb to request that the sched clock is updated when the watchdog thread runs the first time after a resume from kgdb. [yong.zhang0@gmail.com: Use per cpu instead of an array] Signed-off-by: Jason Wessel <jason.wessel@windriver.com> Signed-off-by: Dongdong Deng <Dongdong.Deng@windriver.com> Cc: kgdb-bugreport@lists.sourceforge.net Cc: peterz@infradead.org LKML-Reference: <1264631124-4837-2-git-send-email-jason.wessel@windriver.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-01-28 06:25:22 +08:00
static inline void touch_softlockup_watchdog_sync(void)
{
}
static inline void touch_all_softlockup_watchdogs(void)
{
}
static inline void lockup_detector_init(void)
{
}
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
extern unsigned int sysctl_hung_task_panic;
extern unsigned long sysctl_hung_task_check_count;
extern unsigned long sysctl_hung_task_timeout_secs;
extern unsigned long sysctl_hung_task_warnings;
extern int proc_dohung_task_timeout_secs(struct ctl_table *table, int write,
void __user *buffer,
size_t *lenp, loff_t *ppos);
#else
/* Avoid need for ifdefs elsewhere in the code */
enum { sysctl_hung_task_timeout_secs = 0 };
#endif
/* Attach to any functions which should be ignored in wchan output. */
#define __sched __attribute__((__section__(".sched.text")))
/* Linker adds these: start and end of __sched functions */
extern char __sched_text_start[], __sched_text_end[];
/* Is this address in the __sched functions? */
extern int in_sched_functions(unsigned long addr);
#define MAX_SCHEDULE_TIMEOUT LONG_MAX
extern signed long schedule_timeout(signed long timeout);
extern signed long schedule_timeout_interruptible(signed long timeout);
extern signed long schedule_timeout_killable(signed long timeout);
extern signed long schedule_timeout_uninterruptible(signed long timeout);
asmlinkage void schedule(void);
extern void schedule_preempt_disabled(void);
extern int mutex_spin_on_owner(struct mutex *lock, struct task_struct *owner);
struct nsproxy;
struct user_namespace;
/*
* Default maximum number of active map areas, this limits the number of vmas
* per mm struct. Users can overwrite this number by sysctl but there is a
* problem.
*
* When a program's coredump is generated as ELF format, a section is created
* per a vma. In ELF, the number of sections is represented in unsigned short.
* This means the number of sections should be smaller than 65535 at coredump.
* Because the kernel adds some informative sections to a image of program at
* generating coredump, we need some margin. The number of extra sections is
* 1-3 now and depends on arch. We use "5" as safe margin, here.
*/
#define MAPCOUNT_ELF_CORE_MARGIN (5)
#define DEFAULT_MAX_MAP_COUNT (USHRT_MAX - MAPCOUNT_ELF_CORE_MARGIN)
extern int sysctl_max_map_count;
#include <linux/aio.h>
#ifdef CONFIG_MMU
extern void arch_pick_mmap_layout(struct mm_struct *mm);
extern unsigned long
arch_get_unmapped_area(struct file *, unsigned long, unsigned long,
unsigned long, unsigned long);
extern unsigned long
arch_get_unmapped_area_topdown(struct file *filp, unsigned long addr,
unsigned long len, unsigned long pgoff,
unsigned long flags);
[PATCH] Avoiding mmap fragmentation Ingo recently introduced a great speedup for allocating new mmaps using the free_area_cache pointer which boosts the specweb SSL benchmark by 4-5% and causes huge performance increases in thread creation. The downside of this patch is that it does lead to fragmentation in the mmap-ed areas (visible via /proc/self/maps), such that some applications that work fine under 2.4 kernels quickly run out of memory on any 2.6 kernel. The problem is twofold: 1) the free_area_cache is used to continue a search for memory where the last search ended. Before the change new areas were always searched from the base address on. So now new small areas are cluttering holes of all sizes throughout the whole mmap-able region whereas before small holes tended to close holes near the base leaving holes far from the base large and available for larger requests. 2) the free_area_cache also is set to the location of the last munmap-ed area so in scenarios where we allocate e.g. five regions of 1K each, then free regions 4 2 3 in this order the next request for 1K will be placed in the position of the old region 3, whereas before we appended it to the still active region 1, placing it at the location of the old region 2. Before we had 1 free region of 2K, now we only get two free regions of 1K -> fragmentation. The patch addresses thes issues by introducing yet another cache descriptor cached_hole_size that contains the largest known hole size below the current free_area_cache. If a new request comes in the size is compared against the cached_hole_size and if the request can be filled with a hole below free_area_cache the search is started from the base instead. The results look promising: Whereas 2.6.12-rc4 fragments quickly and my (earlier posted) leakme.c test program terminates after 50000+ iterations with 96 distinct and fragmented maps in /proc/self/maps it performs nicely (as expected) with thread creation, Ingo's test_str02 with 20000 threads requires 0.7s system time. Taking out Ingo's patch (un-patch available per request) by basically deleting all mentions of free_area_cache from the kernel and starting the search for new memory always at the respective bases we observe: leakme terminates successfully with 11 distinctive hardly fragmented areas in /proc/self/maps but thread creating is gringdingly slow: 30+s(!) system time for Ingo's test_str02 with 20000 threads. Now - drumroll ;-) the appended patch works fine with leakme: it ends with only 7 distinct areas in /proc/self/maps and also thread creation seems sufficiently fast with 0.71s for 20000 threads. Signed-off-by: Wolfgang Wander <wwc@rentec.com> Credit-to: "Richard Purdie" <rpurdie@rpsys.net> Signed-off-by: Ken Chen <kenneth.w.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> (partly) Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-06-22 08:14:49 +08:00
extern void arch_unmap_area(struct mm_struct *, unsigned long);
extern void arch_unmap_area_topdown(struct mm_struct *, unsigned long);
#else
static inline void arch_pick_mmap_layout(struct mm_struct *mm) {}
#endif
extern void set_dumpable(struct mm_struct *mm, int value);
extern int get_dumpable(struct mm_struct *mm);
/* mm flags */
/* dumpable bits */
#define MMF_DUMPABLE 0 /* core dump is permitted */
#define MMF_DUMP_SECURELY 1 /* core file is readable only by root */
ksm: the mm interface to ksm This patch presents the mm interface to a dummy version of ksm.c, for better scrutiny of that interface: the real ksm.c follows later. When CONFIG_KSM is not set, madvise(2) reject MADV_MERGEABLE and MADV_UNMERGEABLE with EINVAL, since that seems more helpful than pretending that they can be serviced. But when CONFIG_KSM=y, accept them even if KSM is not currently running, and even on areas which KSM will not touch (e.g. hugetlb or shared file or special driver mappings). Like other madvices, report ENOMEM despite success if any area in the range is unmapped, and use EAGAIN to report out of memory. Define vma flag VM_MERGEABLE to identify an area on which KSM may try merging pages: leave it to ksm_madvise() to decide whether to set it. Define mm flag MMF_VM_MERGEABLE to identify an mm which might contain VM_MERGEABLE areas, to minimize callouts when forking or exiting. Based upon earlier patches by Chris Wright and Izik Eidus. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Signed-off-by: Chris Wright <chrisw@redhat.com> Signed-off-by: Izik Eidus <ieidus@redhat.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Avi Kivity <avi@redhat.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 08:01:57 +08:00
#define MMF_DUMPABLE_BITS 2
ksm: the mm interface to ksm This patch presents the mm interface to a dummy version of ksm.c, for better scrutiny of that interface: the real ksm.c follows later. When CONFIG_KSM is not set, madvise(2) reject MADV_MERGEABLE and MADV_UNMERGEABLE with EINVAL, since that seems more helpful than pretending that they can be serviced. But when CONFIG_KSM=y, accept them even if KSM is not currently running, and even on areas which KSM will not touch (e.g. hugetlb or shared file or special driver mappings). Like other madvices, report ENOMEM despite success if any area in the range is unmapped, and use EAGAIN to report out of memory. Define vma flag VM_MERGEABLE to identify an area on which KSM may try merging pages: leave it to ksm_madvise() to decide whether to set it. Define mm flag MMF_VM_MERGEABLE to identify an mm which might contain VM_MERGEABLE areas, to minimize callouts when forking or exiting. Based upon earlier patches by Chris Wright and Izik Eidus. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Signed-off-by: Chris Wright <chrisw@redhat.com> Signed-off-by: Izik Eidus <ieidus@redhat.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Avi Kivity <avi@redhat.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 08:01:57 +08:00
#define MMF_DUMPABLE_MASK ((1 << MMF_DUMPABLE_BITS) - 1)
/* coredump filter bits */
#define MMF_DUMP_ANON_PRIVATE 2
#define MMF_DUMP_ANON_SHARED 3
#define MMF_DUMP_MAPPED_PRIVATE 4
#define MMF_DUMP_MAPPED_SHARED 5
#define MMF_DUMP_ELF_HEADERS 6
coredump_filter: add hugepage dumping Presently hugepage's vma has a VM_RESERVED flag in order not to be swapped. But a VM_RESERVED vma isn't core dumped because this flag is often used for some kernel vmas (e.g. vmalloc, sound related). Thus hugepages are never dumped and it can't be debugged easily. Many developers want hugepages to be included into core-dump. However, We can't read generic VM_RESERVED area because this area is often IO mapping area. then these area reading may change device state. it is definitly undesiable side-effect. So adding a hugepage specific bit to the coredump filter is better. It will be able to hugepage core dumping and doesn't cause any side-effect to any i/o devices. In additional, libhugetlb use hugetlb private mapping pages as anonymous page. Then, hugepage private mapping pages should be core dumped by default. Then, /proc/[pid]/core_dump_filter has two new bits. - bit 5 mean hugetlb private mapping pages are dumped or not. (default: yes) - bit 6 mean hugetlb shared mapping pages are dumped or not. (default: no) I tested by following method. % ulimit -c unlimited % ./crash_hugepage 50 % ./crash_hugepage 50 -p % ls -lh % gdb ./crash_hugepage core % % echo 0x43 > /proc/self/coredump_filter % ./crash_hugepage 50 % ./crash_hugepage 50 -p % ls -lh % gdb ./crash_hugepage core #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <sys/mman.h> #include <string.h> #include "hugetlbfs.h" int main(int argc, char** argv){ char* p; int ch; int mmap_flags = MAP_SHARED; int fd; int nr_pages; while((ch = getopt(argc, argv, "p")) != -1) { switch (ch) { case 'p': mmap_flags &= ~MAP_SHARED; mmap_flags |= MAP_PRIVATE; break; default: /* nothing*/ break; } } argc -= optind; argv += optind; if (argc == 0){ printf("need # of pages\n"); exit(1); } nr_pages = atoi(argv[0]); if (nr_pages < 2) { printf("nr_pages must >2\n"); exit(1); } fd = hugetlbfs_unlinked_fd(); p = mmap(NULL, nr_pages * gethugepagesize(), PROT_READ|PROT_WRITE, mmap_flags, fd, 0); sleep(2); *(p + gethugepagesize()) = 1; /* COW */ sleep(2); /* crash! */ *(int*)0 = 1; return 0; } Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Reviewed-by: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: William Irwin <wli@holomorphy.com> Cc: Adam Litke <agl@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:27:08 +08:00
#define MMF_DUMP_HUGETLB_PRIVATE 7
#define MMF_DUMP_HUGETLB_SHARED 8
ksm: the mm interface to ksm This patch presents the mm interface to a dummy version of ksm.c, for better scrutiny of that interface: the real ksm.c follows later. When CONFIG_KSM is not set, madvise(2) reject MADV_MERGEABLE and MADV_UNMERGEABLE with EINVAL, since that seems more helpful than pretending that they can be serviced. But when CONFIG_KSM=y, accept them even if KSM is not currently running, and even on areas which KSM will not touch (e.g. hugetlb or shared file or special driver mappings). Like other madvices, report ENOMEM despite success if any area in the range is unmapped, and use EAGAIN to report out of memory. Define vma flag VM_MERGEABLE to identify an area on which KSM may try merging pages: leave it to ksm_madvise() to decide whether to set it. Define mm flag MMF_VM_MERGEABLE to identify an mm which might contain VM_MERGEABLE areas, to minimize callouts when forking or exiting. Based upon earlier patches by Chris Wright and Izik Eidus. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Signed-off-by: Chris Wright <chrisw@redhat.com> Signed-off-by: Izik Eidus <ieidus@redhat.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Avi Kivity <avi@redhat.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 08:01:57 +08:00
#define MMF_DUMP_FILTER_SHIFT MMF_DUMPABLE_BITS
coredump_filter: add hugepage dumping Presently hugepage's vma has a VM_RESERVED flag in order not to be swapped. But a VM_RESERVED vma isn't core dumped because this flag is often used for some kernel vmas (e.g. vmalloc, sound related). Thus hugepages are never dumped and it can't be debugged easily. Many developers want hugepages to be included into core-dump. However, We can't read generic VM_RESERVED area because this area is often IO mapping area. then these area reading may change device state. it is definitly undesiable side-effect. So adding a hugepage specific bit to the coredump filter is better. It will be able to hugepage core dumping and doesn't cause any side-effect to any i/o devices. In additional, libhugetlb use hugetlb private mapping pages as anonymous page. Then, hugepage private mapping pages should be core dumped by default. Then, /proc/[pid]/core_dump_filter has two new bits. - bit 5 mean hugetlb private mapping pages are dumped or not. (default: yes) - bit 6 mean hugetlb shared mapping pages are dumped or not. (default: no) I tested by following method. % ulimit -c unlimited % ./crash_hugepage 50 % ./crash_hugepage 50 -p % ls -lh % gdb ./crash_hugepage core % % echo 0x43 > /proc/self/coredump_filter % ./crash_hugepage 50 % ./crash_hugepage 50 -p % ls -lh % gdb ./crash_hugepage core #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <sys/mman.h> #include <string.h> #include "hugetlbfs.h" int main(int argc, char** argv){ char* p; int ch; int mmap_flags = MAP_SHARED; int fd; int nr_pages; while((ch = getopt(argc, argv, "p")) != -1) { switch (ch) { case 'p': mmap_flags &= ~MAP_SHARED; mmap_flags |= MAP_PRIVATE; break; default: /* nothing*/ break; } } argc -= optind; argv += optind; if (argc == 0){ printf("need # of pages\n"); exit(1); } nr_pages = atoi(argv[0]); if (nr_pages < 2) { printf("nr_pages must >2\n"); exit(1); } fd = hugetlbfs_unlinked_fd(); p = mmap(NULL, nr_pages * gethugepagesize(), PROT_READ|PROT_WRITE, mmap_flags, fd, 0); sleep(2); *(p + gethugepagesize()) = 1; /* COW */ sleep(2); /* crash! */ *(int*)0 = 1; return 0; } Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Reviewed-by: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: William Irwin <wli@holomorphy.com> Cc: Adam Litke <agl@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:27:08 +08:00
#define MMF_DUMP_FILTER_BITS 7
#define MMF_DUMP_FILTER_MASK \
(((1 << MMF_DUMP_FILTER_BITS) - 1) << MMF_DUMP_FILTER_SHIFT)
#define MMF_DUMP_FILTER_DEFAULT \
coredump_filter: add hugepage dumping Presently hugepage's vma has a VM_RESERVED flag in order not to be swapped. But a VM_RESERVED vma isn't core dumped because this flag is often used for some kernel vmas (e.g. vmalloc, sound related). Thus hugepages are never dumped and it can't be debugged easily. Many developers want hugepages to be included into core-dump. However, We can't read generic VM_RESERVED area because this area is often IO mapping area. then these area reading may change device state. it is definitly undesiable side-effect. So adding a hugepage specific bit to the coredump filter is better. It will be able to hugepage core dumping and doesn't cause any side-effect to any i/o devices. In additional, libhugetlb use hugetlb private mapping pages as anonymous page. Then, hugepage private mapping pages should be core dumped by default. Then, /proc/[pid]/core_dump_filter has two new bits. - bit 5 mean hugetlb private mapping pages are dumped or not. (default: yes) - bit 6 mean hugetlb shared mapping pages are dumped or not. (default: no) I tested by following method. % ulimit -c unlimited % ./crash_hugepage 50 % ./crash_hugepage 50 -p % ls -lh % gdb ./crash_hugepage core % % echo 0x43 > /proc/self/coredump_filter % ./crash_hugepage 50 % ./crash_hugepage 50 -p % ls -lh % gdb ./crash_hugepage core #include <stdlib.h> #include <stdio.h> #include <unistd.h> #include <sys/mman.h> #include <string.h> #include "hugetlbfs.h" int main(int argc, char** argv){ char* p; int ch; int mmap_flags = MAP_SHARED; int fd; int nr_pages; while((ch = getopt(argc, argv, "p")) != -1) { switch (ch) { case 'p': mmap_flags &= ~MAP_SHARED; mmap_flags |= MAP_PRIVATE; break; default: /* nothing*/ break; } } argc -= optind; argv += optind; if (argc == 0){ printf("need # of pages\n"); exit(1); } nr_pages = atoi(argv[0]); if (nr_pages < 2) { printf("nr_pages must >2\n"); exit(1); } fd = hugetlbfs_unlinked_fd(); p = mmap(NULL, nr_pages * gethugepagesize(), PROT_READ|PROT_WRITE, mmap_flags, fd, 0); sleep(2); *(p + gethugepagesize()) = 1; /* COW */ sleep(2); /* crash! */ *(int*)0 = 1; return 0; } Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Reviewed-by: Kawai Hidehiro <hidehiro.kawai.ez@hitachi.com> Cc: Hugh Dickins <hugh@veritas.com> Cc: William Irwin <wli@holomorphy.com> Cc: Adam Litke <agl@us.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-10-19 11:27:08 +08:00
((1 << MMF_DUMP_ANON_PRIVATE) | (1 << MMF_DUMP_ANON_SHARED) |\
(1 << MMF_DUMP_HUGETLB_PRIVATE) | MMF_DUMP_MASK_DEFAULT_ELF)
#ifdef CONFIG_CORE_DUMP_DEFAULT_ELF_HEADERS
# define MMF_DUMP_MASK_DEFAULT_ELF (1 << MMF_DUMP_ELF_HEADERS)
#else
# define MMF_DUMP_MASK_DEFAULT_ELF 0
#endif
ksm: the mm interface to ksm This patch presents the mm interface to a dummy version of ksm.c, for better scrutiny of that interface: the real ksm.c follows later. When CONFIG_KSM is not set, madvise(2) reject MADV_MERGEABLE and MADV_UNMERGEABLE with EINVAL, since that seems more helpful than pretending that they can be serviced. But when CONFIG_KSM=y, accept them even if KSM is not currently running, and even on areas which KSM will not touch (e.g. hugetlb or shared file or special driver mappings). Like other madvices, report ENOMEM despite success if any area in the range is unmapped, and use EAGAIN to report out of memory. Define vma flag VM_MERGEABLE to identify an area on which KSM may try merging pages: leave it to ksm_madvise() to decide whether to set it. Define mm flag MMF_VM_MERGEABLE to identify an mm which might contain VM_MERGEABLE areas, to minimize callouts when forking or exiting. Based upon earlier patches by Chris Wright and Izik Eidus. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Signed-off-by: Chris Wright <chrisw@redhat.com> Signed-off-by: Izik Eidus <ieidus@redhat.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Avi Kivity <avi@redhat.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 08:01:57 +08:00
/* leave room for more dump flags */
#define MMF_VM_MERGEABLE 16 /* KSM may merge identical pages */
2011-01-14 07:46:58 +08:00
#define MMF_VM_HUGEPAGE 17 /* set when VM_HUGEPAGE is set on vma */
ksm: the mm interface to ksm This patch presents the mm interface to a dummy version of ksm.c, for better scrutiny of that interface: the real ksm.c follows later. When CONFIG_KSM is not set, madvise(2) reject MADV_MERGEABLE and MADV_UNMERGEABLE with EINVAL, since that seems more helpful than pretending that they can be serviced. But when CONFIG_KSM=y, accept them even if KSM is not currently running, and even on areas which KSM will not touch (e.g. hugetlb or shared file or special driver mappings). Like other madvices, report ENOMEM despite success if any area in the range is unmapped, and use EAGAIN to report out of memory. Define vma flag VM_MERGEABLE to identify an area on which KSM may try merging pages: leave it to ksm_madvise() to decide whether to set it. Define mm flag MMF_VM_MERGEABLE to identify an mm which might contain VM_MERGEABLE areas, to minimize callouts when forking or exiting. Based upon earlier patches by Chris Wright and Izik Eidus. Signed-off-by: Hugh Dickins <hugh.dickins@tiscali.co.uk> Signed-off-by: Chris Wright <chrisw@redhat.com> Signed-off-by: Izik Eidus <ieidus@redhat.com> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Rik van Riel <riel@redhat.com> Cc: Wu Fengguang <fengguang.wu@intel.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Lee Schermerhorn <lee.schermerhorn@hp.com> Cc: Avi Kivity <avi@redhat.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 08:01:57 +08:00
#define MMF_INIT_MASK (MMF_DUMPABLE_MASK | MMF_DUMP_FILTER_MASK)
struct sighand_struct {
atomic_t count;
struct k_sigaction action[_NSIG];
spinlock_t siglock;
wait_queue_head_t signalfd_wqh;
};
struct pacct_struct {
int ac_flag;
long ac_exitcode;
unsigned long ac_mem;
cputime_t ac_utime, ac_stime;
unsigned long ac_minflt, ac_majflt;
};
struct cpu_itimer {
cputime_t expires;
cputime_t incr;
u32 error;
u32 incr_error;
};
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
/**
* struct task_cputime - collected CPU time counts
* @utime: time spent in user mode, in &cputime_t units
* @stime: time spent in kernel mode, in &cputime_t units
* @sum_exec_runtime: total time spent on the CPU, in nanoseconds
*
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
* This structure groups together three kinds of CPU time that are
* tracked for threads and thread groups. Most things considering
* CPU time want to group these counts together and treat all three
* of them in parallel.
*/
struct task_cputime {
cputime_t utime;
cputime_t stime;
unsigned long long sum_exec_runtime;
};
/* Alternate field names when used to cache expirations. */
#define prof_exp stime
#define virt_exp utime
#define sched_exp sum_exec_runtime
#define INIT_CPUTIME \
(struct task_cputime) { \
.utime = 0, \
.stime = 0, \
.sum_exec_runtime = 0, \
}
/*
* Disable preemption until the scheduler is running.
* Reset by start_kernel()->sched_init()->init_idle().
*
* We include PREEMPT_ACTIVE to avoid cond_resched() from working
* before the scheduler is active -- see should_resched().
*/
#define INIT_PREEMPT_COUNT (1 + PREEMPT_ACTIVE)
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
/**
* struct thread_group_cputimer - thread group interval timer counts
* @cputime: thread group interval timers.
* @running: non-zero when there are timers running and
* @cputime receives updates.
* @lock: lock for fields in this struct.
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
*
* This structure contains the version of task_cputime, above, that is
* used for thread group CPU timer calculations.
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
*/
struct thread_group_cputimer {
struct task_cputime cputime;
int running;
raw_spinlock_t lock;
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
};
#include <linux/rwsem.h>
sched: Add 'autogroup' scheduling feature: automated per session task groups A recurring complaint from CFS users is that parallel kbuild has a negative impact on desktop interactivity. This patch implements an idea from Linus, to automatically create task groups. Currently, only per session autogroups are implemented, but the patch leaves the way open for enhancement. Implementation: each task's signal struct contains an inherited pointer to a refcounted autogroup struct containing a task group pointer, the default for all tasks pointing to the init_task_group. When a task calls setsid(), a new task group is created, the process is moved into the new task group, and a reference to the preveious task group is dropped. Child processes inherit this task group thereafter, and increase it's refcount. When the last thread of a process exits, the process's reference is dropped, such that when the last process referencing an autogroup exits, the autogroup is destroyed. At runqueue selection time, IFF a task has no cgroup assignment, its current autogroup is used. Autogroup bandwidth is controllable via setting it's nice level through the proc filesystem: cat /proc/<pid>/autogroup Displays the task's group and the group's nice level. echo <nice level> > /proc/<pid>/autogroup Sets the task group's shares to the weight of nice <level> task. Setting nice level is rate limited for !admin users due to the abuse risk of task group locking. The feature is enabled from boot by default if CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via the boot option noautogroup, and can also be turned on/off on the fly via: echo [01] > /proc/sys/kernel/sched_autogroup_enabled ... which will automatically move tasks to/from the root task group. Signed-off-by: Mike Galbraith <efault@gmx.de> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Paul Turner <pjt@google.com> Cc: Oleg Nesterov <oleg@redhat.com> [ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ] Signed-off-by: Ingo Molnar <mingo@elte.hu> LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-11-30 21:18:03 +08:00
struct autogroup;
/*
* NOTE! "signal_struct" does not have its own
* locking, because a shared signal_struct always
* implies a shared sighand_struct, so locking
* sighand_struct is always a proper superset of
* the locking of signal_struct.
*/
struct signal_struct {
atomic_t sigcnt;
atomic_t live;
int nr_threads;
wait_queue_head_t wait_chldexit; /* for wait4() */
/* current thread group signal load-balancing target: */
struct task_struct *curr_target;
/* shared signal handling: */
struct sigpending shared_pending;
/* thread group exit support */
int group_exit_code;
/* overloaded:
* - notify group_exit_task when ->count is equal to notify_count
* - everyone except group_exit_task is stopped during signal delivery
* of fatal signals, group_exit_task processes the signal.
*/
int notify_count;
struct task_struct *group_exit_task;
/* thread group stop support, overloads group_exit_code too */
int group_stop_count;
unsigned int flags; /* see SIGNAL_* flags below */
prctl: add PR_{SET,GET}_CHILD_SUBREAPER to allow simple process supervision Userspace service managers/supervisors need to track their started services. Many services daemonize by double-forking and get implicitly re-parented to PID 1. The service manager will no longer be able to receive the SIGCHLD signals for them, and is no longer in charge of reaping the children with wait(). All information about the children is lost at the moment PID 1 cleans up the re-parented processes. With this prctl, a service manager process can mark itself as a sort of 'sub-init', able to stay as the parent for all orphaned processes created by the started services. All SIGCHLD signals will be delivered to the service manager. Receiving SIGCHLD and doing wait() is in cases of a service-manager much preferred over any possible asynchronous notification about specific PIDs, because the service manager has full access to the child process data in /proc and the PID can not be re-used until the wait(), the service-manager itself is in charge of, has happened. As a side effect, the relevant parent PID information does not get lost by a double-fork, which results in a more elaborate process tree and 'ps' output: before: # ps afx 253 ? Ss 0:00 /bin/dbus-daemon --system --nofork 294 ? Sl 0:00 /usr/libexec/polkit-1/polkitd 328 ? S 0:00 /usr/sbin/modem-manager 608 ? Sl 0:00 /usr/libexec/colord 658 ? Sl 0:00 /usr/libexec/upowerd 819 ? Sl 0:00 /usr/libexec/imsettings-daemon 916 ? Sl 0:00 /usr/libexec/udisks-daemon 917 ? S 0:00 \_ udisks-daemon: not polling any devices after: # ps afx 294 ? Ss 0:00 /bin/dbus-daemon --system --nofork 426 ? Sl 0:00 \_ /usr/libexec/polkit-1/polkitd 449 ? S 0:00 \_ /usr/sbin/modem-manager 635 ? Sl 0:00 \_ /usr/libexec/colord 705 ? Sl 0:00 \_ /usr/libexec/upowerd 959 ? Sl 0:00 \_ /usr/libexec/udisks-daemon 960 ? S 0:00 | \_ udisks-daemon: not polling any devices 977 ? Sl 0:00 \_ /usr/libexec/packagekitd This prctl is orthogonal to PID namespaces. PID namespaces are isolated from each other, while a service management process usually requires the services to live in the same namespace, to be able to talk to each other. Users of this will be the systemd per-user instance, which provides init-like functionality for the user's login session and D-Bus, which activates bus services on-demand. Both need init-like capabilities to be able to properly keep track of the services they start. Many thanks to Oleg for several rounds of review and insights. [akpm@linux-foundation.org: fix comment layout and spelling] [akpm@linux-foundation.org: add lengthy code comment from Oleg] Reviewed-by: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Lennart Poettering <lennart@poettering.net> Signed-off-by: Kay Sievers <kay.sievers@vrfy.org> Acked-by: Valdis Kletnieks <Valdis.Kletnieks@vt.edu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-24 06:01:54 +08:00
/*
* PR_SET_CHILD_SUBREAPER marks a process, like a service
* manager, to re-parent orphan (double-forking) child processes
* to this process instead of 'init'. The service manager is
* able to receive SIGCHLD signals and is able to investigate
* the process until it calls wait(). All children of this
* process will inherit a flag if they should look for a
* child_subreaper process at exit.
*/
unsigned int is_child_subreaper:1;
unsigned int has_child_subreaper:1;
/* POSIX.1b Interval Timers */
struct list_head posix_timers;
/* ITIMER_REAL timer for the process */
struct hrtimer real_timer;
struct pid *leader_pid;
ktime_t it_real_incr;
/*
* ITIMER_PROF and ITIMER_VIRTUAL timers for the process, we use
* CPUCLOCK_PROF and CPUCLOCK_VIRT for indexing array as these
* values are defined to 0 and 1 respectively
*/
struct cpu_itimer it[2];
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
/*
* Thread group totals for process CPU timers.
* See thread_group_cputimer(), et al, for details.
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
*/
struct thread_group_cputimer cputimer;
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
/* Earliest-expiration cache. */
struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
struct pid *tty_old_pgrp;
/* boolean value for session group leader */
int leader;
struct tty_struct *tty; /* NULL if no tty */
sched: Add 'autogroup' scheduling feature: automated per session task groups A recurring complaint from CFS users is that parallel kbuild has a negative impact on desktop interactivity. This patch implements an idea from Linus, to automatically create task groups. Currently, only per session autogroups are implemented, but the patch leaves the way open for enhancement. Implementation: each task's signal struct contains an inherited pointer to a refcounted autogroup struct containing a task group pointer, the default for all tasks pointing to the init_task_group. When a task calls setsid(), a new task group is created, the process is moved into the new task group, and a reference to the preveious task group is dropped. Child processes inherit this task group thereafter, and increase it's refcount. When the last thread of a process exits, the process's reference is dropped, such that when the last process referencing an autogroup exits, the autogroup is destroyed. At runqueue selection time, IFF a task has no cgroup assignment, its current autogroup is used. Autogroup bandwidth is controllable via setting it's nice level through the proc filesystem: cat /proc/<pid>/autogroup Displays the task's group and the group's nice level. echo <nice level> > /proc/<pid>/autogroup Sets the task group's shares to the weight of nice <level> task. Setting nice level is rate limited for !admin users due to the abuse risk of task group locking. The feature is enabled from boot by default if CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via the boot option noautogroup, and can also be turned on/off on the fly via: echo [01] > /proc/sys/kernel/sched_autogroup_enabled ... which will automatically move tasks to/from the root task group. Signed-off-by: Mike Galbraith <efault@gmx.de> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Paul Turner <pjt@google.com> Cc: Oleg Nesterov <oleg@redhat.com> [ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ] Signed-off-by: Ingo Molnar <mingo@elte.hu> LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-11-30 21:18:03 +08:00
#ifdef CONFIG_SCHED_AUTOGROUP
struct autogroup *autogroup;
#endif
/*
* Cumulative resource counters for dead threads in the group,
* and for reaped dead child processes forked by this group.
* Live threads maintain their own counters and add to these
* in __exit_signal, except for the group leader.
*/
cputime_t utime, stime, cutime, cstime;
cputime_t gtime;
cputime_t cgtime;
sched, cputime: Introduce thread_group_times() This is a real fix for problem of utime/stime values decreasing described in the thread: http://lkml.org/lkml/2009/11/3/522 Now cputime is accounted in the following way: - {u,s}time in task_struct are increased every time when the thread is interrupted by a tick (timer interrupt). - When a thread exits, its {u,s}time are added to signal->{u,s}time, after adjusted by task_times(). - When all threads in a thread_group exits, accumulated {u,s}time (and also c{u,s}time) in signal struct are added to c{u,s}time in signal struct of the group's parent. So {u,s}time in task struct are "raw" tick count, while {u,s}time and c{u,s}time in signal struct are "adjusted" values. And accounted values are used by: - task_times(), to get cputime of a thread: This function returns adjusted values that originates from raw {u,s}time and scaled by sum_exec_runtime that accounted by CFS. - thread_group_cputime(), to get cputime of a thread group: This function returns sum of all {u,s}time of living threads in the group, plus {u,s}time in the signal struct that is sum of adjusted cputimes of all exited threads belonged to the group. The problem is the return value of thread_group_cputime(), because it is mixed sum of "raw" value and "adjusted" value: group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time) This misbehavior can break {u,s}time monotonicity. Assume that if there is a thread that have raw values greater than adjusted values (e.g. interrupted by 1000Hz ticks 50 times but only runs 45ms) and if it exits, cputime will decrease (e.g. -5ms). To fix this, we could do: group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time) But task_times() contains hard divisions, so applying it for every thread should be avoided. This patch fixes the above problem in the following way: - Modify thread's exit (= __exit_signal()) not to use task_times(). It means {u,s}time in signal struct accumulates raw values instead of adjusted values. As the result it makes thread_group_cputime() to return pure sum of "raw" values. - Introduce a new function thread_group_times(*task, *utime, *stime) that converts "raw" values of thread_group_cputime() to "adjusted" values, in same calculation procedure as task_times(). - Modify group's exit (= wait_task_zombie()) to use this introduced thread_group_times(). It make c{u,s}time in signal struct to have adjusted values like before this patch. - Replace some thread_group_cputime() by thread_group_times(). This replacements are only applied where conveys the "adjusted" cputime to users, and where already uses task_times() near by it. (i.e. sys_times(), getrusage(), and /proc/<PID>/stat.) This patch have a positive side effect: - Before this patch, if a group contains many short-life threads (e.g. runs 0.9ms and not interrupted by ticks), the group's cputime could be invisible since thread's cputime was accumulated after adjusted: imagine adjustment function as adj(ticks, runtime), {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0. After this patch it will not happen because the adjustment is applied after accumulated. v2: - remove if()s, put new variables into signal_struct. Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Spencer Candland <spencer@bluehost.com> Cc: Americo Wang <xiyou.wangcong@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Stanislaw Gruszka <sgruszka@redhat.com> LKML-Reference: <4B162517.8040909@jp.fujitsu.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-12-02 16:28:07 +08:00
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
cputime_t prev_utime, prev_stime;
#endif
unsigned long nvcsw, nivcsw, cnvcsw, cnivcsw;
unsigned long min_flt, maj_flt, cmin_flt, cmaj_flt;
unsigned long inblock, oublock, cinblock, coublock;
getrusage: fill ru_maxrss value Make ->ru_maxrss value in struct rusage filled accordingly to rss hiwater mark. This struct is filled as a parameter to getrusage syscall. ->ru_maxrss value is set to KBs which is the way it is done in BSD systems. /usr/bin/time (gnu time) application converts ->ru_maxrss to KBs which seems to be incorrect behavior. Maintainer of this util was notified by me with the patch which corrects it and cc'ed. To make this happen we extend struct signal_struct by two fields. The first one is ->maxrss which we use to store rss hiwater of the task. The second one is ->cmaxrss which we use to store highest rss hiwater of all task childs. These values are used in k_getrusage() to actually fill ->ru_maxrss. k_getrusage() uses current rss hiwater value directly if mm struct exists. Note: exec() clear mm->hiwater_rss, but doesn't clear sig->maxrss. it is intetionally behavior. *BSD getrusage have exec() inheriting. test programs ======================================================== getrusage.c =========== #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <sys/time.h> #include <sys/resource.h> #include <sys/types.h> #include <sys/wait.h> #include <unistd.h> #include <signal.h> #include <sys/mman.h> #include "common.h" #define err(str) perror(str), exit(1) int main(int argc, char** argv) { int status; printf("allocate 100MB\n"); consume(100); printf("testcase1: fork inherit? \n"); printf(" expect: initial.self ~= child.self\n"); show_rusage("initial"); if (__fork()) { wait(&status); } else { show_rusage("fork child"); _exit(0); } printf("\n"); printf("testcase2: fork inherit? (cont.) \n"); printf(" expect: initial.children ~= 100MB, but child.children = 0\n"); show_rusage("initial"); if (__fork()) { wait(&status); } else { show_rusage("child"); _exit(0); } printf("\n"); printf("testcase3: fork + malloc \n"); printf(" expect: child.self ~= initial.self + 50MB\n"); show_rusage("initial"); if (__fork()) { wait(&status); } else { printf("allocate +50MB\n"); consume(50); show_rusage("fork child"); _exit(0); } printf("\n"); printf("testcase4: grandchild maxrss\n"); printf(" expect: post_wait.children ~= 300MB\n"); show_rusage("initial"); if (__fork()) { wait(&status); show_rusage("post_wait"); } else { system("./child -n 0 -g 300"); _exit(0); } printf("\n"); printf("testcase5: zombie\n"); printf(" expect: pre_wait ~= initial, IOW the zombie process is not accounted.\n"); printf(" post_wait ~= 400MB, IOW wait() collect child's max_rss. \n"); show_rusage("initial"); if (__fork()) { sleep(1); /* children become zombie */ show_rusage("pre_wait"); wait(&status); show_rusage("post_wait"); } else { system("./child -n 400"); _exit(0); } printf("\n"); printf("testcase6: SIG_IGN\n"); printf(" expect: initial ~= after_zombie (child's 500MB alloc should be ignored).\n"); show_rusage("initial"); signal(SIGCHLD, SIG_IGN); if (__fork()) { sleep(1); /* children become zombie */ show_rusage("after_zombie"); } else { system("./child -n 500"); _exit(0); } printf("\n"); signal(SIGCHLD, SIG_DFL); printf("testcase7: exec (without fork) \n"); printf(" expect: initial ~= exec \n"); show_rusage("initial"); execl("./child", "child", "-v", NULL); return 0; } child.c ======= #include <sys/types.h> #include <unistd.h> #include <sys/types.h> #include <sys/wait.h> #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <sys/time.h> #include <sys/resource.h> #include "common.h" int main(int argc, char** argv) { int status; int c; long consume_size = 0; long grandchild_consume_size = 0; int show = 0; while ((c = getopt(argc, argv, "n:g:v")) != -1) { switch (c) { case 'n': consume_size = atol(optarg); break; case 'v': show = 1; break; case 'g': grandchild_consume_size = atol(optarg); break; default: break; } } if (show) show_rusage("exec"); if (consume_size) { printf("child alloc %ldMB\n", consume_size); consume(consume_size); } if (grandchild_consume_size) { if (fork()) { wait(&status); } else { printf("grandchild alloc %ldMB\n", grandchild_consume_size); consume(grandchild_consume_size); exit(0); } } return 0; } common.c ======== #include <stdio.h> #include <stdlib.h> #include <string.h> #include <sys/types.h> #include <sys/time.h> #include <sys/resource.h> #include <sys/types.h> #include <sys/wait.h> #include <unistd.h> #include <signal.h> #include <sys/mman.h> #include "common.h" #define err(str) perror(str), exit(1) void show_rusage(char *prefix) { int err, err2; struct rusage rusage_self; struct rusage rusage_children; printf("%s: ", prefix); err = getrusage(RUSAGE_SELF, &rusage_self); if (!err) printf("self %ld ", rusage_self.ru_maxrss); err2 = getrusage(RUSAGE_CHILDREN, &rusage_children); if (!err2) printf("children %ld ", rusage_children.ru_maxrss); printf("\n"); } /* Some buggy OS need this worthless CPU waste. */ void make_pagefault(void) { void *addr; int size = getpagesize(); int i; for (i=0; i<1000; i++) { addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0); if (addr == MAP_FAILED) err("make_pagefault"); memset(addr, 0, size); munmap(addr, size); } } void consume(int mega) { size_t sz = mega * 1024 * 1024; void *ptr; ptr = malloc(sz); memset(ptr, 0, sz); make_pagefault(); } pid_t __fork(void) { pid_t pid; pid = fork(); make_pagefault(); return pid; } common.h ======== void show_rusage(char *prefix); void make_pagefault(void); void consume(int mega); pid_t __fork(void); FreeBSD result (expected result) ======================================================== allocate 100MB testcase1: fork inherit? expect: initial.self ~= child.self initial: self 103492 children 0 fork child: self 103540 children 0 testcase2: fork inherit? (cont.) expect: initial.children ~= 100MB, but child.children = 0 initial: self 103540 children 103540 child: self 103564 children 0 testcase3: fork + malloc expect: child.self ~= initial.self + 50MB initial: self 103564 children 103564 allocate +50MB fork child: self 154860 children 0 testcase4: grandchild maxrss expect: post_wait.children ~= 300MB initial: self 103564 children 154860 grandchild alloc 300MB post_wait: self 103564 children 308720 testcase5: zombie expect: pre_wait ~= initial, IOW the zombie process is not accounted. post_wait ~= 400MB, IOW wait() collect child's max_rss. initial: self 103564 children 308720 child alloc 400MB pre_wait: self 103564 children 308720 post_wait: self 103564 children 411312 testcase6: SIG_IGN expect: initial ~= after_zombie (child's 500MB alloc should be ignored). initial: self 103564 children 411312 child alloc 500MB after_zombie: self 103624 children 411312 testcase7: exec (without fork) expect: initial ~= exec initial: self 103624 children 411312 exec: self 103624 children 411312 Linux result (actual test result) ======================================================== allocate 100MB testcase1: fork inherit? expect: initial.self ~= child.self initial: self 102848 children 0 fork child: self 102572 children 0 testcase2: fork inherit? (cont.) expect: initial.children ~= 100MB, but child.children = 0 initial: self 102876 children 102644 child: self 102572 children 0 testcase3: fork + malloc expect: child.self ~= initial.self + 50MB initial: self 102876 children 102644 allocate +50MB fork child: self 153804 children 0 testcase4: grandchild maxrss expect: post_wait.children ~= 300MB initial: self 102876 children 153864 grandchild alloc 300MB post_wait: self 102876 children 307536 testcase5: zombie expect: pre_wait ~= initial, IOW the zombie process is not accounted. post_wait ~= 400MB, IOW wait() collect child's max_rss. initial: self 102876 children 307536 child alloc 400MB pre_wait: self 102876 children 307536 post_wait: self 102876 children 410076 testcase6: SIG_IGN expect: initial ~= after_zombie (child's 500MB alloc should be ignored). initial: self 102876 children 410076 child alloc 500MB after_zombie: self 102880 children 410076 testcase7: exec (without fork) expect: initial ~= exec initial: self 102880 children 410076 exec: self 102880 children 410076 Signed-off-by: Jiri Pirko <jpirko@redhat.com> Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk> Cc: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-23 07:44:10 +08:00
unsigned long maxrss, cmaxrss;
struct task_io_accounting ioac;
/*
* Cumulative ns of schedule CPU time fo dead threads in the
* group, not including a zombie group leader, (This only differs
* from jiffies_to_ns(utime + stime) if sched_clock uses something
* other than jiffies.)
*/
unsigned long long sum_sched_runtime;
/*
* We don't bother to synchronize most readers of this at all,
* because there is no reader checking a limit that actually needs
* to get both rlim_cur and rlim_max atomically, and either one
* alone is a single word that can safely be read normally.
* getrlimit/setrlimit use task_lock(current->group_leader) to
* protect this instead of the siglock, because they really
* have no need to disable irqs.
*/
struct rlimit rlim[RLIM_NLIMITS];
#ifdef CONFIG_BSD_PROCESS_ACCT
struct pacct_struct pacct; /* per-process accounting information */
#endif
#ifdef CONFIG_TASKSTATS
struct taskstats *stats;
#endif
Audit: add TTY input auditing Add TTY input auditing, used to audit system administrator's actions. This is required by various security standards such as DCID 6/3 and PCI to provide non-repudiation of administrator's actions and to allow a review of past actions if the administrator seems to overstep their duties or if the system becomes misconfigured for unknown reasons. These requirements do not make it necessary to audit TTY output as well. Compared to an user-space keylogger, this approach records TTY input using the audit subsystem, correlated with other audit events, and it is completely transparent to the user-space application (e.g. the console ioctls still work). TTY input auditing works on a higher level than auditing all system calls within the session, which would produce an overwhelming amount of mostly useless audit events. Add an "audit_tty" attribute, inherited across fork (). Data read from TTYs by process with the attribute is sent to the audit subsystem by the kernel. The audit netlink interface is extended to allow modifying the audit_tty attribute, and to allow sending explanatory audit events from user-space (for example, a shell might send an event containing the final command, after the interactive command-line editing and history expansion is performed, which might be difficult to decipher from the TTY input alone). Because the "audit_tty" attribute is inherited across fork (), it would be set e.g. for sshd restarted within an audited session. To prevent this, the audit_tty attribute is cleared when a process with no open TTY file descriptors (e.g. after daemon startup) opens a TTY. See https://www.redhat.com/archives/linux-audit/2007-June/msg00000.html for a more detailed rationale document for an older version of this patch. [akpm@linux-foundation.org: build fix] Signed-off-by: Miloslav Trmac <mitr@redhat.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: Alan Cox <alan@lxorguk.ukuu.org.uk> Cc: Paul Fulghum <paulkf@microgate.com> Cc: Casey Schaufler <casey@schaufler-ca.com> Cc: Steve Grubb <sgrubb@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-07-16 14:40:56 +08:00
#ifdef CONFIG_AUDIT
unsigned audit_tty;
struct tty_audit_buf *tty_audit_buf;
#endif
#ifdef CONFIG_CGROUPS
/*
threadgroup: extend threadgroup_lock() to cover exit and exec threadgroup_lock() protected only protected against new addition to the threadgroup, which was inherently somewhat incomplete and problematic for its only user cgroup. On-going migration could race against exec and exit leading to interesting problems - the symmetry between various attach methods, task exiting during method execution, ->exit() racing against attach methods, migrating task switching basic properties during exec and so on. This patch extends threadgroup_lock() such that it protects against all three threadgroup altering operations - fork, exit and exec. For exit, threadgroup_change_begin/end() calls are added to exit_signals around assertion of PF_EXITING. For exec, threadgroup_[un]lock() are updated to also grab and release cred_guard_mutex. With this change, threadgroup_lock() guarantees that the target threadgroup will remain stable - no new task will be added, no new PF_EXITING will be set and exec won't happen. The next patch will update cgroup so that it can take full advantage of this change. -v2: beefed up comment as suggested by Frederic. -v3: narrowed scope of protection in exit path as suggested by Frederic. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Li Zefan <lizf@cn.fujitsu.com> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Paul Menage <paul@paulmenage.org> Cc: Linus Torvalds <torvalds@linux-foundation.org>
2011-12-13 10:12:21 +08:00
* group_rwsem prevents new tasks from entering the threadgroup and
* member tasks from exiting,a more specifically, setting of
* PF_EXITING. fork and exit paths are protected with this rwsem
* using threadgroup_change_begin/end(). Users which require
* threadgroup to remain stable should use threadgroup_[un]lock()
* which also takes care of exec path. Currently, cgroup is the
* only user.
*/
struct rw_semaphore group_rwsem;
#endif
oom: move oom_adj value from task_struct to signal_struct Currently, OOM logic callflow is here. __out_of_memory() select_bad_process() for each task badness() calculate badness of one task oom_kill_process() search child oom_kill_task() kill target task and mm shared tasks with it example, process-A have two thread, thread-A and thread-B and it have very fat memory and each thread have following oom_adj and oom_score. thread-A: oom_adj = OOM_DISABLE, oom_score = 0 thread-B: oom_adj = 0, oom_score = very-high Then, select_bad_process() select thread-B, but oom_kill_task() refuse kill the task because thread-A have OOM_DISABLE. Thus __out_of_memory() call select_bad_process() again. but select_bad_process() select the same task. It mean kernel fall in livelock. The fact is, select_bad_process() must select killable task. otherwise OOM logic go into livelock. And root cause is, oom_adj shouldn't be per-thread value. it should be per-process value because OOM-killer kill a process, not thread. Thus This patch moves oomkilladj (now more appropriately named oom_adj) from struct task_struct to struct signal_struct. it naturally prevent select_bad_process() choose wrong task. Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Paul Menage <menage@google.com> Cc: David Rientjes <rientjes@google.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Rik van Riel <riel@redhat.com> Cc: Oleg Nesterov <oleg@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-22 08:03:13 +08:00
oom: badness heuristic rewrite This a complete rewrite of the oom killer's badness() heuristic which is used to determine which task to kill in oom conditions. The goal is to make it as simple and predictable as possible so the results are better understood and we end up killing the task which will lead to the most memory freeing while still respecting the fine-tuning from userspace. Instead of basing the heuristic on mm->total_vm for each task, the task's rss and swap space is used instead. This is a better indication of the amount of memory that will be freeable if the oom killed task is chosen and subsequently exits. This helps specifically in cases where KDE or GNOME is chosen for oom kill on desktop systems instead of a memory hogging task. The baseline for the heuristic is a proportion of memory that each task is currently using in memory plus swap compared to the amount of "allowable" memory. "Allowable," in this sense, means the system-wide resources for unconstrained oom conditions, the set of mempolicy nodes, the mems attached to current's cpuset, or a memory controller's limit. The proportion is given on a scale of 0 (never kill) to 1000 (always kill), roughly meaning that if a task has a badness() score of 500 that the task consumes approximately 50% of allowable memory resident in RAM or in swap space. The proportion is always relative to the amount of "allowable" memory and not the total amount of RAM systemwide so that mempolicies and cpusets may operate in isolation; they shall not need to know the true size of the machine on which they are running if they are bound to a specific set of nodes or mems, respectively. Root tasks are given 3% extra memory just like __vm_enough_memory() provides in LSMs. In the event of two tasks consuming similar amounts of memory, it is generally better to save root's task. Because of the change in the badness() heuristic's baseline, it is also necessary to introduce a new user interface to tune it. It's not possible to redefine the meaning of /proc/pid/oom_adj with a new scale since the ABI cannot be changed for backward compatability. Instead, a new tunable, /proc/pid/oom_score_adj, is added that ranges from -1000 to +1000. It may be used to polarize the heuristic such that certain tasks are never considered for oom kill while others may always be considered. The value is added directly into the badness() score so a value of -500, for example, means to discount 50% of its memory consumption in comparison to other tasks either on the system, bound to the mempolicy, in the cpuset, or sharing the same memory controller. /proc/pid/oom_adj is changed so that its meaning is rescaled into the units used by /proc/pid/oom_score_adj, and vice versa. Changing one of these per-task tunables will rescale the value of the other to an equivalent meaning. Although /proc/pid/oom_adj was originally defined as a bitshift on the badness score, it now shares the same linear growth as /proc/pid/oom_score_adj but with different granularity. This is required so the ABI is not broken with userspace applications and allows oom_adj to be deprecated for future removal. Signed-off-by: David Rientjes <rientjes@google.com> Cc: Nick Piggin <npiggin@suse.de> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Balbir Singh <balbir@in.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-08-10 08:19:46 +08:00
int oom_adj; /* OOM kill score adjustment (bit shift) */
int oom_score_adj; /* OOM kill score adjustment */
int oom_score_adj_min; /* OOM kill score adjustment minimum value.
* Only settable by CAP_SYS_RESOURCE. */
struct mutex cred_guard_mutex; /* guard against foreign influences on
* credential calculations
* (notably. ptrace) */
};
/* Context switch must be unlocked if interrupts are to be enabled */
#ifdef __ARCH_WANT_INTERRUPTS_ON_CTXSW
# define __ARCH_WANT_UNLOCKED_CTXSW
#endif
/*
* Bits in flags field of signal_struct.
*/
#define SIGNAL_STOP_STOPPED 0x00000001 /* job control stop in effect */
2011-04-02 02:12:38 +08:00
#define SIGNAL_STOP_CONTINUED 0x00000002 /* SIGCONT since WCONTINUED reap */
#define SIGNAL_GROUP_EXIT 0x00000004 /* group exit in progress */
signals: re-assign CLD_CONTINUED notification from the sender to reciever Based on discussion with Jiri and Roland. In short: currently handle_stop_signal(SIGCONT, p) sends the notification to p->parent, with this patch p itself notifies its parent when it becomes running. handle_stop_signal(SIGCONT) has to drop ->siglock temporary in order to notify the parent with do_notify_parent_cldstop(). This leads to multiple problems: - as Jiri Kosina pointed out, the stopped task can resume without actually seeing SIGCONT which may have a handler. - we race with another sig_kernel_stop() signal which may come in that window. - we race with sig_fatal() signals which may set SIGNAL_GROUP_EXIT in that window. - we can't avoid taking tasklist_lock() while sending SIGCONT. With this patch handle_stop_signal() just sets the new SIGNAL_CLD_CONTINUED flag in p->signal->flags and returns. The notification is sent by the first task which returns from finish_stop() (there should be at least one) or any other signalled thread from get_signal_to_deliver(). This is a user-visible change. Say, currently kill(SIGCONT, stopped_child) can't return without seeing SIGCHLD, with this patch SIGCHLD can be delayed unpredictably. Another difference is that if the child is ptraced by another process, CLD_CONTINUED may be delivered to ->real_parent after ptrace_detach() while currently it always goes to the tracer which doesn't actually need this notification. Hopefully not a problem. The patch asks for the futher obvious cleanups, I'll send them separately. Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru> Cc: Roland McGrath <roland@redhat.com> Cc: Jiri Kosina <jkosina@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-30 15:52:44 +08:00
/*
* Pending notifications to parent.
*/
#define SIGNAL_CLD_STOPPED 0x00000010
#define SIGNAL_CLD_CONTINUED 0x00000020
#define SIGNAL_CLD_MASK (SIGNAL_CLD_STOPPED|SIGNAL_CLD_CONTINUED)
#define SIGNAL_UNKILLABLE 0x00000040 /* for init: ignore fatal signals */
/* If true, all threads except ->group_exit_task have pending SIGKILL */
static inline int signal_group_exit(const struct signal_struct *sig)
{
return (sig->flags & SIGNAL_GROUP_EXIT) ||
(sig->group_exit_task != NULL);
}
/*
* Some day this will be a full-fledged user tracking system..
*/
struct user_struct {
atomic_t __count; /* reference count */
atomic_t processes; /* How many processes does this user have? */
atomic_t files; /* How many open files does this user have? */
atomic_t sigpending; /* How many pending signals does this user have? */
#ifdef CONFIG_INOTIFY_USER
atomic_t inotify_watches; /* How many inotify watches does this user have? */
atomic_t inotify_devs; /* How many inotify devs does this user have opened? */
#endif
#ifdef CONFIG_FANOTIFY
atomic_t fanotify_listeners;
#endif
epoll: introduce resource usage limits It has been thought that the per-user file descriptors limit would also limit the resources that a normal user can request via the epoll interface. Vegard Nossum reported a very simple program (a modified version attached) that can make a normal user to request a pretty large amount of kernel memory, well within the its maximum number of fds. To solve such problem, default limits are now imposed, and /proc based configuration has been introduced. A new directory has been created, named /proc/sys/fs/epoll/ and inside there, there are two configuration points: max_user_instances = Maximum number of devices - per user max_user_watches = Maximum number of "watched" fds - per user The current default for "max_user_watches" limits the memory used by epoll to store "watches", to 1/32 of the amount of the low RAM. As example, a 256MB 32bit machine, will have "max_user_watches" set to roughly 90000. That should be enough to not break existing heavy epoll users. The default value for "max_user_instances" is set to 128, that should be enough too. This also changes the userspace, because a new error code can now come out from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already listed, so that should be ok. [akpm@linux-foundation.org: use get_current_user()] Signed-off-by: Davide Libenzi <davidel@xmailserver.org> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: <stable@kernel.org> Cc: Cyrill Gorcunov <gorcunov@gmail.com> Reported-by: Vegard Nossum <vegardno@ifi.uio.no> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-12-02 05:13:55 +08:00
#ifdef CONFIG_EPOLL
atomic_long_t epoll_watches; /* The number of file descriptors currently watched */
epoll: introduce resource usage limits It has been thought that the per-user file descriptors limit would also limit the resources that a normal user can request via the epoll interface. Vegard Nossum reported a very simple program (a modified version attached) that can make a normal user to request a pretty large amount of kernel memory, well within the its maximum number of fds. To solve such problem, default limits are now imposed, and /proc based configuration has been introduced. A new directory has been created, named /proc/sys/fs/epoll/ and inside there, there are two configuration points: max_user_instances = Maximum number of devices - per user max_user_watches = Maximum number of "watched" fds - per user The current default for "max_user_watches" limits the memory used by epoll to store "watches", to 1/32 of the amount of the low RAM. As example, a 256MB 32bit machine, will have "max_user_watches" set to roughly 90000. That should be enough to not break existing heavy epoll users. The default value for "max_user_instances" is set to 128, that should be enough too. This also changes the userspace, because a new error code can now come out from EPOLL_CTL_ADD (-ENOSPC). The EMFILE from epoll_create() was already listed, so that should be ok. [akpm@linux-foundation.org: use get_current_user()] Signed-off-by: Davide Libenzi <davidel@xmailserver.org> Cc: Michael Kerrisk <mtk.manpages@gmail.com> Cc: <stable@kernel.org> Cc: Cyrill Gorcunov <gorcunov@gmail.com> Reported-by: Vegard Nossum <vegardno@ifi.uio.no> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-12-02 05:13:55 +08:00
#endif
#ifdef CONFIG_POSIX_MQUEUE
/* protected by mq_lock */
unsigned long mq_bytes; /* How many bytes can be allocated to mqueue? */
#endif
unsigned long locked_shm; /* How many pages of mlocked shm ? */
#ifdef CONFIG_KEYS
struct key *uid_keyring; /* UID specific keyring */
struct key *session_keyring; /* UID's default session keyring */
#endif
/* Hash table maintenance information */
struct hlist_node uidhash_node;
uid_t uid;
User namespaces: set of cleanups (v2) The user_ns is moved from nsproxy to user_struct, so that a struct cred by itself is sufficient to determine access (which it otherwise would not be). Corresponding ecryptfs fixes (by David Howells) are here as well. Fix refcounting. The following rules now apply: 1. The task pins the user struct. 2. The user struct pins its user namespace. 3. The user namespace pins the struct user which created it. User namespaces are cloned during copy_creds(). Unsharing a new user_ns is no longer possible. (We could re-add that, but it'll cause code duplication and doesn't seem useful if PAM doesn't need to clone user namespaces). When a user namespace is created, its first user (uid 0) gets empty keyrings and a clean group_info. This incorporates a previous patch by David Howells. Here is his original patch description: >I suggest adding the attached incremental patch. It makes the following >changes: > > (1) Provides a current_user_ns() macro to wrap accesses to current's user > namespace. > > (2) Fixes eCryptFS. > > (3) Renames create_new_userns() to create_user_ns() to be more consistent > with the other associated functions and because the 'new' in the name is > superfluous. > > (4) Moves the argument and permission checks made for CLONE_NEWUSER to the > beginning of do_fork() so that they're done prior to making any attempts > at allocation. > > (5) Calls create_user_ns() after prepare_creds(), and gives it the new creds > to fill in rather than have it return the new root user. I don't imagine > the new root user being used for anything other than filling in a cred > struct. > > This also permits me to get rid of a get_uid() and a free_uid(), as the > reference the creds were holding on the old user_struct can just be > transferred to the new namespace's creator pointer. > > (6) Makes create_user_ns() reset the UIDs and GIDs of the creds under > preparation rather than doing it in copy_creds(). > >David >Signed-off-by: David Howells <dhowells@redhat.com> Changelog: Oct 20: integrate dhowells comments 1. leave thread_keyring alone 2. use current_user_ns() in set_user() Signed-off-by: Serge Hallyn <serue@us.ibm.com>
2008-10-16 05:38:45 +08:00
struct user_namespace *user_ns;
perf: Do the big rename: Performance Counters -> Performance Events Bye-bye Performance Counters, welcome Performance Events! In the past few months the perfcounters subsystem has grown out its initial role of counting hardware events, and has become (and is becoming) a much broader generic event enumeration, reporting, logging, monitoring, analysis facility. Naming its core object 'perf_counter' and naming the subsystem 'perfcounters' has become more and more of a misnomer. With pending code like hw-breakpoints support the 'counter' name is less and less appropriate. All in one, we've decided to rename the subsystem to 'performance events' and to propagate this rename through all fields, variables and API names. (in an ABI compatible fashion) The word 'event' is also a bit shorter than 'counter' - which makes it slightly more convenient to write/handle as well. Thanks goes to Stephane Eranian who first observed this misnomer and suggested a rename. User-space tooling and ABI compatibility is not affected - this patch should be function-invariant. (Also, defconfigs were not touched to keep the size down.) This patch has been generated via the following script: FILES=$(find * -type f | grep -vE 'oprofile|[^K]config') sed -i \ -e 's/PERF_EVENT_/PERF_RECORD_/g' \ -e 's/PERF_COUNTER/PERF_EVENT/g' \ -e 's/perf_counter/perf_event/g' \ -e 's/nb_counters/nb_events/g' \ -e 's/swcounter/swevent/g' \ -e 's/tpcounter_event/tp_event/g' \ $FILES for N in $(find . -name perf_counter.[ch]); do M=$(echo $N | sed 's/perf_counter/perf_event/g') mv $N $M done FILES=$(find . -name perf_event.*) sed -i \ -e 's/COUNTER_MASK/REG_MASK/g' \ -e 's/COUNTER/EVENT/g' \ -e 's/\<event\>/event_id/g' \ -e 's/counter/event/g' \ -e 's/Counter/Event/g' \ $FILES ... to keep it as correct as possible. This script can also be used by anyone who has pending perfcounters patches - it converts a Linux kernel tree over to the new naming. We tried to time this change to the point in time where the amount of pending patches is the smallest: the end of the merge window. Namespace clashes were fixed up in a preparatory patch - and some stylistic fallout will be fixed up in a subsequent patch. ( NOTE: 'counters' are still the proper terminology when we deal with hardware registers - and these sed scripts are a bit over-eager in renaming them. I've undone some of that, but in case there's something left where 'counter' would be better than 'event' we can undo that on an individual basis instead of touching an otherwise nicely automated patch. ) Suggested-by: Stephane Eranian <eranian@google.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Paul Mackerras <paulus@samba.org> Reviewed-by: Arjan van de Ven <arjan@linux.intel.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Howells <dhowells@redhat.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: <linux-arch@vger.kernel.org> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
#ifdef CONFIG_PERF_EVENTS
atomic_long_t locked_vm;
#endif
};
extern int uids_sysfs_init(void);
extern struct user_struct *find_user(uid_t);
extern struct user_struct root_user;
#define INIT_USER (&root_user)
struct backing_dev_info;
struct reclaim_state;
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info {
/* cumulative counters */
unsigned long pcount; /* # of times run on this cpu */
unsigned long long run_delay; /* time spent waiting on a runqueue */
/* timestamps */
unsigned long long last_arrival,/* when we last ran on a cpu */
last_queued; /* when we were last queued to run */
};
#endif /* defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT) */
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info {
spinlock_t lock;
unsigned int flags; /* Private per-task flags */
/* For each stat XXX, add following, aligned appropriately
*
* struct timespec XXX_start, XXX_end;
* u64 XXX_delay;
* u32 XXX_count;
*
* Atomicity of updates to XXX_delay, XXX_count protected by
* single lock above (split into XXX_lock if contention is an issue).
*/
/*
* XXX_count is incremented on every XXX operation, the delay
* associated with the operation is added to XXX_delay.
* XXX_delay contains the accumulated delay time in nanoseconds.
*/
struct timespec blkio_start, blkio_end; /* Shared by blkio, swapin */
u64 blkio_delay; /* wait for sync block io completion */
u64 swapin_delay; /* wait for swapin block io completion */
u32 blkio_count; /* total count of the number of sync block */
/* io operations performed */
u32 swapin_count; /* total count of the number of swapin block */
/* io operations performed */
per-task-delay-accounting: add memory reclaim delay Sometimes, application responses become bad under heavy memory load. Applications take a bit time to reclaim memory. The statistics, how long memory reclaim takes, will be useful to measure memory usage. This patch adds accounting memory reclaim to per-task-delay-accounting for accounting the time of do_try_to_free_pages(). <i.e> - When System is under low memory load, memory reclaim may not occur. $ free total used free shared buffers cached Mem: 8197800 1577300 6620500 0 4808 1516724 -/+ buffers/cache: 55768 8142032 Swap: 16386292 0 16386292 $ vmstat 1 procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu---- r b swpd free buff cache si so bi bo in cs us sy id wa 0 0 0 5069748 10612 3014060 0 0 0 0 3 26 0 0 100 0 0 0 0 5069748 10612 3014060 0 0 0 0 4 22 0 0 100 0 0 0 0 5069748 10612 3014060 0 0 0 0 3 18 0 0 100 0 Measure the time of tar command. $ ls -s test.dat 1501472 test.dat $ time tar cvf test.tar test.dat real 0m13.388s user 0m0.116s sys 0m5.304s $ ./delayget -d -p <pid> CPU count real total virtual total delay total 428 5528345500 5477116080 62749891 IO count delay total 338 8078977189 SWAP count delay total 0 0 RECLAIM count delay total 0 0 - When system is under heavy memory load memory reclaim may occur. $ vmstat 1 procs -----------memory---------- ---swap-- -----io---- -system-- ----cpu---- r b swpd free buff cache si so bi bo in cs us sy id wa 0 0 7159032 49724 1812 3012 0 0 0 0 3 24 0 0 100 0 0 0 7159032 49724 1812 3012 0 0 0 0 4 24 0 0 100 0 0 0 7159032 49848 1812 3012 0 0 0 0 3 22 0 0 100 0 In this case, one process uses more 8G memory by execution of malloc() and memset(). $ time tar cvf test.tar test.dat real 1m38.563s <- increased by 85 sec user 0m0.140s sys 0m7.060s $ ./delayget -d -p <pid> CPU count real total virtual total delay total 9021 7140446250 7315277975 923201824 IO count delay total 8965 90466349669 SWAP count delay total 3 21036367 RECLAIM count delay total 740 61011951153 In the later case, the value of RECLAIM is increasing. So, taskstats can show how much memory reclaim influences TAT. Signed-off-by: Keika Kobayashi <kobayashi.kk@ncos.nec.co.jp> Acked-by: Balbir Singh <balbir@linux.vnet.ibm.com> Acked-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujistu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-25 16:48:52 +08:00
struct timespec freepages_start, freepages_end;
u64 freepages_delay; /* wait for memory reclaim */
u32 freepages_count; /* total count of memory reclaim */
};
#endif /* CONFIG_TASK_DELAY_ACCT */
static inline int sched_info_on(void)
{
#ifdef CONFIG_SCHEDSTATS
return 1;
#elif defined(CONFIG_TASK_DELAY_ACCT)
extern int delayacct_on;
return delayacct_on;
#else
return 0;
#endif
}
enum cpu_idle_type {
CPU_IDLE,
CPU_NOT_IDLE,
CPU_NEWLY_IDLE,
CPU_MAX_IDLE_TYPES
};
/*
sched: Increase SCHED_LOAD_SCALE resolution Introduce SCHED_LOAD_RESOLUTION, which scales is added to SCHED_LOAD_SHIFT and increases the resolution of SCHED_LOAD_SCALE. This patch sets the value of SCHED_LOAD_RESOLUTION to 10, scaling up the weights for all sched entities by a factor of 1024. With this extra resolution, we can handle deeper cgroup hiearchies and the scheduler can do better shares distribution and load load balancing on larger systems (especially for low weight task groups). This does not change the existing user interface, the scaled weights are only used internally. We do not modify prio_to_weight values or inverses, but use the original weights when calculating the inverse which is used to scale execution time delta in calc_delta_mine(). This ensures we do not lose accuracy when accounting time to the sched entities. Thanks to Nikunj Dadhania for fixing an bug in c_d_m() that broken fairness. Below is some analysis of the performance costs/improvements of this patch. 1. Micro-arch performance costs: Experiment was to run Ingo's pipe_test_100k 200 times with the task pinned to one cpu. I measured instruction, cycles and stalled-cycles for the runs. See: http://thread.gmane.org/gmane.linux.kernel/1129232/focus=1129389 for more info. -tip (baseline): Performance counter stats for '/root/load-scale/pipe-test-100k' (200 runs): 964,991,769 instructions # 0.82 insns per cycle # 0.33 stalled cycles per insn # ( +- 0.05% ) 1,171,186,635 cycles # 0.000 GHz ( +- 0.08% ) 306,373,664 stalled-cycles-backend # 26.16% backend cycles idle ( +- 0.28% ) 314,933,621 stalled-cycles-frontend # 26.89% frontend cycles idle ( +- 0.34% ) 1.122405684 seconds time elapsed ( +- 0.05% ) -tip+patches: Performance counter stats for './load-scale/pipe-test-100k' (200 runs): 963,624,821 instructions # 0.82 insns per cycle # 0.33 stalled cycles per insn # ( +- 0.04% ) 1,175,215,649 cycles # 0.000 GHz ( +- 0.08% ) 315,321,126 stalled-cycles-backend # 26.83% backend cycles idle ( +- 0.28% ) 316,835,873 stalled-cycles-frontend # 26.96% frontend cycles idle ( +- 0.29% ) 1.122238659 seconds time elapsed ( +- 0.06% ) With this patch, instructions decrease by ~0.10% and cycles increase by 0.27%. This doesn't look statistically significant. The number of stalled cycles in the backend increased from 26.16% to 26.83%. This can be attributed to the shifts we do in c_d_m() and other places. The fraction of stalled cycles in the frontend remains about the same, at 26.96% compared to 26.89% in -tip. 2. Balancing low-weight task groups Test setup: run 50 tasks with random sleep/busy times (biased around 100ms) in a low weight container (with cpu.shares = 2). Measure %idle as reported by mpstat over a 10s window. -tip (baseline): 06:47:48 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s 06:47:49 PM all 94.32 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.62 15888.00 06:47:50 PM all 94.57 0.00 0.62 0.00 0.00 0.00 0.00 0.00 4.81 16180.00 06:47:51 PM all 94.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.25 15966.00 06:47:52 PM all 95.81 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.19 16053.00 06:47:53 PM all 94.88 0.06 0.00 0.00 0.00 0.00 0.00 0.00 5.06 15984.00 06:47:54 PM all 93.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.69 15806.00 06:47:55 PM all 94.19 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.75 15896.00 06:47:56 PM all 92.87 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.13 15716.00 06:47:57 PM all 94.88 0.00 0.00 0.00 0.00 0.00 0.00 0.00 5.12 15982.00 06:47:58 PM all 95.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.56 16075.00 Average: all 94.49 0.01 0.08 0.00 0.00 0.00 0.00 0.00 5.42 15954.60 -tip+patches: 06:47:03 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s 06:47:04 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16630.00 06:47:05 PM all 99.69 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.31 16580.20 06:47:06 PM all 99.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.25 16596.00 06:47:07 PM all 99.20 0.00 0.74 0.00 0.00 0.06 0.00 0.00 0.00 17838.61 06:47:08 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16540.00 06:47:09 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16575.00 06:47:10 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16614.00 06:47:11 PM all 99.94 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.06 16588.00 06:47:12 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16593.00 06:47:13 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16551.00 Average: all 99.84 0.00 0.09 0.00 0.00 0.01 0.00 0.00 0.06 16711.58 We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06% with the patches). We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06% with the patches). Signed-off-by: Nikhil Rao <ncrao@google.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Nikunj A. Dadhania <nikunj@linux.vnet.ibm.com> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Stephan Barwolf <stephan.baerwolf@tu-ilmenau.de> Cc: Mike Galbraith <efault@gmx.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/r/1305754668-18792-1-git-send-email-ncrao@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-05-19 05:37:48 +08:00
* Increase resolution of nice-level calculations for 64-bit architectures.
* The extra resolution improves shares distribution and load balancing of
* low-weight task groups (eg. nice +19 on an autogroup), deeper taskgroup
* hierarchies, especially on larger systems. This is not a user-visible change
* and does not change the user-interface for setting shares/weights.
*
* We increase resolution only if we have enough bits to allow this increased
* resolution (i.e. BITS_PER_LONG > 32). The costs for increasing resolution
* when BITS_PER_LONG <= 32 are pretty high and the returns do not justify the
* increased costs.
*/
#if 0 /* BITS_PER_LONG > 32 -- currently broken: it increases power usage under light load */
sched: Increase SCHED_LOAD_SCALE resolution Introduce SCHED_LOAD_RESOLUTION, which scales is added to SCHED_LOAD_SHIFT and increases the resolution of SCHED_LOAD_SCALE. This patch sets the value of SCHED_LOAD_RESOLUTION to 10, scaling up the weights for all sched entities by a factor of 1024. With this extra resolution, we can handle deeper cgroup hiearchies and the scheduler can do better shares distribution and load load balancing on larger systems (especially for low weight task groups). This does not change the existing user interface, the scaled weights are only used internally. We do not modify prio_to_weight values or inverses, but use the original weights when calculating the inverse which is used to scale execution time delta in calc_delta_mine(). This ensures we do not lose accuracy when accounting time to the sched entities. Thanks to Nikunj Dadhania for fixing an bug in c_d_m() that broken fairness. Below is some analysis of the performance costs/improvements of this patch. 1. Micro-arch performance costs: Experiment was to run Ingo's pipe_test_100k 200 times with the task pinned to one cpu. I measured instruction, cycles and stalled-cycles for the runs. See: http://thread.gmane.org/gmane.linux.kernel/1129232/focus=1129389 for more info. -tip (baseline): Performance counter stats for '/root/load-scale/pipe-test-100k' (200 runs): 964,991,769 instructions # 0.82 insns per cycle # 0.33 stalled cycles per insn # ( +- 0.05% ) 1,171,186,635 cycles # 0.000 GHz ( +- 0.08% ) 306,373,664 stalled-cycles-backend # 26.16% backend cycles idle ( +- 0.28% ) 314,933,621 stalled-cycles-frontend # 26.89% frontend cycles idle ( +- 0.34% ) 1.122405684 seconds time elapsed ( +- 0.05% ) -tip+patches: Performance counter stats for './load-scale/pipe-test-100k' (200 runs): 963,624,821 instructions # 0.82 insns per cycle # 0.33 stalled cycles per insn # ( +- 0.04% ) 1,175,215,649 cycles # 0.000 GHz ( +- 0.08% ) 315,321,126 stalled-cycles-backend # 26.83% backend cycles idle ( +- 0.28% ) 316,835,873 stalled-cycles-frontend # 26.96% frontend cycles idle ( +- 0.29% ) 1.122238659 seconds time elapsed ( +- 0.06% ) With this patch, instructions decrease by ~0.10% and cycles increase by 0.27%. This doesn't look statistically significant. The number of stalled cycles in the backend increased from 26.16% to 26.83%. This can be attributed to the shifts we do in c_d_m() and other places. The fraction of stalled cycles in the frontend remains about the same, at 26.96% compared to 26.89% in -tip. 2. Balancing low-weight task groups Test setup: run 50 tasks with random sleep/busy times (biased around 100ms) in a low weight container (with cpu.shares = 2). Measure %idle as reported by mpstat over a 10s window. -tip (baseline): 06:47:48 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s 06:47:49 PM all 94.32 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.62 15888.00 06:47:50 PM all 94.57 0.00 0.62 0.00 0.00 0.00 0.00 0.00 4.81 16180.00 06:47:51 PM all 94.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.25 15966.00 06:47:52 PM all 95.81 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.19 16053.00 06:47:53 PM all 94.88 0.06 0.00 0.00 0.00 0.00 0.00 0.00 5.06 15984.00 06:47:54 PM all 93.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.69 15806.00 06:47:55 PM all 94.19 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.75 15896.00 06:47:56 PM all 92.87 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.13 15716.00 06:47:57 PM all 94.88 0.00 0.00 0.00 0.00 0.00 0.00 0.00 5.12 15982.00 06:47:58 PM all 95.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.56 16075.00 Average: all 94.49 0.01 0.08 0.00 0.00 0.00 0.00 0.00 5.42 15954.60 -tip+patches: 06:47:03 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s 06:47:04 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16630.00 06:47:05 PM all 99.69 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.31 16580.20 06:47:06 PM all 99.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.25 16596.00 06:47:07 PM all 99.20 0.00 0.74 0.00 0.00 0.06 0.00 0.00 0.00 17838.61 06:47:08 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16540.00 06:47:09 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16575.00 06:47:10 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16614.00 06:47:11 PM all 99.94 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.06 16588.00 06:47:12 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16593.00 06:47:13 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16551.00 Average: all 99.84 0.00 0.09 0.00 0.00 0.01 0.00 0.00 0.06 16711.58 We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06% with the patches). We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06% with the patches). Signed-off-by: Nikhil Rao <ncrao@google.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Nikunj A. Dadhania <nikunj@linux.vnet.ibm.com> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Stephan Barwolf <stephan.baerwolf@tu-ilmenau.de> Cc: Mike Galbraith <efault@gmx.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/r/1305754668-18792-1-git-send-email-ncrao@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-05-19 05:37:48 +08:00
# define SCHED_LOAD_RESOLUTION 10
# define scale_load(w) ((w) << SCHED_LOAD_RESOLUTION)
# define scale_load_down(w) ((w) >> SCHED_LOAD_RESOLUTION)
#else
# define SCHED_LOAD_RESOLUTION 0
# define scale_load(w) (w)
# define scale_load_down(w) (w)
#endif
sched: Increase SCHED_LOAD_SCALE resolution Introduce SCHED_LOAD_RESOLUTION, which scales is added to SCHED_LOAD_SHIFT and increases the resolution of SCHED_LOAD_SCALE. This patch sets the value of SCHED_LOAD_RESOLUTION to 10, scaling up the weights for all sched entities by a factor of 1024. With this extra resolution, we can handle deeper cgroup hiearchies and the scheduler can do better shares distribution and load load balancing on larger systems (especially for low weight task groups). This does not change the existing user interface, the scaled weights are only used internally. We do not modify prio_to_weight values or inverses, but use the original weights when calculating the inverse which is used to scale execution time delta in calc_delta_mine(). This ensures we do not lose accuracy when accounting time to the sched entities. Thanks to Nikunj Dadhania for fixing an bug in c_d_m() that broken fairness. Below is some analysis of the performance costs/improvements of this patch. 1. Micro-arch performance costs: Experiment was to run Ingo's pipe_test_100k 200 times with the task pinned to one cpu. I measured instruction, cycles and stalled-cycles for the runs. See: http://thread.gmane.org/gmane.linux.kernel/1129232/focus=1129389 for more info. -tip (baseline): Performance counter stats for '/root/load-scale/pipe-test-100k' (200 runs): 964,991,769 instructions # 0.82 insns per cycle # 0.33 stalled cycles per insn # ( +- 0.05% ) 1,171,186,635 cycles # 0.000 GHz ( +- 0.08% ) 306,373,664 stalled-cycles-backend # 26.16% backend cycles idle ( +- 0.28% ) 314,933,621 stalled-cycles-frontend # 26.89% frontend cycles idle ( +- 0.34% ) 1.122405684 seconds time elapsed ( +- 0.05% ) -tip+patches: Performance counter stats for './load-scale/pipe-test-100k' (200 runs): 963,624,821 instructions # 0.82 insns per cycle # 0.33 stalled cycles per insn # ( +- 0.04% ) 1,175,215,649 cycles # 0.000 GHz ( +- 0.08% ) 315,321,126 stalled-cycles-backend # 26.83% backend cycles idle ( +- 0.28% ) 316,835,873 stalled-cycles-frontend # 26.96% frontend cycles idle ( +- 0.29% ) 1.122238659 seconds time elapsed ( +- 0.06% ) With this patch, instructions decrease by ~0.10% and cycles increase by 0.27%. This doesn't look statistically significant. The number of stalled cycles in the backend increased from 26.16% to 26.83%. This can be attributed to the shifts we do in c_d_m() and other places. The fraction of stalled cycles in the frontend remains about the same, at 26.96% compared to 26.89% in -tip. 2. Balancing low-weight task groups Test setup: run 50 tasks with random sleep/busy times (biased around 100ms) in a low weight container (with cpu.shares = 2). Measure %idle as reported by mpstat over a 10s window. -tip (baseline): 06:47:48 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s 06:47:49 PM all 94.32 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.62 15888.00 06:47:50 PM all 94.57 0.00 0.62 0.00 0.00 0.00 0.00 0.00 4.81 16180.00 06:47:51 PM all 94.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.25 15966.00 06:47:52 PM all 95.81 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.19 16053.00 06:47:53 PM all 94.88 0.06 0.00 0.00 0.00 0.00 0.00 0.00 5.06 15984.00 06:47:54 PM all 93.31 0.00 0.00 0.00 0.00 0.00 0.00 0.00 6.69 15806.00 06:47:55 PM all 94.19 0.00 0.06 0.00 0.00 0.00 0.00 0.00 5.75 15896.00 06:47:56 PM all 92.87 0.00 0.00 0.00 0.00 0.00 0.00 0.00 7.13 15716.00 06:47:57 PM all 94.88 0.00 0.00 0.00 0.00 0.00 0.00 0.00 5.12 15982.00 06:47:58 PM all 95.44 0.00 0.00 0.00 0.00 0.00 0.00 0.00 4.56 16075.00 Average: all 94.49 0.01 0.08 0.00 0.00 0.00 0.00 0.00 5.42 15954.60 -tip+patches: 06:47:03 PM CPU %usr %nice %sys %iowait %irq %soft %steal %guest %idle intr/s 06:47:04 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16630.00 06:47:05 PM all 99.69 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.31 16580.20 06:47:06 PM all 99.69 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.25 16596.00 06:47:07 PM all 99.20 0.00 0.74 0.00 0.00 0.06 0.00 0.00 0.00 17838.61 06:47:08 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16540.00 06:47:09 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16575.00 06:47:10 PM all 100.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.00 16614.00 06:47:11 PM all 99.94 0.00 0.00 0.00 0.00 0.00 0.00 0.00 0.06 16588.00 06:47:12 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16593.00 06:47:13 PM all 99.94 0.00 0.06 0.00 0.00 0.00 0.00 0.00 0.00 16551.00 Average: all 99.84 0.00 0.09 0.00 0.00 0.01 0.00 0.00 0.06 16711.58 We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06% with the patches). We see an improvement in idle% on the system (drops from 5.42% on -tip to 0.06% with the patches). Signed-off-by: Nikhil Rao <ncrao@google.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Nikunj A. Dadhania <nikunj@linux.vnet.ibm.com> Cc: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Cc: Stephan Barwolf <stephan.baerwolf@tu-ilmenau.de> Cc: Mike Galbraith <efault@gmx.de> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/r/1305754668-18792-1-git-send-email-ncrao@google.com Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-05-19 05:37:48 +08:00
#define SCHED_LOAD_SHIFT (10 + SCHED_LOAD_RESOLUTION)
#define SCHED_LOAD_SCALE (1L << SCHED_LOAD_SHIFT)
/*
* Increase resolution of cpu_power calculations
*/
#define SCHED_POWER_SHIFT 10
#define SCHED_POWER_SCALE (1L << SCHED_POWER_SHIFT)
/*
* sched-domains (multiprocessor balancing) declarations:
*/
[PATCH] sched: implement smpnice Problem: The introduction of separate run queues per CPU has brought with it "nice" enforcement problems that are best described by a simple example. For the sake of argument suppose that on a single CPU machine with a nice==19 hard spinner and a nice==0 hard spinner running that the nice==0 task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and 2 nice==0 hard spinners running. The user of this system would be entitled to expect that the nice==0 tasks each get 95% of a CPU and the nice==19 tasks only get 5% each. However, whether this expectation is met is pretty much down to luck as there are four equally likely distributions of the tasks to the CPUs that the load balancing code will consider to be balanced with loads of 2.0 for each CPU. Two of these distributions involve one nice==0 and one nice==19 task per CPU and in these circumstances the users expectations will be met. The other two distributions both involve both nice==0 tasks being on one CPU and both nice==19 being on the other CPU and each task will get 50% of a CPU and the user's expectations will not be met. Solution: The solution to this problem that is implemented in the attached patch is to use weighted loads when determining if the system is balanced and, when an imbalance is detected, to move an amount of weighted load between run queues (as opposed to a number of tasks) to restore the balance. Once again, the easiest way to explain why both of these measures are necessary is to use a simple example. Suppose that (in a slight variation of the above example) that we have a two CPU system with 4 nice==0 and 4 nice=19 hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and the 4 nice==19 tasks are on the other CPU. The weighted loads for the two CPUs would be 4.0 and 0.2 respectively and the load balancing code would move 2 tasks resulting in one CPU with a load of 2.0 and the other with load of 2.2. If this was considered to be a big enough imbalance to justify moving a task and that task was moved using the current move_tasks() then it would move the highest priority task that it found and this would result in one CPU with a load of 3.0 and the other with a load of 1.2 which would result in the movement of a task in the opposite direction and so on -- infinite loop. If, on the other hand, an amount of load to be moved is calculated from the imbalance (in this case 0.1) and move_tasks() skips tasks until it find ones whose contributions to the weighted load are less than this amount it would move two of the nice==19 tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with loads of 2.1 for each CPU. One of the advantages of this mechanism is that on a system where all tasks have nice==0 the load balancing calculations would be mathematically identical to the current load balancing code. Notes: struct task_struct: has a new field load_weight which (in a trade off of space for speed) stores the contribution that this task makes to a CPU's weighted load when it is runnable. struct runqueue: has a new field raw_weighted_load which is the sum of the load_weight values for the currently runnable tasks on this run queue. This field always needs to be updated when nr_running is updated so two new inline functions inc_nr_running() and dec_nr_running() have been created to make sure that this happens. This also offers a convenient way to optimize away this part of the smpnice mechanism when CONFIG_SMP is not defined. int try_to_wake_up(): in this function the value SCHED_LOAD_BALANCE is used to represent the load contribution of a single task in various calculations in the code that decides which CPU to put the waking task on. While this would be a valid on a system where the nice values for the runnable tasks were distributed evenly around zero it will lead to anomalous load balancing if the distribution is skewed in either direction. To overcome this problem SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task or by the average load_weight per task for the queue in question (as appropriate). int move_tasks(): The modifications to this function were complicated by the fact that active_load_balance() uses it to move exactly one task without checking whether an imbalance actually exists. This precluded the simple overloading of max_nr_move with max_load_move and necessitated the addition of the latter as an extra argument to the function. The internal implementation is then modified to move up to max_nr_move tasks and max_load_move of weighted load. This slightly complicates the code where move_tasks() is called and if ever active_load_balance() is changed to not use move_tasks() the implementation of move_tasks() should be simplified accordingly. struct sched_group *find_busiest_group(): Similar to try_to_wake_up(), there are places in this function where SCHED_LOAD_SCALE is used to represent the load contribution of a single task and the same issues are created. A similar solution is adopted except that it is now the average per task contribution to a group's load (as opposed to a run queue) that is required. As this value is not directly available from the group it is calculated on the fly as the queues in the groups are visited when determining the busiest group. A key change to this function is that it is no longer to scale down *imbalance on exit as move_tasks() uses the load in its scaled form. void set_user_nice(): has been modified to update the task's load_weight field when it's nice value and also to ensure that its run queue's raw_weighted_load field is updated if it was runnable. From: "Siddha, Suresh B" <suresh.b.siddha@intel.com> With smpnice, sched groups with highest priority tasks can mask the imbalance between the other sched groups with in the same domain. This patch fixes some of the listed down scenarios by not considering the sched groups which are lightly loaded. a) on a simple 4-way MP system, if we have one high priority and 4 normal priority tasks, with smpnice we would like to see the high priority task scheduled on one cpu, two other cpus getting one normal task each and the fourth cpu getting the remaining two normal tasks. but with current smpnice extra normal priority task keeps jumping from one cpu to another cpu having the normal priority task. This is because of the busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the cpu with high priority task in max_load calculations but including that in total and avg_load calcuations.. leading to max_load < avg_load and load balance between cpus running normal priority tasks(2 Vs 1) will always show imbalanace as one normal priority and the extra normal priority task will keep moving from one cpu to another cpu having normal priority task.. b) 4-way system with HT (8 logical processors). Package-P0 T0 has a highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal priority task each.. P2 and P3 are idle. With this patch, one of the normal priority tasks on P1 will be moved to P2 or P3.. c) With the current weighted smp nice calculations, it doesn't always make sense to look at the highest weighted runqueue in the busy group.. Consider a load balance scenario on a DP with HT system, with Package-0 containing one high priority and one low priority, Package-1 containing one low priority(with other thread being idle).. Package-1 thinks that it need to take the low priority thread from Package-0. And find_busiest_queue() returns the cpu thread with highest priority task.. And ultimately(with help of active load balance) we move high priority task to Package-1. And same continues with Package-0 now, moving high priority task from package-1 to package-0.. Even without the presence of active load balance, load balance will fail to balance the above scenario.. Fix find_busiest_queue to use "imbalance" when it is lightly loaded. [kernel@kolivas.org: sched: store weighted load on up] [kernel@kolivas.org: sched: add discrete weighted cpu load function] [suresh.b.siddha@intel.com: sched: remove dead code] Signed-off-by: Peter Williams <pwil3058@bigpond.com.au> Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com> Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Con Kolivas <kernel@kolivas.org> Cc: John Hawkes <hawkes@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:54:34 +08:00
#ifdef CONFIG_SMP
#define SD_LOAD_BALANCE 0x0001 /* Do load balancing on this domain. */
#define SD_BALANCE_NEWIDLE 0x0002 /* Balance when about to become idle */
#define SD_BALANCE_EXEC 0x0004 /* Balance on exec */
#define SD_BALANCE_FORK 0x0008 /* Balance on fork, clone */
#define SD_BALANCE_WAKE 0x0010 /* Balance on wakeup */
#define SD_WAKE_AFFINE 0x0020 /* Wake task to waking CPU */
#define SD_PREFER_LOCAL 0x0040 /* Prefer to keep tasks local to this domain */
#define SD_SHARE_CPUPOWER 0x0080 /* Domain members share cpu power */
#define SD_POWERSAVINGS_BALANCE 0x0100 /* Balance for power savings */
#define SD_SHARE_PKG_RESOURCES 0x0200 /* Domain members share cpu pkg resources */
#define SD_SERIALIZE 0x0400 /* Only a single load balancing instance */
sched: Add asymmetric group packing option for sibling domain Check to see if the group is packed in a sched doman. This is primarily intended to used at the sibling level. Some cores like POWER7 prefer to use lower numbered SMT threads. In the case of POWER7, it can move to lower SMT modes only when higher threads are idle. When in lower SMT modes, the threads will perform better since they share less core resources. Hence when we have idle threads, we want them to be the higher ones. This adds a hook into f_b_g() called check_asym_packing() to check the packing. This packing function is run on idle threads. It checks to see if the busiest CPU in this domain (core in the P7 case) has a higher CPU number than what where the packing function is being run on. If it is, calculate the imbalance and return the higher busier thread as the busiest group to f_b_g(). Here we are assuming a lower CPU number will be equivalent to a lower SMT thread number. It also creates a new SD_ASYM_PACKING flag to enable this feature at any scheduler domain level. It also creates an arch hook to enable this feature at the sibling level. The default function doesn't enable this feature. Based heavily on patch from Peter Zijlstra. Fixes from Srivatsa Vaddagiri. Signed-off-by: Michael Neuling <mikey@neuling.org> Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <20100608045702.2936CCC897@localhost.localdomain> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-08 12:57:02 +08:00
#define SD_ASYM_PACKING 0x0800 /* Place busy groups earlier in the domain */
#define SD_PREFER_SIBLING 0x1000 /* Prefer to place tasks in a sibling domain */
#define SD_OVERLAP 0x2000 /* sched_domains of this level overlap */
enum powersavings_balance_level {
POWERSAVINGS_BALANCE_NONE = 0, /* No power saving load balance */
POWERSAVINGS_BALANCE_BASIC, /* Fill one thread/core/package
* first for long running threads
*/
POWERSAVINGS_BALANCE_WAKEUP, /* Also bias task wakeups to semi-idle
* cpu package for power savings
*/
MAX_POWERSAVINGS_BALANCE_LEVELS
};
extern int sched_mc_power_savings, sched_smt_power_savings;
static inline int sd_balance_for_mc_power(void)
{
if (sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE;
if (!sched_mc_power_savings)
return SD_PREFER_SIBLING;
return 0;
}
static inline int sd_balance_for_package_power(void)
{
if (sched_mc_power_savings | sched_smt_power_savings)
return SD_POWERSAVINGS_BALANCE;
return SD_PREFER_SIBLING;
}
sched: Add asymmetric group packing option for sibling domain Check to see if the group is packed in a sched doman. This is primarily intended to used at the sibling level. Some cores like POWER7 prefer to use lower numbered SMT threads. In the case of POWER7, it can move to lower SMT modes only when higher threads are idle. When in lower SMT modes, the threads will perform better since they share less core resources. Hence when we have idle threads, we want them to be the higher ones. This adds a hook into f_b_g() called check_asym_packing() to check the packing. This packing function is run on idle threads. It checks to see if the busiest CPU in this domain (core in the P7 case) has a higher CPU number than what where the packing function is being run on. If it is, calculate the imbalance and return the higher busier thread as the busiest group to f_b_g(). Here we are assuming a lower CPU number will be equivalent to a lower SMT thread number. It also creates a new SD_ASYM_PACKING flag to enable this feature at any scheduler domain level. It also creates an arch hook to enable this feature at the sibling level. The default function doesn't enable this feature. Based heavily on patch from Peter Zijlstra. Fixes from Srivatsa Vaddagiri. Signed-off-by: Michael Neuling <mikey@neuling.org> Signed-off-by: Srivatsa Vaddagiri <vatsa@linux.vnet.ibm.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Arjan van de Ven <arjan@linux.intel.com> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: Thomas Gleixner <tglx@linutronix.de> LKML-Reference: <20100608045702.2936CCC897@localhost.localdomain> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-06-08 12:57:02 +08:00
extern int __weak arch_sd_sibiling_asym_packing(void);
/*
* Optimise SD flags for power savings:
* SD_BALANCE_NEWIDLE helps aggressive task consolidation and power savings.
* Keep default SD flags if sched_{smt,mc}_power_saving=0
*/
static inline int sd_power_saving_flags(void)
{
if (sched_mc_power_savings | sched_smt_power_savings)
return SD_BALANCE_NEWIDLE;
return 0;
}
struct sched_group_power {
atomic_t ref;
/*
* CPU power of this group, SCHED_LOAD_SCALE being max power for a
* single CPU.
Speed up divides by cpu_power in scheduler I noticed expensive divides done in try_to_wakeup() and find_busiest_group() on a bi dual core Opteron machine (total of 4 cores), moderatly loaded (15.000 context switch per second) oprofile numbers : CPU: AMD64 processors, speed 2600.05 MHz (estimated) Counted CPU_CLK_UNHALTED events (Cycles outside of halt state) with a unit mask of 0x00 (No unit mask) count 50000 samples % symbol name ... 613914 1.0498 try_to_wake_up 834 0.0013 :ffffffff80227ae1: div %rcx 77513 0.1191 :ffffffff80227ae4: mov %rax,%r11 608893 1.0413 find_busiest_group 1841 0.0031 :ffffffff802260bf: div %rdi 140109 0.2394 :ffffffff802260c2: test %sil,%sil Some of these divides can use the reciprocal divides we introduced some time ago (currently used in slab AFAIK) We can assume a load will fit in a 32bits number, because with a SCHED_LOAD_SCALE=128 value, its still a theorical limit of 33554432 When/if we reach this limit one day, probably cpus will have a fast hardware divide and we can zap the reciprocal divide trick. Ingo suggested to rename cpu_power to __cpu_power to make clear it should not be modified without changing its reciprocal value too. I did not convert the divide in cpu_avg_load_per_task(), because tracking nr_running changes may be not worth it ? We could use a static table of 32 reciprocal values but it would add a conditional branch and table lookup. [akpm@linux-foundation.org: !SMP build fix] Signed-off-by: Eric Dumazet <dada1@cosmosbay.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-08 15:32:57 +08:00
*/
unsigned int power, power_orig;
unsigned long next_update;
/*
* Number of busy cpus in this group.
*/
atomic_t nr_busy_cpus;
};
struct sched_group {
struct sched_group *next; /* Must be a circular list */
atomic_t ref;
sched: Use group weight, idle cpu metrics to fix imbalances during idle Currently we consider a sched domain to be well balanced when the imbalance is less than the domain's imablance_pct. As the number of cores and threads are increasing, current values of imbalance_pct (for example 25% for a NUMA domain) are not enough to detect imbalances like: a) On a WSM-EP system (two sockets, each having 6 cores and 12 logical threads), 24 cpu-hogging tasks get scheduled as 13 on one socket and 11 on another socket. Leading to an idle HT cpu. b) On a hypothetial 2 socket NHM-EX system (each socket having 8 cores and 16 logical threads), 16 cpu-hogging tasks can get scheduled as 9 on one socket and 7 on another socket. Leaving one core in a socket idle whereas in another socket we have a core having both its HT siblings busy. While this issue can be fixed by decreasing the domain's imbalance_pct (by making it a function of number of logical cpus in the domain), it can potentially cause more task migrations across sched groups in an overloaded case. Fix this by using imbalance_pct only during newly_idle and busy load balancing. And during idle load balancing, check if there is an imbalance in number of idle cpu's across the busiest and this sched_group or if the busiest group has more tasks than its weight that the idle cpu in this_group can pull. Reported-by: Nikhil Rao <ncrao@google.com> Signed-off-by: Suresh Siddha <suresh.b.siddha@intel.com> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <1284760952.2676.11.camel@sbsiddha-MOBL3.sc.intel.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-09-18 06:02:32 +08:00
unsigned int group_weight;
struct sched_group_power *sgp;
/*
* The CPUs this group covers.
*
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
unsigned long cpumask[0];
};
static inline struct cpumask *sched_group_cpus(struct sched_group *sg)
{
return to_cpumask(sg->cpumask);
}
/**
* group_first_cpu - Returns the first cpu in the cpumask of a sched_group.
* @group: The group whose first cpu is to be returned.
*/
static inline unsigned int group_first_cpu(struct sched_group *group)
{
return cpumask_first(sched_group_cpus(group));
}
struct sched_domain_attr {
int relax_domain_level;
};
#define SD_ATTR_INIT (struct sched_domain_attr) { \
.relax_domain_level = -1, \
}
extern int sched_domain_level_max;
struct sched_domain {
/* These fields must be setup */
struct sched_domain *parent; /* top domain must be null terminated */
struct sched_domain *child; /* bottom domain must be null terminated */
struct sched_group *groups; /* the balancing groups of the domain */
unsigned long min_interval; /* Minimum balance interval ms */
unsigned long max_interval; /* Maximum balance interval ms */
unsigned int busy_factor; /* less balancing by factor if busy */
unsigned int imbalance_pct; /* No balance until over watermark */
unsigned int cache_nice_tries; /* Leave cache hot tasks for # tries */
unsigned int busy_idx;
unsigned int idle_idx;
unsigned int newidle_idx;
unsigned int wake_idx;
unsigned int forkexec_idx;
unsigned int smt_gain;
int flags; /* See SD_* */
int level;
/* Runtime fields. */
unsigned long last_balance; /* init to jiffies. units in jiffies */
unsigned int balance_interval; /* initialise to 1. units in ms. */
unsigned int nr_balance_failed; /* initialise to 0 */
u64 last_update;
#ifdef CONFIG_SCHEDSTATS
/* load_balance() stats */
unsigned int lb_count[CPU_MAX_IDLE_TYPES];
unsigned int lb_failed[CPU_MAX_IDLE_TYPES];
unsigned int lb_balanced[CPU_MAX_IDLE_TYPES];
unsigned int lb_imbalance[CPU_MAX_IDLE_TYPES];
unsigned int lb_gained[CPU_MAX_IDLE_TYPES];
unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES];
unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES];
unsigned int lb_nobusyq[CPU_MAX_IDLE_TYPES];
/* Active load balancing */
unsigned int alb_count;
unsigned int alb_failed;
unsigned int alb_pushed;
/* SD_BALANCE_EXEC stats */
unsigned int sbe_count;
unsigned int sbe_balanced;
unsigned int sbe_pushed;
/* SD_BALANCE_FORK stats */
unsigned int sbf_count;
unsigned int sbf_balanced;
unsigned int sbf_pushed;
/* try_to_wake_up() stats */
unsigned int ttwu_wake_remote;
unsigned int ttwu_move_affine;
unsigned int ttwu_move_balance;
#endif
#ifdef CONFIG_SCHED_DEBUG
char *name;
#endif
sched: Dynamically allocate sched_domain/sched_group data-structures Instead of relying on static allocations for the sched_domain and sched_group trees, dynamically allocate and RCU free them. Allocating this dynamically also allows for some build_sched_groups() simplification since we can now (like with other simplifications) rely on the sched_domain tree instead of hard-coded knowledge. One tricky to note is that detach_destroy_domains() needs to hold rcu_read_lock() over the entire tear-down, per-cpu is not sufficient since that can lead to partial sched_group existance (could possibly be solved by doing the tear-down backwards but this is much more robust). A concequence of the above is that we can no longer print the sched_domain debug stuff from cpu_attach_domain() since that might now run with preemption disabled (due to classic RCU etc.) and sched_domain_debug() does some GFP_KERNEL allocations. Another thing to note is that we now fully rely on normal RCU and not RCU-sched, this is because with the new and exiting RCU flavours we grew over the years BH doesn't necessarily hold off RCU-sched grace periods (-rt is known to break this). This would in fact already cause us grief since we do sched_domain/sched_group iterations from softirq context. This patch is somewhat larger than I would like it to be, but I didn't find any means of shrinking/splitting this. Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Mike Galbraith <efault@gmx.de> Cc: Nick Piggin <npiggin@kernel.dk> Cc: Linus Torvalds <torvalds@linux-foundation.org> Cc: Andrew Morton <akpm@linux-foundation.org> Link: http://lkml.kernel.org/r/20110407122942.245307941@chello.nl Signed-off-by: Ingo Molnar <mingo@elte.hu>
2011-04-07 20:09:50 +08:00
union {
void *private; /* used during construction */
struct rcu_head rcu; /* used during destruction */
};
unsigned int span_weight;
/*
* Span of all CPUs in this domain.
*
* NOTE: this field is variable length. (Allocated dynamically
* by attaching extra space to the end of the structure,
* depending on how many CPUs the kernel has booted up with)
*/
unsigned long span[0];
};
static inline struct cpumask *sched_domain_span(struct sched_domain *sd)
{
return to_cpumask(sd->span);
}
extern void partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new);
cpuset sched_load_balance flag Add a new per-cpuset flag called 'sched_load_balance'. When enabled in a cpuset (the default value) it tells the kernel scheduler that the scheduler should provide the normal load balancing on the CPUs in that cpuset, sometimes moving tasks from one CPU to a second CPU if the second CPU is less loaded and if that task is allowed to run there. When disabled (write "0" to the file) then it tells the kernel scheduler that load balancing is not required for the CPUs in that cpuset. Now even if this flag is disabled for some cpuset, the kernel may still have to load balance some or all the CPUs in that cpuset, if some overlapping cpuset has its sched_load_balance flag enabled. If there are some CPUs that are not in any cpuset whose sched_load_balance flag is enabled, the kernel scheduler will not load balance tasks to those CPUs. Moreover the kernel will partition the 'sched domains' (non-overlapping sets of CPUs over which load balancing is attempted) into the finest granularity partition that it can find, while still keeping any two CPUs that are in the same shed_load_balance enabled cpuset in the same element of the partition. This serves two purposes: 1) It provides a mechanism for real time isolation of some CPUs, and 2) it can be used to improve performance on systems with many CPUs by supporting configurations in which load balancing is not done across all CPUs at once, but rather only done in several smaller disjoint sets of CPUs. This mechanism replaces the earlier overloading of the per-cpuset flag 'cpu_exclusive', which overloading was removed in an earlier patch: cpuset-remove-sched-domain-hooks-from-cpusets See further the Documentation and comments in the code itself. [akpm@linux-foundation.org: don't be weird] Signed-off-by: Paul Jackson <pj@sgi.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:40:20 +08:00
/* Allocate an array of sched domains, for partition_sched_domains(). */
cpumask_var_t *alloc_sched_domains(unsigned int ndoms);
void free_sched_domains(cpumask_var_t doms[], unsigned int ndoms);
/* Test a flag in parent sched domain */
static inline int test_sd_parent(struct sched_domain *sd, int flag)
{
if (sd->parent && (sd->parent->flags & flag))
return 1;
return 0;
}
cpuset sched_load_balance flag Add a new per-cpuset flag called 'sched_load_balance'. When enabled in a cpuset (the default value) it tells the kernel scheduler that the scheduler should provide the normal load balancing on the CPUs in that cpuset, sometimes moving tasks from one CPU to a second CPU if the second CPU is less loaded and if that task is allowed to run there. When disabled (write "0" to the file) then it tells the kernel scheduler that load balancing is not required for the CPUs in that cpuset. Now even if this flag is disabled for some cpuset, the kernel may still have to load balance some or all the CPUs in that cpuset, if some overlapping cpuset has its sched_load_balance flag enabled. If there are some CPUs that are not in any cpuset whose sched_load_balance flag is enabled, the kernel scheduler will not load balance tasks to those CPUs. Moreover the kernel will partition the 'sched domains' (non-overlapping sets of CPUs over which load balancing is attempted) into the finest granularity partition that it can find, while still keeping any two CPUs that are in the same shed_load_balance enabled cpuset in the same element of the partition. This serves two purposes: 1) It provides a mechanism for real time isolation of some CPUs, and 2) it can be used to improve performance on systems with many CPUs by supporting configurations in which load balancing is not done across all CPUs at once, but rather only done in several smaller disjoint sets of CPUs. This mechanism replaces the earlier overloading of the per-cpuset flag 'cpu_exclusive', which overloading was removed in an earlier patch: cpuset-remove-sched-domain-hooks-from-cpusets See further the Documentation and comments in the code itself. [akpm@linux-foundation.org: don't be weird] Signed-off-by: Paul Jackson <pj@sgi.com> Acked-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:40:20 +08:00
unsigned long default_scale_freq_power(struct sched_domain *sd, int cpu);
unsigned long default_scale_smt_power(struct sched_domain *sd, int cpu);
bool cpus_share_cache(int this_cpu, int that_cpu);
#else /* CONFIG_SMP */
struct sched_domain_attr;
static inline void
partition_sched_domains(int ndoms_new, cpumask_var_t doms_new[],
struct sched_domain_attr *dattr_new)
{
}
static inline bool cpus_share_cache(int this_cpu, int that_cpu)
{
return true;
}
#endif /* !CONFIG_SMP */
struct io_context; /* See blkdev.h */
#ifdef ARCH_HAS_PREFETCH_SWITCH_STACK
extern void prefetch_stack(struct task_struct *t);
#else
static inline void prefetch_stack(struct task_struct *t) { }
#endif
struct audit_context; /* See audit.c */
struct mempolicy;
struct pipe_inode_info;
struct uts_namespace;
struct rq;
struct sched_domain;
/*
* wake flags
*/
#define WF_SYNC 0x01 /* waker goes to sleep after wakup */
#define WF_FORK 0x02 /* child wakeup after fork */
#define WF_MIGRATED 0x04 /* internal use, task got migrated */
#define ENQUEUE_WAKEUP 1
#define ENQUEUE_HEAD 2
#ifdef CONFIG_SMP
#define ENQUEUE_WAKING 4 /* sched_class::task_waking was called */
#else
#define ENQUEUE_WAKING 0
#endif
#define DEQUEUE_SLEEP 1
struct sched_class {
const struct sched_class *next;
void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags);
void (*yield_task) (struct rq *rq);
bool (*yield_to_task) (struct rq *rq, struct task_struct *p, bool preempt);
void (*check_preempt_curr) (struct rq *rq, struct task_struct *p, int flags);
struct task_struct * (*pick_next_task) (struct rq *rq);
void (*put_prev_task) (struct rq *rq, struct task_struct *p);
#ifdef CONFIG_SMP
int (*select_task_rq)(struct task_struct *p, int sd_flag, int flags);
void (*pre_schedule) (struct rq *this_rq, struct task_struct *task);
void (*post_schedule) (struct rq *this_rq);
void (*task_waking) (struct task_struct *task);
void (*task_woken) (struct rq *this_rq, struct task_struct *task);
void (*set_cpus_allowed)(struct task_struct *p,
const struct cpumask *newmask);
void (*rq_online)(struct rq *rq);
void (*rq_offline)(struct rq *rq);
#endif
void (*set_curr_task) (struct rq *rq);
void (*task_tick) (struct rq *rq, struct task_struct *p, int queued);
void (*task_fork) (struct task_struct *p);
void (*switched_from) (struct rq *this_rq, struct task_struct *task);
void (*switched_to) (struct rq *this_rq, struct task_struct *task);
void (*prio_changed) (struct rq *this_rq, struct task_struct *task,
int oldprio);
unsigned int (*get_rr_interval) (struct rq *rq,
struct task_struct *task);
#ifdef CONFIG_FAIR_GROUP_SCHED
void (*task_move_group) (struct task_struct *p, int on_rq);
#endif
};
struct load_weight {
unsigned long weight, inv_weight;
};
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics {
u64 wait_start;
u64 wait_max;
u64 wait_count;
u64 wait_sum;
u64 iowait_count;
u64 iowait_sum;
u64 sleep_start;
u64 sleep_max;
s64 sum_sleep_runtime;
u64 block_start;
u64 block_max;
u64 exec_max;
u64 slice_max;
u64 nr_migrations_cold;
u64 nr_failed_migrations_affine;
u64 nr_failed_migrations_running;
u64 nr_failed_migrations_hot;
u64 nr_forced_migrations;
u64 nr_wakeups;
u64 nr_wakeups_sync;
u64 nr_wakeups_migrate;
u64 nr_wakeups_local;
u64 nr_wakeups_remote;
u64 nr_wakeups_affine;
u64 nr_wakeups_affine_attempts;
u64 nr_wakeups_passive;
u64 nr_wakeups_idle;
};
#endif
struct sched_entity {
struct load_weight load; /* for load-balancing */
struct rb_node run_node;
struct list_head group_node;
unsigned int on_rq;
u64 exec_start;
u64 sum_exec_runtime;
u64 vruntime;
u64 prev_sum_exec_runtime;
u64 nr_migrations;
#ifdef CONFIG_SCHEDSTATS
struct sched_statistics statistics;
#endif
#ifdef CONFIG_FAIR_GROUP_SCHED
struct sched_entity *parent;
/* rq on which this entity is (to be) queued: */
struct cfs_rq *cfs_rq;
/* rq "owned" by this entity/group: */
struct cfs_rq *my_q;
#endif
};
struct sched_rt_entity {
struct list_head run_list;
unsigned long timeout;
unsigned int time_slice;
int nr_cpus_allowed;
struct sched_rt_entity *back;
#ifdef CONFIG_RT_GROUP_SCHED
struct sched_rt_entity *parent;
/* rq on which this entity is (to be) queued: */
struct rt_rq *rt_rq;
/* rq "owned" by this entity/group: */
struct rt_rq *my_q;
#endif
};
/*
* default timeslice is 100 msecs (used only for SCHED_RR tasks).
* Timeslices get refilled after they expire.
*/
#define RR_TIMESLICE (100 * HZ / 1000)
struct rcu_node;
enum perf_event_task_context {
perf_invalid_context = -1,
perf_hw_context = 0,
perf_sw_context,
perf_nr_task_contexts,
};
struct task_struct {
volatile long state; /* -1 unrunnable, 0 runnable, >0 stopped */
rename thread_info to stack This finally renames the thread_info field in task structure to stack, so that the assumptions about this field are gone and archs have more freedom about placing the thread_info structure. Nonbroken archs which have a proper thread pointer can do the access to both current thread and task structure via a single pointer. It'll allow for a few more cleanups of the fork code, from which e.g. ia64 could benefit. Signed-off-by: Roman Zippel <zippel@linux-m68k.org> [akpm@linux-foundation.org: build fix] Cc: Richard Henderson <rth@twiddle.net> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Ian Molton <spyro@f2s.com> Cc: Haavard Skinnemoen <hskinnemoen@atmel.com> Cc: Mikael Starvik <starvik@axis.com> Cc: David Howells <dhowells@redhat.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Hirokazu Takata <takata@linux-m32r.org> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Roman Zippel <zippel@linux-m68k.org> Cc: Greg Ungerer <gerg@uclinux.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp> Cc: Richard Curnow <rc@rc0.org.uk> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Jeff Dike <jdike@addtoit.com> Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp> Cc: Andi Kleen <ak@muc.de> Cc: Chris Zankel <chris@zankel.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-09 17:35:17 +08:00
void *stack;
atomic_t usage;
unsigned int flags; /* per process flags, defined below */
unsigned int ptrace;
[PATCH] sched: implement smpnice Problem: The introduction of separate run queues per CPU has brought with it "nice" enforcement problems that are best described by a simple example. For the sake of argument suppose that on a single CPU machine with a nice==19 hard spinner and a nice==0 hard spinner running that the nice==0 task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and 2 nice==0 hard spinners running. The user of this system would be entitled to expect that the nice==0 tasks each get 95% of a CPU and the nice==19 tasks only get 5% each. However, whether this expectation is met is pretty much down to luck as there are four equally likely distributions of the tasks to the CPUs that the load balancing code will consider to be balanced with loads of 2.0 for each CPU. Two of these distributions involve one nice==0 and one nice==19 task per CPU and in these circumstances the users expectations will be met. The other two distributions both involve both nice==0 tasks being on one CPU and both nice==19 being on the other CPU and each task will get 50% of a CPU and the user's expectations will not be met. Solution: The solution to this problem that is implemented in the attached patch is to use weighted loads when determining if the system is balanced and, when an imbalance is detected, to move an amount of weighted load between run queues (as opposed to a number of tasks) to restore the balance. Once again, the easiest way to explain why both of these measures are necessary is to use a simple example. Suppose that (in a slight variation of the above example) that we have a two CPU system with 4 nice==0 and 4 nice=19 hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and the 4 nice==19 tasks are on the other CPU. The weighted loads for the two CPUs would be 4.0 and 0.2 respectively and the load balancing code would move 2 tasks resulting in one CPU with a load of 2.0 and the other with load of 2.2. If this was considered to be a big enough imbalance to justify moving a task and that task was moved using the current move_tasks() then it would move the highest priority task that it found and this would result in one CPU with a load of 3.0 and the other with a load of 1.2 which would result in the movement of a task in the opposite direction and so on -- infinite loop. If, on the other hand, an amount of load to be moved is calculated from the imbalance (in this case 0.1) and move_tasks() skips tasks until it find ones whose contributions to the weighted load are less than this amount it would move two of the nice==19 tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with loads of 2.1 for each CPU. One of the advantages of this mechanism is that on a system where all tasks have nice==0 the load balancing calculations would be mathematically identical to the current load balancing code. Notes: struct task_struct: has a new field load_weight which (in a trade off of space for speed) stores the contribution that this task makes to a CPU's weighted load when it is runnable. struct runqueue: has a new field raw_weighted_load which is the sum of the load_weight values for the currently runnable tasks on this run queue. This field always needs to be updated when nr_running is updated so two new inline functions inc_nr_running() and dec_nr_running() have been created to make sure that this happens. This also offers a convenient way to optimize away this part of the smpnice mechanism when CONFIG_SMP is not defined. int try_to_wake_up(): in this function the value SCHED_LOAD_BALANCE is used to represent the load contribution of a single task in various calculations in the code that decides which CPU to put the waking task on. While this would be a valid on a system where the nice values for the runnable tasks were distributed evenly around zero it will lead to anomalous load balancing if the distribution is skewed in either direction. To overcome this problem SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task or by the average load_weight per task for the queue in question (as appropriate). int move_tasks(): The modifications to this function were complicated by the fact that active_load_balance() uses it to move exactly one task without checking whether an imbalance actually exists. This precluded the simple overloading of max_nr_move with max_load_move and necessitated the addition of the latter as an extra argument to the function. The internal implementation is then modified to move up to max_nr_move tasks and max_load_move of weighted load. This slightly complicates the code where move_tasks() is called and if ever active_load_balance() is changed to not use move_tasks() the implementation of move_tasks() should be simplified accordingly. struct sched_group *find_busiest_group(): Similar to try_to_wake_up(), there are places in this function where SCHED_LOAD_SCALE is used to represent the load contribution of a single task and the same issues are created. A similar solution is adopted except that it is now the average per task contribution to a group's load (as opposed to a run queue) that is required. As this value is not directly available from the group it is calculated on the fly as the queues in the groups are visited when determining the busiest group. A key change to this function is that it is no longer to scale down *imbalance on exit as move_tasks() uses the load in its scaled form. void set_user_nice(): has been modified to update the task's load_weight field when it's nice value and also to ensure that its run queue's raw_weighted_load field is updated if it was runnable. From: "Siddha, Suresh B" <suresh.b.siddha@intel.com> With smpnice, sched groups with highest priority tasks can mask the imbalance between the other sched groups with in the same domain. This patch fixes some of the listed down scenarios by not considering the sched groups which are lightly loaded. a) on a simple 4-way MP system, if we have one high priority and 4 normal priority tasks, with smpnice we would like to see the high priority task scheduled on one cpu, two other cpus getting one normal task each and the fourth cpu getting the remaining two normal tasks. but with current smpnice extra normal priority task keeps jumping from one cpu to another cpu having the normal priority task. This is because of the busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the cpu with high priority task in max_load calculations but including that in total and avg_load calcuations.. leading to max_load < avg_load and load balance between cpus running normal priority tasks(2 Vs 1) will always show imbalanace as one normal priority and the extra normal priority task will keep moving from one cpu to another cpu having normal priority task.. b) 4-way system with HT (8 logical processors). Package-P0 T0 has a highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal priority task each.. P2 and P3 are idle. With this patch, one of the normal priority tasks on P1 will be moved to P2 or P3.. c) With the current weighted smp nice calculations, it doesn't always make sense to look at the highest weighted runqueue in the busy group.. Consider a load balance scenario on a DP with HT system, with Package-0 containing one high priority and one low priority, Package-1 containing one low priority(with other thread being idle).. Package-1 thinks that it need to take the low priority thread from Package-0. And find_busiest_queue() returns the cpu thread with highest priority task.. And ultimately(with help of active load balance) we move high priority task to Package-1. And same continues with Package-0 now, moving high priority task from package-1 to package-0.. Even without the presence of active load balance, load balance will fail to balance the above scenario.. Fix find_busiest_queue to use "imbalance" when it is lightly loaded. [kernel@kolivas.org: sched: store weighted load on up] [kernel@kolivas.org: sched: add discrete weighted cpu load function] [suresh.b.siddha@intel.com: sched: remove dead code] Signed-off-by: Peter Williams <pwil3058@bigpond.com.au> Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com> Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Con Kolivas <kernel@kolivas.org> Cc: John Hawkes <hawkes@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:54:34 +08:00
#ifdef CONFIG_SMP
struct llist_node wake_entry;
int on_cpu;
[PATCH] sched: implement smpnice Problem: The introduction of separate run queues per CPU has brought with it "nice" enforcement problems that are best described by a simple example. For the sake of argument suppose that on a single CPU machine with a nice==19 hard spinner and a nice==0 hard spinner running that the nice==0 task gets 95% of the CPU and the nice==19 task gets 5% of the CPU. Now suppose that there is a system with 2 CPUs and 2 nice==19 hard spinners and 2 nice==0 hard spinners running. The user of this system would be entitled to expect that the nice==0 tasks each get 95% of a CPU and the nice==19 tasks only get 5% each. However, whether this expectation is met is pretty much down to luck as there are four equally likely distributions of the tasks to the CPUs that the load balancing code will consider to be balanced with loads of 2.0 for each CPU. Two of these distributions involve one nice==0 and one nice==19 task per CPU and in these circumstances the users expectations will be met. The other two distributions both involve both nice==0 tasks being on one CPU and both nice==19 being on the other CPU and each task will get 50% of a CPU and the user's expectations will not be met. Solution: The solution to this problem that is implemented in the attached patch is to use weighted loads when determining if the system is balanced and, when an imbalance is detected, to move an amount of weighted load between run queues (as opposed to a number of tasks) to restore the balance. Once again, the easiest way to explain why both of these measures are necessary is to use a simple example. Suppose that (in a slight variation of the above example) that we have a two CPU system with 4 nice==0 and 4 nice=19 hard spinning tasks running and that the 4 nice==0 tasks are on one CPU and the 4 nice==19 tasks are on the other CPU. The weighted loads for the two CPUs would be 4.0 and 0.2 respectively and the load balancing code would move 2 tasks resulting in one CPU with a load of 2.0 and the other with load of 2.2. If this was considered to be a big enough imbalance to justify moving a task and that task was moved using the current move_tasks() then it would move the highest priority task that it found and this would result in one CPU with a load of 3.0 and the other with a load of 1.2 which would result in the movement of a task in the opposite direction and so on -- infinite loop. If, on the other hand, an amount of load to be moved is calculated from the imbalance (in this case 0.1) and move_tasks() skips tasks until it find ones whose contributions to the weighted load are less than this amount it would move two of the nice==19 tasks resulting in a system with 2 nice==0 and 2 nice=19 on each CPU with loads of 2.1 for each CPU. One of the advantages of this mechanism is that on a system where all tasks have nice==0 the load balancing calculations would be mathematically identical to the current load balancing code. Notes: struct task_struct: has a new field load_weight which (in a trade off of space for speed) stores the contribution that this task makes to a CPU's weighted load when it is runnable. struct runqueue: has a new field raw_weighted_load which is the sum of the load_weight values for the currently runnable tasks on this run queue. This field always needs to be updated when nr_running is updated so two new inline functions inc_nr_running() and dec_nr_running() have been created to make sure that this happens. This also offers a convenient way to optimize away this part of the smpnice mechanism when CONFIG_SMP is not defined. int try_to_wake_up(): in this function the value SCHED_LOAD_BALANCE is used to represent the load contribution of a single task in various calculations in the code that decides which CPU to put the waking task on. While this would be a valid on a system where the nice values for the runnable tasks were distributed evenly around zero it will lead to anomalous load balancing if the distribution is skewed in either direction. To overcome this problem SCHED_LOAD_SCALE has been replaced by the load_weight for the relevant task or by the average load_weight per task for the queue in question (as appropriate). int move_tasks(): The modifications to this function were complicated by the fact that active_load_balance() uses it to move exactly one task without checking whether an imbalance actually exists. This precluded the simple overloading of max_nr_move with max_load_move and necessitated the addition of the latter as an extra argument to the function. The internal implementation is then modified to move up to max_nr_move tasks and max_load_move of weighted load. This slightly complicates the code where move_tasks() is called and if ever active_load_balance() is changed to not use move_tasks() the implementation of move_tasks() should be simplified accordingly. struct sched_group *find_busiest_group(): Similar to try_to_wake_up(), there are places in this function where SCHED_LOAD_SCALE is used to represent the load contribution of a single task and the same issues are created. A similar solution is adopted except that it is now the average per task contribution to a group's load (as opposed to a run queue) that is required. As this value is not directly available from the group it is calculated on the fly as the queues in the groups are visited when determining the busiest group. A key change to this function is that it is no longer to scale down *imbalance on exit as move_tasks() uses the load in its scaled form. void set_user_nice(): has been modified to update the task's load_weight field when it's nice value and also to ensure that its run queue's raw_weighted_load field is updated if it was runnable. From: "Siddha, Suresh B" <suresh.b.siddha@intel.com> With smpnice, sched groups with highest priority tasks can mask the imbalance between the other sched groups with in the same domain. This patch fixes some of the listed down scenarios by not considering the sched groups which are lightly loaded. a) on a simple 4-way MP system, if we have one high priority and 4 normal priority tasks, with smpnice we would like to see the high priority task scheduled on one cpu, two other cpus getting one normal task each and the fourth cpu getting the remaining two normal tasks. but with current smpnice extra normal priority task keeps jumping from one cpu to another cpu having the normal priority task. This is because of the busiest_has_loaded_cpus, nr_loaded_cpus logic.. We are not including the cpu with high priority task in max_load calculations but including that in total and avg_load calcuations.. leading to max_load < avg_load and load balance between cpus running normal priority tasks(2 Vs 1) will always show imbalanace as one normal priority and the extra normal priority task will keep moving from one cpu to another cpu having normal priority task.. b) 4-way system with HT (8 logical processors). Package-P0 T0 has a highest priority task, T1 is idle. Package-P1 Both T0 and T1 have 1 normal priority task each.. P2 and P3 are idle. With this patch, one of the normal priority tasks on P1 will be moved to P2 or P3.. c) With the current weighted smp nice calculations, it doesn't always make sense to look at the highest weighted runqueue in the busy group.. Consider a load balance scenario on a DP with HT system, with Package-0 containing one high priority and one low priority, Package-1 containing one low priority(with other thread being idle).. Package-1 thinks that it need to take the low priority thread from Package-0. And find_busiest_queue() returns the cpu thread with highest priority task.. And ultimately(with help of active load balance) we move high priority task to Package-1. And same continues with Package-0 now, moving high priority task from package-1 to package-0.. Even without the presence of active load balance, load balance will fail to balance the above scenario.. Fix find_busiest_queue to use "imbalance" when it is lightly loaded. [kernel@kolivas.org: sched: store weighted load on up] [kernel@kolivas.org: sched: add discrete weighted cpu load function] [suresh.b.siddha@intel.com: sched: remove dead code] Signed-off-by: Peter Williams <pwil3058@bigpond.com.au> Cc: "Siddha, Suresh B" <suresh.b.siddha@intel.com> Cc: "Chen, Kenneth W" <kenneth.w.chen@intel.com> Acked-by: Ingo Molnar <mingo@elte.hu> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Signed-off-by: Con Kolivas <kernel@kolivas.org> Cc: John Hawkes <hawkes@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:54:34 +08:00
#endif
int on_rq;
int prio, static_prio, normal_prio;
unsigned int rt_priority;
const struct sched_class *sched_class;
struct sched_entity se;
struct sched_rt_entity rt;
#ifdef CONFIG_PREEMPT_NOTIFIERS
/* list of struct preempt_notifier: */
struct hlist_head preempt_notifiers;
#endif
/*
* fpu_counter contains the number of consecutive context switches
* that the FPU is used. If this is over a threshold, the lazy fpu
* saving becomes unlazy to save the trap. This is an unsigned char
* so that after 256 times the counter wraps and the behavior turns
* lazy again; this to deal with bursty apps that only use FPU for
* a short time
*/
unsigned char fpu_counter;
#ifdef CONFIG_BLK_DEV_IO_TRACE
unsigned int btrace_seq;
#endif
unsigned int policy;
cpumask_t cpus_allowed;
rcu: Add a TINY_PREEMPT_RCU Implement a small-memory-footprint uniprocessor-only implementation of preemptible RCU. This implementation uses but a single blocked-tasks list rather than the combinatorial number used per leaf rcu_node by TREE_PREEMPT_RCU, which reduces memory consumption and greatly simplifies processing. This version also takes advantage of uniprocessor execution to accelerate grace periods in the case where there are no readers. The general design is otherwise broadly similar to that of TREE_PREEMPT_RCU. This implementation is a step towards having RCU implementation driven off of the SMP and PREEMPT kernel configuration variables, which can happen once this implementation has accumulated sufficient experience. Removed ACCESS_ONCE() from __rcu_read_unlock() and added barrier() as suggested by Steve Rostedt in order to avoid the compiler-reordering issue noted by Mathieu Desnoyers (http://lkml.org/lkml/2010/8/16/183). As can be seen below, CONFIG_TINY_PREEMPT_RCU represents almost 5Kbyte savings compared to CONFIG_TREE_PREEMPT_RCU. Of course, for non-real-time workloads, CONFIG_TINY_RCU is even better. CONFIG_TREE_PREEMPT_RCU text data bss dec filename 13 0 0 13 kernel/rcupdate.o 6170 825 28 7023 kernel/rcutree.o ---- 7026 Total CONFIG_TINY_PREEMPT_RCU text data bss dec filename 13 0 0 13 kernel/rcupdate.o 2081 81 8 2170 kernel/rcutiny.o ---- 2183 Total CONFIG_TINY_RCU (non-preemptible) text data bss dec filename 13 0 0 13 kernel/rcupdate.o 719 25 0 744 kernel/rcutiny.o --- 757 Total Requested-by: Loïc Minier <loic.minier@canonical.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2010-06-30 07:49:16 +08:00
#ifdef CONFIG_PREEMPT_RCU
int rcu_read_lock_nesting;
rcu: Merge preemptable-RCU functionality into hierarchical RCU Create a kernel/rcutree_plugin.h file that contains definitions for preemptable RCU (or, under the #else branch of the #ifdef, empty definitions for the classic non-preemptable semantics). These definitions fit into plugins defined in kernel/rcutree.c for this purpose. This variant of preemptable RCU uses a new algorithm whose read-side expense is roughly that of classic hierarchical RCU under CONFIG_PREEMPT. This new algorithm's update-side expense is similar to that of classic hierarchical RCU, and, in absence of read-side preemption or blocking, is exactly that of classic hierarchical RCU. Perhaps more important, this new algorithm has a much simpler implementation, saving well over 1,000 lines of code compared to mainline's implementation of preemptable RCU, which will hopefully be retired in favor of this new algorithm. The simplifications are obtained by maintaining per-task nesting state for running tasks, and using a simple lock-protected algorithm to handle accounting when tasks block within RCU read-side critical sections, making use of lessons learned while creating numerous user-level RCU implementations over the past 18 months. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12509746134003-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 04:56:52 +08:00
char rcu_read_unlock_special;
struct list_head rcu_node_entry;
rcu: Add a TINY_PREEMPT_RCU Implement a small-memory-footprint uniprocessor-only implementation of preemptible RCU. This implementation uses but a single blocked-tasks list rather than the combinatorial number used per leaf rcu_node by TREE_PREEMPT_RCU, which reduces memory consumption and greatly simplifies processing. This version also takes advantage of uniprocessor execution to accelerate grace periods in the case where there are no readers. The general design is otherwise broadly similar to that of TREE_PREEMPT_RCU. This implementation is a step towards having RCU implementation driven off of the SMP and PREEMPT kernel configuration variables, which can happen once this implementation has accumulated sufficient experience. Removed ACCESS_ONCE() from __rcu_read_unlock() and added barrier() as suggested by Steve Rostedt in order to avoid the compiler-reordering issue noted by Mathieu Desnoyers (http://lkml.org/lkml/2010/8/16/183). As can be seen below, CONFIG_TINY_PREEMPT_RCU represents almost 5Kbyte savings compared to CONFIG_TREE_PREEMPT_RCU. Of course, for non-real-time workloads, CONFIG_TINY_RCU is even better. CONFIG_TREE_PREEMPT_RCU text data bss dec filename 13 0 0 13 kernel/rcupdate.o 6170 825 28 7023 kernel/rcutree.o ---- 7026 Total CONFIG_TINY_PREEMPT_RCU text data bss dec filename 13 0 0 13 kernel/rcupdate.o 2081 81 8 2170 kernel/rcutiny.o ---- 2183 Total CONFIG_TINY_RCU (non-preemptible) text data bss dec filename 13 0 0 13 kernel/rcupdate.o 719 25 0 744 kernel/rcutiny.o --- 757 Total Requested-by: Loïc Minier <loic.minier@canonical.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2010-06-30 07:49:16 +08:00
#endif /* #ifdef CONFIG_PREEMPT_RCU */
#ifdef CONFIG_TREE_PREEMPT_RCU
struct rcu_node *rcu_blocked_node;
rcu: Merge preemptable-RCU functionality into hierarchical RCU Create a kernel/rcutree_plugin.h file that contains definitions for preemptable RCU (or, under the #else branch of the #ifdef, empty definitions for the classic non-preemptable semantics). These definitions fit into plugins defined in kernel/rcutree.c for this purpose. This variant of preemptable RCU uses a new algorithm whose read-side expense is roughly that of classic hierarchical RCU under CONFIG_PREEMPT. This new algorithm's update-side expense is similar to that of classic hierarchical RCU, and, in absence of read-side preemption or blocking, is exactly that of classic hierarchical RCU. Perhaps more important, this new algorithm has a much simpler implementation, saving well over 1,000 lines of code compared to mainline's implementation of preemptable RCU, which will hopefully be retired in favor of this new algorithm. The simplifications are obtained by maintaining per-task nesting state for running tasks, and using a simple lock-protected algorithm to handle accounting when tasks block within RCU read-side critical sections, making use of lessons learned while creating numerous user-level RCU implementations over the past 18 months. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12509746134003-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 04:56:52 +08:00
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
struct rt_mutex *rcu_boost_mutex;
#endif /* #ifdef CONFIG_RCU_BOOST */
#if defined(CONFIG_SCHEDSTATS) || defined(CONFIG_TASK_DELAY_ACCT)
struct sched_info sched_info;
#endif
struct list_head tasks;
#ifdef CONFIG_SMP
sched: create "pushable_tasks" list to limit pushing to one attempt The RT scheduler employs a "push/pull" design to actively balance tasks within the system (on a per disjoint cpuset basis). When a task is awoken, it is immediately determined if there are any lower priority cpus which should be preempted. This is opposed to the way normal SCHED_OTHER tasks behave, which will wait for a periodic rebalancing operation to occur before spreading out load. When a particular RQ has more than 1 active RT task, it is said to be in an "overloaded" state. Once this occurs, the system enters the active balancing mode, where it will try to push the task away, or persuade a different cpu to pull it over. The system will stay in this state until the system falls back below the <= 1 queued RT task per RQ. However, the current implementation suffers from a limitation in the push logic. Once overloaded, all tasks (other than current) on the RQ are analyzed on every push operation, even if it was previously unpushable (due to affinity, etc). Whats more, the operation stops at the first task that is unpushable and will not look at items lower in the queue. This causes two problems: 1) We can have the same tasks analyzed over and over again during each push, which extends out the fast path in the scheduler for no gain. Consider a RQ that has dozens of tasks that are bound to a core. Each one of those tasks will be encountered and skipped for each push operation while they are queued. 2) There may be lower-priority tasks under the unpushable task that could have been successfully pushed, but will never be considered until either the unpushable task is cleared, or a pull operation succeeds. The net result is a potential latency source for mid priority tasks. This patch aims to rectify these two conditions by introducing a new priority sorted list: "pushable_tasks". A task is added to the list each time a task is activated or preempted. It is removed from the list any time it is deactivated, made current, or fails to push. This works because a task only needs to be attempted to push once. After an initial failure to push, the other cpus will eventually try to pull the task when the conditions are proper. This also solves the problem that we don't completely analyze all tasks due to encountering an unpushable tasks. Now every task will have a push attempted (when appropriate). This reduces latency both by shorting the critical section of the rq->lock for certain workloads, and by making sure the algorithm considers all eligible tasks in the system. [ rostedt: added a couple more BUG_ONs ] Signed-off-by: Gregory Haskins <ghaskins@novell.com> Acked-by: Steven Rostedt <srostedt@redhat.com>
2008-12-29 22:39:53 +08:00
struct plist_node pushable_tasks;
#endif
struct mm_struct *mm, *active_mm;
#ifdef CONFIG_COMPAT_BRK
unsigned brk_randomized:1;
#endif
#if defined(SPLIT_RSS_COUNTING)
struct task_rss_stat rss_stat;
#endif
/* task state */
int exit_state;
int exit_code, exit_signal;
int pdeath_signal; /* The signal sent when the parent dies */
unsigned int jobctl; /* JOBCTL_*, siglock protected */
/* ??? */
unsigned int personality;
unsigned did_exec:1;
unsigned in_execve:1; /* Tell the LSMs that the process is doing an
* execve */
unsigned in_iowait:1;
sched: Introduce SCHED_RESET_ON_FORK scheduling policy flag This patch introduces a new flag SCHED_RESET_ON_FORK which can be passed to the kernel via sched_setscheduler(), ORed in the policy parameter. If set this will make sure that when the process forks a) the scheduling priority is reset to DEFAULT_PRIO if it was higher and b) the scheduling policy is reset to SCHED_NORMAL if it was either SCHED_FIFO or SCHED_RR. Why have this? Currently, if a process is real-time scheduled this will 'leak' to all its child processes. For security reasons it is often (always?) a good idea to make sure that if a process acquires RT scheduling this is confined to this process and only this process. More specifically this makes the per-process resource limit RLIMIT_RTTIME useful for security purposes, because it makes it impossible to use a fork bomb to circumvent the per-process RLIMIT_RTTIME accounting. This feature is also useful for tools like 'renice' which can then change the nice level of a process without having this spill to all its child processes. Why expose this via sched_setscheduler() and not other syscalls such as prctl() or sched_setparam()? prctl() does not take a pid parameter. Due to that it would be impossible to modify this flag for other processes than the current one. The struct passed to sched_setparam() can unfortunately not be extended without breaking compatibility, since sched_setparam() lacks a size parameter. How to use this from userspace? In your RT program simply replace this: sched_setscheduler(pid, SCHED_FIFO, &param); by this: sched_setscheduler(pid, SCHED_FIFO|SCHED_RESET_ON_FORK, &param); Signed-off-by: Lennart Poettering <lennart@poettering.net> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <20090615152714.GA29092@tango.0pointer.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-15 23:17:47 +08:00
/* Revert to default priority/policy when forking */
unsigned sched_reset_on_fork:1;
unsigned sched_contributes_to_load:1;
sched: Introduce SCHED_RESET_ON_FORK scheduling policy flag This patch introduces a new flag SCHED_RESET_ON_FORK which can be passed to the kernel via sched_setscheduler(), ORed in the policy parameter. If set this will make sure that when the process forks a) the scheduling priority is reset to DEFAULT_PRIO if it was higher and b) the scheduling policy is reset to SCHED_NORMAL if it was either SCHED_FIFO or SCHED_RR. Why have this? Currently, if a process is real-time scheduled this will 'leak' to all its child processes. For security reasons it is often (always?) a good idea to make sure that if a process acquires RT scheduling this is confined to this process and only this process. More specifically this makes the per-process resource limit RLIMIT_RTTIME useful for security purposes, because it makes it impossible to use a fork bomb to circumvent the per-process RLIMIT_RTTIME accounting. This feature is also useful for tools like 'renice' which can then change the nice level of a process without having this spill to all its child processes. Why expose this via sched_setscheduler() and not other syscalls such as prctl() or sched_setparam()? prctl() does not take a pid parameter. Due to that it would be impossible to modify this flag for other processes than the current one. The struct passed to sched_setparam() can unfortunately not be extended without breaking compatibility, since sched_setparam() lacks a size parameter. How to use this from userspace? In your RT program simply replace this: sched_setscheduler(pid, SCHED_FIFO, &param); by this: sched_setscheduler(pid, SCHED_FIFO|SCHED_RESET_ON_FORK, &param); Signed-off-by: Lennart Poettering <lennart@poettering.net> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> LKML-Reference: <20090615152714.GA29092@tango.0pointer.de> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-06-15 23:17:47 +08:00
#ifdef CONFIG_GENERIC_HARDIRQS
/* IRQ handler threads */
unsigned irq_thread:1;
#endif
pid_t pid;
pid_t tgid;
#ifdef CONFIG_CC_STACKPROTECTOR
/* Canary value for the -fstack-protector gcc feature */
unsigned long stack_canary;
#endif
/*
* pointers to (original) parent process, youngest child, younger sibling,
* older sibling, respectively. (p->father can be replaced with
* p->real_parent->pid)
*/
struct task_struct __rcu *real_parent; /* real parent process */
struct task_struct __rcu *parent; /* recipient of SIGCHLD, wait4() reports */
/*
* children/sibling forms the list of my natural children
*/
struct list_head children; /* list of my children */
struct list_head sibling; /* linkage in my parent's children list */
struct task_struct *group_leader; /* threadgroup leader */
/*
* ptraced is the list of tasks this task is using ptrace on.
* This includes both natural children and PTRACE_ATTACH targets.
* p->ptrace_entry is p's link on the p->parent->ptraced list.
*/
struct list_head ptraced;
struct list_head ptrace_entry;
/* PID/PID hash table linkage. */
[PATCH] pidhash: Refactor the pid hash table Simplifies the code, reduces the need for 4 pid hash tables, and makes the code more capable. In the discussions I had with Oleg it was felt that to a large extent the cleanup itself justified the work. With struct pid being dynamically allocated meant we could create the hash table entry when the pid was allocated and free the hash table entry when the pid was freed. Instead of playing with the hash lists when ever a process would attach or detach to a process. For myself the fact that it gave what my previous task_ref patch gave for free with simpler code was a big win. The problem is that if you hold a reference to struct task_struct you lock in 10K of low memory. If you do that in a user controllable way like /proc does, with an unprivileged but hostile user space application with typical resource limits of 1000 fds and 100 processes I can trigger the OOM killer by consuming all of low memory with task structs, on a machine wight 1GB of low memory. If I instead hold a reference to struct pid which holds a pointer to my task_struct, I don't suffer from that problem because struct pid is 2 orders of magnitude smaller. In fact struct pid is small enough that most other kernel data structures dwarf it, so simply limiting the number of referring data structures is enough to prevent exhaustion of low memory. This splits the current struct pid into two structures, struct pid and struct pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one. struct pid_link is the per process linkage into the hash tables and lives in struct task_struct. struct pid is given an indepedent lifetime, and holds pointers to each of the pid types. The independent life of struct pid simplifies attach_pid, and detach_pid, because we are always manipulating the list of pids and not the hash table. In addition in giving struct pid an indpendent life it makes the concept much more powerful. Kernel data structures can now embed a struct pid * instead of a pid_t and not suffer from pid wrap around problems or from keeping unnecessarily large amounts of memory allocated. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 18:31:42 +08:00
struct pid_link pids[PIDTYPE_MAX];
struct list_head thread_group;
struct completion *vfork_done; /* for vfork() */
int __user *set_child_tid; /* CLONE_CHILD_SETTID */
int __user *clear_child_tid; /* CLONE_CHILD_CLEARTID */
cputime_t utime, stime, utimescaled, stimescaled;
cputime_t gtime;
#ifndef CONFIG_VIRT_CPU_ACCOUNTING
cputime_t prev_utime, prev_stime;
#endif
unsigned long nvcsw, nivcsw; /* context switch counts */
struct timespec start_time; /* monotonic time */
struct timespec real_start_time; /* boot based time */
/* mm fault and swap info: this can arguably be seen as either mm-specific or thread-specific */
unsigned long min_flt, maj_flt;
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
struct task_cputime cputime_expires;
struct list_head cpu_timers[3];
/* process credentials */
const struct cred __rcu *real_cred; /* objective and real subjective task
* credentials (COW) */
const struct cred __rcu *cred; /* effective (overridable) subjective task
* credentials (COW) */
KEYS: Add a keyctl to install a process's session keyring on its parent [try #6] Add a keyctl to install a process's session keyring onto its parent. This replaces the parent's session keyring. Because the COW credential code does not permit one process to change another process's credentials directly, the change is deferred until userspace next starts executing again. Normally this will be after a wait*() syscall. To support this, three new security hooks have been provided: cred_alloc_blank() to allocate unset security creds, cred_transfer() to fill in the blank security creds and key_session_to_parent() - which asks the LSM if the process may replace its parent's session keyring. The replacement may only happen if the process has the same ownership details as its parent, and the process has LINK permission on the session keyring, and the session keyring is owned by the process, and the LSM permits it. Note that this requires alteration to each architecture's notify_resume path. This has been done for all arches barring blackfin, m68k* and xtensa, all of which need assembly alteration to support TIF_NOTIFY_RESUME. This allows the replacement to be performed at the point the parent process resumes userspace execution. This allows the userspace AFS pioctl emulation to fully emulate newpag() and the VIOCSETTOK and VIOCSETTOK2 pioctls, all of which require the ability to alter the parent process's PAG membership. However, since kAFS doesn't use PAGs per se, but rather dumps the keys into the session keyring, the session keyring of the parent must be replaced if, for example, VIOCSETTOK is passed the newpag flag. This can be tested with the following program: #include <stdio.h> #include <stdlib.h> #include <keyutils.h> #define KEYCTL_SESSION_TO_PARENT 18 #define OSERROR(X, S) do { if ((long)(X) == -1) { perror(S); exit(1); } } while(0) int main(int argc, char **argv) { key_serial_t keyring, key; long ret; keyring = keyctl_join_session_keyring(argv[1]); OSERROR(keyring, "keyctl_join_session_keyring"); key = add_key("user", "a", "b", 1, keyring); OSERROR(key, "add_key"); ret = keyctl(KEYCTL_SESSION_TO_PARENT); OSERROR(ret, "KEYCTL_SESSION_TO_PARENT"); return 0; } Compiled and linked with -lkeyutils, you should see something like: [dhowells@andromeda ~]$ keyctl show Session Keyring -3 --alswrv 4043 4043 keyring: _ses 355907932 --alswrv 4043 -1 \_ keyring: _uid.4043 [dhowells@andromeda ~]$ /tmp/newpag [dhowells@andromeda ~]$ keyctl show Session Keyring -3 --alswrv 4043 4043 keyring: _ses 1055658746 --alswrv 4043 4043 \_ user: a [dhowells@andromeda ~]$ /tmp/newpag hello [dhowells@andromeda ~]$ keyctl show Session Keyring -3 --alswrv 4043 4043 keyring: hello 340417692 --alswrv 4043 4043 \_ user: a Where the test program creates a new session keyring, sticks a user key named 'a' into it and then installs it on its parent. Signed-off-by: David Howells <dhowells@redhat.com> Signed-off-by: James Morris <jmorris@namei.org>
2009-09-02 16:14:21 +08:00
struct cred *replacement_session_keyring; /* for KEYCTL_SESSION_TO_PARENT */
char comm[TASK_COMM_LEN]; /* executable name excluding path
- access with [gs]et_task_comm (which lock
it with task_lock())
- initialized normally by setup_new_exec */
/* file system info */
int link_count, total_link_count;
#ifdef CONFIG_SYSVIPC
/* ipc stuff */
struct sysv_sem sysvsem;
#endif
#ifdef CONFIG_DETECT_HUNG_TASK
softlockup: automatically detect hung TASK_UNINTERRUPTIBLE tasks this patch extends the soft-lockup detector to automatically detect hung TASK_UNINTERRUPTIBLE tasks. Such hung tasks are printed the following way: ------------------> INFO: task prctl:3042 blocked for more than 120 seconds. "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message prctl D fd5e3793 0 3042 2997 f6050f38 00000046 00000001 fd5e3793 00000009 c06d8264 c06dae80 00000286 f6050f40 f6050f00 f7d34d90 f7d34fc8 c1e1be80 00000001 f6050000 00000000 f7e92d00 00000286 f6050f18 c0489d1a f6050f40 00006605 00000000 c0133a5b Call Trace: [<c04883a5>] schedule_timeout+0x6d/0x8b [<c04883d8>] schedule_timeout_uninterruptible+0x15/0x17 [<c0133a76>] msleep+0x10/0x16 [<c0138974>] sys_prctl+0x30/0x1e2 [<c0104c52>] sysenter_past_esp+0x5f/0xa5 ======================= 2 locks held by prctl/3042: #0: (&sb->s_type->i_mutex_key#5){--..}, at: [<c0197d11>] do_fsync+0x38/0x7a #1: (jbd_handle){--..}, at: [<c01ca3d2>] journal_start+0xc7/0xe9 <------------------ the current default timeout is 120 seconds. Such messages are printed up to 10 times per bootup. If the system has crashed already then the messages are not printed. if lockdep is enabled then all held locks are printed as well. this feature is a natural extension to the softlockup-detector (kernel locked up without scheduling) and to the NMI watchdog (kernel locked up with IRQs disabled). [ Gautham R Shenoy <ego@in.ibm.com>: CPU hotplug fixes. ] [ Andrew Morton <akpm@linux-foundation.org>: build warning fix. ] Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
2008-01-26 04:08:02 +08:00
/* hung task detection */
unsigned long last_switch_count;
#endif
/* CPU-specific state of this task */
struct thread_struct thread;
/* filesystem information */
struct fs_struct *fs;
/* open file information */
struct files_struct *files;
/* namespaces */
struct nsproxy *nsproxy;
/* signal handlers */
struct signal_struct *signal;
struct sighand_struct *sighand;
sigset_t blocked, real_blocked;
sigset_t saved_sigmask; /* restored if set_restore_sigmask() was used */
struct sigpending pending;
unsigned long sas_ss_sp;
size_t sas_ss_size;
int (*notifier)(void *priv);
void *notifier_data;
sigset_t *notifier_mask;
struct audit_context *audit_context;
#ifdef CONFIG_AUDITSYSCALL
uid_t loginuid;
unsigned int sessionid;
#endif
seccomp_t seccomp;
/* Thread group tracking */
u32 parent_exec_id;
u32 self_exec_id;
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
/* Protection of (de-)allocation: mm, files, fs, tty, keyrings, mems_allowed,
* mempolicy */
spinlock_t alloc_lock;
/* Protection of the PI data structures: */
raw_spinlock_t pi_lock;
#ifdef CONFIG_RT_MUTEXES
/* PI waiters blocked on a rt_mutex held by this task */
struct plist_head pi_waiters;
/* Deadlock detection and priority inheritance handling */
struct rt_mutex_waiter *pi_blocked_on;
#endif
#ifdef CONFIG_DEBUG_MUTEXES
/* mutex deadlock detection */
struct mutex_waiter *blocked_on;
#endif
#ifdef CONFIG_TRACE_IRQFLAGS
unsigned int irq_events;
unsigned long hardirq_enable_ip;
unsigned long hardirq_disable_ip;
unsigned int hardirq_enable_event;
unsigned int hardirq_disable_event;
int hardirqs_enabled;
int hardirq_context;
unsigned long softirq_disable_ip;
unsigned long softirq_enable_ip;
unsigned int softirq_disable_event;
unsigned int softirq_enable_event;
int softirqs_enabled;
int softirq_context;
#endif
[PATCH] lockdep: core Do 'make oldconfig' and accept all the defaults for new config options - reboot into the kernel and if everything goes well it should boot up fine and you should have /proc/lockdep and /proc/lockdep_stats files. Typically if the lock validator finds some problem it will print out voluminous debug output that begins with "BUG: ..." and which syslog output can be used by kernel developers to figure out the precise locking scenario. What does the lock validator do? It "observes" and maps all locking rules as they occur dynamically (as triggered by the kernel's natural use of spinlocks, rwlocks, mutexes and rwsems). Whenever the lock validator subsystem detects a new locking scenario, it validates this new rule against the existing set of rules. If this new rule is consistent with the existing set of rules then the new rule is added transparently and the kernel continues as normal. If the new rule could create a deadlock scenario then this condition is printed out. When determining validity of locking, all possible "deadlock scenarios" are considered: assuming arbitrary number of CPUs, arbitrary irq context and task context constellations, running arbitrary combinations of all the existing locking scenarios. In a typical system this means millions of separate scenarios. This is why we call it a "locking correctness" validator - for all rules that are observed the lock validator proves it with mathematical certainty that a deadlock could not occur (assuming that the lock validator implementation itself is correct and its internal data structures are not corrupted by some other kernel subsystem). [see more details and conditionals of this statement in include/linux/lockdep.h and Documentation/lockdep-design.txt] Furthermore, this "all possible scenarios" property of the validator also enables the finding of complex, highly unlikely multi-CPU multi-context races via single single-context rules, increasing the likelyhood of finding bugs drastically. In practical terms: the lock validator already found a bug in the upstream kernel that could only occur on systems with 3 or more CPUs, and which needed 3 very unlikely code sequences to occur at once on the 3 CPUs. That bug was found and reported on a single-CPU system (!). So in essence a race will be found "piecemail-wise", triggering all the necessary components for the race, without having to reproduce the race scenario itself! In its short existence the lock validator found and reported many bugs before they actually caused a real deadlock. To further increase the efficiency of the validator, the mapping is not per "lock instance", but per "lock-class". For example, all struct inode objects in the kernel have inode->inotify_mutex. If there are 10,000 inodes cached, then there are 10,000 lock objects. But ->inotify_mutex is a single "lock type", and all locking activities that occur against ->inotify_mutex are "unified" into this single lock-class. The advantage of the lock-class approach is that all historical ->inotify_mutex uses are mapped into a single (and as narrow as possible) set of locking rules - regardless of how many different tasks or inode structures it took to build this set of rules. The set of rules persist during the lifetime of the kernel. To see the rough magnitude of checking that the lock validator does, here's a portion of /proc/lockdep_stats, fresh after bootup: lock-classes: 694 [max: 2048] direct dependencies: 1598 [max: 8192] indirect dependencies: 17896 all direct dependencies: 16206 dependency chains: 1910 [max: 8192] in-hardirq chains: 17 in-softirq chains: 105 in-process chains: 1065 stack-trace entries: 38761 [max: 131072] combined max dependencies: 2033928 hardirq-safe locks: 24 hardirq-unsafe locks: 176 softirq-safe locks: 53 softirq-unsafe locks: 137 irq-safe locks: 59 irq-unsafe locks: 176 The lock validator has observed 1598 actual single-thread locking patterns, and has validated all possible 2033928 distinct locking scenarios. More details about the design of the lock validator can be found in Documentation/lockdep-design.txt, which can also found at: http://redhat.com/~mingo/lockdep-patches/lockdep-design.txt [bunk@stusta.de: cleanups] Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-03 15:24:50 +08:00
#ifdef CONFIG_LOCKDEP
# define MAX_LOCK_DEPTH 48UL
[PATCH] lockdep: core Do 'make oldconfig' and accept all the defaults for new config options - reboot into the kernel and if everything goes well it should boot up fine and you should have /proc/lockdep and /proc/lockdep_stats files. Typically if the lock validator finds some problem it will print out voluminous debug output that begins with "BUG: ..." and which syslog output can be used by kernel developers to figure out the precise locking scenario. What does the lock validator do? It "observes" and maps all locking rules as they occur dynamically (as triggered by the kernel's natural use of spinlocks, rwlocks, mutexes and rwsems). Whenever the lock validator subsystem detects a new locking scenario, it validates this new rule against the existing set of rules. If this new rule is consistent with the existing set of rules then the new rule is added transparently and the kernel continues as normal. If the new rule could create a deadlock scenario then this condition is printed out. When determining validity of locking, all possible "deadlock scenarios" are considered: assuming arbitrary number of CPUs, arbitrary irq context and task context constellations, running arbitrary combinations of all the existing locking scenarios. In a typical system this means millions of separate scenarios. This is why we call it a "locking correctness" validator - for all rules that are observed the lock validator proves it with mathematical certainty that a deadlock could not occur (assuming that the lock validator implementation itself is correct and its internal data structures are not corrupted by some other kernel subsystem). [see more details and conditionals of this statement in include/linux/lockdep.h and Documentation/lockdep-design.txt] Furthermore, this "all possible scenarios" property of the validator also enables the finding of complex, highly unlikely multi-CPU multi-context races via single single-context rules, increasing the likelyhood of finding bugs drastically. In practical terms: the lock validator already found a bug in the upstream kernel that could only occur on systems with 3 or more CPUs, and which needed 3 very unlikely code sequences to occur at once on the 3 CPUs. That bug was found and reported on a single-CPU system (!). So in essence a race will be found "piecemail-wise", triggering all the necessary components for the race, without having to reproduce the race scenario itself! In its short existence the lock validator found and reported many bugs before they actually caused a real deadlock. To further increase the efficiency of the validator, the mapping is not per "lock instance", but per "lock-class". For example, all struct inode objects in the kernel have inode->inotify_mutex. If there are 10,000 inodes cached, then there are 10,000 lock objects. But ->inotify_mutex is a single "lock type", and all locking activities that occur against ->inotify_mutex are "unified" into this single lock-class. The advantage of the lock-class approach is that all historical ->inotify_mutex uses are mapped into a single (and as narrow as possible) set of locking rules - regardless of how many different tasks or inode structures it took to build this set of rules. The set of rules persist during the lifetime of the kernel. To see the rough magnitude of checking that the lock validator does, here's a portion of /proc/lockdep_stats, fresh after bootup: lock-classes: 694 [max: 2048] direct dependencies: 1598 [max: 8192] indirect dependencies: 17896 all direct dependencies: 16206 dependency chains: 1910 [max: 8192] in-hardirq chains: 17 in-softirq chains: 105 in-process chains: 1065 stack-trace entries: 38761 [max: 131072] combined max dependencies: 2033928 hardirq-safe locks: 24 hardirq-unsafe locks: 176 softirq-safe locks: 53 softirq-unsafe locks: 137 irq-safe locks: 59 irq-unsafe locks: 176 The lock validator has observed 1598 actual single-thread locking patterns, and has validated all possible 2033928 distinct locking scenarios. More details about the design of the lock validator can be found in Documentation/lockdep-design.txt, which can also found at: http://redhat.com/~mingo/lockdep-patches/lockdep-design.txt [bunk@stusta.de: cleanups] Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-03 15:24:50 +08:00
u64 curr_chain_key;
int lockdep_depth;
unsigned int lockdep_recursion;
struct held_lock held_locks[MAX_LOCK_DEPTH];
lockdep: annotate reclaim context (__GFP_NOFS) Here is another version, with the incremental patch rolled up, and added reclaim context annotation to kswapd, and allocation tracing to slab allocators (which may only ever reach the page allocator in rare cases, so it is good to put annotations here too). Haven't tested this version as such, but it should be getting closer to merge worthy ;) -- After noticing some code in mm/filemap.c accidentally perform a __GFP_FS allocation when it should not have been, I thought it might be a good idea to try to catch this kind of thing with lockdep. I coded up a little idea that seems to work. Unfortunately the system has to actually be in __GFP_FS page reclaim, then take the lock, before it will mark it. But at least that might still be some orders of magnitude more common (and more debuggable) than an actual deadlock condition, so we have some improvement I hope (the concept is no less complete than discovery of a lock's interrupt contexts). I guess we could even do the same thing with __GFP_IO (normal reclaim), and even GFP_NOIO locks too... but filesystems will have the most locks and fiddly code paths, so let's start there and see how it goes. It *seems* to work. I did a quick test. ================================= [ INFO: inconsistent lock state ] 2.6.28-rc6-00007-ged31348-dirty #26 --------------------------------- inconsistent {in-reclaim-W} -> {ov-reclaim-W} usage. modprobe/8526 [HC0[0]:SC0[0]:HE1:SE1] takes: (testlock){--..}, at: [<ffffffffa0020055>] brd_init+0x55/0x216 [brd] {in-reclaim-W} state was registered at: [<ffffffff80267bdb>] __lock_acquire+0x75b/0x1a60 [<ffffffff80268f71>] lock_acquire+0x91/0xc0 [<ffffffff8070f0e1>] mutex_lock_nested+0xb1/0x310 [<ffffffffa002002b>] brd_init+0x2b/0x216 [brd] [<ffffffff8020903b>] _stext+0x3b/0x170 [<ffffffff80272ebf>] sys_init_module+0xaf/0x1e0 [<ffffffff8020c3fb>] system_call_fastpath+0x16/0x1b [<ffffffffffffffff>] 0xffffffffffffffff irq event stamp: 3929 hardirqs last enabled at (3929): [<ffffffff8070f2b5>] mutex_lock_nested+0x285/0x310 hardirqs last disabled at (3928): [<ffffffff8070f089>] mutex_lock_nested+0x59/0x310 softirqs last enabled at (3732): [<ffffffff8061f623>] sk_filter+0x83/0xe0 softirqs last disabled at (3730): [<ffffffff8061f5b6>] sk_filter+0x16/0xe0 other info that might help us debug this: 1 lock held by modprobe/8526: #0: (testlock){--..}, at: [<ffffffffa0020055>] brd_init+0x55/0x216 [brd] stack backtrace: Pid: 8526, comm: modprobe Not tainted 2.6.28-rc6-00007-ged31348-dirty #26 Call Trace: [<ffffffff80265483>] print_usage_bug+0x193/0x1d0 [<ffffffff80266530>] mark_lock+0xaf0/0xca0 [<ffffffff80266735>] mark_held_locks+0x55/0xc0 [<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd] [<ffffffff802667ca>] trace_reclaim_fs+0x2a/0x60 [<ffffffff80285005>] __alloc_pages_internal+0x475/0x580 [<ffffffff8070f29e>] ? mutex_lock_nested+0x26e/0x310 [<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd] [<ffffffffa002006a>] brd_init+0x6a/0x216 [brd] [<ffffffffa0020000>] ? brd_init+0x0/0x216 [brd] [<ffffffff8020903b>] _stext+0x3b/0x170 [<ffffffff8070f8b9>] ? mutex_unlock+0x9/0x10 [<ffffffff8070f83d>] ? __mutex_unlock_slowpath+0x10d/0x180 [<ffffffff802669ec>] ? trace_hardirqs_on_caller+0x12c/0x190 [<ffffffff80272ebf>] sys_init_module+0xaf/0x1e0 [<ffffffff8020c3fb>] system_call_fastpath+0x16/0x1b Signed-off-by: Nick Piggin <npiggin@suse.de> Signed-off-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-01-21 15:12:39 +08:00
gfp_t lockdep_reclaim_gfp;
[PATCH] lockdep: core Do 'make oldconfig' and accept all the defaults for new config options - reboot into the kernel and if everything goes well it should boot up fine and you should have /proc/lockdep and /proc/lockdep_stats files. Typically if the lock validator finds some problem it will print out voluminous debug output that begins with "BUG: ..." and which syslog output can be used by kernel developers to figure out the precise locking scenario. What does the lock validator do? It "observes" and maps all locking rules as they occur dynamically (as triggered by the kernel's natural use of spinlocks, rwlocks, mutexes and rwsems). Whenever the lock validator subsystem detects a new locking scenario, it validates this new rule against the existing set of rules. If this new rule is consistent with the existing set of rules then the new rule is added transparently and the kernel continues as normal. If the new rule could create a deadlock scenario then this condition is printed out. When determining validity of locking, all possible "deadlock scenarios" are considered: assuming arbitrary number of CPUs, arbitrary irq context and task context constellations, running arbitrary combinations of all the existing locking scenarios. In a typical system this means millions of separate scenarios. This is why we call it a "locking correctness" validator - for all rules that are observed the lock validator proves it with mathematical certainty that a deadlock could not occur (assuming that the lock validator implementation itself is correct and its internal data structures are not corrupted by some other kernel subsystem). [see more details and conditionals of this statement in include/linux/lockdep.h and Documentation/lockdep-design.txt] Furthermore, this "all possible scenarios" property of the validator also enables the finding of complex, highly unlikely multi-CPU multi-context races via single single-context rules, increasing the likelyhood of finding bugs drastically. In practical terms: the lock validator already found a bug in the upstream kernel that could only occur on systems with 3 or more CPUs, and which needed 3 very unlikely code sequences to occur at once on the 3 CPUs. That bug was found and reported on a single-CPU system (!). So in essence a race will be found "piecemail-wise", triggering all the necessary components for the race, without having to reproduce the race scenario itself! In its short existence the lock validator found and reported many bugs before they actually caused a real deadlock. To further increase the efficiency of the validator, the mapping is not per "lock instance", but per "lock-class". For example, all struct inode objects in the kernel have inode->inotify_mutex. If there are 10,000 inodes cached, then there are 10,000 lock objects. But ->inotify_mutex is a single "lock type", and all locking activities that occur against ->inotify_mutex are "unified" into this single lock-class. The advantage of the lock-class approach is that all historical ->inotify_mutex uses are mapped into a single (and as narrow as possible) set of locking rules - regardless of how many different tasks or inode structures it took to build this set of rules. The set of rules persist during the lifetime of the kernel. To see the rough magnitude of checking that the lock validator does, here's a portion of /proc/lockdep_stats, fresh after bootup: lock-classes: 694 [max: 2048] direct dependencies: 1598 [max: 8192] indirect dependencies: 17896 all direct dependencies: 16206 dependency chains: 1910 [max: 8192] in-hardirq chains: 17 in-softirq chains: 105 in-process chains: 1065 stack-trace entries: 38761 [max: 131072] combined max dependencies: 2033928 hardirq-safe locks: 24 hardirq-unsafe locks: 176 softirq-safe locks: 53 softirq-unsafe locks: 137 irq-safe locks: 59 irq-unsafe locks: 176 The lock validator has observed 1598 actual single-thread locking patterns, and has validated all possible 2033928 distinct locking scenarios. More details about the design of the lock validator can be found in Documentation/lockdep-design.txt, which can also found at: http://redhat.com/~mingo/lockdep-patches/lockdep-design.txt [bunk@stusta.de: cleanups] Signed-off-by: Ingo Molnar <mingo@elte.hu> Signed-off-by: Arjan van de Ven <arjan@linux.intel.com> Signed-off-by: Adrian Bunk <bunk@stusta.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-07-03 15:24:50 +08:00
#endif
/* journalling filesystem info */
void *journal_info;
When stacked block devices are in-use (e.g. md or dm), the recursive calls to generic_make_request can use up a lot of space, and we would rather they didn't. As generic_make_request is a void function, and as it is generally not expected that it will have any effect immediately, it is safe to delay any call to generic_make_request until there is sufficient stack space available. As ->bi_next is reserved for the driver to use, it can have no valid value when generic_make_request is called, and as __make_request implicitly assumes it will be NULL (ELEVATOR_BACK_MERGE fork of switch) we can be certain that all callers set it to NULL. We can therefore safely use bi_next to link pending requests together, providing we clear it before making the real call. So, we choose to allow each thread to only be active in one generic_make_request at a time. If a subsequent (recursive) call is made, the bio is linked into a per-thread list, and is handled when the active call completes. As the list of pending bios is per-thread, there are no locking issues to worry about. I say above that it is "safe to delay any call...". There are, however, some behaviours of a make_request_fn which would make it unsafe. These include any behaviour that assumes anything will have changed after a recursive call to generic_make_request. These could include: - waiting for that call to finish and call it's bi_end_io function. md use to sometimes do this (marking the superblock dirty before completing a write) but doesn't any more - inspecting the bio for fields that generic_make_request might change, such as bi_sector or bi_bdev. It is hard to see a good reason for this, and I don't think anyone actually does it. - inspecing the queue to see if, e.g. it is 'full' yet. Again, I think this is very unlikely to be useful, or to be done. Signed-off-by: Neil Brown <neilb@suse.de> Cc: Jens Axboe <axboe@kernel.dk> Cc: <dm-devel@redhat.com> Alasdair G Kergon <agk@redhat.com> said: I can see nothing wrong with this in principle. For device-mapper at the moment though it's essential that, while the bio mappings may now get delayed, they still get processed in exactly the same order as they were passed to generic_make_request(). My main concern is whether the timing changes implicit in this patch will make the rare data-corrupting races in the existing snapshot code more likely. (I'm working on a fix for these races, but the unfinished patch is already several hundred lines long.) It would be helpful if some people on this mailing list would test this patch in various scenarios and report back. Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2007-05-01 15:53:42 +08:00
/* stacked block device info */
struct bio_list *bio_list;
When stacked block devices are in-use (e.g. md or dm), the recursive calls to generic_make_request can use up a lot of space, and we would rather they didn't. As generic_make_request is a void function, and as it is generally not expected that it will have any effect immediately, it is safe to delay any call to generic_make_request until there is sufficient stack space available. As ->bi_next is reserved for the driver to use, it can have no valid value when generic_make_request is called, and as __make_request implicitly assumes it will be NULL (ELEVATOR_BACK_MERGE fork of switch) we can be certain that all callers set it to NULL. We can therefore safely use bi_next to link pending requests together, providing we clear it before making the real call. So, we choose to allow each thread to only be active in one generic_make_request at a time. If a subsequent (recursive) call is made, the bio is linked into a per-thread list, and is handled when the active call completes. As the list of pending bios is per-thread, there are no locking issues to worry about. I say above that it is "safe to delay any call...". There are, however, some behaviours of a make_request_fn which would make it unsafe. These include any behaviour that assumes anything will have changed after a recursive call to generic_make_request. These could include: - waiting for that call to finish and call it's bi_end_io function. md use to sometimes do this (marking the superblock dirty before completing a write) but doesn't any more - inspecting the bio for fields that generic_make_request might change, such as bi_sector or bi_bdev. It is hard to see a good reason for this, and I don't think anyone actually does it. - inspecing the queue to see if, e.g. it is 'full' yet. Again, I think this is very unlikely to be useful, or to be done. Signed-off-by: Neil Brown <neilb@suse.de> Cc: Jens Axboe <axboe@kernel.dk> Cc: <dm-devel@redhat.com> Alasdair G Kergon <agk@redhat.com> said: I can see nothing wrong with this in principle. For device-mapper at the moment though it's essential that, while the bio mappings may now get delayed, they still get processed in exactly the same order as they were passed to generic_make_request(). My main concern is whether the timing changes implicit in this patch will make the rare data-corrupting races in the existing snapshot code more likely. (I'm working on a fix for these races, but the unfinished patch is already several hundred lines long.) It would be helpful if some people on this mailing list would test this patch in various scenarios and report back. Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Jens Axboe <jens.axboe@oracle.com>
2007-05-01 15:53:42 +08:00
#ifdef CONFIG_BLOCK
/* stack plugging */
struct blk_plug *plug;
#endif
/* VM state */
struct reclaim_state *reclaim_state;
struct backing_dev_info *backing_dev_info;
struct io_context *io_context;
unsigned long ptrace_message;
siginfo_t *last_siginfo; /* For ptrace use. */
[PATCH] io-accounting: core statistics The present per-task IO accounting isn't very useful. It simply counts the number of bytes passed into read() and write(). So if a process reads 1MB from an already-cached file, it is accused of having performed 1MB of I/O, which is wrong. (David Wright had some comments on the applicability of the present logical IO accounting: For billing purposes it is useless but for workload analysis it is very useful read_bytes/read_calls average read request size write_bytes/write_calls average write request size read_bytes/read_blocks ie logical/physical can indicate hit rate or thrashing write_bytes/write_blocks ie logical/physical guess since pdflush writes can be missed I often look for logical larger than physical to see filesystem cache problems. And the bytes/cpusec can help find applications that are dominating the cache and causing slow interactive response from page cache contention. I want to find the IO intensive applications and make sure they are doing efficient IO. Thus the acctcms(sysV) or csacms command would give the high IO commands). This patchset adds new accounting which tries to be more accurate. We account for three things: reads: attempt to count the number of bytes which this process really did cause to be fetched from the storage layer. Done at the submit_bio() level, so it is accurate for block-backed filesystems. I also attempt to wire up NFS and CIFS. writes: attempt to count the number of bytes which this process caused to be sent to the storage layer. This is done at page-dirtying time. The big inaccuracy here is truncate. If a process writes 1MB to a file and then deletes the file, it will in fact perform no writeout. But it will have been accounted as having caused 1MB of write. So... cancelled_writes: account the number of bytes which this process caused to not happen, by truncating pagecache. We _could_ just subtract this from the process's `write' accounting. But that means that some processes would be reported to have done negative amounts of write IO, which is silly. So we just report the raw number and punt this decision up to userspace. Now, we _could_ account for writes at the physical I/O level. But - This would require that we track memory-dirtying tasks at the per-page level (would require a new pointer in struct page). - It would mean that IO statistics for a process are usually only available long after that process has exitted. Which means that we probably cannot communicate this info via taskstats. This patch: Wire up the kernel-private data structures and the accessor functions to manipulate them. Cc: Jay Lan <jlan@sgi.com> Cc: Shailabh Nagar <nagar@watson.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Chris Sturtivant <csturtiv@sgi.com> Cc: Tony Ernst <tee@sgi.com> Cc: Guillaume Thouvenin <guillaume.thouvenin@bull.net> Cc: David Wright <daw@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-12-10 18:19:19 +08:00
struct task_io_accounting ioac;
#if defined(CONFIG_TASK_XACCT)
u64 acct_rss_mem1; /* accumulated rss usage */
u64 acct_vm_mem1; /* accumulated virtual memory usage */
cputime_t acct_timexpd; /* stime + utime since last update */
#endif
#ifdef CONFIG_CPUSETS
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
nodemask_t mems_allowed; /* Protected by alloc_lock */
cpuset: mm: reduce large amounts of memory barrier related damage v3 Commit c0ff7453bb5c ("cpuset,mm: fix no node to alloc memory when changing cpuset's mems") wins a super prize for the largest number of memory barriers entered into fast paths for one commit. [get|put]_mems_allowed is incredibly heavy with pairs of full memory barriers inserted into a number of hot paths. This was detected while investigating at large page allocator slowdown introduced some time after 2.6.32. The largest portion of this overhead was shown by oprofile to be at an mfence introduced by this commit into the page allocator hot path. For extra style points, the commit introduced the use of yield() in an implementation of what looks like a spinning mutex. This patch replaces the full memory barriers on both read and write sides with a sequence counter with just read barriers on the fast path side. This is much cheaper on some architectures, including x86. The main bulk of the patch is the retry logic if the nodemask changes in a manner that can cause a false failure. While updating the nodemask, a check is made to see if a false failure is a risk. If it is, the sequence number gets bumped and parallel allocators will briefly stall while the nodemask update takes place. In a page fault test microbenchmark, oprofile samples from __alloc_pages_nodemask went from 4.53% of all samples to 1.15%. The actual results were 3.3.0-rc3 3.3.0-rc3 rc3-vanilla nobarrier-v2r1 Clients 1 UserTime 0.07 ( 0.00%) 0.08 (-14.19%) Clients 2 UserTime 0.07 ( 0.00%) 0.07 ( 2.72%) Clients 4 UserTime 0.08 ( 0.00%) 0.07 ( 3.29%) Clients 1 SysTime 0.70 ( 0.00%) 0.65 ( 6.65%) Clients 2 SysTime 0.85 ( 0.00%) 0.82 ( 3.65%) Clients 4 SysTime 1.41 ( 0.00%) 1.41 ( 0.32%) Clients 1 WallTime 0.77 ( 0.00%) 0.74 ( 4.19%) Clients 2 WallTime 0.47 ( 0.00%) 0.45 ( 3.73%) Clients 4 WallTime 0.38 ( 0.00%) 0.37 ( 1.58%) Clients 1 Flt/sec/cpu 497620.28 ( 0.00%) 520294.53 ( 4.56%) Clients 2 Flt/sec/cpu 414639.05 ( 0.00%) 429882.01 ( 3.68%) Clients 4 Flt/sec/cpu 257959.16 ( 0.00%) 258761.48 ( 0.31%) Clients 1 Flt/sec 495161.39 ( 0.00%) 517292.87 ( 4.47%) Clients 2 Flt/sec 820325.95 ( 0.00%) 850289.77 ( 3.65%) Clients 4 Flt/sec 1020068.93 ( 0.00%) 1022674.06 ( 0.26%) MMTests Statistics: duration Sys Time Running Test (seconds) 135.68 132.17 User+Sys Time Running Test (seconds) 164.2 160.13 Total Elapsed Time (seconds) 123.46 120.87 The overall improvement is small but the System CPU time is much improved and roughly in correlation to what oprofile reported (these performance figures are without profiling so skew is expected). The actual number of page faults is noticeably improved. For benchmarks like kernel builds, the overall benefit is marginal but the system CPU time is slightly reduced. To test the actual bug the commit fixed I opened two terminals. The first ran within a cpuset and continually ran a small program that faulted 100M of anonymous data. In a second window, the nodemask of the cpuset was continually randomised in a loop. Without the commit, the program would fail every so often (usually within 10 seconds) and obviously with the commit everything worked fine. With this patch applied, it also worked fine so the fix should be functionally equivalent. Signed-off-by: Mel Gorman <mgorman@suse.de> Cc: Miao Xie <miaox@cn.fujitsu.com> Cc: David Rientjes <rientjes@google.com> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-03-22 07:34:11 +08:00
seqcount_t mems_allowed_seq; /* Seqence no to catch updates */
[PATCH] cpuset memory spread basic implementation This patch provides the implementation and cpuset interface for an alternative memory allocation policy that can be applied to certain kinds of memory allocations, such as the page cache (file system buffers) and some slab caches (such as inode caches). The policy is called "memory spreading." If enabled, it spreads out these kinds of memory allocations over all the nodes allowed to a task, instead of preferring to place them on the node where the task is executing. All other kinds of allocations, including anonymous pages for a tasks stack and data regions, are not affected by this policy choice, and continue to be allocated preferring the node local to execution, as modified by the NUMA mempolicy. There are two boolean flag files per cpuset that control where the kernel allocates pages for the file system buffers and related in kernel data structures. They are called 'memory_spread_page' and 'memory_spread_slab'. If the per-cpuset boolean flag file 'memory_spread_page' is set, then the kernel will spread the file system buffers (page cache) evenly over all the nodes that the faulting task is allowed to use, instead of preferring to put those pages on the node where the task is running. If the per-cpuset boolean flag file 'memory_spread_slab' is set, then the kernel will spread some file system related slab caches, such as for inodes and dentries evenly over all the nodes that the faulting task is allowed to use, instead of preferring to put those pages on the node where the task is running. The implementation is simple. Setting the cpuset flags 'memory_spread_page' or 'memory_spread_cache' turns on the per-process flags PF_SPREAD_PAGE or PF_SPREAD_SLAB, respectively, for each task that is in the cpuset or subsequently joins that cpuset. In subsequent patches, the page allocation calls for the affected page cache and slab caches are modified to perform an inline check for these flags, and if set, a call to a new routine cpuset_mem_spread_node() returns the node to prefer for the allocation. The cpuset_mem_spread_node() routine is also simple. It uses the value of a per-task rotor cpuset_mem_spread_rotor to select the next node in the current tasks mems_allowed to prefer for the allocation. This policy can provide substantial improvements for jobs that need to place thread local data on the corresponding node, but that need to access large file system data sets that need to be spread across the several nodes in the jobs cpuset in order to fit. Without this patch, especially for jobs that might have one thread reading in the data set, the memory allocation across the nodes in the jobs cpuset can become very uneven. A couple of Copyright year ranges are updated as well. And a couple of email addresses that can be found in the MAINTAINERS file are removed. Signed-off-by: Paul Jackson <pj@sgi.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-24 19:16:03 +08:00
int cpuset_mem_spread_rotor;
cpusets: new round-robin rotor for SLAB allocations We have observed several workloads running on multi-node systems where memory is assigned unevenly across the nodes in the system. There are numerous reasons for this but one is the round-robin rotor in cpuset_mem_spread_node(). For example, a simple test that writes a multi-page file will allocate pages on nodes 0 2 4 6 ... Odd nodes are skipped. (Sometimes it allocates on odd nodes & skips even nodes). An example is shown below. The program "lfile" writes a file consisting of 10 pages. The program then mmaps the file & uses get_mempolicy(..., MPOL_F_NODE) to determine the nodes where the file pages were allocated. The output is shown below: # ./lfile allocated on nodes: 2 4 6 0 1 2 6 0 2 There is a single rotor that is used for allocating both file pages & slab pages. Writing the file allocates both a data page & a slab page (buffer_head). This advances the RR rotor 2 nodes for each page allocated. A quick confirmation seems to confirm this is the cause of the uneven allocation: # echo 0 >/dev/cpuset/memory_spread_slab # ./lfile allocated on nodes: 6 7 8 9 0 1 2 3 4 5 This patch introduces a second rotor that is used for slab allocations. Signed-off-by: Jack Steiner <steiner@sgi.com> Acked-by: Christoph Lameter <cl@linux-foundation.org> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: Paul Menage <menage@google.com> Cc: Jack Steiner <steiner@sgi.com> Cc: Robin Holt <holt@sgi.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-05-27 05:42:49 +08:00
int cpuset_slab_spread_rotor;
#endif
Task Control Groups: basic task cgroup framework Generic Process Control Groups -------------------------- There have recently been various proposals floating around for resource management/accounting and other task grouping subsystems in the kernel, including ResGroups, User BeanCounters, NSProxy cgroups, and others. These all need the basic abstraction of being able to group together multiple processes in an aggregate, in order to track/limit the resources permitted to those processes, or control other behaviour of the processes, and all implement this grouping in different ways. This patchset provides a framework for tracking and grouping processes into arbitrary "cgroups" and assigning arbitrary state to those groupings, in order to control the behaviour of the cgroup as an aggregate. The intention is that the various resource management and virtualization/cgroup efforts can also become task cgroup clients, with the result that: - the userspace APIs are (somewhat) normalised - it's easier to test e.g. the ResGroups CPU controller in conjunction with the BeanCounters memory controller, or use either of them as the resource-control portion of a virtual server system. - the additional kernel footprint of any of the competing resource management systems is substantially reduced, since it doesn't need to provide process grouping/containment, hence improving their chances of getting into the kernel This patch: Add the main task cgroups framework - the cgroup filesystem, and the basic structures for tracking membership and associating subsystem state objects to tasks. Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:30 +08:00
#ifdef CONFIG_CGROUPS
Task Control Groups: shared cgroup subsystem group arrays Replace the struct css_set embedded in task_struct with a pointer; all tasks that have the same set of memberships across all hierarchies will share a css_set object, and will be linked via their css_sets field to the "tasks" list_head in the css_set. Assuming that many tasks share the same cgroup assignments, this reduces overall space usage and keeps the size of the task_struct down (three pointers added to task_struct compared to a non-cgroups kernel, no matter how many subsystems are registered). [akpm@linux-foundation.org: fix a printk] [akpm@linux-foundation.org: build fix] Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:36 +08:00
/* Control Group info protected by css_set_lock */
struct css_set __rcu *cgroups;
Task Control Groups: shared cgroup subsystem group arrays Replace the struct css_set embedded in task_struct with a pointer; all tasks that have the same set of memberships across all hierarchies will share a css_set object, and will be linked via their css_sets field to the "tasks" list_head in the css_set. Assuming that many tasks share the same cgroup assignments, this reduces overall space usage and keeps the size of the task_struct down (three pointers added to task_struct compared to a non-cgroups kernel, no matter how many subsystems are registered). [akpm@linux-foundation.org: fix a printk] [akpm@linux-foundation.org: build fix] Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:36 +08:00
/* cg_list protected by css_set_lock and tsk->alloc_lock */
struct list_head cg_list;
Task Control Groups: basic task cgroup framework Generic Process Control Groups -------------------------- There have recently been various proposals floating around for resource management/accounting and other task grouping subsystems in the kernel, including ResGroups, User BeanCounters, NSProxy cgroups, and others. These all need the basic abstraction of being able to group together multiple processes in an aggregate, in order to track/limit the resources permitted to those processes, or control other behaviour of the processes, and all implement this grouping in different ways. This patchset provides a framework for tracking and grouping processes into arbitrary "cgroups" and assigning arbitrary state to those groupings, in order to control the behaviour of the cgroup as an aggregate. The intention is that the various resource management and virtualization/cgroup efforts can also become task cgroup clients, with the result that: - the userspace APIs are (somewhat) normalised - it's easier to test e.g. the ResGroups CPU controller in conjunction with the BeanCounters memory controller, or use either of them as the resource-control portion of a virtual server system. - the additional kernel footprint of any of the competing resource management systems is substantially reduced, since it doesn't need to provide process grouping/containment, hence improving their chances of getting into the kernel This patch: Add the main task cgroups framework - the cgroup filesystem, and the basic structures for tracking membership and associating subsystem state objects to tasks. Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:30 +08:00
#endif
#ifdef CONFIG_FUTEX
struct robust_list_head __user *robust_list;
#ifdef CONFIG_COMPAT
struct compat_robust_list_head __user *compat_robust_list;
#endif
struct list_head pi_state_list;
struct futex_pi_state *pi_state_cache;
#endif
perf: Do the big rename: Performance Counters -> Performance Events Bye-bye Performance Counters, welcome Performance Events! In the past few months the perfcounters subsystem has grown out its initial role of counting hardware events, and has become (and is becoming) a much broader generic event enumeration, reporting, logging, monitoring, analysis facility. Naming its core object 'perf_counter' and naming the subsystem 'perfcounters' has become more and more of a misnomer. With pending code like hw-breakpoints support the 'counter' name is less and less appropriate. All in one, we've decided to rename the subsystem to 'performance events' and to propagate this rename through all fields, variables and API names. (in an ABI compatible fashion) The word 'event' is also a bit shorter than 'counter' - which makes it slightly more convenient to write/handle as well. Thanks goes to Stephane Eranian who first observed this misnomer and suggested a rename. User-space tooling and ABI compatibility is not affected - this patch should be function-invariant. (Also, defconfigs were not touched to keep the size down.) This patch has been generated via the following script: FILES=$(find * -type f | grep -vE 'oprofile|[^K]config') sed -i \ -e 's/PERF_EVENT_/PERF_RECORD_/g' \ -e 's/PERF_COUNTER/PERF_EVENT/g' \ -e 's/perf_counter/perf_event/g' \ -e 's/nb_counters/nb_events/g' \ -e 's/swcounter/swevent/g' \ -e 's/tpcounter_event/tp_event/g' \ $FILES for N in $(find . -name perf_counter.[ch]); do M=$(echo $N | sed 's/perf_counter/perf_event/g') mv $N $M done FILES=$(find . -name perf_event.*) sed -i \ -e 's/COUNTER_MASK/REG_MASK/g' \ -e 's/COUNTER/EVENT/g' \ -e 's/\<event\>/event_id/g' \ -e 's/counter/event/g' \ -e 's/Counter/Event/g' \ $FILES ... to keep it as correct as possible. This script can also be used by anyone who has pending perfcounters patches - it converts a Linux kernel tree over to the new naming. We tried to time this change to the point in time where the amount of pending patches is the smallest: the end of the merge window. Namespace clashes were fixed up in a preparatory patch - and some stylistic fallout will be fixed up in a subsequent patch. ( NOTE: 'counters' are still the proper terminology when we deal with hardware registers - and these sed scripts are a bit over-eager in renaming them. I've undone some of that, but in case there's something left where 'counter' would be better than 'event' we can undo that on an individual basis instead of touching an otherwise nicely automated patch. ) Suggested-by: Stephane Eranian <eranian@google.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Paul Mackerras <paulus@samba.org> Reviewed-by: Arjan van de Ven <arjan@linux.intel.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Howells <dhowells@redhat.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: <linux-arch@vger.kernel.org> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
#ifdef CONFIG_PERF_EVENTS
struct perf_event_context *perf_event_ctxp[perf_nr_task_contexts];
perf: Do the big rename: Performance Counters -> Performance Events Bye-bye Performance Counters, welcome Performance Events! In the past few months the perfcounters subsystem has grown out its initial role of counting hardware events, and has become (and is becoming) a much broader generic event enumeration, reporting, logging, monitoring, analysis facility. Naming its core object 'perf_counter' and naming the subsystem 'perfcounters' has become more and more of a misnomer. With pending code like hw-breakpoints support the 'counter' name is less and less appropriate. All in one, we've decided to rename the subsystem to 'performance events' and to propagate this rename through all fields, variables and API names. (in an ABI compatible fashion) The word 'event' is also a bit shorter than 'counter' - which makes it slightly more convenient to write/handle as well. Thanks goes to Stephane Eranian who first observed this misnomer and suggested a rename. User-space tooling and ABI compatibility is not affected - this patch should be function-invariant. (Also, defconfigs were not touched to keep the size down.) This patch has been generated via the following script: FILES=$(find * -type f | grep -vE 'oprofile|[^K]config') sed -i \ -e 's/PERF_EVENT_/PERF_RECORD_/g' \ -e 's/PERF_COUNTER/PERF_EVENT/g' \ -e 's/perf_counter/perf_event/g' \ -e 's/nb_counters/nb_events/g' \ -e 's/swcounter/swevent/g' \ -e 's/tpcounter_event/tp_event/g' \ $FILES for N in $(find . -name perf_counter.[ch]); do M=$(echo $N | sed 's/perf_counter/perf_event/g') mv $N $M done FILES=$(find . -name perf_event.*) sed -i \ -e 's/COUNTER_MASK/REG_MASK/g' \ -e 's/COUNTER/EVENT/g' \ -e 's/\<event\>/event_id/g' \ -e 's/counter/event/g' \ -e 's/Counter/Event/g' \ $FILES ... to keep it as correct as possible. This script can also be used by anyone who has pending perfcounters patches - it converts a Linux kernel tree over to the new naming. We tried to time this change to the point in time where the amount of pending patches is the smallest: the end of the merge window. Namespace clashes were fixed up in a preparatory patch - and some stylistic fallout will be fixed up in a subsequent patch. ( NOTE: 'counters' are still the proper terminology when we deal with hardware registers - and these sed scripts are a bit over-eager in renaming them. I've undone some of that, but in case there's something left where 'counter' would be better than 'event' we can undo that on an individual basis instead of touching an otherwise nicely automated patch. ) Suggested-by: Stephane Eranian <eranian@google.com> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Acked-by: Paul Mackerras <paulus@samba.org> Reviewed-by: Arjan van de Ven <arjan@linux.intel.com> Cc: Mike Galbraith <efault@gmx.de> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: David Howells <dhowells@redhat.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: "H. Peter Anvin" <hpa@zytor.com> Cc: <linux-arch@vger.kernel.org> LKML-Reference: <new-submission> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
struct mutex perf_event_mutex;
struct list_head perf_event_list;
perf_counter: Dynamically allocate tasks' perf_counter_context struct This replaces the struct perf_counter_context in the task_struct with a pointer to a dynamically allocated perf_counter_context struct. The main reason for doing is this is to allow us to transfer a perf_counter_context from one task to another when we do lazy PMU switching in a later patch. This has a few side-benefits: the task_struct becomes a little smaller, we save some memory because only tasks that have perf_counters attached get a perf_counter_context allocated for them, and we can remove the inclusion of <linux/perf_counter.h> in sched.h, meaning that we don't end up recompiling nearly everything whenever perf_counter.h changes. The perf_counter_context structures are reference-counted and freed when the last reference is dropped. A context can have references from its task and the counters on its task. Counters can outlive the task so it is possible that a context will be freed well after its task has exited. Contexts are allocated on fork if the parent had a context, or otherwise the first time that a per-task counter is created on a task. In the latter case, we set the context pointer in the task struct locklessly using an atomic compare-and-exchange operation in case we raced with some other task in creating a context for the subject task. This also removes the task pointer from the perf_counter struct. The task pointer was not used anywhere and would make it harder to move a context from one task to another. Anything that needed to know which task a counter was attached to was already using counter->ctx->task. The __perf_counter_init_context function moves up in perf_counter.c so that it can be called from find_get_context, and now initializes the refcount, but is otherwise unchanged. We were potentially calling list_del_counter twice: once from __perf_counter_exit_task when the task exits and once from __perf_counter_remove_from_context when the counter's fd gets closed. This adds a check in list_del_counter so it doesn't do anything if the counter has already been removed from the lists. Since perf_counter_task_sched_in doesn't do anything if the task doesn't have a context, and leaves cpuctx->task_ctx = NULL, this adds code to __perf_install_in_context to set cpuctx->task_ctx if necessary, i.e. in the case where the current task adds the first counter to itself and thus creates a context for itself. This also adds similar code to __perf_counter_enable to handle a similar situation which can arise when the counters have been disabled using prctl; that also leaves cpuctx->task_ctx = NULL. [ Impact: refactor counter context management to prepare for new feature ] Signed-off-by: Paul Mackerras <paulus@samba.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Corey Ashford <cjashfor@linux.vnet.ibm.com> Cc: Marcelo Tosatti <mtosatti@redhat.com> Cc: Arnaldo Carvalho de Melo <acme@redhat.com> LKML-Reference: <18966.10075.781053.231153@cargo.ozlabs.ibm.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-05-22 12:17:31 +08:00
#endif
#ifdef CONFIG_NUMA
cpuset,mm: update tasks' mems_allowed in time Fix allocating page cache/slab object on the unallowed node when memory spread is set by updating tasks' mems_allowed after its cpuset's mems is changed. In order to update tasks' mems_allowed in time, we must modify the code of memory policy. Because the memory policy is applied in the process's context originally. After applying this patch, one task directly manipulates anothers mems_allowed, and we use alloc_lock in the task_struct to protect mems_allowed and memory policy of the task. But in the fast path, we didn't use lock to protect them, because adding a lock may lead to performance regression. But if we don't add a lock,the task might see no nodes when changing cpuset's mems_allowed to some non-overlapping set. In order to avoid it, we set all new allowed nodes, then clear newly disallowed ones. [lee.schermerhorn@hp.com: The rework of mpol_new() to extract the adjusting of the node mask to apply cpuset and mpol flags "context" breaks set_mempolicy() and mbind() with MPOL_PREFERRED and a NULL nodemask--i.e., explicit local allocation. Fix this by adding the check for MPOL_PREFERRED and empty node mask to mpol_new_mpolicy(). Remove the now unneeded 'nodes = NULL' from mpol_new(). Note that mpol_new_mempolicy() is always called with a non-NULL 'nodes' parameter now that it has been removed from mpol_new(). Therefore, we don't need to test nodes for NULL before testing it for 'empty'. However, just to be extra paranoid, add a VM_BUG_ON() to verify this assumption.] [lee.schermerhorn@hp.com: I don't think the function name 'mpol_new_mempolicy' is descriptive enough to differentiate it from mpol_new(). This function applies cpuset set context, usually constraining nodes to those allowed by the cpuset. However, when the 'RELATIVE_NODES flag is set, it also translates the nodes. So I settled on 'mpol_set_nodemask()', because the comment block for mpol_new() mentions that we need to call this function to "set nodes". Some additional minor line length, whitespace and typo cleanup.] Signed-off-by: Miao Xie <miaox@cn.fujitsu.com> Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Christoph Lameter <cl@linux-foundation.org> Cc: Paul Menage <menage@google.com> Cc: Nick Piggin <nickpiggin@yahoo.com.au> Cc: Yasunori Goto <y-goto@jp.fujitsu.com> Cc: Pekka Enberg <penberg@cs.helsinki.fi> Cc: David Rientjes <rientjes@google.com> Signed-off-by: Lee Schermerhorn <lee.schermerhorn@hp.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-06-17 06:31:49 +08:00
struct mempolicy *mempolicy; /* Protected by alloc_lock */
short il_next;
short pref_node_fork;
#endif
struct rcu_head rcu;
/*
* cache last used pipe for splice
*/
struct pipe_inode_info *splice_pipe;
#ifdef CONFIG_TASK_DELAY_ACCT
struct task_delay_info *delays;
#endif
#ifdef CONFIG_FAULT_INJECTION
int make_it_fail;
#endif
writeback: per task dirty rate limit Add two fields to task_struct. 1) account dirtied pages in the individual tasks, for accuracy 2) per-task balance_dirty_pages() call intervals, for flexibility The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will scale near-sqrt to the safety gap between dirty pages and threshold. The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying pages at exactly the same time, each task will be assigned a large initial nr_dirtied_pause, so that the dirty threshold will be exceeded long before each task reached its nr_dirtied_pause and hence call balance_dirty_pages(). The solution is to watch for the number of pages dirtied on each CPU in between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages (3% dirty threshold), force call balance_dirty_pages() for a chance to set bdi->dirty_exceeded. In normal situations, this safeguarding condition is not expected to trigger at all. On the sqrt in dirty_poll_interval(): It will serve as an initial guess when dirty pages are still in the freerun area. When dirty pages are floating inside the dirty control scope [freerun, limit], a followup patch will use some refined dirty poll interval to get the desired pause time. thresh-dirty (MB) sqrt 1 16 2 22 4 32 8 45 16 64 32 90 64 128 128 181 256 256 512 362 1024 512 The above table means, given 1MB (or 1GB) gap and the dd tasks polling balance_dirty_pages() on every 16 (or 512) pages, the dirty limit won't be exceeded as long as there are less than 16 (or 512) concurrent dd's. So sqrt naturally leads to less overheads and more safe concurrent tasks for large memory servers, which have large (thresh-freerun) gaps. peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case CC: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: Andrea Righi <andrea@betterlinux.com> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
2011-06-12 08:10:12 +08:00
/*
* when (nr_dirtied >= nr_dirtied_pause), it's time to call
* balance_dirty_pages() for some dirty throttling pause
*/
int nr_dirtied;
int nr_dirtied_pause;
unsigned long dirty_paused_when; /* start of a write-and-pause period */
writeback: per task dirty rate limit Add two fields to task_struct. 1) account dirtied pages in the individual tasks, for accuracy 2) per-task balance_dirty_pages() call intervals, for flexibility The balance_dirty_pages() call interval (ie. nr_dirtied_pause) will scale near-sqrt to the safety gap between dirty pages and threshold. The main problem of per-task nr_dirtied is, if 1k+ tasks start dirtying pages at exactly the same time, each task will be assigned a large initial nr_dirtied_pause, so that the dirty threshold will be exceeded long before each task reached its nr_dirtied_pause and hence call balance_dirty_pages(). The solution is to watch for the number of pages dirtied on each CPU in between the calls into balance_dirty_pages(). If it exceeds ratelimit_pages (3% dirty threshold), force call balance_dirty_pages() for a chance to set bdi->dirty_exceeded. In normal situations, this safeguarding condition is not expected to trigger at all. On the sqrt in dirty_poll_interval(): It will serve as an initial guess when dirty pages are still in the freerun area. When dirty pages are floating inside the dirty control scope [freerun, limit], a followup patch will use some refined dirty poll interval to get the desired pause time. thresh-dirty (MB) sqrt 1 16 2 22 4 32 8 45 16 64 32 90 64 128 128 181 256 256 512 362 1024 512 The above table means, given 1MB (or 1GB) gap and the dd tasks polling balance_dirty_pages() on every 16 (or 512) pages, the dirty limit won't be exceeded as long as there are less than 16 (or 512) concurrent dd's. So sqrt naturally leads to less overheads and more safe concurrent tasks for large memory servers, which have large (thresh-freerun) gaps. peter: keep the per-CPU ratelimit for safeguarding the 1k+ tasks case CC: Peter Zijlstra <a.p.zijlstra@chello.nl> Reviewed-by: Andrea Righi <andrea@betterlinux.com> Signed-off-by: Wu Fengguang <fengguang.wu@intel.com>
2011-06-12 08:10:12 +08:00
#ifdef CONFIG_LATENCYTOP
int latency_record_count;
struct latency_record latency_record[LT_SAVECOUNT];
#endif
/*
* time slack values; these are used to round up poll() and
* select() etc timeout values. These are in nanoseconds.
*/
unsigned long timer_slack_ns;
unsigned long default_timer_slack_ns;
struct list_head *scm_work_list;
#ifdef CONFIG_FUNCTION_GRAPH_TRACER
/* Index of current stored address in ret_stack */
int curr_ret_stack;
/* Stack of return addresses for return function tracing */
struct ftrace_ret_stack *ret_stack;
/* time stamp for last schedule */
unsigned long long ftrace_timestamp;
/*
* Number of functions that haven't been traced
* because of depth overrun.
*/
atomic_t trace_overrun;
/* Pause for the tracing */
atomic_t tracing_graph_pause;
#endif
ftrace: graph of a single function This patch adds the file: /debugfs/tracing/set_graph_function which can be used along with the function graph tracer. When this file is empty, the function graph tracer will act as usual. When the file has a function in it, the function graph tracer will only trace that function. For example: # echo blk_unplug > /debugfs/tracing/set_graph_function # cat /debugfs/tracing/trace [...] ------------------------------------------ | 2) make-19003 => kjournald-2219 ------------------------------------------ 2) | blk_unplug() { 2) | dm_unplug_all() { 2) | dm_get_table() { 2) 1.381 us | _read_lock(); 2) 0.911 us | dm_table_get(); 2) 1. 76 us | _read_unlock(); 2) + 12.912 us | } 2) | dm_table_unplug_all() { 2) | blk_unplug() { 2) 0.778 us | generic_unplug_device(); 2) 2.409 us | } 2) 5.992 us | } 2) 0.813 us | dm_table_put(); 2) + 29. 90 us | } 2) + 34.532 us | } You can add up to 32 functions into this file. Currently we limit it to 32, but this may change with later improvements. To add another function, use the append '>>': # echo sys_read >> /debugfs/tracing/set_graph_function # cat /debugfs/tracing/set_graph_function blk_unplug sys_read Using the '>' will clear out the function and write anew: # echo sys_write > /debug/tracing/set_graph_function # cat /debug/tracing/set_graph_function sys_write Note, if you have function graph running while doing this, the small time between clearing it and updating it will cause the graph to record all functions. This should not be an issue because after it sets the filter, only those functions will be recorded from then on. If you need to only record a particular function then set this file first before starting the function graph tracer. In the future this side effect may be corrected. The set_graph_function file is similar to the set_ftrace_filter but it does not take wild cards nor does it allow for more than one function to be set with a single write. There is no technical reason why this is the case, I just do not have the time yet to implement that. Note, dynamic ftrace must be enabled for this to appear because it uses the dynamic ftrace records to match the name to the mcount call sites. Signed-off-by: Steven Rostedt <srostedt@redhat.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-12-04 04:36:57 +08:00
#ifdef CONFIG_TRACING
/* state flags for use by tracers */
unsigned long trace;
ftrace: Add internal recursive checks Witold reported a reboot caused by the selftests of the dynamic function tracer. He sent me a config and I used ktest to do a config_bisect on it (as my config did not cause the crash). It pointed out that the problem config was CONFIG_PROVE_RCU. What happened was that if multiple callbacks are attached to the function tracer, we iterate a list of callbacks. Because the list is managed by synchronize_sched() and preempt_disable, the access to the pointers uses rcu_dereference_raw(). When PROVE_RCU is enabled, the rcu_dereference_raw() calls some debugging functions, which happen to be traced. The tracing of the debug function would then call rcu_dereference_raw() which would then call the debug function and then... well you get the idea. I first wrote two different patches to solve this bug. 1) add a __rcu_dereference_raw() that would not do any checks. 2) add notrace to the offending debug functions. Both of these patches worked. Talking with Paul McKenney on IRC, he suggested to add recursion detection instead. This seemed to be a better solution, so I decided to implement it. As the task_struct already has a trace_recursion to detect recursion in the ring buffer, and that has a very small number it allows, I decided to use that same variable to add flags that can detect the recursion inside the infrastructure of the function tracer. I plan to change it so that the task struct bit can be checked in mcount, but as that requires changes to all archs, I will hold that off to the next merge window. Cc: Ingo Molnar <mingo@elte.hu> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Thomas Gleixner <tglx@linutronix.de> Cc: Frederic Weisbecker <fweisbec@gmail.com> Cc: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Link: http://lkml.kernel.org/r/1306348063.1465.116.camel@gandalf.stny.rr.com Reported-by: Witold Baryluk <baryluk@smp.if.uj.edu.pl> Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-05-26 02:27:43 +08:00
/* bitmask and counter of trace recursion */
unsigned long trace_recursion;
#endif /* CONFIG_TRACING */
memcg: coalesce uncharge during unmap/truncate In massive parallel enviroment, res_counter can be a performance bottleneck. One strong techinque to reduce lock contention is reducing calls by coalescing some amount of calls into one. Considering charge/uncharge chatacteristic, - charge is done one by one via demand-paging. - uncharge is done by - in chunk at munmap, truncate, exit, execve... - one by one via vmscan/paging. It seems we have a chance to coalesce uncharges for improving scalability at unmap/truncation. This patch is a for coalescing uncharge. For avoiding scattering memcg's structure to functions under /mm, this patch adds memcg batch uncharge information to the task. A reason for per-task batching is for making use of caller's context information. We do batched uncharge (deleyed uncharge) when truncation/unmap occurs but do direct uncharge when uncharge is called by memory reclaim (vmscan.c). The degree of coalescing depends on callers - at invalidate/trucate... pagevec size - at unmap ....ZAP_BLOCK_SIZE (memory itself will be freed in this degree.) Then, we'll not coalescing too much. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 miss/faults [child with this patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 miss/faults We can see some amounts of improvement. (root cgroup doesn't affected by this patch) Another patch for "charge" will follow this and above will be improved more. Changelog(since 2009/10/02): - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes) - some clean up and commentary/description updates. - added initialize code to copy_process(). (possible bug fix) Changelog(old): - fixed !CONFIG_MEM_CGROUP case. - rebased onto the latest mmotm + softlimit fix patches. - unified patch for callers - added commetns. - make ->do_batch as bool. - removed css_get() at el. We don't need it. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-16 08:47:03 +08:00
#ifdef CONFIG_CGROUP_MEM_RES_CTLR /* memcg uses this to do batch job */
struct memcg_batch_info {
int do_batch; /* incremented when batch uncharge started */
struct mem_cgroup *memcg; /* target memcg of uncharge */
unsigned long nr_pages; /* uncharged usage */
unsigned long memsw_nr_pages; /* uncharged mem+swap usage */
memcg: coalesce uncharge during unmap/truncate In massive parallel enviroment, res_counter can be a performance bottleneck. One strong techinque to reduce lock contention is reducing calls by coalescing some amount of calls into one. Considering charge/uncharge chatacteristic, - charge is done one by one via demand-paging. - uncharge is done by - in chunk at munmap, truncate, exit, execve... - one by one via vmscan/paging. It seems we have a chance to coalesce uncharges for improving scalability at unmap/truncation. This patch is a for coalescing uncharge. For avoiding scattering memcg's structure to functions under /mm, this patch adds memcg batch uncharge information to the task. A reason for per-task batching is for making use of caller's context information. We do batched uncharge (deleyed uncharge) when truncation/unmap occurs but do direct uncharge when uncharge is called by memory reclaim (vmscan.c). The degree of coalescing depends on callers - at invalidate/trucate... pagevec size - at unmap ....ZAP_BLOCK_SIZE (memory itself will be freed in this degree.) Then, we'll not coalescing too much. On x86-64 8cpu server, I tested overheads of memcg at page fault by running a program which does map/fault/unmap in a loop. Running a task per a cpu by taskset and see sum of the number of page faults in 60secs. [without memcg config] 40156968 page-faults # 0.085 M/sec ( +- 0.046% ) 27.67 cache-miss/faults [root cgroup] 36659599 page-faults # 0.077 M/sec ( +- 0.247% ) 31.58 miss/faults [in a child cgroup] 18444157 page-faults # 0.039 M/sec ( +- 0.133% ) 69.96 miss/faults [child with this patch] 27133719 page-faults # 0.057 M/sec ( +- 0.155% ) 47.16 miss/faults We can see some amounts of improvement. (root cgroup doesn't affected by this patch) Another patch for "charge" will follow this and above will be improved more. Changelog(since 2009/10/02): - renamed filed of memcg_batch (as pages to bytes, memsw to memsw_bytes) - some clean up and commentary/description updates. - added initialize code to copy_process(). (possible bug fix) Changelog(old): - fixed !CONFIG_MEM_CGROUP case. - rebased onto the latest mmotm + softlimit fix patches. - unified patch for callers - added commetns. - make ->do_batch as bool. - removed css_get() at el. We don't need it. Signed-off-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-12-16 08:47:03 +08:00
} memcg_batch;
#endif
#ifdef CONFIG_HAVE_HW_BREAKPOINT
atomic_t ptrace_bp_refcnt;
#endif
};
/* Future-safe accessor for struct task_struct's cpus_allowed. */
#define tsk_cpus_allowed(tsk) (&(tsk)->cpus_allowed)
/*
* Priority of a process goes from 0..MAX_PRIO-1, valid RT
* priority is 0..MAX_RT_PRIO-1, and SCHED_NORMAL/SCHED_BATCH
* tasks are in the range MAX_RT_PRIO..MAX_PRIO-1. Priority
* values are inverted: lower p->prio value means higher priority.
*
* The MAX_USER_RT_PRIO value allows the actual maximum
* RT priority to be separate from the value exported to
* user-space. This allows kernel threads to set their
* priority to a value higher than any user task. Note:
* MAX_RT_PRIO must not be smaller than MAX_USER_RT_PRIO.
*/
#define MAX_USER_RT_PRIO 100
#define MAX_RT_PRIO MAX_USER_RT_PRIO
#define MAX_PRIO (MAX_RT_PRIO + 40)
#define DEFAULT_PRIO (MAX_RT_PRIO + 20)
static inline int rt_prio(int prio)
{
if (unlikely(prio < MAX_RT_PRIO))
return 1;
return 0;
}
static inline int rt_task(struct task_struct *p)
{
return rt_prio(p->prio);
}
static inline struct pid *task_pid(struct task_struct *task)
[PATCH] pid: implement access helpers for a tacks various process groups In the last round of cleaning up the pid hash table a more general struct pid was introduced, that can be referenced counted. With the more general struct pid most if not all places where we store a pid_t we can now store a struct pid * and remove the need for a hash table lookup, and avoid any possible problems with pid roll over. Looking forward to the pid namespaces struct pid * gives us an absolute form a pid so we can compare and use them without caring which pid namespace we are in. This patchset introduces the infrastructure needed to use struct pid instead of pid_t, and then it goes on to convert two different kernel users that currently store a pid_t value. There are a lot more places to go but this is enough to get the basic idea. Before we can merge a pid namespace patch all of the kernel pid_t users need to be examined. Those that deal with user space processes need to be converted to using a struct pid *. Those that deal with kernel processes need to converted to using the kthread api. A rare few that only use their current processes pid values get to be left alone. This patch: task_session returns the struct pid of a tasks session. task_pgrp returns the struct pid of a tasks process group. task_tgid returns the struct pid of a tasks thread group. task_pid returns the struct pid of a tasks process id. These can be used to avoid unnecessary hash table lookups, and to implement safe pid comparisions in the face of a pid namespace. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 17:17:09 +08:00
{
return task->pids[PIDTYPE_PID].pid;
}
static inline struct pid *task_tgid(struct task_struct *task)
[PATCH] pid: implement access helpers for a tacks various process groups In the last round of cleaning up the pid hash table a more general struct pid was introduced, that can be referenced counted. With the more general struct pid most if not all places where we store a pid_t we can now store a struct pid * and remove the need for a hash table lookup, and avoid any possible problems with pid roll over. Looking forward to the pid namespaces struct pid * gives us an absolute form a pid so we can compare and use them without caring which pid namespace we are in. This patchset introduces the infrastructure needed to use struct pid instead of pid_t, and then it goes on to convert two different kernel users that currently store a pid_t value. There are a lot more places to go but this is enough to get the basic idea. Before we can merge a pid namespace patch all of the kernel pid_t users need to be examined. Those that deal with user space processes need to be converted to using a struct pid *. Those that deal with kernel processes need to converted to using the kthread api. A rare few that only use their current processes pid values get to be left alone. This patch: task_session returns the struct pid of a tasks session. task_pgrp returns the struct pid of a tasks process group. task_tgid returns the struct pid of a tasks thread group. task_pid returns the struct pid of a tasks process id. These can be used to avoid unnecessary hash table lookups, and to implement safe pid comparisions in the face of a pid namespace. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 17:17:09 +08:00
{
return task->group_leader->pids[PIDTYPE_PID].pid;
}
/*
* Without tasklist or rcu lock it is not safe to dereference
* the result of task_pgrp/task_session even if task == current,
* we can race with another thread doing sys_setsid/sys_setpgid.
*/
static inline struct pid *task_pgrp(struct task_struct *task)
[PATCH] pid: implement access helpers for a tacks various process groups In the last round of cleaning up the pid hash table a more general struct pid was introduced, that can be referenced counted. With the more general struct pid most if not all places where we store a pid_t we can now store a struct pid * and remove the need for a hash table lookup, and avoid any possible problems with pid roll over. Looking forward to the pid namespaces struct pid * gives us an absolute form a pid so we can compare and use them without caring which pid namespace we are in. This patchset introduces the infrastructure needed to use struct pid instead of pid_t, and then it goes on to convert two different kernel users that currently store a pid_t value. There are a lot more places to go but this is enough to get the basic idea. Before we can merge a pid namespace patch all of the kernel pid_t users need to be examined. Those that deal with user space processes need to be converted to using a struct pid *. Those that deal with kernel processes need to converted to using the kthread api. A rare few that only use their current processes pid values get to be left alone. This patch: task_session returns the struct pid of a tasks session. task_pgrp returns the struct pid of a tasks process group. task_tgid returns the struct pid of a tasks thread group. task_pid returns the struct pid of a tasks process id. These can be used to avoid unnecessary hash table lookups, and to implement safe pid comparisions in the face of a pid namespace. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 17:17:09 +08:00
{
return task->group_leader->pids[PIDTYPE_PGID].pid;
}
static inline struct pid *task_session(struct task_struct *task)
[PATCH] pid: implement access helpers for a tacks various process groups In the last round of cleaning up the pid hash table a more general struct pid was introduced, that can be referenced counted. With the more general struct pid most if not all places where we store a pid_t we can now store a struct pid * and remove the need for a hash table lookup, and avoid any possible problems with pid roll over. Looking forward to the pid namespaces struct pid * gives us an absolute form a pid so we can compare and use them without caring which pid namespace we are in. This patchset introduces the infrastructure needed to use struct pid instead of pid_t, and then it goes on to convert two different kernel users that currently store a pid_t value. There are a lot more places to go but this is enough to get the basic idea. Before we can merge a pid namespace patch all of the kernel pid_t users need to be examined. Those that deal with user space processes need to be converted to using a struct pid *. Those that deal with kernel processes need to converted to using the kthread api. A rare few that only use their current processes pid values get to be left alone. This patch: task_session returns the struct pid of a tasks session. task_pgrp returns the struct pid of a tasks process group. task_tgid returns the struct pid of a tasks thread group. task_pid returns the struct pid of a tasks process id. These can be used to avoid unnecessary hash table lookups, and to implement safe pid comparisions in the face of a pid namespace. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 17:17:09 +08:00
{
return task->group_leader->pids[PIDTYPE_SID].pid;
}
struct pid_namespace;
/*
* the helpers to get the task's different pids as they are seen
* from various namespaces
*
* task_xid_nr() : global id, i.e. the id seen from the init namespace;
* task_xid_vnr() : virtual id, i.e. the id seen from the pid namespace of
* current.
* task_xid_nr_ns() : id seen from the ns specified;
*
* set_task_vxid() : assigns a virtual id to a task;
*
* see also pid_nr() etc in include/linux/pid.h
*/
pid_t __task_pid_nr_ns(struct task_struct *task, enum pid_type type,
struct pid_namespace *ns);
static inline pid_t task_pid_nr(struct task_struct *tsk)
{
return tsk->pid;
}
static inline pid_t task_pid_nr_ns(struct task_struct *tsk,
struct pid_namespace *ns)
{
return __task_pid_nr_ns(tsk, PIDTYPE_PID, ns);
}
static inline pid_t task_pid_vnr(struct task_struct *tsk)
{
return __task_pid_nr_ns(tsk, PIDTYPE_PID, NULL);
}
static inline pid_t task_tgid_nr(struct task_struct *tsk)
{
return tsk->tgid;
}
pid_t task_tgid_nr_ns(struct task_struct *tsk, struct pid_namespace *ns);
static inline pid_t task_tgid_vnr(struct task_struct *tsk)
{
return pid_vnr(task_tgid(tsk));
}
static inline pid_t task_pgrp_nr_ns(struct task_struct *tsk,
struct pid_namespace *ns)
{
return __task_pid_nr_ns(tsk, PIDTYPE_PGID, ns);
}
static inline pid_t task_pgrp_vnr(struct task_struct *tsk)
{
return __task_pid_nr_ns(tsk, PIDTYPE_PGID, NULL);
}
static inline pid_t task_session_nr_ns(struct task_struct *tsk,
struct pid_namespace *ns)
{
return __task_pid_nr_ns(tsk, PIDTYPE_SID, ns);
}
static inline pid_t task_session_vnr(struct task_struct *tsk)
{
return __task_pid_nr_ns(tsk, PIDTYPE_SID, NULL);
}
/* obsolete, do not use */
static inline pid_t task_pgrp_nr(struct task_struct *tsk)
{
return task_pgrp_nr_ns(tsk, &init_pid_ns);
}
/**
* pid_alive - check that a task structure is not stale
* @p: Task structure to be checked.
*
* Test if a process is not yet dead (at most zombie state)
* If pid_alive fails, then pointers within the task structure
* can be stale and must not be dereferenced.
*/
static inline int pid_alive(struct task_struct *p)
{
[PATCH] pidhash: Refactor the pid hash table Simplifies the code, reduces the need for 4 pid hash tables, and makes the code more capable. In the discussions I had with Oleg it was felt that to a large extent the cleanup itself justified the work. With struct pid being dynamically allocated meant we could create the hash table entry when the pid was allocated and free the hash table entry when the pid was freed. Instead of playing with the hash lists when ever a process would attach or detach to a process. For myself the fact that it gave what my previous task_ref patch gave for free with simpler code was a big win. The problem is that if you hold a reference to struct task_struct you lock in 10K of low memory. If you do that in a user controllable way like /proc does, with an unprivileged but hostile user space application with typical resource limits of 1000 fds and 100 processes I can trigger the OOM killer by consuming all of low memory with task structs, on a machine wight 1GB of low memory. If I instead hold a reference to struct pid which holds a pointer to my task_struct, I don't suffer from that problem because struct pid is 2 orders of magnitude smaller. In fact struct pid is small enough that most other kernel data structures dwarf it, so simply limiting the number of referring data structures is enough to prevent exhaustion of low memory. This splits the current struct pid into two structures, struct pid and struct pid_link, and reduces our number of hash tables from PIDTYPE_MAX to just one. struct pid_link is the per process linkage into the hash tables and lives in struct task_struct. struct pid is given an indepedent lifetime, and holds pointers to each of the pid types. The independent life of struct pid simplifies attach_pid, and detach_pid, because we are always manipulating the list of pids and not the hash table. In addition in giving struct pid an indpendent life it makes the concept much more powerful. Kernel data structures can now embed a struct pid * instead of a pid_t and not suffer from pid wrap around problems or from keeping unnecessarily large amounts of memory allocated. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 18:31:42 +08:00
return p->pids[PIDTYPE_PID].pid != NULL;
}
/**
pid namespaces: define is_global_init() and is_container_init() is_init() is an ambiguous name for the pid==1 check. Split it into is_global_init() and is_container_init(). A cgroup init has it's tsk->pid == 1. A global init also has it's tsk->pid == 1 and it's active pid namespace is the init_pid_ns. But rather than check the active pid namespace, compare the task structure with 'init_pid_ns.child_reaper', which is initialized during boot to the /sbin/init process and never changes. Changelog: 2.6.22-rc4-mm2-pidns1: - Use 'init_pid_ns.child_reaper' to determine if a given task is the global init (/sbin/init) process. This would improve performance and remove dependence on the task_pid(). 2.6.21-mm2-pidns2: - [Sukadev Bhattiprolu] Changed is_container_init() calls in {powerpc, ppc,avr32}/traps.c for the _exception() call to is_global_init(). This way, we kill only the cgroup if the cgroup's init has a bug rather than force a kernel panic. [akpm@linux-foundation.org: fix comment] [sukadev@us.ibm.com: Use is_global_init() in arch/m32r/mm/fault.c] [bunk@stusta.de: kernel/pid.c: remove unused exports] [sukadev@us.ibm.com: Fix capability.c to work with threaded init] Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com> Acked-by: Pavel Emelianov <xemul@openvz.org> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Herbert Poetzel <herbert@13thfloor.at> Cc: Kirill Korotaev <dev@sw.ru> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:52 +08:00
* is_global_init - check if a task structure is init
* @tsk: Task structure to be checked.
*
* Check if a task structure is the first user space task the kernel created.
pid namespaces: define is_global_init() and is_container_init() is_init() is an ambiguous name for the pid==1 check. Split it into is_global_init() and is_container_init(). A cgroup init has it's tsk->pid == 1. A global init also has it's tsk->pid == 1 and it's active pid namespace is the init_pid_ns. But rather than check the active pid namespace, compare the task structure with 'init_pid_ns.child_reaper', which is initialized during boot to the /sbin/init process and never changes. Changelog: 2.6.22-rc4-mm2-pidns1: - Use 'init_pid_ns.child_reaper' to determine if a given task is the global init (/sbin/init) process. This would improve performance and remove dependence on the task_pid(). 2.6.21-mm2-pidns2: - [Sukadev Bhattiprolu] Changed is_container_init() calls in {powerpc, ppc,avr32}/traps.c for the _exception() call to is_global_init(). This way, we kill only the cgroup if the cgroup's init has a bug rather than force a kernel panic. [akpm@linux-foundation.org: fix comment] [sukadev@us.ibm.com: Use is_global_init() in arch/m32r/mm/fault.c] [bunk@stusta.de: kernel/pid.c: remove unused exports] [sukadev@us.ibm.com: Fix capability.c to work with threaded init] Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com> Acked-by: Pavel Emelianov <xemul@openvz.org> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Herbert Poetzel <herbert@13thfloor.at> Cc: Kirill Korotaev <dev@sw.ru> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:52 +08:00
*/
static inline int is_global_init(struct task_struct *tsk)
{
return tsk->pid == 1;
}
pid namespaces: define is_global_init() and is_container_init() is_init() is an ambiguous name for the pid==1 check. Split it into is_global_init() and is_container_init(). A cgroup init has it's tsk->pid == 1. A global init also has it's tsk->pid == 1 and it's active pid namespace is the init_pid_ns. But rather than check the active pid namespace, compare the task structure with 'init_pid_ns.child_reaper', which is initialized during boot to the /sbin/init process and never changes. Changelog: 2.6.22-rc4-mm2-pidns1: - Use 'init_pid_ns.child_reaper' to determine if a given task is the global init (/sbin/init) process. This would improve performance and remove dependence on the task_pid(). 2.6.21-mm2-pidns2: - [Sukadev Bhattiprolu] Changed is_container_init() calls in {powerpc, ppc,avr32}/traps.c for the _exception() call to is_global_init(). This way, we kill only the cgroup if the cgroup's init has a bug rather than force a kernel panic. [akpm@linux-foundation.org: fix comment] [sukadev@us.ibm.com: Use is_global_init() in arch/m32r/mm/fault.c] [bunk@stusta.de: kernel/pid.c: remove unused exports] [sukadev@us.ibm.com: Fix capability.c to work with threaded init] Signed-off-by: Serge E. Hallyn <serue@us.ibm.com> Signed-off-by: Sukadev Bhattiprolu <sukadev@us.ibm.com> Acked-by: Pavel Emelianov <xemul@openvz.org> Cc: Eric W. Biederman <ebiederm@xmission.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Herbert Poetzel <herbert@13thfloor.at> Cc: Kirill Korotaev <dev@sw.ru> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:52 +08:00
/*
* is_container_init:
* check whether in the task is init in its own pid namespace.
*/
extern int is_container_init(struct task_struct *tsk);
extern struct pid *cad_pid;
extern void free_task(struct task_struct *tsk);
#define get_task_struct(tsk) do { atomic_inc(&(tsk)->usage); } while(0)
extern void __put_task_struct(struct task_struct *t);
static inline void put_task_struct(struct task_struct *t)
{
if (atomic_dec_and_test(&t->usage))
[PATCH] task: RCU protect task->usage A big problem with rcu protected data structures that are also reference counted is that you must jump through several hoops to increase the reference count. I think someone finally implemented atomic_inc_not_zero(&count) to automate the common case. Unfortunately this means you must special case the rcu access case. When data structures are only visible via rcu in a manner that is not determined by the reference count on the object (i.e. tasks are visible until their zombies are reaped) there is a much simpler technique we can employ. Simply delaying the decrement of the reference count until the rcu interval is over. What that means is that the proc code that looks up a task and later wants to sleep can now do: rcu_read_lock(); task = find_task_by_pid(some_pid); if (task) { get_task_struct(task); } rcu_read_unlock(); The effect on the rest of the kernel is that put_task_struct becomes cheaper and immediate, and in the case where the task has been reaped it frees the task immediate instead of unnecessarily waiting an until the rcu interval is over. Cleanup of task_struct does not happen when its reference count drops to zero, instead cleanup happens when release_task is called. Tasks can only be looked up via rcu before release_task is called. All rcu protected members of task_struct are freed by release_task. Therefore we can move call_rcu from put_task_struct into release_task. And we can modify release_task to not immediately release the reference count but instead have it call put_task_struct from the function it gives to call_rcu. The end result: - get_task_struct is safe in an rcu context where we have just looked up the task. - put_task_struct() simplifies into its old pre rcu self. This reorganization also makes put_task_struct uncallable from modules as it is not exported but it does not appear to be called from any modules so this should not be an issue, and is trivially fixed. Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-03-31 18:31:37 +08:00
__put_task_struct(t);
}
extern void task_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
sched, cputime: Introduce thread_group_times() This is a real fix for problem of utime/stime values decreasing described in the thread: http://lkml.org/lkml/2009/11/3/522 Now cputime is accounted in the following way: - {u,s}time in task_struct are increased every time when the thread is interrupted by a tick (timer interrupt). - When a thread exits, its {u,s}time are added to signal->{u,s}time, after adjusted by task_times(). - When all threads in a thread_group exits, accumulated {u,s}time (and also c{u,s}time) in signal struct are added to c{u,s}time in signal struct of the group's parent. So {u,s}time in task struct are "raw" tick count, while {u,s}time and c{u,s}time in signal struct are "adjusted" values. And accounted values are used by: - task_times(), to get cputime of a thread: This function returns adjusted values that originates from raw {u,s}time and scaled by sum_exec_runtime that accounted by CFS. - thread_group_cputime(), to get cputime of a thread group: This function returns sum of all {u,s}time of living threads in the group, plus {u,s}time in the signal struct that is sum of adjusted cputimes of all exited threads belonged to the group. The problem is the return value of thread_group_cputime(), because it is mixed sum of "raw" value and "adjusted" value: group's {u,s}time = foreach(thread){{u,s}time} + exited({u,s}time) This misbehavior can break {u,s}time monotonicity. Assume that if there is a thread that have raw values greater than adjusted values (e.g. interrupted by 1000Hz ticks 50 times but only runs 45ms) and if it exits, cputime will decrease (e.g. -5ms). To fix this, we could do: group's {u,s}time = foreach(t){task_times(t)} + exited({u,s}time) But task_times() contains hard divisions, so applying it for every thread should be avoided. This patch fixes the above problem in the following way: - Modify thread's exit (= __exit_signal()) not to use task_times(). It means {u,s}time in signal struct accumulates raw values instead of adjusted values. As the result it makes thread_group_cputime() to return pure sum of "raw" values. - Introduce a new function thread_group_times(*task, *utime, *stime) that converts "raw" values of thread_group_cputime() to "adjusted" values, in same calculation procedure as task_times(). - Modify group's exit (= wait_task_zombie()) to use this introduced thread_group_times(). It make c{u,s}time in signal struct to have adjusted values like before this patch. - Replace some thread_group_cputime() by thread_group_times(). This replacements are only applied where conveys the "adjusted" cputime to users, and where already uses task_times() near by it. (i.e. sys_times(), getrusage(), and /proc/<PID>/stat.) This patch have a positive side effect: - Before this patch, if a group contains many short-life threads (e.g. runs 0.9ms and not interrupted by ticks), the group's cputime could be invisible since thread's cputime was accumulated after adjusted: imagine adjustment function as adj(ticks, runtime), {adj(0, 0.9) + adj(0, 0.9) + ....} = {0 + 0 + ....} = 0. After this patch it will not happen because the adjustment is applied after accumulated. v2: - remove if()s, put new variables into signal_struct. Signed-off-by: Hidetoshi Seto <seto.hidetoshi@jp.fujitsu.com> Acked-by: Peter Zijlstra <peterz@infradead.org> Cc: Spencer Candland <spencer@bluehost.com> Cc: Americo Wang <xiyou.wangcong@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Stanislaw Gruszka <sgruszka@redhat.com> LKML-Reference: <4B162517.8040909@jp.fujitsu.com> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-12-02 16:28:07 +08:00
extern void thread_group_times(struct task_struct *p, cputime_t *ut, cputime_t *st);
/*
* Per process flags
*/
#define PF_EXITING 0x00000004 /* getting shut down */
pi-futex: fix exit races and locking problems 1. New entries can be added to tsk->pi_state_list after task completed exit_pi_state_list(). The result is memory leakage and deadlocks. 2. handle_mm_fault() is called under spinlock. The result is obvious. 3. results in self-inflicted deadlock inside glibc. Sometimes futex_lock_pi returns -ESRCH, when it is not expected and glibc enters to for(;;) sleep() to simulate deadlock. This problem is quite obvious and I think the patch is right. Though it looks like each "if" in futex_lock_pi() got some stupid special case "else if". :-) 4. sometimes futex_lock_pi() returns -EDEADLK, when nobody has the lock. The reason is also obvious (see comment in the patch), but correct fix is far beyond my comprehension. I guess someone already saw this, the chunk: if (rt_mutex_trylock(&q.pi_state->pi_mutex)) ret = 0; is obviously from the same opera. But it does not work, because the rtmutex is really taken at this point: wake_futex_pi() of previous owner reassigned it to us. My fix works. But it looks very stupid. I would think about removal of shift of ownership in wake_futex_pi() and making all the work in context of process taking lock. From: Thomas Gleixner <tglx@linutronix.de> Fix 1) Avoid the tasklist lock variant of the exit race fix by adding an additional state transition to the exit code. This fixes also the issue, when a task with recursive segfaults is not able to release the futexes. Fix 2) Cleanup the lookup_pi_state() failure path and solve the -ESRCH problem finally. Fix 3) Solve the fixup_pi_state_owner() problem which needs to do the fixup in the lock protected section by using the in_atomic userspace access functions. This removes also the ugly lock drop / unqueue inside of fixup_pi_state() Fix 4) Fix a stale lock in the error path of futex_wake_pi() Added some error checks for verification. The -EDEADLK problem is solved by the rtmutex fixups. Signed-off-by: Thomas Gleixner <tglx@linutronix.de> Acked-by: Ingo Molnar <mingo@elte.hu> Cc: Steven Rostedt <rostedt@goodmis.org> Cc: Ulrich Drepper <drepper@redhat.com> Cc: Eric Dumazet <dada1@cosmosbay.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-09 04:47:00 +08:00
#define PF_EXITPIDONE 0x00000008 /* pi exit done on shut down */
#define PF_VCPU 0x00000010 /* I'm a virtual CPU */
#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */
#define PF_FORKNOEXEC 0x00000040 /* forked but didn't exec */
#define PF_MCE_PROCESS 0x00000080 /* process policy on mce errors */
#define PF_SUPERPRIV 0x00000100 /* used super-user privileges */
#define PF_DUMPCORE 0x00000200 /* dumped core */
#define PF_SIGNALED 0x00000400 /* killed by a signal */
#define PF_MEMALLOC 0x00000800 /* Allocating memory */
move RLIMIT_NPROC check from set_user() to do_execve_common() The patch http://lkml.org/lkml/2003/7/13/226 introduced an RLIMIT_NPROC check in set_user() to check for NPROC exceeding via setuid() and similar functions. Before the check there was a possibility to greatly exceed the allowed number of processes by an unprivileged user if the program relied on rlimit only. But the check created new security threat: many poorly written programs simply don't check setuid() return code and believe it cannot fail if executed with root privileges. So, the check is removed in this patch because of too often privilege escalations related to buggy programs. The NPROC can still be enforced in the common code flow of daemons spawning user processes. Most of daemons do fork()+setuid()+execve(). The check introduced in execve() (1) enforces the same limit as in setuid() and (2) doesn't create similar security issues. Neil Brown suggested to track what specific process has exceeded the limit by setting PF_NPROC_EXCEEDED process flag. With the change only this process would fail on execve(), and other processes' execve() behaviour is not changed. Solar Designer suggested to re-check whether NPROC limit is still exceeded at the moment of execve(). If the process was sleeping for days between set*uid() and execve(), and the NPROC counter step down under the limit, the defered execve() failure because NPROC limit was exceeded days ago would be unexpected. If the limit is not exceeded anymore, we clear the flag on successful calls to execve() and fork(). The flag is also cleared on successful calls to set_user() as the limit was exceeded for the previous user, not the current one. Similar check was introduced in -ow patches (without the process flag). v3 - clear PF_NPROC_EXCEEDED on successful calls to set_user(). Reviewed-by: James Morris <jmorris@namei.org> Signed-off-by: Vasiliy Kulikov <segoon@openwall.com> Acked-by: NeilBrown <neilb@suse.de> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2011-08-08 23:02:04 +08:00
#define PF_NPROC_EXCEEDED 0x00001000 /* set_user noticed that RLIMIT_NPROC was exceeded */
#define PF_USED_MATH 0x00002000 /* if unset the fpu must be initialized before use */
#define PF_NOFREEZE 0x00008000 /* this thread should not be frozen */
#define PF_FROZEN 0x00010000 /* frozen for system suspend */
#define PF_FSTRANS 0x00020000 /* inside a filesystem transaction */
#define PF_KSWAPD 0x00040000 /* I am kswapd */
#define PF_LESS_THROTTLE 0x00100000 /* Throttle me less: I clean memory */
#define PF_KTHREAD 0x00200000 /* I am a kernel thread */
#define PF_RANDOMIZE 0x00400000 /* randomize virtual address space */
#define PF_SWAPWRITE 0x00800000 /* Allowed to write to swap */
#define PF_SPREAD_PAGE 0x01000000 /* Spread page cache over cpuset */
#define PF_SPREAD_SLAB 0x02000000 /* Spread some slab caches over cpuset */
#define PF_THREAD_BOUND 0x04000000 /* Thread bound to specific cpu */
#define PF_MCE_EARLY 0x08000000 /* Early kill for mce process policy */
#define PF_MEMPOLICY 0x10000000 /* Non-default NUMA mempolicy */
#define PF_MUTEX_TESTER 0x20000000 /* Thread belongs to the rt mutex tester */
#define PF_FREEZER_SKIP 0x40000000 /* Freezer should not count it as freezable */
/*
* Only the _current_ task can read/write to tsk->flags, but other
* tasks can access tsk->flags in readonly mode for example
* with tsk_used_math (like during threaded core dumping).
* There is however an exception to this rule during ptrace
* or during fork: the ptracer task is allowed to write to the
* child->flags of its traced child (same goes for fork, the parent
* can write to the child->flags), because we're guaranteed the
* child is not running and in turn not changing child->flags
* at the same time the parent does it.
*/
#define clear_stopped_child_used_math(child) do { (child)->flags &= ~PF_USED_MATH; } while (0)
#define set_stopped_child_used_math(child) do { (child)->flags |= PF_USED_MATH; } while (0)
#define clear_used_math() clear_stopped_child_used_math(current)
#define set_used_math() set_stopped_child_used_math(current)
#define conditional_stopped_child_used_math(condition, child) \
do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= (condition) ? PF_USED_MATH : 0; } while (0)
#define conditional_used_math(condition) \
conditional_stopped_child_used_math(condition, current)
#define copy_to_stopped_child_used_math(child) \
do { (child)->flags &= ~PF_USED_MATH, (child)->flags |= current->flags & PF_USED_MATH; } while (0)
/* NOTE: this will return 0 or PF_USED_MATH, it will never return 1 */
#define tsk_used_math(p) ((p)->flags & PF_USED_MATH)
#define used_math() tsk_used_math(current)
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 17:37:00 +08:00
/*
* task->jobctl flags
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 17:37:00 +08:00
*/
#define JOBCTL_STOP_SIGMASK 0xffff /* signr of the last group stop */
signal: Fix premature completion of group stop when interfered by ptrace task->signal->group_stop_count is used to track the progress of group stop. It's initialized to the number of tasks which need to stop for group stop to finish and each stopping or trapping task decrements. However, each task doesn't keep track of whether it decremented the counter or not and if woken up before the group stop is complete and stops again, it can decrement the counter multiple times. Please consider the following example code. static void *worker(void *arg) { while (1) ; return NULL; } int main(void) { pthread_t thread; pid_t pid; int i; pid = fork(); if (!pid) { for (i = 0; i < 5; i++) pthread_create(&thread, NULL, worker, NULL); while (1) ; return 0; } ptrace(PTRACE_ATTACH, pid, NULL, NULL); while (1) { waitid(P_PID, pid, NULL, WSTOPPED); ptrace(PTRACE_SINGLESTEP, pid, NULL, (void *)(long)SIGSTOP); } return 0; } The child creates five threads and the parent continuously traps the first thread and whenever the child gets a signal, SIGSTOP is delivered. If an external process sends SIGSTOP to the child, all other threads in the process should reliably stop. However, due to the above bug, the first thread will often end up consuming group_stop_count multiple times and SIGSTOP often ends up stopping none or part of the other four threads. This patch adds a new field task->group_stop which is protected by siglock and uses GROUP_STOP_CONSUME flag to track which task is still to consume group_stop_count to fix this bug. task_clear_group_stop_pending() and task_participate_group_stop() are added to help manipulating group stop states. As ptrace_stop() now also uses task_participate_group_stop(), it will set SIGNAL_STOP_STOPPED if it completes a group stop. There still are many issues regarding the interaction between group stop and ptrace. Patches to address them will follow. - Oleg spotted duplicate GROUP_STOP_CONSUME. Dropped. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 17:37:00 +08:00
#define JOBCTL_STOP_DEQUEUED_BIT 16 /* stop signal dequeued */
#define JOBCTL_STOP_PENDING_BIT 17 /* task should stop for group stop */
#define JOBCTL_STOP_CONSUME_BIT 18 /* consume group stop count */
job control: introduce JOBCTL_TRAP_STOP and use it for group stop trap do_signal_stop() implemented both normal group stop and trap for group stop while ptraced. This approach has been enough but scheduled changes require trap mechanism which can be used in more generic manner and using group stop trap for generic trap site simplifies both userland visible interface and implementation. This patch adds a new jobctl flag - JOBCTL_TRAP_STOP. When set, it triggers a trap site, which behaves like group stop trap, in get_signal_to_deliver() after checking for pending signals. While ptraced, do_signal_stop() doesn't stop itself. It initiates group stop if requested and schedules JOBCTL_TRAP_STOP and returns. The caller - get_signal_to_deliver() - is responsible for checking whether TRAP_STOP is pending afterwards and handling it. ptrace_attach() is updated to use JOBCTL_TRAP_STOP instead of JOBCTL_STOP_PENDING and __ptrace_unlink() to clear all pending trap bits and TRAPPING so that TRAP_STOP and future trap bits don't linger after detach. While at it, add proper function comment to do_signal_stop() and make it return bool. -v2: __ptrace_unlink() updated to clear JOBCTL_TRAP_MASK and TRAPPING instead of JOBCTL_PENDING_MASK. This avoids accidentally clearing JOBCTL_STOP_CONSUME. Spotted by Oleg. -v3: do_signal_stop() updated to return %false without dropping siglock while ptraced and TRAP_STOP check moved inside for(;;) loop after group stop participation. This avoids unnecessary relocking and also will help avoiding unnecessary traps by consuming group stop before handling pending traps. -v4: Jobctl trap handling moved into a separate function - do_jobctl_trap(). Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com>
2011-06-14 17:20:14 +08:00
#define JOBCTL_TRAP_STOP_BIT 19 /* trap for STOP */
ptrace: implement TRAP_NOTIFY and use it for group stop events Currently there's no way for ptracer to find out whether group stop finished other than polling with INTERRUPT - GETSIGINFO - CONT sequence. This patch implements group stop notification for ptracer using STOP traps. When group stop state of a seized tracee changes, JOBCTL_TRAP_NOTIFY is set, which schedules a STOP trap which is sticky - it isn't cleared by other traps and at least one STOP trap will happen eventually. STOP trap is synchronization point for event notification and the tracer can determine the current group stop state by looking at the signal number portion of exit code (si_status from waitid(2) or si_code from PTRACE_GETSIGINFO). Notifications are generated both on start and end of group stops but, because group stop participation always happens before STOP trap, this doesn't cause an extra trap while tracee is participating in group stop. The symmetry will be useful later. Note that this notification works iff tracee is not trapped. Currently there is no way to be notified of group stop state changes while tracee is trapped. This will be addressed by a later patch. An example program follows. #define PTRACE_SEIZE 0x4206 #define PTRACE_INTERRUPT 0x4207 #define PTRACE_SEIZE_DEVEL 0x80000000 static const struct timespec ts1s = { .tv_sec = 1 }; int main(int argc, char **argv) { pid_t tracee, tracer; int i; tracee = fork(); if (!tracee) while (1) pause(); tracer = fork(); if (!tracer) { siginfo_t si; ptrace(PTRACE_SEIZE, tracee, NULL, (void *)(unsigned long)PTRACE_SEIZE_DEVEL); ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL); repeat: waitid(P_PID, tracee, NULL, WSTOPPED); ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si); if (!si.si_code) { printf("tracer: SIG %d\n", si.si_signo); ptrace(PTRACE_CONT, tracee, NULL, (void *)(unsigned long)si.si_signo); goto repeat; } printf("tracer: stopped=%d signo=%d\n", si.si_signo != SIGTRAP, si.si_signo); ptrace(PTRACE_CONT, tracee, NULL, NULL); goto repeat; } for (i = 0; i < 3; i++) { nanosleep(&ts1s, NULL); printf("mother: SIGSTOP\n"); kill(tracee, SIGSTOP); nanosleep(&ts1s, NULL); printf("mother: SIGCONT\n"); kill(tracee, SIGCONT); } nanosleep(&ts1s, NULL); kill(tracer, SIGKILL); kill(tracee, SIGKILL); return 0; } In the above program, tracer keeps tracee running and gets notification of each group stop state changes. # ./test-notify tracer: stopped=0 signo=5 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com>
2011-06-14 17:20:17 +08:00
#define JOBCTL_TRAP_NOTIFY_BIT 20 /* trap for NOTIFY */
#define JOBCTL_TRAPPING_BIT 21 /* switching to TRACED */
ptrace: implement PTRACE_LISTEN The previous patch implemented async notification for ptrace but it only worked while trace is running. This patch introduces PTRACE_LISTEN which is suggested by Oleg Nestrov. It's allowed iff tracee is in STOP trap and puts tracee into quasi-running state - tracee never really runs but wait(2) and ptrace(2) consider it to be running. While ptracer is listening, tracee is allowed to re-enter STOP to notify an async event. Listening state is cleared on the first notification. Ptracer can also clear it by issuing INTERRUPT - tracee will re-trap into STOP with listening state cleared. This allows ptracer to monitor group stop state without running tracee - use INTERRUPT to put tracee into STOP trap, issue LISTEN and then wait(2) to wait for the next group stop event. When it happens, PTRACE_GETSIGINFO provides information to determine the current state. Test program follows. #define PTRACE_SEIZE 0x4206 #define PTRACE_INTERRUPT 0x4207 #define PTRACE_LISTEN 0x4208 #define PTRACE_SEIZE_DEVEL 0x80000000 static const struct timespec ts1s = { .tv_sec = 1 }; int main(int argc, char **argv) { pid_t tracee, tracer; int i; tracee = fork(); if (!tracee) while (1) pause(); tracer = fork(); if (!tracer) { siginfo_t si; ptrace(PTRACE_SEIZE, tracee, NULL, (void *)(unsigned long)PTRACE_SEIZE_DEVEL); ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL); repeat: waitid(P_PID, tracee, NULL, WSTOPPED); ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si); if (!si.si_code) { printf("tracer: SIG %d\n", si.si_signo); ptrace(PTRACE_CONT, tracee, NULL, (void *)(unsigned long)si.si_signo); goto repeat; } printf("tracer: stopped=%d signo=%d\n", si.si_signo != SIGTRAP, si.si_signo); if (si.si_signo != SIGTRAP) ptrace(PTRACE_LISTEN, tracee, NULL, NULL); else ptrace(PTRACE_CONT, tracee, NULL, NULL); goto repeat; } for (i = 0; i < 3; i++) { nanosleep(&ts1s, NULL); printf("mother: SIGSTOP\n"); kill(tracee, SIGSTOP); nanosleep(&ts1s, NULL); printf("mother: SIGCONT\n"); kill(tracee, SIGCONT); } nanosleep(&ts1s, NULL); kill(tracer, SIGKILL); kill(tracee, SIGKILL); return 0; } This is identical to the program to test TRAP_NOTIFY except that tracee is PTRACE_LISTEN'd instead of PTRACE_CONT'd when group stopped. This allows ptracer to monitor when group stop ends without running tracee. # ./test-listen tracer: stopped=0 signo=5 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 -v2: Moved JOBCTL_LISTENING check in wait_task_stopped() into task_stopped_code() as suggested by Oleg. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com>
2011-06-14 17:20:18 +08:00
#define JOBCTL_LISTENING_BIT 22 /* ptracer is listening for events */
#define JOBCTL_STOP_DEQUEUED (1 << JOBCTL_STOP_DEQUEUED_BIT)
#define JOBCTL_STOP_PENDING (1 << JOBCTL_STOP_PENDING_BIT)
#define JOBCTL_STOP_CONSUME (1 << JOBCTL_STOP_CONSUME_BIT)
job control: introduce JOBCTL_TRAP_STOP and use it for group stop trap do_signal_stop() implemented both normal group stop and trap for group stop while ptraced. This approach has been enough but scheduled changes require trap mechanism which can be used in more generic manner and using group stop trap for generic trap site simplifies both userland visible interface and implementation. This patch adds a new jobctl flag - JOBCTL_TRAP_STOP. When set, it triggers a trap site, which behaves like group stop trap, in get_signal_to_deliver() after checking for pending signals. While ptraced, do_signal_stop() doesn't stop itself. It initiates group stop if requested and schedules JOBCTL_TRAP_STOP and returns. The caller - get_signal_to_deliver() - is responsible for checking whether TRAP_STOP is pending afterwards and handling it. ptrace_attach() is updated to use JOBCTL_TRAP_STOP instead of JOBCTL_STOP_PENDING and __ptrace_unlink() to clear all pending trap bits and TRAPPING so that TRAP_STOP and future trap bits don't linger after detach. While at it, add proper function comment to do_signal_stop() and make it return bool. -v2: __ptrace_unlink() updated to clear JOBCTL_TRAP_MASK and TRAPPING instead of JOBCTL_PENDING_MASK. This avoids accidentally clearing JOBCTL_STOP_CONSUME. Spotted by Oleg. -v3: do_signal_stop() updated to return %false without dropping siglock while ptraced and TRAP_STOP check moved inside for(;;) loop after group stop participation. This avoids unnecessary relocking and also will help avoiding unnecessary traps by consuming group stop before handling pending traps. -v4: Jobctl trap handling moved into a separate function - do_jobctl_trap(). Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com>
2011-06-14 17:20:14 +08:00
#define JOBCTL_TRAP_STOP (1 << JOBCTL_TRAP_STOP_BIT)
ptrace: implement TRAP_NOTIFY and use it for group stop events Currently there's no way for ptracer to find out whether group stop finished other than polling with INTERRUPT - GETSIGINFO - CONT sequence. This patch implements group stop notification for ptracer using STOP traps. When group stop state of a seized tracee changes, JOBCTL_TRAP_NOTIFY is set, which schedules a STOP trap which is sticky - it isn't cleared by other traps and at least one STOP trap will happen eventually. STOP trap is synchronization point for event notification and the tracer can determine the current group stop state by looking at the signal number portion of exit code (si_status from waitid(2) or si_code from PTRACE_GETSIGINFO). Notifications are generated both on start and end of group stops but, because group stop participation always happens before STOP trap, this doesn't cause an extra trap while tracee is participating in group stop. The symmetry will be useful later. Note that this notification works iff tracee is not trapped. Currently there is no way to be notified of group stop state changes while tracee is trapped. This will be addressed by a later patch. An example program follows. #define PTRACE_SEIZE 0x4206 #define PTRACE_INTERRUPT 0x4207 #define PTRACE_SEIZE_DEVEL 0x80000000 static const struct timespec ts1s = { .tv_sec = 1 }; int main(int argc, char **argv) { pid_t tracee, tracer; int i; tracee = fork(); if (!tracee) while (1) pause(); tracer = fork(); if (!tracer) { siginfo_t si; ptrace(PTRACE_SEIZE, tracee, NULL, (void *)(unsigned long)PTRACE_SEIZE_DEVEL); ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL); repeat: waitid(P_PID, tracee, NULL, WSTOPPED); ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si); if (!si.si_code) { printf("tracer: SIG %d\n", si.si_signo); ptrace(PTRACE_CONT, tracee, NULL, (void *)(unsigned long)si.si_signo); goto repeat; } printf("tracer: stopped=%d signo=%d\n", si.si_signo != SIGTRAP, si.si_signo); ptrace(PTRACE_CONT, tracee, NULL, NULL); goto repeat; } for (i = 0; i < 3; i++) { nanosleep(&ts1s, NULL); printf("mother: SIGSTOP\n"); kill(tracee, SIGSTOP); nanosleep(&ts1s, NULL); printf("mother: SIGCONT\n"); kill(tracee, SIGCONT); } nanosleep(&ts1s, NULL); kill(tracer, SIGKILL); kill(tracee, SIGKILL); return 0; } In the above program, tracer keeps tracee running and gets notification of each group stop state changes. # ./test-notify tracer: stopped=0 signo=5 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com>
2011-06-14 17:20:17 +08:00
#define JOBCTL_TRAP_NOTIFY (1 << JOBCTL_TRAP_NOTIFY_BIT)
#define JOBCTL_TRAPPING (1 << JOBCTL_TRAPPING_BIT)
ptrace: implement PTRACE_LISTEN The previous patch implemented async notification for ptrace but it only worked while trace is running. This patch introduces PTRACE_LISTEN which is suggested by Oleg Nestrov. It's allowed iff tracee is in STOP trap and puts tracee into quasi-running state - tracee never really runs but wait(2) and ptrace(2) consider it to be running. While ptracer is listening, tracee is allowed to re-enter STOP to notify an async event. Listening state is cleared on the first notification. Ptracer can also clear it by issuing INTERRUPT - tracee will re-trap into STOP with listening state cleared. This allows ptracer to monitor group stop state without running tracee - use INTERRUPT to put tracee into STOP trap, issue LISTEN and then wait(2) to wait for the next group stop event. When it happens, PTRACE_GETSIGINFO provides information to determine the current state. Test program follows. #define PTRACE_SEIZE 0x4206 #define PTRACE_INTERRUPT 0x4207 #define PTRACE_LISTEN 0x4208 #define PTRACE_SEIZE_DEVEL 0x80000000 static const struct timespec ts1s = { .tv_sec = 1 }; int main(int argc, char **argv) { pid_t tracee, tracer; int i; tracee = fork(); if (!tracee) while (1) pause(); tracer = fork(); if (!tracer) { siginfo_t si; ptrace(PTRACE_SEIZE, tracee, NULL, (void *)(unsigned long)PTRACE_SEIZE_DEVEL); ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL); repeat: waitid(P_PID, tracee, NULL, WSTOPPED); ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si); if (!si.si_code) { printf("tracer: SIG %d\n", si.si_signo); ptrace(PTRACE_CONT, tracee, NULL, (void *)(unsigned long)si.si_signo); goto repeat; } printf("tracer: stopped=%d signo=%d\n", si.si_signo != SIGTRAP, si.si_signo); if (si.si_signo != SIGTRAP) ptrace(PTRACE_LISTEN, tracee, NULL, NULL); else ptrace(PTRACE_CONT, tracee, NULL, NULL); goto repeat; } for (i = 0; i < 3; i++) { nanosleep(&ts1s, NULL); printf("mother: SIGSTOP\n"); kill(tracee, SIGSTOP); nanosleep(&ts1s, NULL); printf("mother: SIGCONT\n"); kill(tracee, SIGCONT); } nanosleep(&ts1s, NULL); kill(tracer, SIGKILL); kill(tracee, SIGKILL); return 0; } This is identical to the program to test TRAP_NOTIFY except that tracee is PTRACE_LISTEN'd instead of PTRACE_CONT'd when group stopped. This allows ptracer to monitor when group stop ends without running tracee. # ./test-listen tracer: stopped=0 signo=5 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 -v2: Moved JOBCTL_LISTENING check in wait_task_stopped() into task_stopped_code() as suggested by Oleg. Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com>
2011-06-14 17:20:18 +08:00
#define JOBCTL_LISTENING (1 << JOBCTL_LISTENING_BIT)
ptrace: implement TRAP_NOTIFY and use it for group stop events Currently there's no way for ptracer to find out whether group stop finished other than polling with INTERRUPT - GETSIGINFO - CONT sequence. This patch implements group stop notification for ptracer using STOP traps. When group stop state of a seized tracee changes, JOBCTL_TRAP_NOTIFY is set, which schedules a STOP trap which is sticky - it isn't cleared by other traps and at least one STOP trap will happen eventually. STOP trap is synchronization point for event notification and the tracer can determine the current group stop state by looking at the signal number portion of exit code (si_status from waitid(2) or si_code from PTRACE_GETSIGINFO). Notifications are generated both on start and end of group stops but, because group stop participation always happens before STOP trap, this doesn't cause an extra trap while tracee is participating in group stop. The symmetry will be useful later. Note that this notification works iff tracee is not trapped. Currently there is no way to be notified of group stop state changes while tracee is trapped. This will be addressed by a later patch. An example program follows. #define PTRACE_SEIZE 0x4206 #define PTRACE_INTERRUPT 0x4207 #define PTRACE_SEIZE_DEVEL 0x80000000 static const struct timespec ts1s = { .tv_sec = 1 }; int main(int argc, char **argv) { pid_t tracee, tracer; int i; tracee = fork(); if (!tracee) while (1) pause(); tracer = fork(); if (!tracer) { siginfo_t si; ptrace(PTRACE_SEIZE, tracee, NULL, (void *)(unsigned long)PTRACE_SEIZE_DEVEL); ptrace(PTRACE_INTERRUPT, tracee, NULL, NULL); repeat: waitid(P_PID, tracee, NULL, WSTOPPED); ptrace(PTRACE_GETSIGINFO, tracee, NULL, &si); if (!si.si_code) { printf("tracer: SIG %d\n", si.si_signo); ptrace(PTRACE_CONT, tracee, NULL, (void *)(unsigned long)si.si_signo); goto repeat; } printf("tracer: stopped=%d signo=%d\n", si.si_signo != SIGTRAP, si.si_signo); ptrace(PTRACE_CONT, tracee, NULL, NULL); goto repeat; } for (i = 0; i < 3; i++) { nanosleep(&ts1s, NULL); printf("mother: SIGSTOP\n"); kill(tracee, SIGSTOP); nanosleep(&ts1s, NULL); printf("mother: SIGCONT\n"); kill(tracee, SIGCONT); } nanosleep(&ts1s, NULL); kill(tracer, SIGKILL); kill(tracee, SIGKILL); return 0; } In the above program, tracer keeps tracee running and gets notification of each group stop state changes. # ./test-notify tracer: stopped=0 signo=5 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 mother: SIGSTOP tracer: SIG 19 tracer: stopped=1 signo=19 mother: SIGCONT tracer: stopped=0 signo=5 tracer: SIG 18 Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com>
2011-06-14 17:20:17 +08:00
#define JOBCTL_TRAP_MASK (JOBCTL_TRAP_STOP | JOBCTL_TRAP_NOTIFY)
job control: introduce JOBCTL_TRAP_STOP and use it for group stop trap do_signal_stop() implemented both normal group stop and trap for group stop while ptraced. This approach has been enough but scheduled changes require trap mechanism which can be used in more generic manner and using group stop trap for generic trap site simplifies both userland visible interface and implementation. This patch adds a new jobctl flag - JOBCTL_TRAP_STOP. When set, it triggers a trap site, which behaves like group stop trap, in get_signal_to_deliver() after checking for pending signals. While ptraced, do_signal_stop() doesn't stop itself. It initiates group stop if requested and schedules JOBCTL_TRAP_STOP and returns. The caller - get_signal_to_deliver() - is responsible for checking whether TRAP_STOP is pending afterwards and handling it. ptrace_attach() is updated to use JOBCTL_TRAP_STOP instead of JOBCTL_STOP_PENDING and __ptrace_unlink() to clear all pending trap bits and TRAPPING so that TRAP_STOP and future trap bits don't linger after detach. While at it, add proper function comment to do_signal_stop() and make it return bool. -v2: __ptrace_unlink() updated to clear JOBCTL_TRAP_MASK and TRAPPING instead of JOBCTL_PENDING_MASK. This avoids accidentally clearing JOBCTL_STOP_CONSUME. Spotted by Oleg. -v3: do_signal_stop() updated to return %false without dropping siglock while ptraced and TRAP_STOP check moved inside for(;;) loop after group stop participation. This avoids unnecessary relocking and also will help avoiding unnecessary traps by consuming group stop before handling pending traps. -v4: Jobctl trap handling moved into a separate function - do_jobctl_trap(). Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com>
2011-06-14 17:20:14 +08:00
#define JOBCTL_PENDING_MASK (JOBCTL_STOP_PENDING | JOBCTL_TRAP_MASK)
extern bool task_set_jobctl_pending(struct task_struct *task,
unsigned int mask);
job control: introduce JOBCTL_TRAP_STOP and use it for group stop trap do_signal_stop() implemented both normal group stop and trap for group stop while ptraced. This approach has been enough but scheduled changes require trap mechanism which can be used in more generic manner and using group stop trap for generic trap site simplifies both userland visible interface and implementation. This patch adds a new jobctl flag - JOBCTL_TRAP_STOP. When set, it triggers a trap site, which behaves like group stop trap, in get_signal_to_deliver() after checking for pending signals. While ptraced, do_signal_stop() doesn't stop itself. It initiates group stop if requested and schedules JOBCTL_TRAP_STOP and returns. The caller - get_signal_to_deliver() - is responsible for checking whether TRAP_STOP is pending afterwards and handling it. ptrace_attach() is updated to use JOBCTL_TRAP_STOP instead of JOBCTL_STOP_PENDING and __ptrace_unlink() to clear all pending trap bits and TRAPPING so that TRAP_STOP and future trap bits don't linger after detach. While at it, add proper function comment to do_signal_stop() and make it return bool. -v2: __ptrace_unlink() updated to clear JOBCTL_TRAP_MASK and TRAPPING instead of JOBCTL_PENDING_MASK. This avoids accidentally clearing JOBCTL_STOP_CONSUME. Spotted by Oleg. -v3: do_signal_stop() updated to return %false without dropping siglock while ptraced and TRAP_STOP check moved inside for(;;) loop after group stop participation. This avoids unnecessary relocking and also will help avoiding unnecessary traps by consuming group stop before handling pending traps. -v4: Jobctl trap handling moved into a separate function - do_jobctl_trap(). Signed-off-by: Tejun Heo <tj@kernel.org> Cc: Oleg Nesterov <oleg@redhat.com>
2011-06-14 17:20:14 +08:00
extern void task_clear_jobctl_trapping(struct task_struct *task);
extern void task_clear_jobctl_pending(struct task_struct *task,
unsigned int mask);
signal: Use GROUP_STOP_PENDING to stop once for a single group stop Currently task->signal->group_stop_count is used to decide whether to stop for group stop. However, if there is a task in the group which is taking a long time to stop, other tasks which are continued by ptrace would repeatedly stop for the same group stop until the group stop is complete. Conversely, if a ptraced task is in TASK_TRACED state, the debugger won't get notified of group stops which is inconsistent compared to the ptraced task in any other state. This patch introduces GROUP_STOP_PENDING which tracks whether a task is yet to stop for the group stop in progress. The flag is set when a group stop starts and cleared when the task stops the first time for the group stop, and consulted whenever whether the task should participate in a group stop needs to be determined. Note that now tasks in TASK_TRACED also participate in group stop. This results in the following behavior changes. * For a single group stop, a ptracer would see at most one stop reported. * A ptracee in TASK_TRACED now also participates in group stop and the tracer would get the notification. However, as a ptraced task could be in TASK_STOPPED state or any ptrace trap could consume group stop, the notification may still be missing. These will be addressed with further patches. * A ptracee may start a group stop while one is still in progress if the tracer let it continue with stop signal delivery. Group stop code handles this correctly. Oleg: * Spotted that a task might skip signal check even when its GROUP_STOP_PENDING is set. Fixed by updating recalc_sigpending_tsk() to check GROUP_STOP_PENDING instead of group_stop_count. * Pointed out that task->group_stop should be cleared whenever task->signal->group_stop_count is cleared. Fixed accordingly. * Pointed out the behavior inconsistency between TASK_TRACED and RUNNING and the last behavior change. Signed-off-by: Tejun Heo <tj@kernel.org> Acked-by: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com>
2011-03-23 17:37:00 +08:00
rcu: Add a TINY_PREEMPT_RCU Implement a small-memory-footprint uniprocessor-only implementation of preemptible RCU. This implementation uses but a single blocked-tasks list rather than the combinatorial number used per leaf rcu_node by TREE_PREEMPT_RCU, which reduces memory consumption and greatly simplifies processing. This version also takes advantage of uniprocessor execution to accelerate grace periods in the case where there are no readers. The general design is otherwise broadly similar to that of TREE_PREEMPT_RCU. This implementation is a step towards having RCU implementation driven off of the SMP and PREEMPT kernel configuration variables, which can happen once this implementation has accumulated sufficient experience. Removed ACCESS_ONCE() from __rcu_read_unlock() and added barrier() as suggested by Steve Rostedt in order to avoid the compiler-reordering issue noted by Mathieu Desnoyers (http://lkml.org/lkml/2010/8/16/183). As can be seen below, CONFIG_TINY_PREEMPT_RCU represents almost 5Kbyte savings compared to CONFIG_TREE_PREEMPT_RCU. Of course, for non-real-time workloads, CONFIG_TINY_RCU is even better. CONFIG_TREE_PREEMPT_RCU text data bss dec filename 13 0 0 13 kernel/rcupdate.o 6170 825 28 7023 kernel/rcutree.o ---- 7026 Total CONFIG_TINY_PREEMPT_RCU text data bss dec filename 13 0 0 13 kernel/rcupdate.o 2081 81 8 2170 kernel/rcutiny.o ---- 2183 Total CONFIG_TINY_RCU (non-preemptible) text data bss dec filename 13 0 0 13 kernel/rcupdate.o 719 25 0 744 kernel/rcutiny.o --- 757 Total Requested-by: Loïc Minier <loic.minier@canonical.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2010-06-30 07:49:16 +08:00
#ifdef CONFIG_PREEMPT_RCU
rcu: Merge preemptable-RCU functionality into hierarchical RCU Create a kernel/rcutree_plugin.h file that contains definitions for preemptable RCU (or, under the #else branch of the #ifdef, empty definitions for the classic non-preemptable semantics). These definitions fit into plugins defined in kernel/rcutree.c for this purpose. This variant of preemptable RCU uses a new algorithm whose read-side expense is roughly that of classic hierarchical RCU under CONFIG_PREEMPT. This new algorithm's update-side expense is similar to that of classic hierarchical RCU, and, in absence of read-side preemption or blocking, is exactly that of classic hierarchical RCU. Perhaps more important, this new algorithm has a much simpler implementation, saving well over 1,000 lines of code compared to mainline's implementation of preemptable RCU, which will hopefully be retired in favor of this new algorithm. The simplifications are obtained by maintaining per-task nesting state for running tasks, and using a simple lock-protected algorithm to handle accounting when tasks block within RCU read-side critical sections, making use of lessons learned while creating numerous user-level RCU implementations over the past 18 months. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12509746134003-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 04:56:52 +08:00
#define RCU_READ_UNLOCK_BLOCKED (1 << 0) /* blocked while in RCU read-side. */
#define RCU_READ_UNLOCK_NEED_QS (1 << 1) /* RCU core needs CPU response. */
rcu: Merge preemptable-RCU functionality into hierarchical RCU Create a kernel/rcutree_plugin.h file that contains definitions for preemptable RCU (or, under the #else branch of the #ifdef, empty definitions for the classic non-preemptable semantics). These definitions fit into plugins defined in kernel/rcutree.c for this purpose. This variant of preemptable RCU uses a new algorithm whose read-side expense is roughly that of classic hierarchical RCU under CONFIG_PREEMPT. This new algorithm's update-side expense is similar to that of classic hierarchical RCU, and, in absence of read-side preemption or blocking, is exactly that of classic hierarchical RCU. Perhaps more important, this new algorithm has a much simpler implementation, saving well over 1,000 lines of code compared to mainline's implementation of preemptable RCU, which will hopefully be retired in favor of this new algorithm. The simplifications are obtained by maintaining per-task nesting state for running tasks, and using a simple lock-protected algorithm to handle accounting when tasks block within RCU read-side critical sections, making use of lessons learned while creating numerous user-level RCU implementations over the past 18 months. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12509746134003-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 04:56:52 +08:00
static inline void rcu_copy_process(struct task_struct *p)
{
p->rcu_read_lock_nesting = 0;
p->rcu_read_unlock_special = 0;
rcu: Add a TINY_PREEMPT_RCU Implement a small-memory-footprint uniprocessor-only implementation of preemptible RCU. This implementation uses but a single blocked-tasks list rather than the combinatorial number used per leaf rcu_node by TREE_PREEMPT_RCU, which reduces memory consumption and greatly simplifies processing. This version also takes advantage of uniprocessor execution to accelerate grace periods in the case where there are no readers. The general design is otherwise broadly similar to that of TREE_PREEMPT_RCU. This implementation is a step towards having RCU implementation driven off of the SMP and PREEMPT kernel configuration variables, which can happen once this implementation has accumulated sufficient experience. Removed ACCESS_ONCE() from __rcu_read_unlock() and added barrier() as suggested by Steve Rostedt in order to avoid the compiler-reordering issue noted by Mathieu Desnoyers (http://lkml.org/lkml/2010/8/16/183). As can be seen below, CONFIG_TINY_PREEMPT_RCU represents almost 5Kbyte savings compared to CONFIG_TREE_PREEMPT_RCU. Of course, for non-real-time workloads, CONFIG_TINY_RCU is even better. CONFIG_TREE_PREEMPT_RCU text data bss dec filename 13 0 0 13 kernel/rcupdate.o 6170 825 28 7023 kernel/rcutree.o ---- 7026 Total CONFIG_TINY_PREEMPT_RCU text data bss dec filename 13 0 0 13 kernel/rcupdate.o 2081 81 8 2170 kernel/rcutiny.o ---- 2183 Total CONFIG_TINY_RCU (non-preemptible) text data bss dec filename 13 0 0 13 kernel/rcupdate.o 719 25 0 744 kernel/rcutiny.o --- 757 Total Requested-by: Loïc Minier <loic.minier@canonical.com> Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
2010-06-30 07:49:16 +08:00
#ifdef CONFIG_TREE_PREEMPT_RCU
p->rcu_blocked_node = NULL;
#endif /* #ifdef CONFIG_TREE_PREEMPT_RCU */
#ifdef CONFIG_RCU_BOOST
p->rcu_boost_mutex = NULL;
#endif /* #ifdef CONFIG_RCU_BOOST */
rcu: Merge preemptable-RCU functionality into hierarchical RCU Create a kernel/rcutree_plugin.h file that contains definitions for preemptable RCU (or, under the #else branch of the #ifdef, empty definitions for the classic non-preemptable semantics). These definitions fit into plugins defined in kernel/rcutree.c for this purpose. This variant of preemptable RCU uses a new algorithm whose read-side expense is roughly that of classic hierarchical RCU under CONFIG_PREEMPT. This new algorithm's update-side expense is similar to that of classic hierarchical RCU, and, in absence of read-side preemption or blocking, is exactly that of classic hierarchical RCU. Perhaps more important, this new algorithm has a much simpler implementation, saving well over 1,000 lines of code compared to mainline's implementation of preemptable RCU, which will hopefully be retired in favor of this new algorithm. The simplifications are obtained by maintaining per-task nesting state for running tasks, and using a simple lock-protected algorithm to handle accounting when tasks block within RCU read-side critical sections, making use of lessons learned while creating numerous user-level RCU implementations over the past 18 months. Signed-off-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com> Cc: laijs@cn.fujitsu.com Cc: dipankar@in.ibm.com Cc: akpm@linux-foundation.org Cc: mathieu.desnoyers@polymtl.ca Cc: josht@linux.vnet.ibm.com Cc: dvhltc@us.ibm.com Cc: niv@us.ibm.com Cc: peterz@infradead.org Cc: rostedt@goodmis.org LKML-Reference: <12509746134003-git-send-email-> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-08-23 04:56:52 +08:00
INIT_LIST_HEAD(&p->rcu_node_entry);
}
#else
static inline void rcu_copy_process(struct task_struct *p)
{
}
#endif
#ifdef CONFIG_SMP
extern void do_set_cpus_allowed(struct task_struct *p,
const struct cpumask *new_mask);
extern int set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask);
#else
static inline void do_set_cpus_allowed(struct task_struct *p,
const struct cpumask *new_mask)
{
}
static inline int set_cpus_allowed_ptr(struct task_struct *p,
const struct cpumask *new_mask)
{
if (!cpumask_test_cpu(0, new_mask))
return -EINVAL;
return 0;
}
#endif
#ifndef CONFIG_CPUMASK_OFFSTACK
static inline int set_cpus_allowed(struct task_struct *p, cpumask_t new_mask)
{
return set_cpus_allowed_ptr(p, &new_mask);
}
#endif
/*
* Do not use outside of architecture code which knows its limitations.
*
* sched_clock() has no promise of monotonicity or bounded drift between
* CPUs, use (which you should not) requires disabling IRQs.
*
* Please use one of the three interfaces below.
*/
extern unsigned long long notrace sched_clock(void);
/*
* See the comment in kernel/sched_clock.c
*/
extern u64 cpu_clock(int cpu);
extern u64 local_clock(void);
extern u64 sched_clock_cpu(int cpu);
extern void sched_clock_init(void);
#ifndef CONFIG_HAVE_UNSTABLE_SCHED_CLOCK
static inline void sched_clock_tick(void)
{
}
static inline void sched_clock_idle_sleep_event(void)
{
}
static inline void sched_clock_idle_wakeup_event(u64 delta_ns)
{
}
#else
/*
* Architectures can set this to 1 if they have specified
* CONFIG_HAVE_UNSTABLE_SCHED_CLOCK in their arch Kconfig,
* but then during bootup it turns out that sched_clock()
* is reliable after all:
*/
extern int sched_clock_stable;
extern void sched_clock_tick(void);
extern void sched_clock_idle_sleep_event(void);
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
#endif
#ifdef CONFIG_IRQ_TIME_ACCOUNTING
/*
* An i/f to runtime opt-in for irq time accounting based off of sched_clock.
* The reason for this explicit opt-in is not to have perf penalty with
* slow sched_clocks.
*/
extern void enable_sched_clock_irqtime(void);
extern void disable_sched_clock_irqtime(void);
#else
static inline void enable_sched_clock_irqtime(void) {}
static inline void disable_sched_clock_irqtime(void) {}
#endif
extern unsigned long long
task_sched_runtime(struct task_struct *task);
/* sched_exec is called by processes performing an exec */
#ifdef CONFIG_SMP
extern void sched_exec(void);
#else
#define sched_exec() {}
#endif
extern void sched_clock_idle_sleep_event(void);
extern void sched_clock_idle_wakeup_event(u64 delta_ns);
#ifdef CONFIG_HOTPLUG_CPU
extern void idle_task_exit(void);
#else
static inline void idle_task_exit(void) {}
#endif
#if defined(CONFIG_NO_HZ) && defined(CONFIG_SMP)
extern void wake_up_idle_cpu(int cpu);
#else
static inline void wake_up_idle_cpu(int cpu) { }
#endif
extern unsigned int sysctl_sched_latency;
extern unsigned int sysctl_sched_min_granularity;
extern unsigned int sysctl_sched_wakeup_granularity;
extern unsigned int sysctl_sched_child_runs_first;
enum sched_tunable_scaling {
SCHED_TUNABLESCALING_NONE,
SCHED_TUNABLESCALING_LOG,
SCHED_TUNABLESCALING_LINEAR,
SCHED_TUNABLESCALING_END,
};
extern enum sched_tunable_scaling sysctl_sched_tunable_scaling;
#ifdef CONFIG_SCHED_DEBUG
extern unsigned int sysctl_sched_migration_cost;
extern unsigned int sysctl_sched_nr_migrate;
extern unsigned int sysctl_sched_time_avg;
extern unsigned int sysctl_timer_migration;
extern unsigned int sysctl_sched_shares_window;
int sched_proc_update_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *length,
loff_t *ppos);
#endif
#ifdef CONFIG_SCHED_DEBUG
static inline unsigned int get_sysctl_timer_migration(void)
{
return sysctl_timer_migration;
}
#else
static inline unsigned int get_sysctl_timer_migration(void)
{
return 1;
}
#endif
extern unsigned int sysctl_sched_rt_period;
extern int sysctl_sched_rt_runtime;
int sched_rt_handler(struct ctl_table *table, int write,
void __user *buffer, size_t *lenp,
loff_t *ppos);
sched: Add 'autogroup' scheduling feature: automated per session task groups A recurring complaint from CFS users is that parallel kbuild has a negative impact on desktop interactivity. This patch implements an idea from Linus, to automatically create task groups. Currently, only per session autogroups are implemented, but the patch leaves the way open for enhancement. Implementation: each task's signal struct contains an inherited pointer to a refcounted autogroup struct containing a task group pointer, the default for all tasks pointing to the init_task_group. When a task calls setsid(), a new task group is created, the process is moved into the new task group, and a reference to the preveious task group is dropped. Child processes inherit this task group thereafter, and increase it's refcount. When the last thread of a process exits, the process's reference is dropped, such that when the last process referencing an autogroup exits, the autogroup is destroyed. At runqueue selection time, IFF a task has no cgroup assignment, its current autogroup is used. Autogroup bandwidth is controllable via setting it's nice level through the proc filesystem: cat /proc/<pid>/autogroup Displays the task's group and the group's nice level. echo <nice level> > /proc/<pid>/autogroup Sets the task group's shares to the weight of nice <level> task. Setting nice level is rate limited for !admin users due to the abuse risk of task group locking. The feature is enabled from boot by default if CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via the boot option noautogroup, and can also be turned on/off on the fly via: echo [01] > /proc/sys/kernel/sched_autogroup_enabled ... which will automatically move tasks to/from the root task group. Signed-off-by: Mike Galbraith <efault@gmx.de> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Paul Turner <pjt@google.com> Cc: Oleg Nesterov <oleg@redhat.com> [ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ] Signed-off-by: Ingo Molnar <mingo@elte.hu> LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-11-30 21:18:03 +08:00
#ifdef CONFIG_SCHED_AUTOGROUP
extern unsigned int sysctl_sched_autogroup_enabled;
extern void sched_autogroup_create_attach(struct task_struct *p);
extern void sched_autogroup_detach(struct task_struct *p);
extern void sched_autogroup_fork(struct signal_struct *sig);
extern void sched_autogroup_exit(struct signal_struct *sig);
#ifdef CONFIG_PROC_FS
extern void proc_sched_autogroup_show_task(struct task_struct *p, struct seq_file *m);
extern int proc_sched_autogroup_set_nice(struct task_struct *p, int nice);
sched: Add 'autogroup' scheduling feature: automated per session task groups A recurring complaint from CFS users is that parallel kbuild has a negative impact on desktop interactivity. This patch implements an idea from Linus, to automatically create task groups. Currently, only per session autogroups are implemented, but the patch leaves the way open for enhancement. Implementation: each task's signal struct contains an inherited pointer to a refcounted autogroup struct containing a task group pointer, the default for all tasks pointing to the init_task_group. When a task calls setsid(), a new task group is created, the process is moved into the new task group, and a reference to the preveious task group is dropped. Child processes inherit this task group thereafter, and increase it's refcount. When the last thread of a process exits, the process's reference is dropped, such that when the last process referencing an autogroup exits, the autogroup is destroyed. At runqueue selection time, IFF a task has no cgroup assignment, its current autogroup is used. Autogroup bandwidth is controllable via setting it's nice level through the proc filesystem: cat /proc/<pid>/autogroup Displays the task's group and the group's nice level. echo <nice level> > /proc/<pid>/autogroup Sets the task group's shares to the weight of nice <level> task. Setting nice level is rate limited for !admin users due to the abuse risk of task group locking. The feature is enabled from boot by default if CONFIG_SCHED_AUTOGROUP=y is selected, but can be disabled via the boot option noautogroup, and can also be turned on/off on the fly via: echo [01] > /proc/sys/kernel/sched_autogroup_enabled ... which will automatically move tasks to/from the root task group. Signed-off-by: Mike Galbraith <efault@gmx.de> Acked-by: Linus Torvalds <torvalds@linux-foundation.org> Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl> Cc: Markus Trippelsdorf <markus@trippelsdorf.de> Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com> Cc: Paul Turner <pjt@google.com> Cc: Oleg Nesterov <oleg@redhat.com> [ Removed the task_group_path() debug code, and fixed !EVENTFD build failure. ] Signed-off-by: Ingo Molnar <mingo@elte.hu> LKML-Reference: <1290281700.28711.9.camel@maggy.simson.net> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2010-11-30 21:18:03 +08:00
#endif
#else
static inline void sched_autogroup_create_attach(struct task_struct *p) { }
static inline void sched_autogroup_detach(struct task_struct *p) { }
static inline void sched_autogroup_fork(struct signal_struct *sig) { }
static inline void sched_autogroup_exit(struct signal_struct *sig) { }
#endif
#ifdef CONFIG_CFS_BANDWIDTH
extern unsigned int sysctl_sched_cfs_bandwidth_slice;
#endif
#ifdef CONFIG_RT_MUTEXES
extern int rt_mutex_getprio(struct task_struct *p);
extern void rt_mutex_setprio(struct task_struct *p, int prio);
extern void rt_mutex_adjust_pi(struct task_struct *p);
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
return tsk->pi_blocked_on != NULL;
}
#else
static inline int rt_mutex_getprio(struct task_struct *p)
{
return p->normal_prio;
}
# define rt_mutex_adjust_pi(p) do { } while (0)
static inline bool tsk_is_pi_blocked(struct task_struct *tsk)
{
return false;
}
#endif
extern bool yield_to(struct task_struct *p, bool preempt);
extern void set_user_nice(struct task_struct *p, long nice);
extern int task_prio(const struct task_struct *p);
extern int task_nice(const struct task_struct *p);
extern int can_nice(const struct task_struct *p, const int nice);
extern int task_curr(const struct task_struct *p);
extern int idle_cpu(int cpu);
extern int sched_setscheduler(struct task_struct *, int,
const struct sched_param *);
extern int sched_setscheduler_nocheck(struct task_struct *, int,
const struct sched_param *);
extern struct task_struct *idle_task(int cpu);
/**
* is_idle_task - is the specified task an idle task?
* @p: the task in question.
*/
static inline bool is_idle_task(const struct task_struct *p)
{
return p->pid == 0;
}
extern struct task_struct *curr_task(int cpu);
extern void set_curr_task(int cpu, struct task_struct *p);
void yield(void);
/*
* The default (Linux) execution domain.
*/
extern struct exec_domain default_exec_domain;
union thread_union {
struct thread_info thread_info;
unsigned long stack[THREAD_SIZE/sizeof(long)];
};
#ifndef __HAVE_ARCH_KSTACK_END
static inline int kstack_end(void *addr)
{
/* Reliable end of stack detection:
* Some APM bios versions misalign the stack
*/
return !(((unsigned long)addr+sizeof(void*)-1) & (THREAD_SIZE-sizeof(void*)));
}
#endif
extern union thread_union init_thread_union;
extern struct task_struct init_task;
extern struct mm_struct init_mm;
extern struct pid_namespace init_pid_ns;
/*
* find a task by one of its numerical ids
*
* find_task_by_pid_ns():
* finds a task by its pid in the specified namespace
* find_task_by_vpid():
* finds a task by its virtual pid
*
* see also find_vpid() etc in include/linux/pid.h
*/
extern struct task_struct *find_task_by_vpid(pid_t nr);
extern struct task_struct *find_task_by_pid_ns(pid_t nr,
struct pid_namespace *ns);
extern void __set_special_pids(struct pid *pid);
/* per-UID process charging. */
extern struct user_struct * alloc_uid(struct user_namespace *, uid_t);
static inline struct user_struct *get_uid(struct user_struct *u)
{
atomic_inc(&u->__count);
return u;
}
extern void free_uid(struct user_struct *);
extern void release_uids(struct user_namespace *ns);
#include <asm/current.h>
extern void xtime_update(unsigned long ticks);
extern int wake_up_state(struct task_struct *tsk, unsigned int state);
extern int wake_up_process(struct task_struct *tsk);
extern void wake_up_new_task(struct task_struct *tsk);
#ifdef CONFIG_SMP
extern void kick_process(struct task_struct *tsk);
#else
static inline void kick_process(struct task_struct *tsk) { }
#endif
extern void sched_fork(struct task_struct *p);
extern void sched_dead(struct task_struct *p);
extern void proc_caches_init(void);
extern void flush_signals(struct task_struct *);
extern void __flush_signals(struct task_struct *);
extern void ignore_signals(struct task_struct *);
extern void flush_signal_handlers(struct task_struct *, int force_default);
extern int dequeue_signal(struct task_struct *tsk, sigset_t *mask, siginfo_t *info);
static inline int dequeue_signal_lock(struct task_struct *tsk, sigset_t *mask, siginfo_t *info)
{
unsigned long flags;
int ret;
spin_lock_irqsave(&tsk->sighand->siglock, flags);
ret = dequeue_signal(tsk, mask, info);
spin_unlock_irqrestore(&tsk->sighand->siglock, flags);
return ret;
}
extern void block_all_signals(int (*notifier)(void *priv), void *priv,
sigset_t *mask);
extern void unblock_all_signals(void);
extern void release_task(struct task_struct * p);
extern int send_sig_info(int, struct siginfo *, struct task_struct *);
extern int force_sigsegv(int, struct task_struct *);
extern int force_sig_info(int, struct siginfo *, struct task_struct *);
extern int __kill_pgrp_info(int sig, struct siginfo *info, struct pid *pgrp);
extern int kill_pid_info(int sig, struct siginfo *info, struct pid *pid);
extern int kill_pid_info_as_cred(int, struct siginfo *, struct pid *,
const struct cred *, u32);
extern int kill_pgrp(struct pid *pid, int sig, int priv);
extern int kill_pid(struct pid *pid, int sig, int priv);
extern int kill_proc_info(int, struct siginfo *, pid_t);
extern __must_check bool do_notify_parent(struct task_struct *, int);
extern void __wake_up_parent(struct task_struct *p, struct task_struct *parent);
extern void force_sig(int, struct task_struct *);
extern int send_sig(int, struct task_struct *, int);
extern int zap_other_threads(struct task_struct *p);
extern struct sigqueue *sigqueue_alloc(void);
extern void sigqueue_free(struct sigqueue *);
extern int send_sigqueue(struct sigqueue *, struct task_struct *, int group);
extern int do_sigaction(int, struct k_sigaction *, struct k_sigaction *);
extern int do_sigaltstack(const stack_t __user *, stack_t __user *, unsigned long);
static inline int kill_cad_pid(int sig, int priv)
{
return kill_pid(cad_pid, sig, priv);
}
/* These can be the second arg to send_sig_info/send_group_sig_info. */
#define SEND_SIG_NOINFO ((struct siginfo *) 0)
#define SEND_SIG_PRIV ((struct siginfo *) 1)
#define SEND_SIG_FORCED ((struct siginfo *) 2)
signal: Fix alternate signal stack check All architectures in the kernel increment/decrement the stack pointer before storing values on the stack. On architectures which have the stack grow down sas_ss_sp == sp is not on the alternate signal stack while sas_ss_sp + sas_ss_size == sp is on the alternate signal stack. On architectures which have the stack grow up sas_ss_sp == sp is on the alternate signal stack while sas_ss_sp + sas_ss_size == sp is not on the alternate signal stack. The current implementation fails for architectures which have the stack grow down on the corner case where sas_ss_sp == sp.This was reported as Debian bug #544905 on AMD64. Simplified test case: http://download.breakpoint.cc/tc-sig-stack.c The test case creates the following stack scenario: 0xn0300 stack top 0xn0200 alt stack pointer top (when switching to alt stack) 0xn01ff alt stack end 0xn0100 alt stack start == stack pointer If the signal is sent the stack pointer is pointing to the base address of the alt stack and the kernel erroneously decides that it has already switched to the alternate stack because of the current check for "sp - sas_ss_sp < sas_ss_size" On parisc (stack grows up) the scenario would be: 0xn0200 stack pointer 0xn01ff alt stack end 0xn0100 alt stack start = alt stack pointer base (when switching to alt stack) 0xn0000 stack base This is handled correctly by the current implementation. [ tglx: Modified for archs which have the stack grow up (parisc) which would fail with the correct implementation for stack grows down. Added a check for sp >= current->sas_ss_sp which is strictly not necessary but makes the code symetric for both variants ] Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: stable@kernel.org LKML-Reference: <20091025143758.GA6653@Chamillionaire.breakpoint.cc> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2009-10-25 22:37:58 +08:00
/*
* True if we are on the alternate signal stack.
*/
static inline int on_sig_stack(unsigned long sp)
{
signal: Fix alternate signal stack check All architectures in the kernel increment/decrement the stack pointer before storing values on the stack. On architectures which have the stack grow down sas_ss_sp == sp is not on the alternate signal stack while sas_ss_sp + sas_ss_size == sp is on the alternate signal stack. On architectures which have the stack grow up sas_ss_sp == sp is on the alternate signal stack while sas_ss_sp + sas_ss_size == sp is not on the alternate signal stack. The current implementation fails for architectures which have the stack grow down on the corner case where sas_ss_sp == sp.This was reported as Debian bug #544905 on AMD64. Simplified test case: http://download.breakpoint.cc/tc-sig-stack.c The test case creates the following stack scenario: 0xn0300 stack top 0xn0200 alt stack pointer top (when switching to alt stack) 0xn01ff alt stack end 0xn0100 alt stack start == stack pointer If the signal is sent the stack pointer is pointing to the base address of the alt stack and the kernel erroneously decides that it has already switched to the alternate stack because of the current check for "sp - sas_ss_sp < sas_ss_size" On parisc (stack grows up) the scenario would be: 0xn0200 stack pointer 0xn01ff alt stack end 0xn0100 alt stack start = alt stack pointer base (when switching to alt stack) 0xn0000 stack base This is handled correctly by the current implementation. [ tglx: Modified for archs which have the stack grow up (parisc) which would fail with the correct implementation for stack grows down. Added a check for sp >= current->sas_ss_sp which is strictly not necessary but makes the code symetric for both variants ] Signed-off-by: Sebastian Andrzej Siewior <sebastian@breakpoint.cc> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Roland McGrath <roland@redhat.com> Cc: Kyle McMartin <kyle@mcmartin.ca> Cc: stable@kernel.org LKML-Reference: <20091025143758.GA6653@Chamillionaire.breakpoint.cc> Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
2009-10-25 22:37:58 +08:00
#ifdef CONFIG_STACK_GROWSUP
return sp >= current->sas_ss_sp &&
sp - current->sas_ss_sp < current->sas_ss_size;
#else
return sp > current->sas_ss_sp &&
sp - current->sas_ss_sp <= current->sas_ss_size;
#endif
}
static inline int sas_ss_flags(unsigned long sp)
{
return (current->sas_ss_size == 0 ? SS_DISABLE
: on_sig_stack(sp) ? SS_ONSTACK : 0);
}
/*
* Routines for handling mm_structs
*/
extern struct mm_struct * mm_alloc(void);
/* mmdrop drops the mm and the page tables */
extern void __mmdrop(struct mm_struct *);
static inline void mmdrop(struct mm_struct * mm)
{
if (unlikely(atomic_dec_and_test(&mm->mm_count)))
__mmdrop(mm);
}
/* mmput gets rid of the mappings and all user-space */
extern void mmput(struct mm_struct *);
/* Grab a reference to a task's mm, if it is not already going away */
extern struct mm_struct *get_task_mm(struct task_struct *task);
/*
* Grab a reference to a task's mm, if it is not already going away
* and ptrace_may_access with the mode parameter passed to it
* succeeds.
*/
extern struct mm_struct *mm_access(struct task_struct *task, unsigned int mode);
/* Remove the current tasks stale references to the old mm_struct */
extern void mm_release(struct task_struct *, struct mm_struct *);
/* Allocate a new mm structure and copy contents from tsk->mm */
extern struct mm_struct *dup_mm(struct task_struct *tsk);
extern int copy_thread(unsigned long, unsigned long, unsigned long,
struct task_struct *, struct pt_regs *);
extern void flush_thread(void);
extern void exit_thread(void);
extern void exit_files(struct task_struct *);
extern void __cleanup_sighand(struct sighand_struct *);
extern void exit_itimers(struct signal_struct *);
extern void flush_itimer_signals(void);
extern void do_group_exit(int);
extern void daemonize(const char *, ...);
extern int allow_signal(int);
extern int disallow_signal(int);
extern int do_execve(const char *,
const char __user * const __user *,
const char __user * const __user *, struct pt_regs *);
extern long do_fork(unsigned long, unsigned long, struct pt_regs *, unsigned long, int __user *, int __user *);
struct task_struct *fork_idle(int);
extern void set_task_comm(struct task_struct *tsk, char *from);
extern char *get_task_comm(char *to, struct task_struct *tsk);
#ifdef CONFIG_SMP
void scheduler_ipi(void);
extern unsigned long wait_task_inactive(struct task_struct *, long match_state);
#else
static inline void scheduler_ipi(void) { }
static inline unsigned long wait_task_inactive(struct task_struct *p,
long match_state)
{
return 1;
}
#endif
#define next_task(p) \
list_entry_rcu((p)->tasks.next, struct task_struct, tasks)
#define for_each_process(p) \
for (p = &init_task ; (p = next_task(p)) != &init_task ; )
extern bool current_is_single_threaded(void);
CRED: Inaugurate COW credentials Inaugurate copy-on-write credentials management. This uses RCU to manage the credentials pointer in the task_struct with respect to accesses by other tasks. A process may only modify its own credentials, and so does not need locking to access or modify its own credentials. A mutex (cred_replace_mutex) is added to the task_struct to control the effect of PTRACE_ATTACHED on credential calculations, particularly with respect to execve(). With this patch, the contents of an active credentials struct may not be changed directly; rather a new set of credentials must be prepared, modified and committed using something like the following sequence of events: struct cred *new = prepare_creds(); int ret = blah(new); if (ret < 0) { abort_creds(new); return ret; } return commit_creds(new); There are some exceptions to this rule: the keyrings pointed to by the active credentials may be instantiated - keyrings violate the COW rule as managing COW keyrings is tricky, given that it is possible for a task to directly alter the keys in a keyring in use by another task. To help enforce this, various pointers to sets of credentials, such as those in the task_struct, are declared const. The purpose of this is compile-time discouragement of altering credentials through those pointers. Once a set of credentials has been made public through one of these pointers, it may not be modified, except under special circumstances: (1) Its reference count may incremented and decremented. (2) The keyrings to which it points may be modified, but not replaced. The only safe way to modify anything else is to create a replacement and commit using the functions described in Documentation/credentials.txt (which will be added by a later patch). This patch and the preceding patches have been tested with the LTP SELinux testsuite. This patch makes several logical sets of alteration: (1) execve(). This now prepares and commits credentials in various places in the security code rather than altering the current creds directly. (2) Temporary credential overrides. do_coredump() and sys_faccessat() now prepare their own credentials and temporarily override the ones currently on the acting thread, whilst preventing interference from other threads by holding cred_replace_mutex on the thread being dumped. This will be replaced in a future patch by something that hands down the credentials directly to the functions being called, rather than altering the task's objective credentials. (3) LSM interface. A number of functions have been changed, added or removed: (*) security_capset_check(), ->capset_check() (*) security_capset_set(), ->capset_set() Removed in favour of security_capset(). (*) security_capset(), ->capset() New. This is passed a pointer to the new creds, a pointer to the old creds and the proposed capability sets. It should fill in the new creds or return an error. All pointers, barring the pointer to the new creds, are now const. (*) security_bprm_apply_creds(), ->bprm_apply_creds() Changed; now returns a value, which will cause the process to be killed if it's an error. (*) security_task_alloc(), ->task_alloc_security() Removed in favour of security_prepare_creds(). (*) security_cred_free(), ->cred_free() New. Free security data attached to cred->security. (*) security_prepare_creds(), ->cred_prepare() New. Duplicate any security data attached to cred->security. (*) security_commit_creds(), ->cred_commit() New. Apply any security effects for the upcoming installation of new security by commit_creds(). (*) security_task_post_setuid(), ->task_post_setuid() Removed in favour of security_task_fix_setuid(). (*) security_task_fix_setuid(), ->task_fix_setuid() Fix up the proposed new credentials for setuid(). This is used by cap_set_fix_setuid() to implicitly adjust capabilities in line with setuid() changes. Changes are made to the new credentials, rather than the task itself as in security_task_post_setuid(). (*) security_task_reparent_to_init(), ->task_reparent_to_init() Removed. Instead the task being reparented to init is referred directly to init's credentials. NOTE! This results in the loss of some state: SELinux's osid no longer records the sid of the thread that forked it. (*) security_key_alloc(), ->key_alloc() (*) security_key_permission(), ->key_permission() Changed. These now take cred pointers rather than task pointers to refer to the security context. (4) sys_capset(). This has been simplified and uses less locking. The LSM functions it calls have been merged. (5) reparent_to_kthreadd(). This gives the current thread the same credentials as init by simply using commit_thread() to point that way. (6) __sigqueue_alloc() and switch_uid() __sigqueue_alloc() can't stop the target task from changing its creds beneath it, so this function gets a reference to the currently applicable user_struct which it then passes into the sigqueue struct it returns if successful. switch_uid() is now called from commit_creds(), and possibly should be folded into that. commit_creds() should take care of protecting __sigqueue_alloc(). (7) [sg]et[ug]id() and co and [sg]et_current_groups. The set functions now all use prepare_creds(), commit_creds() and abort_creds() to build and check a new set of credentials before applying it. security_task_set[ug]id() is called inside the prepared section. This guarantees that nothing else will affect the creds until we've finished. The calling of set_dumpable() has been moved into commit_creds(). Much of the functionality of set_user() has been moved into commit_creds(). The get functions all simply access the data directly. (8) security_task_prctl() and cap_task_prctl(). security_task_prctl() has been modified to return -ENOSYS if it doesn't want to handle a function, or otherwise return the return value directly rather than through an argument. Additionally, cap_task_prctl() now prepares a new set of credentials, even if it doesn't end up using it. (9) Keyrings. A number of changes have been made to the keyrings code: (a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have all been dropped and built in to the credentials functions directly. They may want separating out again later. (b) key_alloc() and search_process_keyrings() now take a cred pointer rather than a task pointer to specify the security context. (c) copy_creds() gives a new thread within the same thread group a new thread keyring if its parent had one, otherwise it discards the thread keyring. (d) The authorisation key now points directly to the credentials to extend the search into rather pointing to the task that carries them. (e) Installing thread, process or session keyrings causes a new set of credentials to be created, even though it's not strictly necessary for process or session keyrings (they're shared). (10) Usermode helper. The usermode helper code now carries a cred struct pointer in its subprocess_info struct instead of a new session keyring pointer. This set of credentials is derived from init_cred and installed on the new process after it has been cloned. call_usermodehelper_setup() allocates the new credentials and call_usermodehelper_freeinfo() discards them if they haven't been used. A special cred function (prepare_usermodeinfo_creds()) is provided specifically for call_usermodehelper_setup() to call. call_usermodehelper_setkeys() adjusts the credentials to sport the supplied keyring as the new session keyring. (11) SELinux. SELinux has a number of changes, in addition to those to support the LSM interface changes mentioned above: (a) selinux_setprocattr() no longer does its check for whether the current ptracer can access processes with the new SID inside the lock that covers getting the ptracer's SID. Whilst this lock ensures that the check is done with the ptracer pinned, the result is only valid until the lock is released, so there's no point doing it inside the lock. (12) is_single_threaded(). This function has been extracted from selinux_setprocattr() and put into a file of its own in the lib/ directory as join_session_keyring() now wants to use it too. The code in SELinux just checked to see whether a task shared mm_structs with other tasks (CLONE_VM), but that isn't good enough. We really want to know if they're part of the same thread group (CLONE_THREAD). (13) nfsd. The NFS server daemon now has to use the COW credentials to set the credentials it is going to use. It really needs to pass the credentials down to the functions it calls, but it can't do that until other patches in this series have been applied. Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: James Morris <jmorris@namei.org> Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:23 +08:00
/*
* Careful: do_each_thread/while_each_thread is a double loop so
* 'break' will not work as expected - use goto instead.
*/
#define do_each_thread(g, t) \
for (g = t = &init_task ; (g = t = next_task(g)) != &init_task ; ) do
#define while_each_thread(g, t) \
while ((t = next_thread(t)) != g)
static inline int get_nr_threads(struct task_struct *tsk)
{
return tsk->signal->nr_threads;
}
static inline bool thread_group_leader(struct task_struct *p)
{
return p->exit_signal >= 0;
}
[PATCH] proc: readdir race fix (take 3) The problem: An opendir, readdir, closedir sequence can fail to report process ids that are continually in use throughout the sequence of system calls. For this race to trigger the process that proc_pid_readdir stops at must exit before readdir is called again. This can cause ps to fail to report processes, and it is in violation of posix guarantees and normal application expectations with respect to readdir. Currently there is no way to work around this problem in user space short of providing a gargantuan buffer to user space so the directory read all happens in on system call. This patch implements the normal directory semantics for proc, that guarantee that a directory entry that is neither created nor destroyed while reading the directory entry will be returned. For directory that are either created or destroyed during the readdir you may or may not see them. Furthermore you may seek to a directory offset you have previously seen. These are the guarantee that ext[23] provides and that posix requires, and more importantly that user space expects. Plus it is a simple semantic to implement reliable service. It is just a matter of calling readdir a second time if you are wondering if something new has show up. These better semantics are implemented by scanning through the pids in numerical order and by making the file offset a pid plus a fixed offset. The pid scan happens on the pid bitmap, which when you look at it is remarkably efficient for a brute force algorithm. Given that a typical cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There are only 40 cache lines for the entire 32K pid space. A typical system will have 100 pids or more so this is actually fewer cache lines we have to look at to scan a linked list, and the worst case of having to scan the entire pid bitmap is pretty reasonable. If we need something more efficient we can go to a more efficient data structure for indexing the pids, but for now what we have should be sufficient. In addition this takes no additional locks and is actually less code than what we are doing now. Also another very subtle bug in this area has been fixed. It is possible to catch a task in the middle of de_thread where a thread is assuming the thread of it's thread group leader. This patch carefully handles that case so if we hit it we don't fail to return the pid, that is undergoing the de_thread dance. Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for providing the first fix, pointing this out and working on it. [oleg@tv-sign.ru: fix it] Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru> Cc: Jean Delvare <jdelvare@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 17:17:04 +08:00
/* Do to the insanities of de_thread it is possible for a process
* to have the pid of the thread group leader without actually being
* the thread group leader. For iteration through the pids in proc
* all we care about is that we have a task with the appropriate
* pid, we don't actually care if we have the right task.
*/
static inline int has_group_leader_pid(struct task_struct *p)
[PATCH] proc: readdir race fix (take 3) The problem: An opendir, readdir, closedir sequence can fail to report process ids that are continually in use throughout the sequence of system calls. For this race to trigger the process that proc_pid_readdir stops at must exit before readdir is called again. This can cause ps to fail to report processes, and it is in violation of posix guarantees and normal application expectations with respect to readdir. Currently there is no way to work around this problem in user space short of providing a gargantuan buffer to user space so the directory read all happens in on system call. This patch implements the normal directory semantics for proc, that guarantee that a directory entry that is neither created nor destroyed while reading the directory entry will be returned. For directory that are either created or destroyed during the readdir you may or may not see them. Furthermore you may seek to a directory offset you have previously seen. These are the guarantee that ext[23] provides and that posix requires, and more importantly that user space expects. Plus it is a simple semantic to implement reliable service. It is just a matter of calling readdir a second time if you are wondering if something new has show up. These better semantics are implemented by scanning through the pids in numerical order and by making the file offset a pid plus a fixed offset. The pid scan happens on the pid bitmap, which when you look at it is remarkably efficient for a brute force algorithm. Given that a typical cache line is 64 bytes and thus covers space for 64*8 == 200 pids. There are only 40 cache lines for the entire 32K pid space. A typical system will have 100 pids or more so this is actually fewer cache lines we have to look at to scan a linked list, and the worst case of having to scan the entire pid bitmap is pretty reasonable. If we need something more efficient we can go to a more efficient data structure for indexing the pids, but for now what we have should be sufficient. In addition this takes no additional locks and is actually less code than what we are doing now. Also another very subtle bug in this area has been fixed. It is possible to catch a task in the middle of de_thread where a thread is assuming the thread of it's thread group leader. This patch carefully handles that case so if we hit it we don't fail to return the pid, that is undergoing the de_thread dance. Thanks to KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> for providing the first fix, pointing this out and working on it. [oleg@tv-sign.ru: fix it] Signed-off-by: Eric W. Biederman <ebiederm@xmission.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Signed-off-by: Oleg Nesterov <oleg@tv-sign.ru> Cc: Jean Delvare <jdelvare@suse.de> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-10-02 17:17:04 +08:00
{
return p->pid == p->tgid;
}
static inline
int same_thread_group(struct task_struct *p1, struct task_struct *p2)
{
return p1->tgid == p2->tgid;
}
static inline struct task_struct *next_thread(const struct task_struct *p)
{
return list_entry_rcu(p->thread_group.next,
struct task_struct, thread_group);
}
static inline int thread_group_empty(struct task_struct *p)
{
return list_empty(&p->thread_group);
}
#define delay_group_leader(p) \
(thread_group_leader(p) && !thread_group_empty(p))
/*
* Protects ->fs, ->files, ->mm, ->group_info, ->comm, keyring
* subscriptions and synchronises with wait4(). Also used in procfs. Also
Task Control Groups: basic task cgroup framework Generic Process Control Groups -------------------------- There have recently been various proposals floating around for resource management/accounting and other task grouping subsystems in the kernel, including ResGroups, User BeanCounters, NSProxy cgroups, and others. These all need the basic abstraction of being able to group together multiple processes in an aggregate, in order to track/limit the resources permitted to those processes, or control other behaviour of the processes, and all implement this grouping in different ways. This patchset provides a framework for tracking and grouping processes into arbitrary "cgroups" and assigning arbitrary state to those groupings, in order to control the behaviour of the cgroup as an aggregate. The intention is that the various resource management and virtualization/cgroup efforts can also become task cgroup clients, with the result that: - the userspace APIs are (somewhat) normalised - it's easier to test e.g. the ResGroups CPU controller in conjunction with the BeanCounters memory controller, or use either of them as the resource-control portion of a virtual server system. - the additional kernel footprint of any of the competing resource management systems is substantially reduced, since it doesn't need to provide process grouping/containment, hence improving their chances of getting into the kernel This patch: Add the main task cgroups framework - the cgroup filesystem, and the basic structures for tracking membership and associating subsystem state objects to tasks. Signed-off-by: Paul Menage <menage@google.com> Cc: Serge E. Hallyn <serue@us.ibm.com> Cc: "Eric W. Biederman" <ebiederm@xmission.com> Cc: Dave Hansen <haveblue@us.ibm.com> Cc: Balbir Singh <balbir@in.ibm.com> Cc: Paul Jackson <pj@sgi.com> Cc: Kirill Korotaev <dev@openvz.org> Cc: Herbert Poetzl <herbert@13thfloor.at> Cc: Srivatsa Vaddagiri <vatsa@in.ibm.com> Cc: Cedric Le Goater <clg@fr.ibm.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-10-19 14:39:30 +08:00
* pins the final release of task.io_context. Also protects ->cpuset and
* ->cgroup.subsys[]. And ->vfork_done.
*
* Nests both inside and outside of read_lock(&tasklist_lock).
* It must not be nested with write_lock_irq(&tasklist_lock),
* neither inside nor outside.
*/
static inline void task_lock(struct task_struct *p)
{
spin_lock(&p->alloc_lock);
}
static inline void task_unlock(struct task_struct *p)
{
spin_unlock(&p->alloc_lock);
}
extern struct sighand_struct *__lock_task_sighand(struct task_struct *tsk,
unsigned long *flags);
static inline struct sighand_struct *lock_task_sighand(struct task_struct *tsk,
unsigned long *flags)
{
struct sighand_struct *ret;
ret = __lock_task_sighand(tsk, flags);
(void)__cond_lock(&tsk->sighand->siglock, ret);
return ret;
}
static inline void unlock_task_sighand(struct task_struct *tsk,
unsigned long *flags)
{
spin_unlock_irqrestore(&tsk->sighand->siglock, *flags);
}
#ifdef CONFIG_CGROUPS
static inline void threadgroup_change_begin(struct task_struct *tsk)
{
down_read(&tsk->signal->group_rwsem);
}
static inline void threadgroup_change_end(struct task_struct *tsk)
{
up_read(&tsk->signal->group_rwsem);
}
threadgroup: extend threadgroup_lock() to cover exit and exec threadgroup_lock() protected only protected against new addition to the threadgroup, which was inherently somewhat incomplete and problematic for its only user cgroup. On-going migration could race against exec and exit leading to interesting problems - the symmetry between various attach methods, task exiting during method execution, ->exit() racing against attach methods, migrating task switching basic properties during exec and so on. This patch extends threadgroup_lock() such that it protects against all three threadgroup altering operations - fork, exit and exec. For exit, threadgroup_change_begin/end() calls are added to exit_signals around assertion of PF_EXITING. For exec, threadgroup_[un]lock() are updated to also grab and release cred_guard_mutex. With this change, threadgroup_lock() guarantees that the target threadgroup will remain stable - no new task will be added, no new PF_EXITING will be set and exec won't happen. The next patch will update cgroup so that it can take full advantage of this change. -v2: beefed up comment as suggested by Frederic. -v3: narrowed scope of protection in exit path as suggested by Frederic. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Li Zefan <lizf@cn.fujitsu.com> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Paul Menage <paul@paulmenage.org> Cc: Linus Torvalds <torvalds@linux-foundation.org>
2011-12-13 10:12:21 +08:00
/**
* threadgroup_lock - lock threadgroup
* @tsk: member task of the threadgroup to lock
*
* Lock the threadgroup @tsk belongs to. No new task is allowed to enter
* and member tasks aren't allowed to exit (as indicated by PF_EXITING) or
* perform exec. This is useful for cases where the threadgroup needs to
* stay stable across blockable operations.
*
* fork and exit paths explicitly call threadgroup_change_{begin|end}() for
* synchronization. While held, no new task will be added to threadgroup
* and no existing live task will have its PF_EXITING set.
*
* During exec, a task goes and puts its thread group through unusual
* changes. After de-threading, exclusive access is assumed to resources
* which are usually shared by tasks in the same group - e.g. sighand may
* be replaced with a new one. Also, the exec'ing task takes over group
* leader role including its pid. Exclude these changes while locked by
* grabbing cred_guard_mutex which is used to synchronize exec path.
*/
static inline void threadgroup_lock(struct task_struct *tsk)
{
threadgroup: extend threadgroup_lock() to cover exit and exec threadgroup_lock() protected only protected against new addition to the threadgroup, which was inherently somewhat incomplete and problematic for its only user cgroup. On-going migration could race against exec and exit leading to interesting problems - the symmetry between various attach methods, task exiting during method execution, ->exit() racing against attach methods, migrating task switching basic properties during exec and so on. This patch extends threadgroup_lock() such that it protects against all three threadgroup altering operations - fork, exit and exec. For exit, threadgroup_change_begin/end() calls are added to exit_signals around assertion of PF_EXITING. For exec, threadgroup_[un]lock() are updated to also grab and release cred_guard_mutex. With this change, threadgroup_lock() guarantees that the target threadgroup will remain stable - no new task will be added, no new PF_EXITING will be set and exec won't happen. The next patch will update cgroup so that it can take full advantage of this change. -v2: beefed up comment as suggested by Frederic. -v3: narrowed scope of protection in exit path as suggested by Frederic. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Li Zefan <lizf@cn.fujitsu.com> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Paul Menage <paul@paulmenage.org> Cc: Linus Torvalds <torvalds@linux-foundation.org>
2011-12-13 10:12:21 +08:00
/*
* exec uses exit for de-threading nesting group_rwsem inside
* cred_guard_mutex. Grab cred_guard_mutex first.
*/
mutex_lock(&tsk->signal->cred_guard_mutex);
down_write(&tsk->signal->group_rwsem);
}
threadgroup: extend threadgroup_lock() to cover exit and exec threadgroup_lock() protected only protected against new addition to the threadgroup, which was inherently somewhat incomplete and problematic for its only user cgroup. On-going migration could race against exec and exit leading to interesting problems - the symmetry between various attach methods, task exiting during method execution, ->exit() racing against attach methods, migrating task switching basic properties during exec and so on. This patch extends threadgroup_lock() such that it protects against all three threadgroup altering operations - fork, exit and exec. For exit, threadgroup_change_begin/end() calls are added to exit_signals around assertion of PF_EXITING. For exec, threadgroup_[un]lock() are updated to also grab and release cred_guard_mutex. With this change, threadgroup_lock() guarantees that the target threadgroup will remain stable - no new task will be added, no new PF_EXITING will be set and exec won't happen. The next patch will update cgroup so that it can take full advantage of this change. -v2: beefed up comment as suggested by Frederic. -v3: narrowed scope of protection in exit path as suggested by Frederic. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Li Zefan <lizf@cn.fujitsu.com> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Paul Menage <paul@paulmenage.org> Cc: Linus Torvalds <torvalds@linux-foundation.org>
2011-12-13 10:12:21 +08:00
/**
* threadgroup_unlock - unlock threadgroup
* @tsk: member task of the threadgroup to unlock
*
* Reverse threadgroup_lock().
*/
static inline void threadgroup_unlock(struct task_struct *tsk)
{
up_write(&tsk->signal->group_rwsem);
threadgroup: extend threadgroup_lock() to cover exit and exec threadgroup_lock() protected only protected against new addition to the threadgroup, which was inherently somewhat incomplete and problematic for its only user cgroup. On-going migration could race against exec and exit leading to interesting problems - the symmetry between various attach methods, task exiting during method execution, ->exit() racing against attach methods, migrating task switching basic properties during exec and so on. This patch extends threadgroup_lock() such that it protects against all three threadgroup altering operations - fork, exit and exec. For exit, threadgroup_change_begin/end() calls are added to exit_signals around assertion of PF_EXITING. For exec, threadgroup_[un]lock() are updated to also grab and release cred_guard_mutex. With this change, threadgroup_lock() guarantees that the target threadgroup will remain stable - no new task will be added, no new PF_EXITING will be set and exec won't happen. The next patch will update cgroup so that it can take full advantage of this change. -v2: beefed up comment as suggested by Frederic. -v3: narrowed scope of protection in exit path as suggested by Frederic. Signed-off-by: Tejun Heo <tj@kernel.org> Reviewed-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Li Zefan <lizf@cn.fujitsu.com> Acked-by: Frederic Weisbecker <fweisbec@gmail.com> Cc: Oleg Nesterov <oleg@redhat.com> Cc: Andrew Morton <akpm@linux-foundation.org> Cc: Paul Menage <paul@paulmenage.org> Cc: Linus Torvalds <torvalds@linux-foundation.org>
2011-12-13 10:12:21 +08:00
mutex_unlock(&tsk->signal->cred_guard_mutex);
}
#else
static inline void threadgroup_change_begin(struct task_struct *tsk) {}
static inline void threadgroup_change_end(struct task_struct *tsk) {}
static inline void threadgroup_lock(struct task_struct *tsk) {}
static inline void threadgroup_unlock(struct task_struct *tsk) {}
#endif
#ifndef __HAVE_THREAD_FUNCTIONS
rename thread_info to stack This finally renames the thread_info field in task structure to stack, so that the assumptions about this field are gone and archs have more freedom about placing the thread_info structure. Nonbroken archs which have a proper thread pointer can do the access to both current thread and task structure via a single pointer. It'll allow for a few more cleanups of the fork code, from which e.g. ia64 could benefit. Signed-off-by: Roman Zippel <zippel@linux-m68k.org> [akpm@linux-foundation.org: build fix] Cc: Richard Henderson <rth@twiddle.net> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Ian Molton <spyro@f2s.com> Cc: Haavard Skinnemoen <hskinnemoen@atmel.com> Cc: Mikael Starvik <starvik@axis.com> Cc: David Howells <dhowells@redhat.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Hirokazu Takata <takata@linux-m32r.org> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Roman Zippel <zippel@linux-m68k.org> Cc: Greg Ungerer <gerg@uclinux.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp> Cc: Richard Curnow <rc@rc0.org.uk> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Jeff Dike <jdike@addtoit.com> Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp> Cc: Andi Kleen <ak@muc.de> Cc: Chris Zankel <chris@zankel.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-09 17:35:17 +08:00
#define task_thread_info(task) ((struct thread_info *)(task)->stack)
#define task_stack_page(task) ((task)->stack)
static inline void setup_thread_stack(struct task_struct *p, struct task_struct *org)
{
*task_thread_info(p) = *task_thread_info(org);
task_thread_info(p)->task = p;
}
static inline unsigned long *end_of_stack(struct task_struct *p)
{
rename thread_info to stack This finally renames the thread_info field in task structure to stack, so that the assumptions about this field are gone and archs have more freedom about placing the thread_info structure. Nonbroken archs which have a proper thread pointer can do the access to both current thread and task structure via a single pointer. It'll allow for a few more cleanups of the fork code, from which e.g. ia64 could benefit. Signed-off-by: Roman Zippel <zippel@linux-m68k.org> [akpm@linux-foundation.org: build fix] Cc: Richard Henderson <rth@twiddle.net> Cc: Ivan Kokshaysky <ink@jurassic.park.msu.ru> Cc: Russell King <rmk@arm.linux.org.uk> Cc: Ian Molton <spyro@f2s.com> Cc: Haavard Skinnemoen <hskinnemoen@atmel.com> Cc: Mikael Starvik <starvik@axis.com> Cc: David Howells <dhowells@redhat.com> Cc: Yoshinori Sato <ysato@users.sourceforge.jp> Cc: "Luck, Tony" <tony.luck@intel.com> Cc: Hirokazu Takata <takata@linux-m32r.org> Cc: Geert Uytterhoeven <geert@linux-m68k.org> Cc: Roman Zippel <zippel@linux-m68k.org> Cc: Greg Ungerer <gerg@uclinux.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Ralf Baechle <ralf@linux-mips.org> Cc: Paul Mackerras <paulus@samba.org> Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org> Cc: Heiko Carstens <heiko.carstens@de.ibm.com> Cc: Martin Schwidefsky <schwidefsky@de.ibm.com> Cc: Paul Mundt <lethal@linux-sh.org> Cc: Kazumoto Kojima <kkojima@rr.iij4u.or.jp> Cc: Richard Curnow <rc@rc0.org.uk> Cc: William Lee Irwin III <wli@holomorphy.com> Cc: "David S. Miller" <davem@davemloft.net> Cc: Jeff Dike <jdike@addtoit.com> Cc: Paolo 'Blaisorblade' Giarrusso <blaisorblade@yahoo.it> Cc: Miles Bader <uclinux-v850@lsi.nec.co.jp> Cc: Andi Kleen <ak@muc.de> Cc: Chris Zankel <chris@zankel.net> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-09 17:35:17 +08:00
return (unsigned long *)(task_thread_info(p) + 1);
}
#endif
static inline int object_is_on_stack(void *obj)
{
void *stack = task_stack_page(current);
return (obj >= stack) && (obj < (stack + THREAD_SIZE));
}
extern void thread_info_cache_init(void);
#ifdef CONFIG_DEBUG_STACK_USAGE
static inline unsigned long stack_not_used(struct task_struct *p)
{
unsigned long *n = end_of_stack(p);
do { /* Skip over canary */
n++;
} while (!*n);
return (unsigned long)n - (unsigned long)end_of_stack(p);
}
#endif
/* set thread flags in other task's structures
* - see asm/thread_info.h for TIF_xxxx flags available
*/
static inline void set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
set_ti_thread_flag(task_thread_info(tsk), flag);
}
static inline void clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
clear_ti_thread_flag(task_thread_info(tsk), flag);
}
static inline int test_and_set_tsk_thread_flag(struct task_struct *tsk, int flag)
{
return test_and_set_ti_thread_flag(task_thread_info(tsk), flag);
}
static inline int test_and_clear_tsk_thread_flag(struct task_struct *tsk, int flag)
{
return test_and_clear_ti_thread_flag(task_thread_info(tsk), flag);
}
static inline int test_tsk_thread_flag(struct task_struct *tsk, int flag)
{
return test_ti_thread_flag(task_thread_info(tsk), flag);
}
static inline void set_tsk_need_resched(struct task_struct *tsk)
{
set_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}
static inline void clear_tsk_need_resched(struct task_struct *tsk)
{
clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED);
}
static inline int test_tsk_need_resched(struct task_struct *tsk)
{
return unlikely(test_tsk_thread_flag(tsk,TIF_NEED_RESCHED));
}
static inline int restart_syscall(void)
{
set_tsk_thread_flag(current, TIF_SIGPENDING);
return -ERESTARTNOINTR;
}
static inline int signal_pending(struct task_struct *p)
{
return unlikely(test_tsk_thread_flag(p,TIF_SIGPENDING));
}
static inline int __fatal_signal_pending(struct task_struct *p)
{
return unlikely(sigismember(&p->pending.signal, SIGKILL));
}
static inline int fatal_signal_pending(struct task_struct *p)
{
return signal_pending(p) && __fatal_signal_pending(p);
}
static inline int signal_pending_state(long state, struct task_struct *p)
{
if (!(state & (TASK_INTERRUPTIBLE | TASK_WAKEKILL)))
return 0;
if (!signal_pending(p))
return 0;
return (state & TASK_INTERRUPTIBLE) || __fatal_signal_pending(p);
}
static inline int need_resched(void)
{
return unlikely(test_thread_flag(TIF_NEED_RESCHED));
}
/*
* cond_resched() and cond_resched_lock(): latency reduction via
* explicit rescheduling in places that are safe. The return
* value indicates whether a reschedule was done in fact.
* cond_resched_lock() will drop the spinlock before scheduling,
* cond_resched_softirq() will enable bhs before scheduling.
*/
extern int _cond_resched(void);
#define cond_resched() ({ \
__might_sleep(__FILE__, __LINE__, 0); \
_cond_resched(); \
})
extern int __cond_resched_lock(spinlock_t *lock);
#ifdef CONFIG_PREEMPT_COUNT
#define PREEMPT_LOCK_OFFSET PREEMPT_OFFSET
#else
#define PREEMPT_LOCK_OFFSET 0
#endif
#define cond_resched_lock(lock) ({ \
__might_sleep(__FILE__, __LINE__, PREEMPT_LOCK_OFFSET); \
__cond_resched_lock(lock); \
})
extern int __cond_resched_softirq(void);
#define cond_resched_softirq() ({ \
__might_sleep(__FILE__, __LINE__, SOFTIRQ_DISABLE_OFFSET); \
__cond_resched_softirq(); \
})
/*
* Does a critical section need to be broken due to another
* task waiting?: (technically does not depend on CONFIG_PREEMPT,
* but a general need for low latency)
*/
static inline int spin_needbreak(spinlock_t *lock)
{
#ifdef CONFIG_PREEMPT
return spin_is_contended(lock);
#else
return 0;
#endif
}
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
/*
* Thread group CPU time accounting.
*/
void thread_group_cputime(struct task_struct *tsk, struct task_cputime *times);
void thread_group_cputimer(struct task_struct *tsk, struct task_cputime *times);
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
static inline void thread_group_cputime_init(struct signal_struct *sig)
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
{
raw_spin_lock_init(&sig->cputimer.lock);
timers: fix itimer/many thread hang Overview This patch reworks the handling of POSIX CPU timers, including the ITIMER_PROF, ITIMER_VIRT timers and rlimit handling. It was put together with the help of Roland McGrath, the owner and original writer of this code. The problem we ran into, and the reason for this rework, has to do with using a profiling timer in a process with a large number of threads. It appears that the performance of the old implementation of run_posix_cpu_timers() was at least O(n*3) (where "n" is the number of threads in a process) or worse. Everything is fine with an increasing number of threads until the time taken for that routine to run becomes the same as or greater than the tick time, at which point things degrade rather quickly. This patch fixes bug 9906, "Weird hang with NPTL and SIGPROF." Code Changes This rework corrects the implementation of run_posix_cpu_timers() to make it run in constant time for a particular machine. (Performance may vary between one machine and another depending upon whether the kernel is built as single- or multiprocessor and, in the latter case, depending upon the number of running processors.) To do this, at each tick we now update fields in signal_struct as well as task_struct. The run_posix_cpu_timers() function uses those fields to make its decisions. We define a new structure, "task_cputime," to contain user, system and scheduler times and use these in appropriate places: struct task_cputime { cputime_t utime; cputime_t stime; unsigned long long sum_exec_runtime; }; This is included in the structure "thread_group_cputime," which is a new substructure of signal_struct and which varies for uniprocessor versus multiprocessor kernels. For uniprocessor kernels, it uses "task_cputime" as a simple substructure, while for multiprocessor kernels it is a pointer: struct thread_group_cputime { struct task_cputime totals; }; struct thread_group_cputime { struct task_cputime *totals; }; We also add a new task_cputime substructure directly to signal_struct, to cache the earliest expiration of process-wide timers, and task_cputime also replaces the it_*_expires fields of task_struct (used for earliest expiration of thread timers). The "thread_group_cputime" structure contains process-wide timers that are updated via account_user_time() and friends. In the non-SMP case the structure is a simple aggregator; unfortunately in the SMP case that simplicity was not achievable due to cache-line contention between CPUs (in one measured case performance was actually _worse_ on a 16-cpu system than the same test on a 4-cpu system, due to this contention). For SMP, the thread_group_cputime counters are maintained as a per-cpu structure allocated using alloc_percpu(). The timer functions update only the timer field in the structure corresponding to the running CPU, obtained using per_cpu_ptr(). We define a set of inline functions in sched.h that we use to maintain the thread_group_cputime structure and hide the differences between UP and SMP implementations from the rest of the kernel. The thread_group_cputime_init() function initializes the thread_group_cputime structure for the given task. The thread_group_cputime_alloc() is a no-op for UP; for SMP it calls the out-of-line function thread_group_cputime_alloc_smp() to allocate and fill in the per-cpu structures and fields. The thread_group_cputime_free() function, also a no-op for UP, in SMP frees the per-cpu structures. The thread_group_cputime_clone_thread() function (also a UP no-op) for SMP calls thread_group_cputime_alloc() if the per-cpu structures haven't yet been allocated. The thread_group_cputime() function fills the task_cputime structure it is passed with the contents of the thread_group_cputime fields; in UP it's that simple but in SMP it must also safely check that tsk->signal is non-NULL (if it is it just uses the appropriate fields of task_struct) and, if so, sums the per-cpu values for each online CPU. Finally, the three functions account_group_user_time(), account_group_system_time() and account_group_exec_runtime() are used by timer functions to update the respective fields of the thread_group_cputime structure. Non-SMP operation is trivial and will not be mentioned further. The per-cpu structure is always allocated when a task creates its first new thread, via a call to thread_group_cputime_clone_thread() from copy_signal(). It is freed at process exit via a call to thread_group_cputime_free() from cleanup_signal(). All functions that formerly summed utime/stime/sum_sched_runtime values from from all threads in the thread group now use thread_group_cputime() to snapshot the values in the thread_group_cputime structure or the values in the task structure itself if the per-cpu structure hasn't been allocated. Finally, the code in kernel/posix-cpu-timers.c has changed quite a bit. The run_posix_cpu_timers() function has been split into a fast path and a slow path; the former safely checks whether there are any expired thread timers and, if not, just returns, while the slow path does the heavy lifting. With the dedicated thread group fields, timers are no longer "rebalanced" and the process_timer_rebalance() function and related code has gone away. All summing loops are gone and all code that used them now uses the thread_group_cputime() inline. When process-wide timers are set, the new task_cputime structure in signal_struct is used to cache the earliest expiration; this is checked in the fast path. Performance The fix appears not to add significant overhead to existing operations. It generally performs the same as the current code except in two cases, one in which it performs slightly worse (Case 5 below) and one in which it performs very significantly better (Case 2 below). Overall it's a wash except in those two cases. I've since done somewhat more involved testing on a dual-core Opteron system. Case 1: With no itimer running, for a test with 100,000 threads, the fixed kernel took 1428.5 seconds, 513 seconds more than the unfixed system, all of which was spent in the system. There were twice as many voluntary context switches with the fix as without it. Case 2: With an itimer running at .01 second ticks and 4000 threads (the most an unmodified kernel can handle), the fixed kernel ran the test in eight percent of the time (5.8 seconds as opposed to 70 seconds) and had better tick accuracy (.012 seconds per tick as opposed to .023 seconds per tick). Case 3: A 4000-thread test with an initial timer tick of .01 second and an interval of 10,000 seconds (i.e. a timer that ticks only once) had very nearly the same performance in both cases: 6.3 seconds elapsed for the fixed kernel versus 5.5 seconds for the unfixed kernel. With fewer threads (eight in these tests), the Case 1 test ran in essentially the same time on both the modified and unmodified kernels (5.2 seconds versus 5.8 seconds). The Case 2 test ran in about the same time as well, 5.9 seconds versus 5.4 seconds but again with much better tick accuracy, .013 seconds per tick versus .025 seconds per tick for the unmodified kernel. Since the fix affected the rlimit code, I also tested soft and hard CPU limits. Case 4: With a hard CPU limit of 20 seconds and eight threads (and an itimer running), the modified kernel was very slightly favored in that while it killed the process in 19.997 seconds of CPU time (5.002 seconds of wall time), only .003 seconds of that was system time, the rest was user time. The unmodified kernel killed the process in 20.001 seconds of CPU (5.014 seconds of wall time) of which .016 seconds was system time. Really, though, the results were too close to call. The results were essentially the same with no itimer running. Case 5: With a soft limit of 20 seconds and a hard limit of 2000 seconds (where the hard limit would never be reached) and an itimer running, the modified kernel exhibited worse tick accuracy than the unmodified kernel: .050 seconds/tick versus .028 seconds/tick. Otherwise, performance was almost indistinguishable. With no itimer running this test exhibited virtually identical behavior and times in both cases. In times past I did some limited performance testing. those results are below. On a four-cpu Opteron system without this fix, a sixteen-thread test executed in 3569.991 seconds, of which user was 3568.435s and system was 1.556s. On the same system with the fix, user and elapsed time were about the same, but system time dropped to 0.007 seconds. Performance with eight, four and one thread were comparable. Interestingly, the timer ticks with the fix seemed more accurate: The sixteen-thread test with the fix received 149543 ticks for 0.024 seconds per tick, while the same test without the fix received 58720 for 0.061 seconds per tick. Both cases were configured for an interval of 0.01 seconds. Again, the other tests were comparable. Each thread in this test computed the primes up to 25,000,000. I also did a test with a large number of threads, 100,000 threads, which is impossible without the fix. In this case each thread computed the primes only up to 10,000 (to make the runtime manageable). System time dominated, at 1546.968 seconds out of a total 2176.906 seconds (giving a user time of 629.938s). It received 147651 ticks for 0.015 seconds per tick, still quite accurate. There is obviously no comparable test without the fix. Signed-off-by: Frank Mayhar <fmayhar@google.com> Cc: Roland McGrath <roland@redhat.com> Cc: Alexey Dobriyan <adobriyan@gmail.com> Cc: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Ingo Molnar <mingo@elte.hu>
2008-09-13 00:54:39 +08:00
}
/*
* Reevaluate whether the task has signals pending delivery.
* Wake the task if so.
* This is required every time the blocked sigset_t changes.
* callers must hold sighand->siglock.
*/
extern void recalc_sigpending_and_wake(struct task_struct *t);
extern void recalc_sigpending(void);
extern void signal_wake_up(struct task_struct *t, int resume_stopped);
/*
* Wrappers for p->thread_info->cpu access. No-op on UP.
*/
#ifdef CONFIG_SMP
static inline unsigned int task_cpu(const struct task_struct *p)
{
return task_thread_info(p)->cpu;
}
extern void set_task_cpu(struct task_struct *p, unsigned int cpu);
#else
static inline unsigned int task_cpu(const struct task_struct *p)
{
return 0;
}
static inline void set_task_cpu(struct task_struct *p, unsigned int cpu)
{
}
#endif /* CONFIG_SMP */
extern long sched_setaffinity(pid_t pid, const struct cpumask *new_mask);
extern long sched_getaffinity(pid_t pid, struct cpumask *mask);
extern void normalize_rt_tasks(void);
#ifdef CONFIG_CGROUP_SCHED
extern struct task_group root_task_group;
extern struct task_group *sched_create_group(struct task_group *parent);
extern void sched_destroy_group(struct task_group *tg);
extern void sched_move_task(struct task_struct *tsk);
#ifdef CONFIG_FAIR_GROUP_SCHED
extern int sched_group_set_shares(struct task_group *tg, unsigned long shares);
extern unsigned long sched_group_shares(struct task_group *tg);
#endif
#ifdef CONFIG_RT_GROUP_SCHED
extern int sched_group_set_rt_runtime(struct task_group *tg,
long rt_runtime_us);
extern long sched_group_rt_runtime(struct task_group *tg);
extern int sched_group_set_rt_period(struct task_group *tg,
long rt_period_us);
extern long sched_group_rt_period(struct task_group *tg);
extern int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk);
#endif
#endif
extern int task_can_switch_user(struct user_struct *up,
struct task_struct *tsk);
#ifdef CONFIG_TASK_XACCT
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
{
tsk->ioac.rchar += amt;
}
static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
{
tsk->ioac.wchar += amt;
}
static inline void inc_syscr(struct task_struct *tsk)
{
tsk->ioac.syscr++;
}
static inline void inc_syscw(struct task_struct *tsk)
{
tsk->ioac.syscw++;
}
#else
static inline void add_rchar(struct task_struct *tsk, ssize_t amt)
{
}
static inline void add_wchar(struct task_struct *tsk, ssize_t amt)
{
}
static inline void inc_syscr(struct task_struct *tsk)
{
}
static inline void inc_syscw(struct task_struct *tsk)
{
}
#endif
#ifndef TASK_SIZE_OF
#define TASK_SIZE_OF(tsk) TASK_SIZE
#endif
cgroups: add an owner to the mm_struct Remove the mem_cgroup member from mm_struct and instead adds an owner. This approach was suggested by Paul Menage. The advantage of this approach is that, once the mm->owner is known, using the subsystem id, the cgroup can be determined. It also allows several control groups that are virtually grouped by mm_struct, to exist independent of the memory controller i.e., without adding mem_cgroup's for each controller, to mm_struct. A new config option CONFIG_MM_OWNER is added and the memory resource controller selects this config option. This patch also adds cgroup callbacks to notify subsystems when mm->owner changes. The mm_cgroup_changed callback is called with the task_lock() of the new task held and is called just prior to changing the mm->owner. I am indebted to Paul Menage for the several reviews of this patchset and helping me make it lighter and simpler. This patch was tested on a powerpc box, it was compiled with both the MM_OWNER config turned on and off. After the thread group leader exits, it's moved to init_css_state by cgroup_exit(), thus all future charges from runnings threads would be redirected to the init_css_set's subsystem. Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com> Cc: Pavel Emelianov <xemul@openvz.org> Cc: Hugh Dickins <hugh@veritas.com> Cc: Sudhir Kumar <skumar@linux.vnet.ibm.com> Cc: YAMAMOTO Takashi <yamamoto@valinux.co.jp> Cc: Hirokazu Takahashi <taka@valinux.co.jp> Cc: David Rientjes <rientjes@google.com>, Cc: Balbir Singh <balbir@linux.vnet.ibm.com> Acked-by: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com> Acked-by: Pekka Enberg <penberg@cs.helsinki.fi> Reviewed-by: Paul Menage <menage@google.com> Cc: Oleg Nesterov <oleg@tv-sign.ru> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-04-29 16:00:16 +08:00
#ifdef CONFIG_MM_OWNER
extern void mm_update_next_owner(struct mm_struct *mm);
extern void mm_init_owner(struct mm_struct *mm, struct task_struct *p);
#else
static inline void mm_update_next_owner(struct mm_struct *mm)
{
}
static inline void mm_init_owner(struct mm_struct *mm, struct task_struct *p)
{
}
#endif /* CONFIG_MM_OWNER */
static inline unsigned long task_rlimit(const struct task_struct *tsk,
unsigned int limit)
{
return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_cur);
}
static inline unsigned long task_rlimit_max(const struct task_struct *tsk,
unsigned int limit)
{
return ACCESS_ONCE(tsk->signal->rlim[limit].rlim_max);
}
static inline unsigned long rlimit(unsigned int limit)
{
return task_rlimit(current, limit);
}
static inline unsigned long rlimit_max(unsigned int limit)
{
return task_rlimit_max(current, limit);
}
#endif /* __KERNEL__ */
#endif