2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* linux/fs/exec.c
|
|
|
|
*
|
|
|
|
* Copyright (C) 1991, 1992 Linus Torvalds
|
|
|
|
*/
|
|
|
|
|
|
|
|
/*
|
|
|
|
* #!-checking implemented by tytso.
|
|
|
|
*/
|
|
|
|
/*
|
|
|
|
* Demand-loading implemented 01.12.91 - no need to read anything but
|
|
|
|
* the header into memory. The inode of the executable is put into
|
|
|
|
* "current->executable", and page faults do the actual loading. Clean.
|
|
|
|
*
|
|
|
|
* Once more I can proudly say that linux stood up to being changed: it
|
|
|
|
* was less than 2 hours work to get demand-loading completely implemented.
|
|
|
|
*
|
|
|
|
* Demand loading changed July 1993 by Eric Youngdale. Use mmap instead,
|
|
|
|
* current->executable is only used by the procfs. This allows a dispatch
|
|
|
|
* table to check for several different types of binary formats. We keep
|
|
|
|
* trying until we recognize the file or we run out of supported binary
|
|
|
|
* formats.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#include <linux/slab.h>
|
|
|
|
#include <linux/file.h>
|
2008-04-24 19:44:08 +08:00
|
|
|
#include <linux/fdtable.h>
|
2008-07-25 16:45:43 +08:00
|
|
|
#include <linux/mm.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/stat.h>
|
|
|
|
#include <linux/fcntl.h>
|
2008-07-25 16:45:43 +08:00
|
|
|
#include <linux/swap.h>
|
2007-10-17 14:26:35 +08:00
|
|
|
#include <linux/string.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/init.h>
|
2008-07-29 06:46:18 +08:00
|
|
|
#include <linux/pagemap.h>
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
#include <linux/perf_event.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/highmem.h>
|
|
|
|
#include <linux/spinlock.h>
|
|
|
|
#include <linux/key.h>
|
|
|
|
#include <linux/personality.h>
|
|
|
|
#include <linux/binfmts.h>
|
|
|
|
#include <linux/utsname.h>
|
2006-12-08 18:38:01 +08:00
|
|
|
#include <linux/pid_namespace.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/module.h>
|
|
|
|
#include <linux/namei.h>
|
|
|
|
#include <linux/mount.h>
|
|
|
|
#include <linux/security.h>
|
|
|
|
#include <linux/syscalls.h>
|
2006-10-01 14:28:59 +08:00
|
|
|
#include <linux/tsacct_kern.h>
|
2005-11-07 16:59:16 +08:00
|
|
|
#include <linux/cn_proc.h>
|
2006-04-27 02:04:08 +08:00
|
|
|
#include <linux/audit.h>
|
2008-07-26 10:45:44 +08:00
|
|
|
#include <linux/tracehook.h>
|
2008-07-09 16:28:40 +08:00
|
|
|
#include <linux/kmod.h>
|
2008-12-18 02:53:20 +08:00
|
|
|
#include <linux/fsnotify.h>
|
2009-03-30 07:50:06 +08:00
|
|
|
#include <linux/fs_struct.h>
|
2009-09-24 06:56:58 +08:00
|
|
|
#include <linux/pipe_fs_i.h>
|
2010-10-27 05:21:23 +08:00
|
|
|
#include <linux/oom.h>
|
2011-03-07 01:02:54 +08:00
|
|
|
#include <linux/compat.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <asm/uaccess.h>
|
|
|
|
#include <asm/mmu_context.h>
|
2007-07-19 16:48:16 +08:00
|
|
|
#include <asm/tlb.h>
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
#include "internal.h"
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
int core_uses_pid;
|
2007-05-17 13:11:16 +08:00
|
|
|
char core_pattern[CORENAME_MAX_SIZE] = "core";
|
2009-09-24 06:56:56 +08:00
|
|
|
unsigned int core_pipe_limit;
|
2005-06-23 15:09:43 +08:00
|
|
|
int suid_dumpable = 0;
|
|
|
|
|
2010-10-28 06:34:08 +08:00
|
|
|
struct core_name {
|
|
|
|
char *corename;
|
|
|
|
int used, size;
|
|
|
|
};
|
|
|
|
static atomic_t call_count = ATOMIC_INIT(1);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* The maximal length of core_pattern is also specified in sysctl.c */
|
|
|
|
|
2007-10-17 14:26:03 +08:00
|
|
|
static LIST_HEAD(formats);
|
2005-04-17 06:20:36 +08:00
|
|
|
static DEFINE_RWLOCK(binfmt_lock);
|
|
|
|
|
2009-05-01 06:08:49 +08:00
|
|
|
int __register_binfmt(struct linux_binfmt * fmt, int insert)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
if (!fmt)
|
|
|
|
return -EINVAL;
|
|
|
|
write_lock(&binfmt_lock);
|
2009-05-01 06:08:49 +08:00
|
|
|
insert ? list_add(&fmt->lh, &formats) :
|
|
|
|
list_add_tail(&fmt->lh, &formats);
|
2005-04-17 06:20:36 +08:00
|
|
|
write_unlock(&binfmt_lock);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-05-01 06:08:49 +08:00
|
|
|
EXPORT_SYMBOL(__register_binfmt);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-10-17 14:26:04 +08:00
|
|
|
void unregister_binfmt(struct linux_binfmt * fmt)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
write_lock(&binfmt_lock);
|
2007-10-17 14:26:03 +08:00
|
|
|
list_del(&fmt->lh);
|
2005-04-17 06:20:36 +08:00
|
|
|
write_unlock(&binfmt_lock);
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(unregister_binfmt);
|
|
|
|
|
|
|
|
static inline void put_binfmt(struct linux_binfmt * fmt)
|
|
|
|
{
|
|
|
|
module_put(fmt->module);
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Note that a shared library must be both readable and executable due to
|
|
|
|
* security reasons.
|
|
|
|
*
|
|
|
|
* Also note that we take the address to load from from the file itself.
|
|
|
|
*/
|
2009-01-14 21:14:29 +08:00
|
|
|
SYSCALL_DEFINE1(uselib, const char __user *, library)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-07-26 15:33:14 +08:00
|
|
|
struct file *file;
|
|
|
|
char *tmp = getname(library);
|
|
|
|
int error = PTR_ERR(tmp);
|
2011-02-24 06:44:09 +08:00
|
|
|
static const struct open_flags uselib_flags = {
|
|
|
|
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
|
|
|
|
.acc_mode = MAY_READ | MAY_EXEC | MAY_OPEN,
|
|
|
|
.intent = LOOKUP_OPEN
|
|
|
|
};
|
2008-07-26 15:33:14 +08:00
|
|
|
|
2009-04-06 23:16:22 +08:00
|
|
|
if (IS_ERR(tmp))
|
|
|
|
goto out;
|
|
|
|
|
2011-02-24 06:44:09 +08:00
|
|
|
file = do_filp_open(AT_FDCWD, tmp, &uselib_flags, LOOKUP_FOLLOW);
|
2009-04-06 23:16:22 +08:00
|
|
|
putname(tmp);
|
|
|
|
error = PTR_ERR(file);
|
|
|
|
if (IS_ERR(file))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
error = -EINVAL;
|
2009-04-06 23:16:22 +08:00
|
|
|
if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto exit;
|
|
|
|
|
2008-07-22 12:02:33 +08:00
|
|
|
error = -EACCES;
|
2009-04-06 23:16:22 +08:00
|
|
|
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
|
2005-04-17 06:20:36 +08:00
|
|
|
goto exit;
|
|
|
|
|
2009-12-18 10:24:21 +08:00
|
|
|
fsnotify_open(file);
|
2008-12-18 02:53:20 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
error = -ENOEXEC;
|
|
|
|
if(file->f_op) {
|
|
|
|
struct linux_binfmt * fmt;
|
|
|
|
|
|
|
|
read_lock(&binfmt_lock);
|
2007-10-17 14:26:03 +08:00
|
|
|
list_for_each_entry(fmt, &formats, lh) {
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!fmt->load_shlib)
|
|
|
|
continue;
|
|
|
|
if (!try_module_get(fmt->module))
|
|
|
|
continue;
|
|
|
|
read_unlock(&binfmt_lock);
|
|
|
|
error = fmt->load_shlib(file);
|
|
|
|
read_lock(&binfmt_lock);
|
|
|
|
put_binfmt(fmt);
|
|
|
|
if (error != -ENOEXEC)
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
read_unlock(&binfmt_lock);
|
|
|
|
}
|
2009-04-06 23:16:22 +08:00
|
|
|
exit:
|
2005-04-17 06:20:36 +08:00
|
|
|
fput(file);
|
|
|
|
out:
|
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
#ifdef CONFIG_MMU
|
2011-03-07 01:03:11 +08:00
|
|
|
/*
|
|
|
|
* The nascent bprm->mm is not visible until exec_mmap() but it can
|
|
|
|
* use a lot of memory, account these pages in current->mm temporary
|
|
|
|
* for oom_badness()->get_mm_rss(). Once exec succeeds or fails, we
|
|
|
|
* change the counter back via acct_arg_size(0).
|
|
|
|
*/
|
2011-03-07 01:02:54 +08:00
|
|
|
static void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
|
2010-12-01 03:55:34 +08:00
|
|
|
{
|
|
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
long diff = (long)(pages - bprm->vma_pages);
|
|
|
|
|
|
|
|
if (!mm || !diff)
|
|
|
|
return;
|
|
|
|
|
|
|
|
bprm->vma_pages = pages;
|
|
|
|
add_mm_counter(mm, MM_ANONPAGES, diff);
|
|
|
|
}
|
|
|
|
|
2011-03-07 01:02:54 +08:00
|
|
|
static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
|
2007-07-19 16:48:16 +08:00
|
|
|
int write)
|
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
int ret;
|
|
|
|
|
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
if (write) {
|
2011-05-25 08:11:44 +08:00
|
|
|
ret = expand_downwards(bprm->vma, pos);
|
2007-07-19 16:48:16 +08:00
|
|
|
if (ret < 0)
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
ret = get_user_pages(current, bprm->mm, pos,
|
|
|
|
1, write, 1, &page, NULL);
|
|
|
|
if (ret <= 0)
|
|
|
|
return NULL;
|
|
|
|
|
|
|
|
if (write) {
|
|
|
|
unsigned long size = bprm->vma->vm_end - bprm->vma->vm_start;
|
2008-03-04 02:12:14 +08:00
|
|
|
struct rlimit *rlim;
|
|
|
|
|
2010-12-01 03:55:34 +08:00
|
|
|
acct_arg_size(bprm, size / PAGE_SIZE);
|
|
|
|
|
2008-03-04 02:12:14 +08:00
|
|
|
/*
|
|
|
|
* We've historically supported up to 32 pages (ARG_MAX)
|
|
|
|
* of argument strings even with small stacks
|
|
|
|
*/
|
|
|
|
if (size <= ARG_MAX)
|
|
|
|
return page;
|
2007-07-19 16:48:16 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Limit to 1/4-th the stack size for the argv+env strings.
|
|
|
|
* This ensures that:
|
|
|
|
* - the remaining binfmt code will not run out of stack space,
|
|
|
|
* - the program will have a reasonable amount of stack left
|
|
|
|
* to work from.
|
|
|
|
*/
|
2008-03-04 02:12:14 +08:00
|
|
|
rlim = current->signal->rlim;
|
2010-03-06 05:42:42 +08:00
|
|
|
if (size > ACCESS_ONCE(rlim[RLIMIT_STACK].rlim_cur) / 4) {
|
2007-07-19 16:48:16 +08:00
|
|
|
put_page(page);
|
|
|
|
return NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void put_arg_page(struct page *page)
|
|
|
|
{
|
|
|
|
put_page(page);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_arg_page(struct linux_binprm *bprm, int i)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_arg_pages(struct linux_binprm *bprm)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
|
|
|
|
struct page *page)
|
|
|
|
{
|
|
|
|
flush_cache_page(bprm->vma, pos, page_to_pfn(page));
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __bprm_mm_init(struct linux_binprm *bprm)
|
|
|
|
{
|
2009-01-07 06:40:44 +08:00
|
|
|
int err;
|
2007-07-19 16:48:16 +08:00
|
|
|
struct vm_area_struct *vma = NULL;
|
|
|
|
struct mm_struct *mm = bprm->mm;
|
|
|
|
|
|
|
|
bprm->vma = vma = kmem_cache_zalloc(vm_area_cachep, GFP_KERNEL);
|
|
|
|
if (!vma)
|
2009-01-07 06:40:44 +08:00
|
|
|
return -ENOMEM;
|
2007-07-19 16:48:16 +08:00
|
|
|
|
|
|
|
down_write(&mm->mmap_sem);
|
|
|
|
vma->vm_mm = mm;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Place the stack at the largest stack address the architecture
|
|
|
|
* supports. Later, we'll move this to an appropriate place. We don't
|
|
|
|
* use STACK_TOP because that can depend on attributes which aren't
|
|
|
|
* configured yet.
|
|
|
|
*/
|
2011-07-27 07:08:40 +08:00
|
|
|
BUILD_BUG_ON(VM_STACK_FLAGS & VM_STACK_INCOMPLETE_SETUP);
|
2007-07-19 16:48:16 +08:00
|
|
|
vma->vm_end = STACK_TOP_MAX;
|
|
|
|
vma->vm_start = vma->vm_end - PAGE_SIZE;
|
2010-05-25 05:32:24 +08:00
|
|
|
vma->vm_flags = VM_STACK_FLAGS | VM_STACK_INCOMPLETE_SETUP;
|
2007-10-19 14:39:15 +08:00
|
|
|
vma->vm_page_prot = vm_get_page_prot(vma->vm_flags);
|
mm: change anon_vma linking to fix multi-process server scalability issue
The old anon_vma code can lead to scalability issues with heavily forking
workloads. Specifically, each anon_vma will be shared between the parent
process and all its child processes.
In a workload with 1000 child processes and a VMA with 1000 anonymous
pages per process that get COWed, this leads to a system with a million
anonymous pages in the same anon_vma, each of which is mapped in just one
of the 1000 processes. However, the current rmap code needs to walk them
all, leading to O(N) scanning complexity for each page.
This can result in systems where one CPU is walking the page tables of
1000 processes in page_referenced_one, while all other CPUs are stuck on
the anon_vma lock. This leads to catastrophic failure for a benchmark
like AIM7, where the total number of processes can reach in the tens of
thousands. Real workloads are still a factor 10 less process intensive
than AIM7, but they are catching up.
This patch changes the way anon_vmas and VMAs are linked, which allows us
to associate multiple anon_vmas with a VMA. At fork time, each child
process gets its own anon_vmas, in which its COWed pages will be
instantiated. The parents' anon_vma is also linked to the VMA, because
non-COWed pages could be present in any of the children.
This reduces rmap scanning complexity to O(1) for the pages of the 1000
child processes, with O(N) complexity for at most 1/N pages in the system.
This reduces the average scanning cost in heavily forking workloads from
O(N) to 2.
The only real complexity in this patch stems from the fact that linking a
VMA to anon_vmas now involves memory allocations. This means vma_adjust
can fail, if it needs to attach a VMA to anon_vma structures. This in
turn means error handling needs to be added to the calling functions.
A second source of complexity is that, because there can be multiple
anon_vmas, the anon_vma linking in vma_adjust can no longer be done under
"the" anon_vma lock. To prevent the rmap code from walking up an
incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit
flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h
to make sure it is impossible to compile a kernel that needs both symbolic
values for the same bitflag.
Some test results:
Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test
box with 16GB RAM and not quite enough IO), the system ends up running
>99% in system time, with every CPU on the same anon_vma lock in the
pageout code.
With these changes, AIM7 hits the cross-over point around 29.7k users.
This happens with ~99% IO wait time, there never seems to be any spike in
system time. The anon_vma lock contention appears to be resolved.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-06 05:42:07 +08:00
|
|
|
INIT_LIST_HEAD(&vma->anon_vma_chain);
|
2010-12-09 22:29:42 +08:00
|
|
|
|
|
|
|
err = security_file_mmap(NULL, 0, 0, 0, vma->vm_start, 1);
|
|
|
|
if (err)
|
|
|
|
goto err;
|
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
err = insert_vm_struct(mm, vma);
|
2009-01-07 06:40:44 +08:00
|
|
|
if (err)
|
2007-07-19 16:48:16 +08:00
|
|
|
goto err;
|
|
|
|
|
|
|
|
mm->stack_vm = mm->total_vm = 1;
|
|
|
|
up_write(&mm->mmap_sem);
|
|
|
|
bprm->p = vma->vm_end - sizeof(void *);
|
|
|
|
return 0;
|
|
|
|
err:
|
2009-01-07 06:40:44 +08:00
|
|
|
up_write(&mm->mmap_sem);
|
|
|
|
bprm->vma = NULL;
|
|
|
|
kmem_cache_free(vm_area_cachep, vma);
|
2007-07-19 16:48:16 +08:00
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool valid_arg_len(struct linux_binprm *bprm, long len)
|
|
|
|
{
|
|
|
|
return len <= MAX_ARG_STRLEN;
|
|
|
|
}
|
|
|
|
|
|
|
|
#else
|
|
|
|
|
2011-03-07 01:02:54 +08:00
|
|
|
static inline void acct_arg_size(struct linux_binprm *bprm, unsigned long pages)
|
2010-12-01 03:55:34 +08:00
|
|
|
{
|
|
|
|
}
|
|
|
|
|
2011-03-07 01:02:54 +08:00
|
|
|
static struct page *get_arg_page(struct linux_binprm *bprm, unsigned long pos,
|
2007-07-19 16:48:16 +08:00
|
|
|
int write)
|
|
|
|
{
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
page = bprm->page[pos / PAGE_SIZE];
|
|
|
|
if (!page && write) {
|
|
|
|
page = alloc_page(GFP_HIGHUSER|__GFP_ZERO);
|
|
|
|
if (!page)
|
|
|
|
return NULL;
|
|
|
|
bprm->page[pos / PAGE_SIZE] = page;
|
|
|
|
}
|
|
|
|
|
|
|
|
return page;
|
|
|
|
}
|
|
|
|
|
|
|
|
static void put_arg_page(struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_arg_page(struct linux_binprm *bprm, int i)
|
|
|
|
{
|
|
|
|
if (bprm->page[i]) {
|
|
|
|
__free_page(bprm->page[i]);
|
|
|
|
bprm->page[i] = NULL;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
static void free_arg_pages(struct linux_binprm *bprm)
|
|
|
|
{
|
|
|
|
int i;
|
|
|
|
|
|
|
|
for (i = 0; i < MAX_ARG_PAGES; i++)
|
|
|
|
free_arg_page(bprm, i);
|
|
|
|
}
|
|
|
|
|
|
|
|
static void flush_arg_page(struct linux_binprm *bprm, unsigned long pos,
|
|
|
|
struct page *page)
|
|
|
|
{
|
|
|
|
}
|
|
|
|
|
|
|
|
static int __bprm_mm_init(struct linux_binprm *bprm)
|
|
|
|
{
|
|
|
|
bprm->p = PAGE_SIZE * MAX_ARG_PAGES - sizeof(void *);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static bool valid_arg_len(struct linux_binprm *bprm, long len)
|
|
|
|
{
|
|
|
|
return len <= bprm->p;
|
|
|
|
}
|
|
|
|
|
|
|
|
#endif /* CONFIG_MMU */
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Create a new mm_struct and populate it with a temporary stack
|
|
|
|
* vm_area_struct. We don't have enough context at this point to set the stack
|
|
|
|
* flags, permissions, and offset, so we use temporary values. We'll update
|
|
|
|
* them later in setup_arg_pages().
|
|
|
|
*/
|
|
|
|
int bprm_mm_init(struct linux_binprm *bprm)
|
|
|
|
{
|
|
|
|
int err;
|
|
|
|
struct mm_struct *mm = NULL;
|
|
|
|
|
|
|
|
bprm->mm = mm = mm_alloc();
|
|
|
|
err = -ENOMEM;
|
|
|
|
if (!mm)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = init_new_context(current, mm);
|
|
|
|
if (err)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
err = __bprm_mm_init(bprm);
|
|
|
|
if (err)
|
|
|
|
goto err;
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
err:
|
|
|
|
if (mm) {
|
|
|
|
bprm->mm = NULL;
|
|
|
|
mmdrop(mm);
|
|
|
|
}
|
|
|
|
|
|
|
|
return err;
|
|
|
|
}
|
|
|
|
|
2011-03-07 01:02:37 +08:00
|
|
|
struct user_arg_ptr {
|
2011-03-07 01:02:54 +08:00
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
bool is_compat;
|
|
|
|
#endif
|
|
|
|
union {
|
|
|
|
const char __user *const __user *native;
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
compat_uptr_t __user *compat;
|
|
|
|
#endif
|
|
|
|
} ptr;
|
2011-03-07 01:02:37 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
static const char __user *get_user_arg_ptr(struct user_arg_ptr argv, int nr)
|
2011-03-07 01:02:21 +08:00
|
|
|
{
|
2011-03-07 01:02:54 +08:00
|
|
|
const char __user *native;
|
|
|
|
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
if (unlikely(argv.is_compat)) {
|
|
|
|
compat_uptr_t compat;
|
|
|
|
|
|
|
|
if (get_user(compat, argv.ptr.compat + nr))
|
|
|
|
return ERR_PTR(-EFAULT);
|
2011-03-07 01:02:21 +08:00
|
|
|
|
2011-03-07 01:02:54 +08:00
|
|
|
return compat_ptr(compat);
|
|
|
|
}
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (get_user(native, argv.ptr.native + nr))
|
2011-03-07 01:02:21 +08:00
|
|
|
return ERR_PTR(-EFAULT);
|
|
|
|
|
2011-03-07 01:02:54 +08:00
|
|
|
return native;
|
2011-03-07 01:02:21 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* count() counts the number of strings in array ARGV.
|
|
|
|
*/
|
2011-03-07 01:02:37 +08:00
|
|
|
static int count(struct user_arg_ptr argv, int max)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
int i = 0;
|
|
|
|
|
2011-03-07 01:02:54 +08:00
|
|
|
if (argv.ptr.native != NULL) {
|
2005-04-17 06:20:36 +08:00
|
|
|
for (;;) {
|
2011-03-07 01:02:21 +08:00
|
|
|
const char __user *p = get_user_arg_ptr(argv, i);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
if (!p)
|
|
|
|
break;
|
2011-03-07 01:02:21 +08:00
|
|
|
|
|
|
|
if (IS_ERR(p))
|
|
|
|
return -EFAULT;
|
|
|
|
|
2008-10-16 13:01:52 +08:00
|
|
|
if (i++ >= max)
|
2005-04-17 06:20:36 +08:00
|
|
|
return -E2BIG;
|
2010-09-08 10:37:06 +08:00
|
|
|
|
|
|
|
if (fatal_signal_pending(current))
|
|
|
|
return -ERESTARTNOHAND;
|
2005-04-17 06:20:36 +08:00
|
|
|
cond_resched();
|
|
|
|
}
|
|
|
|
}
|
|
|
|
return i;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2007-07-19 16:48:16 +08:00
|
|
|
* 'copy_strings()' copies argument/environment strings from the old
|
|
|
|
* processes's memory to the new process's stack. The call to get_user_pages()
|
|
|
|
* ensures the destination page is created and not swapped out.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2011-03-07 01:02:37 +08:00
|
|
|
static int copy_strings(int argc, struct user_arg_ptr argv,
|
2005-05-06 07:16:09 +08:00
|
|
|
struct linux_binprm *bprm)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct page *kmapped_page = NULL;
|
|
|
|
char *kaddr = NULL;
|
2007-07-19 16:48:16 +08:00
|
|
|
unsigned long kpos = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
while (argc-- > 0) {
|
2010-08-18 06:52:56 +08:00
|
|
|
const char __user *str;
|
2005-04-17 06:20:36 +08:00
|
|
|
int len;
|
|
|
|
unsigned long pos;
|
|
|
|
|
2011-03-07 01:02:21 +08:00
|
|
|
ret = -EFAULT;
|
|
|
|
str = get_user_arg_ptr(argv, argc);
|
|
|
|
if (IS_ERR(str))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
|
|
|
|
2011-03-07 01:02:21 +08:00
|
|
|
len = strnlen_user(str, MAX_ARG_STRLEN);
|
|
|
|
if (!len)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
ret = -E2BIG;
|
|
|
|
if (!valid_arg_len(bprm, len))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
/* We're going to work our way backwords. */
|
2005-04-17 06:20:36 +08:00
|
|
|
pos = bprm->p;
|
2007-07-19 16:48:16 +08:00
|
|
|
str += len;
|
|
|
|
bprm->p -= len;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
while (len > 0) {
|
|
|
|
int offset, bytes_to_copy;
|
|
|
|
|
2010-09-08 10:37:06 +08:00
|
|
|
if (fatal_signal_pending(current)) {
|
|
|
|
ret = -ERESTARTNOHAND;
|
|
|
|
goto out;
|
|
|
|
}
|
2010-09-08 10:36:28 +08:00
|
|
|
cond_resched();
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
offset = pos % PAGE_SIZE;
|
2007-07-19 16:48:16 +08:00
|
|
|
if (offset == 0)
|
|
|
|
offset = PAGE_SIZE;
|
|
|
|
|
|
|
|
bytes_to_copy = offset;
|
|
|
|
if (bytes_to_copy > len)
|
|
|
|
bytes_to_copy = len;
|
|
|
|
|
|
|
|
offset -= bytes_to_copy;
|
|
|
|
pos -= bytes_to_copy;
|
|
|
|
str -= bytes_to_copy;
|
|
|
|
len -= bytes_to_copy;
|
|
|
|
|
|
|
|
if (!kmapped_page || kpos != (pos & PAGE_MASK)) {
|
|
|
|
struct page *page;
|
|
|
|
|
|
|
|
page = get_arg_page(bprm, pos, 1);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!page) {
|
2007-07-19 16:48:16 +08:00
|
|
|
ret = -E2BIG;
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
if (kmapped_page) {
|
|
|
|
flush_kernel_dcache_page(kmapped_page);
|
2005-04-17 06:20:36 +08:00
|
|
|
kunmap(kmapped_page);
|
2007-07-19 16:48:16 +08:00
|
|
|
put_arg_page(kmapped_page);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
kmapped_page = page;
|
|
|
|
kaddr = kmap(kmapped_page);
|
2007-07-19 16:48:16 +08:00
|
|
|
kpos = pos & PAGE_MASK;
|
|
|
|
flush_arg_page(bprm, kpos, kmapped_page);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-07-19 16:48:16 +08:00
|
|
|
if (copy_from_user(kaddr+offset, str, bytes_to_copy)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
ret = -EFAULT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
ret = 0;
|
|
|
|
out:
|
2007-07-19 16:48:16 +08:00
|
|
|
if (kmapped_page) {
|
|
|
|
flush_kernel_dcache_page(kmapped_page);
|
2005-04-17 06:20:36 +08:00
|
|
|
kunmap(kmapped_page);
|
2007-07-19 16:48:16 +08:00
|
|
|
put_arg_page(kmapped_page);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Like copy_strings, but get argv and its values from kernel memory.
|
|
|
|
*/
|
2011-03-07 01:02:37 +08:00
|
|
|
int copy_strings_kernel(int argc, const char *const *__argv,
|
2010-08-18 06:52:56 +08:00
|
|
|
struct linux_binprm *bprm)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
int r;
|
|
|
|
mm_segment_t oldfs = get_fs();
|
2011-03-07 01:02:37 +08:00
|
|
|
struct user_arg_ptr argv = {
|
2011-03-07 01:02:54 +08:00
|
|
|
.ptr.native = (const char __user *const __user *)__argv,
|
2011-03-07 01:02:37 +08:00
|
|
|
};
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
set_fs(KERNEL_DS);
|
2011-03-07 01:02:37 +08:00
|
|
|
r = copy_strings(argc, argv, bprm);
|
2005-04-17 06:20:36 +08:00
|
|
|
set_fs(oldfs);
|
2011-03-07 01:02:37 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
return r;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(copy_strings_kernel);
|
|
|
|
|
|
|
|
#ifdef CONFIG_MMU
|
2007-07-19 16:48:16 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2007-07-19 16:48:16 +08:00
|
|
|
* During bprm_mm_init(), we create a temporary stack at STACK_TOP_MAX. Once
|
|
|
|
* the binfmt code determines where the new stack should reside, we shift it to
|
|
|
|
* its final location. The process proceeds as follows:
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2007-07-19 16:48:16 +08:00
|
|
|
* 1) Use shift to calculate the new vma endpoints.
|
|
|
|
* 2) Extend vma to cover both the old and new ranges. This ensures the
|
|
|
|
* arguments passed to subsequent functions are consistent.
|
|
|
|
* 3) Move vma's page tables to the new range.
|
|
|
|
* 4) Free up any cleared pgd range.
|
|
|
|
* 5) Shrink the vma to cover only the new range.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-07-19 16:48:16 +08:00
|
|
|
static int shift_arg_pages(struct vm_area_struct *vma, unsigned long shift)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct mm_struct *mm = vma->vm_mm;
|
2007-07-19 16:48:16 +08:00
|
|
|
unsigned long old_start = vma->vm_start;
|
|
|
|
unsigned long old_end = vma->vm_end;
|
|
|
|
unsigned long length = old_end - old_start;
|
|
|
|
unsigned long new_start = old_start - shift;
|
|
|
|
unsigned long new_end = old_end - shift;
|
2011-05-25 08:11:45 +08:00
|
|
|
struct mmu_gather tlb;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
BUG_ON(new_start > new_end);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
/*
|
|
|
|
* ensure there are no vmas between where we want to go
|
|
|
|
* and where we are
|
|
|
|
*/
|
|
|
|
if (vma != find_vma(mm, new_start))
|
|
|
|
return -EFAULT;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* cover the whole range: [new_start, old_end)
|
|
|
|
*/
|
mm: change anon_vma linking to fix multi-process server scalability issue
The old anon_vma code can lead to scalability issues with heavily forking
workloads. Specifically, each anon_vma will be shared between the parent
process and all its child processes.
In a workload with 1000 child processes and a VMA with 1000 anonymous
pages per process that get COWed, this leads to a system with a million
anonymous pages in the same anon_vma, each of which is mapped in just one
of the 1000 processes. However, the current rmap code needs to walk them
all, leading to O(N) scanning complexity for each page.
This can result in systems where one CPU is walking the page tables of
1000 processes in page_referenced_one, while all other CPUs are stuck on
the anon_vma lock. This leads to catastrophic failure for a benchmark
like AIM7, where the total number of processes can reach in the tens of
thousands. Real workloads are still a factor 10 less process intensive
than AIM7, but they are catching up.
This patch changes the way anon_vmas and VMAs are linked, which allows us
to associate multiple anon_vmas with a VMA. At fork time, each child
process gets its own anon_vmas, in which its COWed pages will be
instantiated. The parents' anon_vma is also linked to the VMA, because
non-COWed pages could be present in any of the children.
This reduces rmap scanning complexity to O(1) for the pages of the 1000
child processes, with O(N) complexity for at most 1/N pages in the system.
This reduces the average scanning cost in heavily forking workloads from
O(N) to 2.
The only real complexity in this patch stems from the fact that linking a
VMA to anon_vmas now involves memory allocations. This means vma_adjust
can fail, if it needs to attach a VMA to anon_vma structures. This in
turn means error handling needs to be added to the calling functions.
A second source of complexity is that, because there can be multiple
anon_vmas, the anon_vma linking in vma_adjust can no longer be done under
"the" anon_vma lock. To prevent the rmap code from walking up an
incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit
flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h
to make sure it is impossible to compile a kernel that needs both symbolic
values for the same bitflag.
Some test results:
Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test
box with 16GB RAM and not quite enough IO), the system ends up running
>99% in system time, with every CPU on the same anon_vma lock in the
pageout code.
With these changes, AIM7 hits the cross-over point around 29.7k users.
This happens with ~99% IO wait time, there never seems to be any spike in
system time. The anon_vma lock contention appears to be resolved.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-06 05:42:07 +08:00
|
|
|
if (vma_adjust(vma, new_start, old_end, vma->vm_pgoff, NULL))
|
|
|
|
return -ENOMEM;
|
2007-07-19 16:48:16 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* move the page tables downwards, on failure we rely on
|
|
|
|
* process cleanup to remove whatever mess we made.
|
|
|
|
*/
|
|
|
|
if (length != move_page_tables(vma, old_start,
|
|
|
|
vma, new_start, length))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
|
|
|
lru_add_drain();
|
2011-05-25 08:11:45 +08:00
|
|
|
tlb_gather_mmu(&tlb, mm, 0);
|
2007-07-19 16:48:16 +08:00
|
|
|
if (new_end > old_start) {
|
|
|
|
/*
|
|
|
|
* when the old and new regions overlap clear from new_end.
|
|
|
|
*/
|
2011-05-25 08:11:45 +08:00
|
|
|
free_pgd_range(&tlb, new_end, old_end, new_end,
|
2007-07-19 16:48:16 +08:00
|
|
|
vma->vm_next ? vma->vm_next->vm_start : 0);
|
|
|
|
} else {
|
|
|
|
/*
|
|
|
|
* otherwise, clean from old_start; this is done to not touch
|
|
|
|
* the address space in [new_end, old_start) some architectures
|
|
|
|
* have constraints on va-space that make this illegal (IA64) -
|
|
|
|
* for the others its just a little faster.
|
|
|
|
*/
|
2011-05-25 08:11:45 +08:00
|
|
|
free_pgd_range(&tlb, old_start, old_end, new_end,
|
2007-07-19 16:48:16 +08:00
|
|
|
vma->vm_next ? vma->vm_next->vm_start : 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2011-05-25 08:11:45 +08:00
|
|
|
tlb_finish_mmu(&tlb, new_end, old_end);
|
2007-07-19 16:48:16 +08:00
|
|
|
|
|
|
|
/*
|
mm: change anon_vma linking to fix multi-process server scalability issue
The old anon_vma code can lead to scalability issues with heavily forking
workloads. Specifically, each anon_vma will be shared between the parent
process and all its child processes.
In a workload with 1000 child processes and a VMA with 1000 anonymous
pages per process that get COWed, this leads to a system with a million
anonymous pages in the same anon_vma, each of which is mapped in just one
of the 1000 processes. However, the current rmap code needs to walk them
all, leading to O(N) scanning complexity for each page.
This can result in systems where one CPU is walking the page tables of
1000 processes in page_referenced_one, while all other CPUs are stuck on
the anon_vma lock. This leads to catastrophic failure for a benchmark
like AIM7, where the total number of processes can reach in the tens of
thousands. Real workloads are still a factor 10 less process intensive
than AIM7, but they are catching up.
This patch changes the way anon_vmas and VMAs are linked, which allows us
to associate multiple anon_vmas with a VMA. At fork time, each child
process gets its own anon_vmas, in which its COWed pages will be
instantiated. The parents' anon_vma is also linked to the VMA, because
non-COWed pages could be present in any of the children.
This reduces rmap scanning complexity to O(1) for the pages of the 1000
child processes, with O(N) complexity for at most 1/N pages in the system.
This reduces the average scanning cost in heavily forking workloads from
O(N) to 2.
The only real complexity in this patch stems from the fact that linking a
VMA to anon_vmas now involves memory allocations. This means vma_adjust
can fail, if it needs to attach a VMA to anon_vma structures. This in
turn means error handling needs to be added to the calling functions.
A second source of complexity is that, because there can be multiple
anon_vmas, the anon_vma linking in vma_adjust can no longer be done under
"the" anon_vma lock. To prevent the rmap code from walking up an
incomplete VMA, this patch introduces the VM_LOCK_RMAP VMA flag. This bit
flag uses the same slot as the NOMMU VM_MAPPED_COPY, with an ifdef in mm.h
to make sure it is impossible to compile a kernel that needs both symbolic
values for the same bitflag.
Some test results:
Without the anon_vma changes, when AIM7 hits around 9.7k users (on a test
box with 16GB RAM and not quite enough IO), the system ends up running
>99% in system time, with every CPU on the same anon_vma lock in the
pageout code.
With these changes, AIM7 hits the cross-over point around 29.7k users.
This happens with ~99% IO wait time, there never seems to be any spike in
system time. The anon_vma lock contention appears to be resolved.
[akpm@linux-foundation.org: cleanups]
Signed-off-by: Rik van Riel <riel@redhat.com>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Larry Woodman <lwoodman@redhat.com>
Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
Cc: Minchan Kim <minchan.kim@gmail.com>
Cc: Andrea Arcangeli <aarcange@redhat.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-03-06 05:42:07 +08:00
|
|
|
* Shrink the vma to just the new range. Always succeeds.
|
2007-07-19 16:48:16 +08:00
|
|
|
*/
|
|
|
|
vma_adjust(vma, new_start, new_end, vma->vm_pgoff, NULL);
|
|
|
|
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
/*
|
|
|
|
* Finalizes the stack vm_area_struct. The flags and permissions are updated,
|
|
|
|
* the stack is optionally relocated, and some extra space is added.
|
|
|
|
*/
|
2005-04-17 06:20:36 +08:00
|
|
|
int setup_arg_pages(struct linux_binprm *bprm,
|
|
|
|
unsigned long stack_top,
|
|
|
|
int executable_stack)
|
|
|
|
{
|
2007-07-19 16:48:16 +08:00
|
|
|
unsigned long ret;
|
|
|
|
unsigned long stack_shift;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct mm_struct *mm = current->mm;
|
2007-07-19 16:48:16 +08:00
|
|
|
struct vm_area_struct *vma = bprm->vma;
|
|
|
|
struct vm_area_struct *prev = NULL;
|
|
|
|
unsigned long vm_flags;
|
|
|
|
unsigned long stack_base;
|
2010-02-11 05:56:42 +08:00
|
|
|
unsigned long stack_size;
|
|
|
|
unsigned long stack_expand;
|
|
|
|
unsigned long rlim_stack;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
|
|
|
/* Limit stack size to 1GB */
|
2010-03-06 05:42:42 +08:00
|
|
|
stack_base = rlimit_max(RLIMIT_STACK);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (stack_base > (1 << 30))
|
|
|
|
stack_base = 1 << 30;
|
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
/* Make sure we didn't let the argument array grow too large. */
|
|
|
|
if (vma->vm_end - vma->vm_start > stack_base)
|
|
|
|
return -ENOMEM;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
stack_base = PAGE_ALIGN(stack_top - stack_base);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
stack_shift = vma->vm_start - stack_base;
|
|
|
|
mm->arg_start = bprm->p - stack_shift;
|
|
|
|
bprm->p = vma->vm_end - stack_shift;
|
2005-04-17 06:20:36 +08:00
|
|
|
#else
|
2007-07-19 16:48:16 +08:00
|
|
|
stack_top = arch_align_stack(stack_top);
|
|
|
|
stack_top = PAGE_ALIGN(stack_top);
|
2010-09-08 10:35:49 +08:00
|
|
|
|
|
|
|
if (unlikely(stack_top < mmap_min_addr) ||
|
|
|
|
unlikely(vma->vm_end - vma->vm_start >= stack_top - mmap_min_addr))
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
stack_shift = vma->vm_end - stack_top;
|
|
|
|
|
|
|
|
bprm->p -= stack_shift;
|
2005-04-17 06:20:36 +08:00
|
|
|
mm->arg_start = bprm->p;
|
|
|
|
#endif
|
|
|
|
|
|
|
|
if (bprm->loader)
|
2007-07-19 16:48:16 +08:00
|
|
|
bprm->loader -= stack_shift;
|
|
|
|
bprm->exec -= stack_shift;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
down_write(&mm->mmap_sem);
|
2008-07-11 04:19:20 +08:00
|
|
|
vm_flags = VM_STACK_FLAGS;
|
2007-07-19 16:48:16 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Adjust stack execute permissions; explicitly enable for
|
|
|
|
* EXSTACK_ENABLE_X, disable for EXSTACK_DISABLE_X and leave alone
|
|
|
|
* (arch default) otherwise.
|
|
|
|
*/
|
|
|
|
if (unlikely(executable_stack == EXSTACK_ENABLE_X))
|
|
|
|
vm_flags |= VM_EXEC;
|
|
|
|
else if (executable_stack == EXSTACK_DISABLE_X)
|
|
|
|
vm_flags &= ~VM_EXEC;
|
|
|
|
vm_flags |= mm->def_flags;
|
2010-05-25 05:32:24 +08:00
|
|
|
vm_flags |= VM_STACK_INCOMPLETE_SETUP;
|
2007-07-19 16:48:16 +08:00
|
|
|
|
|
|
|
ret = mprotect_fixup(vma, &prev, vma->vm_start, vma->vm_end,
|
|
|
|
vm_flags);
|
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
|
|
|
BUG_ON(prev != vma);
|
|
|
|
|
|
|
|
/* Move stack pages down in memory. */
|
|
|
|
if (stack_shift) {
|
|
|
|
ret = shift_arg_pages(vma, stack_shift);
|
2009-11-12 06:26:48 +08:00
|
|
|
if (ret)
|
|
|
|
goto out_unlock;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-05-25 05:32:24 +08:00
|
|
|
/* mprotect_fixup is overkill to remove the temporary stack flags */
|
|
|
|
vma->vm_flags &= ~VM_STACK_INCOMPLETE_SETUP;
|
|
|
|
|
2010-03-06 05:42:57 +08:00
|
|
|
stack_expand = 131072UL; /* randomly 32*4k (or 2*64k) pages */
|
2010-02-11 05:56:42 +08:00
|
|
|
stack_size = vma->vm_end - vma->vm_start;
|
|
|
|
/*
|
|
|
|
* Align this down to a page boundary as expand_stack
|
|
|
|
* will align it up.
|
|
|
|
*/
|
|
|
|
rlim_stack = rlimit(RLIMIT_STACK) & PAGE_MASK;
|
2007-07-19 16:48:16 +08:00
|
|
|
#ifdef CONFIG_STACK_GROWSUP
|
2010-02-11 05:56:42 +08:00
|
|
|
if (stack_size + stack_expand > rlim_stack)
|
|
|
|
stack_base = vma->vm_start + rlim_stack;
|
|
|
|
else
|
|
|
|
stack_base = vma->vm_end + stack_expand;
|
2007-07-19 16:48:16 +08:00
|
|
|
#else
|
2010-02-11 05:56:42 +08:00
|
|
|
if (stack_size + stack_expand > rlim_stack)
|
|
|
|
stack_base = vma->vm_end - rlim_stack;
|
|
|
|
else
|
|
|
|
stack_base = vma->vm_start - stack_expand;
|
2007-07-19 16:48:16 +08:00
|
|
|
#endif
|
2010-05-18 22:30:49 +08:00
|
|
|
current->mm->start_stack = bprm->p;
|
2007-07-19 16:48:16 +08:00
|
|
|
ret = expand_stack(vma, stack_base);
|
|
|
|
if (ret)
|
|
|
|
ret = -EFAULT;
|
|
|
|
|
|
|
|
out_unlock:
|
2005-04-17 06:20:36 +08:00
|
|
|
up_write(&mm->mmap_sem);
|
2009-11-12 06:26:48 +08:00
|
|
|
return ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(setup_arg_pages);
|
|
|
|
|
|
|
|
#endif /* CONFIG_MMU */
|
|
|
|
|
|
|
|
struct file *open_exec(const char *name)
|
|
|
|
{
|
|
|
|
struct file *file;
|
2008-05-19 13:53:34 +08:00
|
|
|
int err;
|
2011-02-24 06:44:09 +08:00
|
|
|
static const struct open_flags open_exec_flags = {
|
|
|
|
.open_flag = O_LARGEFILE | O_RDONLY | __FMODE_EXEC,
|
|
|
|
.acc_mode = MAY_EXEC | MAY_OPEN,
|
|
|
|
.intent = LOOKUP_OPEN
|
|
|
|
};
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-02-24 06:44:09 +08:00
|
|
|
file = do_filp_open(AT_FDCWD, name, &open_exec_flags, LOOKUP_FOLLOW);
|
2009-04-06 23:16:22 +08:00
|
|
|
if (IS_ERR(file))
|
2008-05-19 13:53:34 +08:00
|
|
|
goto out;
|
|
|
|
|
|
|
|
err = -EACCES;
|
2009-04-06 23:16:22 +08:00
|
|
|
if (!S_ISREG(file->f_path.dentry->d_inode->i_mode))
|
|
|
|
goto exit;
|
2008-05-19 13:53:34 +08:00
|
|
|
|
2009-04-06 23:16:22 +08:00
|
|
|
if (file->f_path.mnt->mnt_flags & MNT_NOEXEC)
|
|
|
|
goto exit;
|
2008-05-19 13:53:34 +08:00
|
|
|
|
2009-12-18 10:24:21 +08:00
|
|
|
fsnotify_open(file);
|
2008-12-18 02:53:20 +08:00
|
|
|
|
2008-05-19 13:53:34 +08:00
|
|
|
err = deny_write_access(file);
|
2009-04-06 23:16:22 +08:00
|
|
|
if (err)
|
|
|
|
goto exit;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-04-06 23:16:22 +08:00
|
|
|
out:
|
2008-05-19 13:53:34 +08:00
|
|
|
return file;
|
|
|
|
|
2009-04-06 23:16:22 +08:00
|
|
|
exit:
|
|
|
|
fput(file);
|
2008-05-19 13:53:34 +08:00
|
|
|
return ERR_PTR(err);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
EXPORT_SYMBOL(open_exec);
|
|
|
|
|
2009-08-22 02:32:48 +08:00
|
|
|
int kernel_read(struct file *file, loff_t offset,
|
|
|
|
char *addr, unsigned long count)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
mm_segment_t old_fs;
|
|
|
|
loff_t pos = offset;
|
|
|
|
int result;
|
|
|
|
|
|
|
|
old_fs = get_fs();
|
|
|
|
set_fs(get_ds());
|
|
|
|
/* The cast to a user pointer is valid due to the set_fs() */
|
|
|
|
result = vfs_read(file, (void __user *)addr, count, &pos);
|
|
|
|
set_fs(old_fs);
|
|
|
|
return result;
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(kernel_read);
|
|
|
|
|
|
|
|
static int exec_mmap(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
struct task_struct *tsk;
|
|
|
|
struct mm_struct * old_mm, *active_mm;
|
|
|
|
|
|
|
|
/* Notify parent that we're no longer interested in the old VM */
|
|
|
|
tsk = current;
|
|
|
|
old_mm = current->mm;
|
2010-03-06 05:41:40 +08:00
|
|
|
sync_mm_rss(tsk, old_mm);
|
2005-04-17 06:20:36 +08:00
|
|
|
mm_release(tsk, old_mm);
|
|
|
|
|
|
|
|
if (old_mm) {
|
|
|
|
/*
|
|
|
|
* Make sure that if there is a core dump in progress
|
|
|
|
* for the old mm, we get out and die instead of going
|
|
|
|
* through with the exec. We must hold mmap_sem around
|
2008-07-25 16:47:41 +08:00
|
|
|
* checking core_state and changing tsk->mm.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
down_read(&old_mm->mmap_sem);
|
2008-07-25 16:47:41 +08:00
|
|
|
if (unlikely(old_mm->core_state)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
up_read(&old_mm->mmap_sem);
|
|
|
|
return -EINTR;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
task_lock(tsk);
|
|
|
|
active_mm = tsk->active_mm;
|
|
|
|
tsk->mm = mm;
|
|
|
|
tsk->active_mm = mm;
|
|
|
|
activate_mm(active_mm, mm);
|
2010-10-27 05:21:23 +08:00
|
|
|
if (old_mm && tsk->signal->oom_score_adj == OOM_SCORE_ADJ_MIN) {
|
|
|
|
atomic_dec(&old_mm->oom_disable_count);
|
|
|
|
atomic_inc(&tsk->mm->oom_disable_count);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
task_unlock(tsk);
|
|
|
|
arch_pick_mmap_layout(mm);
|
|
|
|
if (old_mm) {
|
|
|
|
up_read(&old_mm->mmap_sem);
|
2006-04-01 07:13:38 +08:00
|
|
|
BUG_ON(active_mm != old_mm);
|
mm owner: fix race between swapoff and exit
There's a race between mm->owner assignment and swapoff, more easily
seen when task slab poisoning is turned on. The condition occurs when
try_to_unuse() runs in parallel with an exiting task. A similar race
can occur with callers of get_task_mm(), such as /proc/<pid>/<mmstats>
or ptrace or page migration.
CPU0 CPU1
try_to_unuse
looks at mm = task0->mm
increments mm->mm_users
task 0 exits
mm->owner needs to be updated, but no
new owner is found (mm_users > 1, but
no other task has task->mm = task0->mm)
mm_update_next_owner() leaves
mmput(mm) decrements mm->mm_users
task0 freed
dereferencing mm->owner fails
The fix is to notify the subsystem via mm_owner_changed callback(),
if no new owner is found, by specifying the new task as NULL.
Jiri Slaby:
mm->owner was set to NULL prior to calling cgroup_mm_owner_callbacks(), but
must be set after that, so as not to pass NULL as old owner causing oops.
Daisuke Nishimura:
mm_update_next_owner() may set mm->owner to NULL, but mem_cgroup_from_task()
and its callers need to take account of this situation to avoid oops.
Hugh Dickins:
Lockdep warning and hang below exec_mmap() when testing these patches.
exit_mm() up_reads mmap_sem before calling mm_update_next_owner(),
so exec_mmap() now needs to do the same. And with that repositioning,
there's now no point in mm_need_new_owner() allowing for NULL mm.
Reported-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Balbir Singh <balbir@linux.vnet.ibm.com>
Signed-off-by: Jiri Slaby <jirislaby@gmail.com>
Signed-off-by: Daisuke Nishimura <nishimura@mxp.nes.nec.co.jp>
Signed-off-by: Hugh Dickins <hugh@veritas.com>
Cc: KAMEZAWA Hiroyuki <kamezawa.hiroyu@jp.fujitsu.com>
Cc: Paul Menage <menage@google.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-09-29 06:09:31 +08:00
|
|
|
mm_update_next_owner(old_mm);
|
2005-04-17 06:20:36 +08:00
|
|
|
mmput(old_mm);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
mmdrop(active_mm);
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* This function makes sure the current process has its own signal table,
|
|
|
|
* so that flush_signal_handlers can later reset the handlers without
|
|
|
|
* disturbing other processes. (Other processes might share the signal
|
|
|
|
* table via the CLONE_SIGHAND option to clone().)
|
|
|
|
*/
|
2006-01-15 05:20:43 +08:00
|
|
|
static int de_thread(struct task_struct *tsk)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct signal_struct *sig = tsk->signal;
|
2007-10-17 14:27:22 +08:00
|
|
|
struct sighand_struct *oldsighand = tsk->sighand;
|
2005-04-17 06:20:36 +08:00
|
|
|
spinlock_t *lock = &oldsighand->siglock;
|
|
|
|
|
2006-09-27 16:51:13 +08:00
|
|
|
if (thread_group_empty(tsk))
|
2005-04-17 06:20:36 +08:00
|
|
|
goto no_thread_group;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Kill all other threads in the thread group.
|
|
|
|
*/
|
|
|
|
spin_lock_irq(lock);
|
2008-02-05 14:27:24 +08:00
|
|
|
if (signal_group_exit(sig)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Another group action in progress, just
|
|
|
|
* return so that the signal is processed.
|
|
|
|
*/
|
|
|
|
spin_unlock_irq(lock);
|
|
|
|
return -EAGAIN;
|
|
|
|
}
|
2010-05-27 05:43:11 +08:00
|
|
|
|
2008-02-05 14:27:24 +08:00
|
|
|
sig->group_exit_task = tsk;
|
2010-05-27 05:43:11 +08:00
|
|
|
sig->notify_count = zap_other_threads(tsk);
|
|
|
|
if (!thread_group_leader(tsk))
|
|
|
|
sig->notify_count--;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-05-27 05:43:11 +08:00
|
|
|
while (sig->notify_count) {
|
2005-04-17 06:20:36 +08:00
|
|
|
__set_current_state(TASK_UNINTERRUPTIBLE);
|
|
|
|
spin_unlock_irq(lock);
|
|
|
|
schedule();
|
|
|
|
spin_lock_irq(lock);
|
|
|
|
}
|
|
|
|
spin_unlock_irq(lock);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* At this point all other threads have exited, all we have to
|
|
|
|
* do is to wait for the thread group leader to become inactive,
|
|
|
|
* and to assume its PID:
|
|
|
|
*/
|
2006-09-27 16:51:13 +08:00
|
|
|
if (!thread_group_leader(tsk)) {
|
2008-12-02 06:18:16 +08:00
|
|
|
struct task_struct *leader = tsk->group_leader;
|
2007-10-17 14:27:23 +08:00
|
|
|
|
2008-04-30 15:53:12 +08:00
|
|
|
sig->notify_count = -1; /* for exit_notify() */
|
2007-10-17 14:27:23 +08:00
|
|
|
for (;;) {
|
|
|
|
write_lock_irq(&tasklist_lock);
|
|
|
|
if (likely(leader->exit_state))
|
|
|
|
break;
|
|
|
|
__set_current_state(TASK_UNINTERRUPTIBLE);
|
|
|
|
write_unlock_irq(&tasklist_lock);
|
|
|
|
schedule();
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-04-11 13:54:16 +08:00
|
|
|
/*
|
|
|
|
* The only record we have of the real-time age of a
|
|
|
|
* process, regardless of execs it's done, is start_time.
|
|
|
|
* All the past CPU time is accumulated in signal_struct
|
|
|
|
* from sister threads now dead. But in this non-leader
|
|
|
|
* exec, nothing survives from the original leader thread,
|
|
|
|
* whose birth marks the true age of this process now.
|
|
|
|
* When we take on its identity by switching to its PID, we
|
|
|
|
* also take its birthdate (always earlier than our own).
|
|
|
|
*/
|
2006-09-27 16:51:13 +08:00
|
|
|
tsk->start_time = leader->start_time;
|
2006-04-11 13:54:16 +08:00
|
|
|
|
2007-10-19 14:40:18 +08:00
|
|
|
BUG_ON(!same_thread_group(leader, tsk));
|
|
|
|
BUG_ON(has_group_leader_pid(tsk));
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* An exec() starts a new thread group with the
|
|
|
|
* TGID of the previous thread group. Rehash the
|
|
|
|
* two threads with a switched PID, and release
|
|
|
|
* the former thread group leader:
|
|
|
|
*/
|
2006-03-29 08:11:03 +08:00
|
|
|
|
|
|
|
/* Become a process group leader with the old leader's pid.
|
2006-09-27 16:51:06 +08:00
|
|
|
* The old leader becomes a thread of the this thread group.
|
|
|
|
* Note: The old leader also uses this pid until release_task
|
2006-03-29 08:11:03 +08:00
|
|
|
* is called. Odd but simple and correct.
|
|
|
|
*/
|
2006-09-27 16:51:13 +08:00
|
|
|
detach_pid(tsk, PIDTYPE_PID);
|
|
|
|
tsk->pid = leader->pid;
|
2007-10-19 14:39:51 +08:00
|
|
|
attach_pid(tsk, PIDTYPE_PID, task_pid(leader));
|
2006-09-27 16:51:13 +08:00
|
|
|
transfer_pid(leader, tsk, PIDTYPE_PGID);
|
|
|
|
transfer_pid(leader, tsk, PIDTYPE_SID);
|
2009-12-18 07:27:15 +08:00
|
|
|
|
2006-09-27 16:51:13 +08:00
|
|
|
list_replace_rcu(&leader->tasks, &tsk->tasks);
|
2009-12-18 07:27:15 +08:00
|
|
|
list_replace_init(&leader->sibling, &tsk->sibling);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-09-27 16:51:13 +08:00
|
|
|
tsk->group_leader = tsk;
|
|
|
|
leader->group_leader = tsk;
|
2006-04-11 07:16:49 +08:00
|
|
|
|
2006-09-27 16:51:13 +08:00
|
|
|
tsk->exit_signal = SIGCHLD;
|
2011-06-23 05:10:26 +08:00
|
|
|
leader->exit_signal = -1;
|
2005-11-24 05:37:43 +08:00
|
|
|
|
|
|
|
BUG_ON(leader->exit_state != EXIT_ZOMBIE);
|
|
|
|
leader->exit_state = EXIT_DEAD;
|
ptrace: do_wait(traced_leader_killed_by_mt_exec) can block forever
Test-case:
void *tfunc(void *arg)
{
execvp("true", NULL);
return NULL;
}
int main(void)
{
int pid;
if (fork()) {
pthread_t t;
kill(getpid(), SIGSTOP);
pthread_create(&t, NULL, tfunc, NULL);
for (;;)
pause();
}
pid = getppid();
assert(ptrace(PTRACE_ATTACH, pid, 0,0) == 0);
while (wait(NULL) > 0)
ptrace(PTRACE_CONT, pid, 0,0);
return 0;
}
It is racy, exit_notify() does __wake_up_parent() too. But in the
likely case it triggers the problem: de_thread() does release_task()
and the old leader goes away without the notification, the tracer
sleeps in do_wait() without children/tracees.
Change de_thread() to do __wake_up_parent(traced_leader->parent).
Since it is already EXIT_DEAD we can do this without ptrace_unlink(),
EXIT_DEAD threads do not exist from do_wait's pov.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Tejun Heo <tj@kernel.org>
2011-07-22 02:00:43 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* We are going to release_task()->ptrace_unlink() silently,
|
|
|
|
* the tracer can sleep in do_wait(). EXIT_DEAD guarantees
|
|
|
|
* the tracer wont't block again waiting for this thread.
|
|
|
|
*/
|
|
|
|
if (unlikely(leader->ptrace))
|
|
|
|
__wake_up_parent(leader, leader->parent);
|
2005-04-17 06:20:36 +08:00
|
|
|
write_unlock_irq(&tasklist_lock);
|
2008-12-02 06:18:16 +08:00
|
|
|
|
|
|
|
release_task(leader);
|
2008-02-05 14:27:24 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-10-17 14:27:23 +08:00
|
|
|
sig->group_exit_task = NULL;
|
|
|
|
sig->notify_count = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
no_thread_group:
|
getrusage: fill ru_maxrss value
Make ->ru_maxrss value in struct rusage filled accordingly to rss hiwater
mark. This struct is filled as a parameter to getrusage syscall.
->ru_maxrss value is set to KBs which is the way it is done in BSD
systems. /usr/bin/time (gnu time) application converts ->ru_maxrss to KBs
which seems to be incorrect behavior. Maintainer of this util was
notified by me with the patch which corrects it and cc'ed.
To make this happen we extend struct signal_struct by two fields. The
first one is ->maxrss which we use to store rss hiwater of the task. The
second one is ->cmaxrss which we use to store highest rss hiwater of all
task childs. These values are used in k_getrusage() to actually fill
->ru_maxrss. k_getrusage() uses current rss hiwater value directly if mm
struct exists.
Note:
exec() clear mm->hiwater_rss, but doesn't clear sig->maxrss.
it is intetionally behavior. *BSD getrusage have exec() inheriting.
test programs
========================================================
getrusage.c
===========
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <signal.h>
#include <sys/mman.h>
#include "common.h"
#define err(str) perror(str), exit(1)
int main(int argc, char** argv)
{
int status;
printf("allocate 100MB\n");
consume(100);
printf("testcase1: fork inherit? \n");
printf(" expect: initial.self ~= child.self\n");
show_rusage("initial");
if (__fork()) {
wait(&status);
} else {
show_rusage("fork child");
_exit(0);
}
printf("\n");
printf("testcase2: fork inherit? (cont.) \n");
printf(" expect: initial.children ~= 100MB, but child.children = 0\n");
show_rusage("initial");
if (__fork()) {
wait(&status);
} else {
show_rusage("child");
_exit(0);
}
printf("\n");
printf("testcase3: fork + malloc \n");
printf(" expect: child.self ~= initial.self + 50MB\n");
show_rusage("initial");
if (__fork()) {
wait(&status);
} else {
printf("allocate +50MB\n");
consume(50);
show_rusage("fork child");
_exit(0);
}
printf("\n");
printf("testcase4: grandchild maxrss\n");
printf(" expect: post_wait.children ~= 300MB\n");
show_rusage("initial");
if (__fork()) {
wait(&status);
show_rusage("post_wait");
} else {
system("./child -n 0 -g 300");
_exit(0);
}
printf("\n");
printf("testcase5: zombie\n");
printf(" expect: pre_wait ~= initial, IOW the zombie process is not accounted.\n");
printf(" post_wait ~= 400MB, IOW wait() collect child's max_rss. \n");
show_rusage("initial");
if (__fork()) {
sleep(1); /* children become zombie */
show_rusage("pre_wait");
wait(&status);
show_rusage("post_wait");
} else {
system("./child -n 400");
_exit(0);
}
printf("\n");
printf("testcase6: SIG_IGN\n");
printf(" expect: initial ~= after_zombie (child's 500MB alloc should be ignored).\n");
show_rusage("initial");
signal(SIGCHLD, SIG_IGN);
if (__fork()) {
sleep(1); /* children become zombie */
show_rusage("after_zombie");
} else {
system("./child -n 500");
_exit(0);
}
printf("\n");
signal(SIGCHLD, SIG_DFL);
printf("testcase7: exec (without fork) \n");
printf(" expect: initial ~= exec \n");
show_rusage("initial");
execl("./child", "child", "-v", NULL);
return 0;
}
child.c
=======
#include <sys/types.h>
#include <unistd.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include "common.h"
int main(int argc, char** argv)
{
int status;
int c;
long consume_size = 0;
long grandchild_consume_size = 0;
int show = 0;
while ((c = getopt(argc, argv, "n:g:v")) != -1) {
switch (c) {
case 'n':
consume_size = atol(optarg);
break;
case 'v':
show = 1;
break;
case 'g':
grandchild_consume_size = atol(optarg);
break;
default:
break;
}
}
if (show)
show_rusage("exec");
if (consume_size) {
printf("child alloc %ldMB\n", consume_size);
consume(consume_size);
}
if (grandchild_consume_size) {
if (fork()) {
wait(&status);
} else {
printf("grandchild alloc %ldMB\n", grandchild_consume_size);
consume(grandchild_consume_size);
exit(0);
}
}
return 0;
}
common.c
========
#include <stdio.h>
#include <stdlib.h>
#include <string.h>
#include <sys/types.h>
#include <sys/time.h>
#include <sys/resource.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <unistd.h>
#include <signal.h>
#include <sys/mman.h>
#include "common.h"
#define err(str) perror(str), exit(1)
void show_rusage(char *prefix)
{
int err, err2;
struct rusage rusage_self;
struct rusage rusage_children;
printf("%s: ", prefix);
err = getrusage(RUSAGE_SELF, &rusage_self);
if (!err)
printf("self %ld ", rusage_self.ru_maxrss);
err2 = getrusage(RUSAGE_CHILDREN, &rusage_children);
if (!err2)
printf("children %ld ", rusage_children.ru_maxrss);
printf("\n");
}
/* Some buggy OS need this worthless CPU waste. */
void make_pagefault(void)
{
void *addr;
int size = getpagesize();
int i;
for (i=0; i<1000; i++) {
addr = mmap(NULL, size, PROT_READ | PROT_WRITE, MAP_PRIVATE | MAP_ANON, -1, 0);
if (addr == MAP_FAILED)
err("make_pagefault");
memset(addr, 0, size);
munmap(addr, size);
}
}
void consume(int mega)
{
size_t sz = mega * 1024 * 1024;
void *ptr;
ptr = malloc(sz);
memset(ptr, 0, sz);
make_pagefault();
}
pid_t __fork(void)
{
pid_t pid;
pid = fork();
make_pagefault();
return pid;
}
common.h
========
void show_rusage(char *prefix);
void make_pagefault(void);
void consume(int mega);
pid_t __fork(void);
FreeBSD result (expected result)
========================================================
allocate 100MB
testcase1: fork inherit?
expect: initial.self ~= child.self
initial: self 103492 children 0
fork child: self 103540 children 0
testcase2: fork inherit? (cont.)
expect: initial.children ~= 100MB, but child.children = 0
initial: self 103540 children 103540
child: self 103564 children 0
testcase3: fork + malloc
expect: child.self ~= initial.self + 50MB
initial: self 103564 children 103564
allocate +50MB
fork child: self 154860 children 0
testcase4: grandchild maxrss
expect: post_wait.children ~= 300MB
initial: self 103564 children 154860
grandchild alloc 300MB
post_wait: self 103564 children 308720
testcase5: zombie
expect: pre_wait ~= initial, IOW the zombie process is not accounted.
post_wait ~= 400MB, IOW wait() collect child's max_rss.
initial: self 103564 children 308720
child alloc 400MB
pre_wait: self 103564 children 308720
post_wait: self 103564 children 411312
testcase6: SIG_IGN
expect: initial ~= after_zombie (child's 500MB alloc should be ignored).
initial: self 103564 children 411312
child alloc 500MB
after_zombie: self 103624 children 411312
testcase7: exec (without fork)
expect: initial ~= exec
initial: self 103624 children 411312
exec: self 103624 children 411312
Linux result (actual test result)
========================================================
allocate 100MB
testcase1: fork inherit?
expect: initial.self ~= child.self
initial: self 102848 children 0
fork child: self 102572 children 0
testcase2: fork inherit? (cont.)
expect: initial.children ~= 100MB, but child.children = 0
initial: self 102876 children 102644
child: self 102572 children 0
testcase3: fork + malloc
expect: child.self ~= initial.self + 50MB
initial: self 102876 children 102644
allocate +50MB
fork child: self 153804 children 0
testcase4: grandchild maxrss
expect: post_wait.children ~= 300MB
initial: self 102876 children 153864
grandchild alloc 300MB
post_wait: self 102876 children 307536
testcase5: zombie
expect: pre_wait ~= initial, IOW the zombie process is not accounted.
post_wait ~= 400MB, IOW wait() collect child's max_rss.
initial: self 102876 children 307536
child alloc 400MB
pre_wait: self 102876 children 307536
post_wait: self 102876 children 410076
testcase6: SIG_IGN
expect: initial ~= after_zombie (child's 500MB alloc should be ignored).
initial: self 102876 children 410076
child alloc 500MB
after_zombie: self 102880 children 410076
testcase7: exec (without fork)
expect: initial ~= exec
initial: self 102880 children 410076
exec: self 102880 children 410076
Signed-off-by: Jiri Pirko <jpirko@redhat.com>
Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Oleg Nesterov <oleg@redhat.com>
Cc: Hugh Dickins <hugh.dickins@tiscali.co.uk>
Cc: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-09-23 07:44:10 +08:00
|
|
|
if (current->mm)
|
|
|
|
setmax_mm_hiwater_rss(&sig->maxrss, current->mm);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
exit_itimers(sig);
|
2008-05-27 00:55:42 +08:00
|
|
|
flush_itimer_signals();
|
2005-11-08 02:12:43 +08:00
|
|
|
|
2007-10-17 14:27:22 +08:00
|
|
|
if (atomic_read(&oldsighand->count) != 1) {
|
|
|
|
struct sighand_struct *newsighand;
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2007-10-17 14:27:22 +08:00
|
|
|
* This ->sighand is shared with the CLONE_SIGHAND
|
|
|
|
* but not CLONE_THREAD task, switch to the new one.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2007-10-17 14:27:22 +08:00
|
|
|
newsighand = kmem_cache_alloc(sighand_cachep, GFP_KERNEL);
|
|
|
|
if (!newsighand)
|
|
|
|
return -ENOMEM;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
atomic_set(&newsighand->count, 1);
|
|
|
|
memcpy(newsighand->action, oldsighand->action,
|
|
|
|
sizeof(newsighand->action));
|
|
|
|
|
|
|
|
write_lock_irq(&tasklist_lock);
|
|
|
|
spin_lock(&oldsighand->siglock);
|
2006-09-27 16:51:13 +08:00
|
|
|
rcu_assign_pointer(tsk->sighand, newsighand);
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_unlock(&oldsighand->siglock);
|
|
|
|
write_unlock_irq(&tasklist_lock);
|
|
|
|
|
signal/timer/event: signalfd core
This patch series implements the new signalfd() system call.
I took part of the original Linus code (and you know how badly it can be
broken :), and I added even more breakage ;) Signals are fetched from the same
signal queue used by the process, so signalfd will compete with standard
kernel delivery in dequeue_signal(). If you want to reliably fetch signals on
the signalfd file, you need to block them with sigprocmask(SIG_BLOCK). This
seems to be working fine on my Dual Opteron machine. I made a quick test
program for it:
http://www.xmailserver.org/signafd-test.c
The signalfd() system call implements signal delivery into a file descriptor
receiver. The signalfd file descriptor if created with the following API:
int signalfd(int ufd, const sigset_t *mask, size_t masksize);
The "ufd" parameter allows to change an existing signalfd sigmask, w/out going
to close/create cycle (Linus idea). Use "ufd" == -1 if you want a brand new
signalfd file.
The "mask" allows to specify the signal mask of signals that we are interested
in. The "masksize" parameter is the size of "mask".
The signalfd fd supports the poll(2) and read(2) system calls. The poll(2)
will return POLLIN when signals are available to be dequeued. As a direct
consequence of supporting the Linux poll subsystem, the signalfd fd can use
used together with epoll(2) too.
The read(2) system call will return a "struct signalfd_siginfo" structure in
the userspace supplied buffer. The return value is the number of bytes copied
in the supplied buffer, or -1 in case of error. The read(2) call can also
return 0, in case the sighand structure to which the signalfd was attached,
has been orphaned. The O_NONBLOCK flag is also supported, and read(2) will
return -EAGAIN in case no signal is available.
If the size of the buffer passed to read(2) is lower than sizeof(struct
signalfd_siginfo), -EINVAL is returned. A read from the signalfd can also
return -ERESTARTSYS in case a signal hits the process. The format of the
struct signalfd_siginfo is, and the valid fields depends of the (->code &
__SI_MASK) value, in the same way a struct siginfo would:
struct signalfd_siginfo {
__u32 signo; /* si_signo */
__s32 err; /* si_errno */
__s32 code; /* si_code */
__u32 pid; /* si_pid */
__u32 uid; /* si_uid */
__s32 fd; /* si_fd */
__u32 tid; /* si_fd */
__u32 band; /* si_band */
__u32 overrun; /* si_overrun */
__u32 trapno; /* si_trapno */
__s32 status; /* si_status */
__s32 svint; /* si_int */
__u64 svptr; /* si_ptr */
__u64 utime; /* si_utime */
__u64 stime; /* si_stime */
__u64 addr; /* si_addr */
};
[akpm@linux-foundation.org: fix signalfd_copyinfo() on i386]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-11 13:23:13 +08:00
|
|
|
__cleanup_sighand(oldsighand);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2006-09-27 16:51:13 +08:00
|
|
|
BUG_ON(!thread_group_leader(tsk));
|
2005-04-17 06:20:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
2007-10-17 14:27:22 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* These functions flushes out all traces of the currently running executable
|
|
|
|
* so that a new one can be started
|
|
|
|
*/
|
2006-01-15 05:20:43 +08:00
|
|
|
static void flush_old_files(struct files_struct * files)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
long j = -1;
|
2005-09-10 04:04:10 +08:00
|
|
|
struct fdtable *fdt;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
spin_lock(&files->file_lock);
|
|
|
|
for (;;) {
|
|
|
|
unsigned long set, i;
|
|
|
|
|
|
|
|
j++;
|
|
|
|
i = j * __NFDBITS;
|
2005-09-10 04:04:10 +08:00
|
|
|
fdt = files_fdtable(files);
|
2006-12-10 18:21:12 +08:00
|
|
|
if (i >= fdt->max_fds)
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
2005-09-10 04:04:10 +08:00
|
|
|
set = fdt->close_on_exec->fds_bits[j];
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!set)
|
|
|
|
continue;
|
2005-09-10 04:04:10 +08:00
|
|
|
fdt->close_on_exec->fds_bits[j] = 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
spin_unlock(&files->file_lock);
|
|
|
|
for ( ; set ; i++,set >>= 1) {
|
|
|
|
if (set & 1) {
|
|
|
|
sys_close(i);
|
|
|
|
}
|
|
|
|
}
|
|
|
|
spin_lock(&files->file_lock);
|
|
|
|
|
|
|
|
}
|
|
|
|
spin_unlock(&files->file_lock);
|
|
|
|
}
|
|
|
|
|
2008-02-05 14:27:21 +08:00
|
|
|
char *get_task_comm(char *buf, struct task_struct *tsk)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
/* buf must be at least sizeof(tsk->comm) in size */
|
|
|
|
task_lock(tsk);
|
|
|
|
strncpy(buf, tsk->comm, sizeof(tsk->comm));
|
|
|
|
task_unlock(tsk);
|
2008-02-05 14:27:21 +08:00
|
|
|
return buf;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2011-05-07 07:56:47 +08:00
|
|
|
EXPORT_SYMBOL_GPL(get_task_comm);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
void set_task_comm(struct task_struct *tsk, char *buf)
|
|
|
|
{
|
|
|
|
task_lock(tsk);
|
2009-12-15 10:00:05 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Threads may access current->comm without holding
|
|
|
|
* the task lock, so write the string carefully.
|
|
|
|
* Readers without a lock may see incomplete new
|
|
|
|
* names but are safe from non-terminating string reads.
|
|
|
|
*/
|
|
|
|
memset(tsk->comm, 0, TASK_COMM_LEN);
|
|
|
|
wmb();
|
2005-04-17 06:20:36 +08:00
|
|
|
strlcpy(tsk->comm, buf, sizeof(tsk->comm));
|
|
|
|
task_unlock(tsk);
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
perf_event_comm(tsk);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
int flush_old_exec(struct linux_binprm * bprm)
|
|
|
|
{
|
Split 'flush_old_exec' into two functions
'flush_old_exec()' is the point of no return when doing an execve(), and
it is pretty badly misnamed. It doesn't just flush the old executable
environment, it also starts up the new one.
Which is very inconvenient for things like setting up the new
personality, because we want the new personality to affect the starting
of the new environment, but at the same time we do _not_ want the new
personality to take effect if flushing the old one fails.
As a result, the x86-64 '32-bit' personality is actually done using this
insane "I'm going to change the ABI, but I haven't done it yet" bit
(TIF_ABI_PENDING), with SET_PERSONALITY() not actually setting the
personality, but just the "pending" bit, so that "flush_thread()" can do
the actual personality magic.
This patch in no way changes any of that insanity, but it does split the
'flush_old_exec()' function up into a preparatory part that can fail
(still called flush_old_exec()), and a new part that will actually set
up the new exec environment (setup_new_exec()). All callers are changed
to trivially comply with the new world order.
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-29 14:14:42 +08:00
|
|
|
int retval;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure we have a private signal table and that
|
|
|
|
* we are unassociated from the previous thread group.
|
|
|
|
*/
|
|
|
|
retval = de_thread(current);
|
|
|
|
if (retval)
|
|
|
|
goto out;
|
|
|
|
|
2008-04-29 16:01:36 +08:00
|
|
|
set_mm_exe_file(bprm->mm, bprm->file);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Release all of the old mmap stuff
|
|
|
|
*/
|
2010-12-01 03:55:34 +08:00
|
|
|
acct_arg_size(bprm, 0);
|
2005-04-17 06:20:36 +08:00
|
|
|
retval = exec_mmap(bprm->mm);
|
|
|
|
if (retval)
|
2008-04-22 17:11:59 +08:00
|
|
|
goto out;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
bprm->mm = NULL; /* We're using it now */
|
2010-02-03 04:37:44 +08:00
|
|
|
|
2011-06-10 02:05:18 +08:00
|
|
|
set_fs(USER_DS);
|
2010-10-28 06:34:16 +08:00
|
|
|
current->flags &= ~(PF_RANDOMIZE | PF_KTHREAD);
|
2010-02-03 04:37:44 +08:00
|
|
|
flush_thread();
|
|
|
|
current->personality &= ~bprm->per_clear;
|
|
|
|
|
Split 'flush_old_exec' into two functions
'flush_old_exec()' is the point of no return when doing an execve(), and
it is pretty badly misnamed. It doesn't just flush the old executable
environment, it also starts up the new one.
Which is very inconvenient for things like setting up the new
personality, because we want the new personality to affect the starting
of the new environment, but at the same time we do _not_ want the new
personality to take effect if flushing the old one fails.
As a result, the x86-64 '32-bit' personality is actually done using this
insane "I'm going to change the ABI, but I haven't done it yet" bit
(TIF_ABI_PENDING), with SET_PERSONALITY() not actually setting the
personality, but just the "pending" bit, so that "flush_thread()" can do
the actual personality magic.
This patch in no way changes any of that insanity, but it does split the
'flush_old_exec()' function up into a preparatory part that can fail
(still called flush_old_exec()), and a new part that will actually set
up the new exec environment (setup_new_exec()). All callers are changed
to trivially comply with the new world order.
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-29 14:14:42 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
out:
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(flush_old_exec);
|
|
|
|
|
2011-06-20 00:49:47 +08:00
|
|
|
void would_dump(struct linux_binprm *bprm, struct file *file)
|
|
|
|
{
|
|
|
|
if (inode_permission(file->f_path.dentry->d_inode, MAY_READ) < 0)
|
|
|
|
bprm->interp_flags |= BINPRM_FLAGS_ENFORCE_NONDUMP;
|
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(would_dump);
|
|
|
|
|
Split 'flush_old_exec' into two functions
'flush_old_exec()' is the point of no return when doing an execve(), and
it is pretty badly misnamed. It doesn't just flush the old executable
environment, it also starts up the new one.
Which is very inconvenient for things like setting up the new
personality, because we want the new personality to affect the starting
of the new environment, but at the same time we do _not_ want the new
personality to take effect if flushing the old one fails.
As a result, the x86-64 '32-bit' personality is actually done using this
insane "I'm going to change the ABI, but I haven't done it yet" bit
(TIF_ABI_PENDING), with SET_PERSONALITY() not actually setting the
personality, but just the "pending" bit, so that "flush_thread()" can do
the actual personality magic.
This patch in no way changes any of that insanity, but it does split the
'flush_old_exec()' function up into a preparatory part that can fail
(still called flush_old_exec()), and a new part that will actually set
up the new exec environment (setup_new_exec()). All callers are changed
to trivially comply with the new world order.
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-29 14:14:42 +08:00
|
|
|
void setup_new_exec(struct linux_binprm * bprm)
|
|
|
|
{
|
|
|
|
int i, ch;
|
2010-08-18 06:52:56 +08:00
|
|
|
const char *name;
|
Split 'flush_old_exec' into two functions
'flush_old_exec()' is the point of no return when doing an execve(), and
it is pretty badly misnamed. It doesn't just flush the old executable
environment, it also starts up the new one.
Which is very inconvenient for things like setting up the new
personality, because we want the new personality to affect the starting
of the new environment, but at the same time we do _not_ want the new
personality to take effect if flushing the old one fails.
As a result, the x86-64 '32-bit' personality is actually done using this
insane "I'm going to change the ABI, but I haven't done it yet" bit
(TIF_ABI_PENDING), with SET_PERSONALITY() not actually setting the
personality, but just the "pending" bit, so that "flush_thread()" can do
the actual personality magic.
This patch in no way changes any of that insanity, but it does split the
'flush_old_exec()' function up into a preparatory part that can fail
(still called flush_old_exec()), and a new part that will actually set
up the new exec environment (setup_new_exec()). All callers are changed
to trivially comply with the new world order.
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-29 14:14:42 +08:00
|
|
|
char tcomm[sizeof(current->comm)];
|
|
|
|
|
|
|
|
arch_pick_mmap_layout(current->mm);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* This is the point of no return */
|
|
|
|
current->sas_ss_sp = current->sas_ss_size = 0;
|
|
|
|
|
2008-11-14 07:39:05 +08:00
|
|
|
if (current_euid() == current_uid() && current_egid() == current_gid())
|
2007-07-19 16:48:27 +08:00
|
|
|
set_dumpable(current->mm, 1);
|
2005-06-23 15:09:43 +08:00
|
|
|
else
|
2007-07-19 16:48:27 +08:00
|
|
|
set_dumpable(current->mm, suid_dumpable);
|
2005-06-23 15:09:43 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
name = bprm->filename;
|
2005-05-06 07:16:12 +08:00
|
|
|
|
|
|
|
/* Copies the binary name from after last slash */
|
2005-04-17 06:20:36 +08:00
|
|
|
for (i=0; (ch = *(name++)) != '\0';) {
|
|
|
|
if (ch == '/')
|
2005-05-06 07:16:12 +08:00
|
|
|
i = 0; /* overwrite what we wrote */
|
2005-04-17 06:20:36 +08:00
|
|
|
else
|
|
|
|
if (i < (sizeof(tcomm) - 1))
|
|
|
|
tcomm[i++] = ch;
|
|
|
|
}
|
|
|
|
tcomm[i] = '\0';
|
|
|
|
set_task_comm(current, tcomm);
|
|
|
|
|
2006-03-01 08:59:19 +08:00
|
|
|
/* Set the new mm task size. We have to do that late because it may
|
|
|
|
* depend on TIF_32BIT which is only updated in flush_thread() on
|
|
|
|
* some architectures like powerpc
|
|
|
|
*/
|
|
|
|
current->mm->task_size = TASK_SIZE;
|
|
|
|
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
/* install the new credentials */
|
|
|
|
if (bprm->cred->uid != current_euid() ||
|
|
|
|
bprm->cred->gid != current_egid()) {
|
2007-08-18 03:47:58 +08:00
|
|
|
current->pdeath_signal = 0;
|
2011-06-20 00:49:47 +08:00
|
|
|
} else {
|
|
|
|
would_dump(bprm, bprm->file);
|
|
|
|
if (bprm->interp_flags & BINPRM_FLAGS_ENFORCE_NONDUMP)
|
|
|
|
set_dumpable(current->mm, suid_dumpable);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-12-16 20:40:44 +08:00
|
|
|
/*
|
|
|
|
* Flush performance counters when crossing a
|
|
|
|
* security domain:
|
|
|
|
*/
|
|
|
|
if (!get_dumpable(current->mm))
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
perf_event_exit_task(current);
|
2008-12-16 20:40:44 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* An exec changes our domain. We are no longer part of the thread
|
|
|
|
group */
|
|
|
|
|
|
|
|
current->self_exec_id++;
|
|
|
|
|
|
|
|
flush_signal_handlers(current, 0);
|
|
|
|
flush_old_files(current->files);
|
|
|
|
}
|
Split 'flush_old_exec' into two functions
'flush_old_exec()' is the point of no return when doing an execve(), and
it is pretty badly misnamed. It doesn't just flush the old executable
environment, it also starts up the new one.
Which is very inconvenient for things like setting up the new
personality, because we want the new personality to affect the starting
of the new environment, but at the same time we do _not_ want the new
personality to take effect if flushing the old one fails.
As a result, the x86-64 '32-bit' personality is actually done using this
insane "I'm going to change the ABI, but I haven't done it yet" bit
(TIF_ABI_PENDING), with SET_PERSONALITY() not actually setting the
personality, but just the "pending" bit, so that "flush_thread()" can do
the actual personality magic.
This patch in no way changes any of that insanity, but it does split the
'flush_old_exec()' function up into a preparatory part that can fail
(still called flush_old_exec()), and a new part that will actually set
up the new exec environment (setup_new_exec()). All callers are changed
to trivially comply with the new world order.
Signed-off-by: H. Peter Anvin <hpa@zytor.com>
Cc: stable@kernel.org
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2010-01-29 14:14:42 +08:00
|
|
|
EXPORT_SYMBOL(setup_new_exec);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-09-06 02:17:13 +08:00
|
|
|
/*
|
|
|
|
* Prepare credentials and lock ->cred_guard_mutex.
|
|
|
|
* install_exec_creds() commits the new creds and drops the lock.
|
|
|
|
* Or, if exec fails before, free_bprm() should release ->cred and
|
|
|
|
* and unlock.
|
|
|
|
*/
|
|
|
|
int prepare_bprm_creds(struct linux_binprm *bprm)
|
|
|
|
{
|
2010-10-28 06:34:08 +08:00
|
|
|
if (mutex_lock_interruptible(¤t->signal->cred_guard_mutex))
|
2009-09-06 02:17:13 +08:00
|
|
|
return -ERESTARTNOINTR;
|
|
|
|
|
|
|
|
bprm->cred = prepare_exec_creds();
|
|
|
|
if (likely(bprm->cred))
|
|
|
|
return 0;
|
|
|
|
|
2010-10-28 06:34:08 +08:00
|
|
|
mutex_unlock(¤t->signal->cred_guard_mutex);
|
2009-09-06 02:17:13 +08:00
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
void free_bprm(struct linux_binprm *bprm)
|
|
|
|
{
|
|
|
|
free_arg_pages(bprm);
|
|
|
|
if (bprm->cred) {
|
2010-10-28 06:34:08 +08:00
|
|
|
mutex_unlock(¤t->signal->cred_guard_mutex);
|
2009-09-06 02:17:13 +08:00
|
|
|
abort_creds(bprm->cred);
|
|
|
|
}
|
|
|
|
kfree(bprm);
|
|
|
|
}
|
|
|
|
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
/*
|
|
|
|
* install the new credentials for this executable
|
|
|
|
*/
|
|
|
|
void install_exec_creds(struct linux_binprm *bprm)
|
|
|
|
{
|
|
|
|
security_bprm_committing_creds(bprm);
|
|
|
|
|
|
|
|
commit_creds(bprm->cred);
|
|
|
|
bprm->cred = NULL;
|
2009-09-06 02:17:13 +08:00
|
|
|
/*
|
|
|
|
* cred_guard_mutex must be held at least to this point to prevent
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
* ptrace_attach() from altering our determination of the task's
|
2009-09-06 02:17:13 +08:00
|
|
|
* credentials; any time after this it may be unlocked.
|
|
|
|
*/
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
security_bprm_committed_creds(bprm);
|
2010-10-28 06:34:08 +08:00
|
|
|
mutex_unlock(¤t->signal->cred_guard_mutex);
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(install_exec_creds);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* determine how safe it is to execute the proposed program
|
2010-10-28 06:34:08 +08:00
|
|
|
* - the caller must hold ->cred_guard_mutex to protect against
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
* PTRACE_ATTACH
|
|
|
|
*/
|
2009-03-30 19:20:30 +08:00
|
|
|
int check_unsafe_exec(struct linux_binprm *bprm)
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
{
|
CRED: Fix SUID exec regression
The patch:
commit a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d
CRED: Make execve() take advantage of copy-on-write credentials
moved the place in which the 'safeness' of a SUID/SGID exec was performed to
before de_thread() was called. This means that LSM_UNSAFE_SHARE is now
calculated incorrectly. This flag is set if any of the usage counts for
fs_struct, files_struct and sighand_struct are greater than 1 at the time the
determination is made. All of which are true for threads created by the
pthread library.
However, since we wish to make the security calculation before irrevocably
damaging the process so that we can return it an error code in the case where
we decide we want to reject the exec request on this basis, we have to make the
determination before calling de_thread().
So, instead, we count up the number of threads (CLONE_THREAD) that are sharing
our fs_struct (CLONE_FS), files_struct (CLONE_FILES) and sighand_structs
(CLONE_SIGHAND/CLONE_THREAD) with us. These will be killed by de_thread() and
so can be discounted by check_unsafe_exec().
We do have to be careful because CLONE_THREAD does not imply FS or FILES.
We _assume_ that there will be no extra references to these structs held by the
threads we're going to kill.
This can be tested with the attached pair of programs. Build the two programs
using the Makefile supplied, and run ./test1 as a non-root user. If
successful, you should see something like:
[dhowells@andromeda tmp]$ ./test1
--TEST1--
uid=4043, euid=4043 suid=4043
exec ./test2
--TEST2--
uid=4043, euid=0 suid=0
SUCCESS - Correct effective user ID
and if unsuccessful, something like:
[dhowells@andromeda tmp]$ ./test1
--TEST1--
uid=4043, euid=4043 suid=4043
exec ./test2
--TEST2--
uid=4043, euid=4043 suid=4043
ERROR - Incorrect effective user ID!
The non-root user ID you see will depend on the user you run as.
[test1.c]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
static void *thread_func(void *arg)
{
while (1) {}
}
int main(int argc, char **argv)
{
pthread_t tid;
uid_t uid, euid, suid;
printf("--TEST1--\n");
getresuid(&uid, &euid, &suid);
printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid);
if (pthread_create(&tid, NULL, thread_func, NULL) < 0) {
perror("pthread_create");
exit(1);
}
printf("exec ./test2\n");
execlp("./test2", "test2", NULL);
perror("./test2");
_exit(1);
}
[test2.c]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
int main(int argc, char **argv)
{
uid_t uid, euid, suid;
getresuid(&uid, &euid, &suid);
printf("--TEST2--\n");
printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid);
if (euid != 0) {
fprintf(stderr, "ERROR - Incorrect effective user ID!\n");
exit(1);
}
printf("SUCCESS - Correct effective user ID\n");
exit(0);
}
[Makefile]
CFLAGS = -D_GNU_SOURCE -Wall -Werror -Wunused
all: test1 test2
test1: test1.c
gcc $(CFLAGS) -o test1 test1.c -lpthread
test2: test2.c
gcc $(CFLAGS) -o test2 test2.c
sudo chown root.root test2
sudo chmod +s test2
Reported-by: David Smith <dsmith@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: David Smith <dsmith@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
2009-02-06 19:45:46 +08:00
|
|
|
struct task_struct *p = current, *t;
|
2009-03-30 19:35:18 +08:00
|
|
|
unsigned n_fs;
|
2009-03-30 19:20:30 +08:00
|
|
|
int res = 0;
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
|
2011-06-17 22:50:38 +08:00
|
|
|
if (p->ptrace) {
|
|
|
|
if (p->ptrace & PT_PTRACE_CAP)
|
|
|
|
bprm->unsafe |= LSM_UNSAFE_PTRACE_CAP;
|
|
|
|
else
|
|
|
|
bprm->unsafe |= LSM_UNSAFE_PTRACE;
|
|
|
|
}
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
|
CRED: Fix SUID exec regression
The patch:
commit a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d
CRED: Make execve() take advantage of copy-on-write credentials
moved the place in which the 'safeness' of a SUID/SGID exec was performed to
before de_thread() was called. This means that LSM_UNSAFE_SHARE is now
calculated incorrectly. This flag is set if any of the usage counts for
fs_struct, files_struct and sighand_struct are greater than 1 at the time the
determination is made. All of which are true for threads created by the
pthread library.
However, since we wish to make the security calculation before irrevocably
damaging the process so that we can return it an error code in the case where
we decide we want to reject the exec request on this basis, we have to make the
determination before calling de_thread().
So, instead, we count up the number of threads (CLONE_THREAD) that are sharing
our fs_struct (CLONE_FS), files_struct (CLONE_FILES) and sighand_structs
(CLONE_SIGHAND/CLONE_THREAD) with us. These will be killed by de_thread() and
so can be discounted by check_unsafe_exec().
We do have to be careful because CLONE_THREAD does not imply FS or FILES.
We _assume_ that there will be no extra references to these structs held by the
threads we're going to kill.
This can be tested with the attached pair of programs. Build the two programs
using the Makefile supplied, and run ./test1 as a non-root user. If
successful, you should see something like:
[dhowells@andromeda tmp]$ ./test1
--TEST1--
uid=4043, euid=4043 suid=4043
exec ./test2
--TEST2--
uid=4043, euid=0 suid=0
SUCCESS - Correct effective user ID
and if unsuccessful, something like:
[dhowells@andromeda tmp]$ ./test1
--TEST1--
uid=4043, euid=4043 suid=4043
exec ./test2
--TEST2--
uid=4043, euid=4043 suid=4043
ERROR - Incorrect effective user ID!
The non-root user ID you see will depend on the user you run as.
[test1.c]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
static void *thread_func(void *arg)
{
while (1) {}
}
int main(int argc, char **argv)
{
pthread_t tid;
uid_t uid, euid, suid;
printf("--TEST1--\n");
getresuid(&uid, &euid, &suid);
printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid);
if (pthread_create(&tid, NULL, thread_func, NULL) < 0) {
perror("pthread_create");
exit(1);
}
printf("exec ./test2\n");
execlp("./test2", "test2", NULL);
perror("./test2");
_exit(1);
}
[test2.c]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
int main(int argc, char **argv)
{
uid_t uid, euid, suid;
getresuid(&uid, &euid, &suid);
printf("--TEST2--\n");
printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid);
if (euid != 0) {
fprintf(stderr, "ERROR - Incorrect effective user ID!\n");
exit(1);
}
printf("SUCCESS - Correct effective user ID\n");
exit(0);
}
[Makefile]
CFLAGS = -D_GNU_SOURCE -Wall -Werror -Wunused
all: test1 test2
test1: test1.c
gcc $(CFLAGS) -o test1 test1.c -lpthread
test2: test2.c
gcc $(CFLAGS) -o test2 test2.c
sudo chown root.root test2
sudo chmod +s test2
Reported-by: David Smith <dsmith@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: David Smith <dsmith@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
2009-02-06 19:45:46 +08:00
|
|
|
n_fs = 1;
|
2010-08-18 02:37:33 +08:00
|
|
|
spin_lock(&p->fs->lock);
|
2009-04-24 07:02:45 +08:00
|
|
|
rcu_read_lock();
|
CRED: Fix SUID exec regression
The patch:
commit a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d
CRED: Make execve() take advantage of copy-on-write credentials
moved the place in which the 'safeness' of a SUID/SGID exec was performed to
before de_thread() was called. This means that LSM_UNSAFE_SHARE is now
calculated incorrectly. This flag is set if any of the usage counts for
fs_struct, files_struct and sighand_struct are greater than 1 at the time the
determination is made. All of which are true for threads created by the
pthread library.
However, since we wish to make the security calculation before irrevocably
damaging the process so that we can return it an error code in the case where
we decide we want to reject the exec request on this basis, we have to make the
determination before calling de_thread().
So, instead, we count up the number of threads (CLONE_THREAD) that are sharing
our fs_struct (CLONE_FS), files_struct (CLONE_FILES) and sighand_structs
(CLONE_SIGHAND/CLONE_THREAD) with us. These will be killed by de_thread() and
so can be discounted by check_unsafe_exec().
We do have to be careful because CLONE_THREAD does not imply FS or FILES.
We _assume_ that there will be no extra references to these structs held by the
threads we're going to kill.
This can be tested with the attached pair of programs. Build the two programs
using the Makefile supplied, and run ./test1 as a non-root user. If
successful, you should see something like:
[dhowells@andromeda tmp]$ ./test1
--TEST1--
uid=4043, euid=4043 suid=4043
exec ./test2
--TEST2--
uid=4043, euid=0 suid=0
SUCCESS - Correct effective user ID
and if unsuccessful, something like:
[dhowells@andromeda tmp]$ ./test1
--TEST1--
uid=4043, euid=4043 suid=4043
exec ./test2
--TEST2--
uid=4043, euid=4043 suid=4043
ERROR - Incorrect effective user ID!
The non-root user ID you see will depend on the user you run as.
[test1.c]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
static void *thread_func(void *arg)
{
while (1) {}
}
int main(int argc, char **argv)
{
pthread_t tid;
uid_t uid, euid, suid;
printf("--TEST1--\n");
getresuid(&uid, &euid, &suid);
printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid);
if (pthread_create(&tid, NULL, thread_func, NULL) < 0) {
perror("pthread_create");
exit(1);
}
printf("exec ./test2\n");
execlp("./test2", "test2", NULL);
perror("./test2");
_exit(1);
}
[test2.c]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
int main(int argc, char **argv)
{
uid_t uid, euid, suid;
getresuid(&uid, &euid, &suid);
printf("--TEST2--\n");
printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid);
if (euid != 0) {
fprintf(stderr, "ERROR - Incorrect effective user ID!\n");
exit(1);
}
printf("SUCCESS - Correct effective user ID\n");
exit(0);
}
[Makefile]
CFLAGS = -D_GNU_SOURCE -Wall -Werror -Wunused
all: test1 test2
test1: test1.c
gcc $(CFLAGS) -o test1 test1.c -lpthread
test2: test2.c
gcc $(CFLAGS) -o test2 test2.c
sudo chown root.root test2
sudo chmod +s test2
Reported-by: David Smith <dsmith@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: David Smith <dsmith@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
2009-02-06 19:45:46 +08:00
|
|
|
for (t = next_thread(p); t != p; t = next_thread(t)) {
|
|
|
|
if (t->fs == p->fs)
|
|
|
|
n_fs++;
|
|
|
|
}
|
2009-04-24 07:02:45 +08:00
|
|
|
rcu_read_unlock();
|
CRED: Fix SUID exec regression
The patch:
commit a6f76f23d297f70e2a6b3ec607f7aeeea9e37e8d
CRED: Make execve() take advantage of copy-on-write credentials
moved the place in which the 'safeness' of a SUID/SGID exec was performed to
before de_thread() was called. This means that LSM_UNSAFE_SHARE is now
calculated incorrectly. This flag is set if any of the usage counts for
fs_struct, files_struct and sighand_struct are greater than 1 at the time the
determination is made. All of which are true for threads created by the
pthread library.
However, since we wish to make the security calculation before irrevocably
damaging the process so that we can return it an error code in the case where
we decide we want to reject the exec request on this basis, we have to make the
determination before calling de_thread().
So, instead, we count up the number of threads (CLONE_THREAD) that are sharing
our fs_struct (CLONE_FS), files_struct (CLONE_FILES) and sighand_structs
(CLONE_SIGHAND/CLONE_THREAD) with us. These will be killed by de_thread() and
so can be discounted by check_unsafe_exec().
We do have to be careful because CLONE_THREAD does not imply FS or FILES.
We _assume_ that there will be no extra references to these structs held by the
threads we're going to kill.
This can be tested with the attached pair of programs. Build the two programs
using the Makefile supplied, and run ./test1 as a non-root user. If
successful, you should see something like:
[dhowells@andromeda tmp]$ ./test1
--TEST1--
uid=4043, euid=4043 suid=4043
exec ./test2
--TEST2--
uid=4043, euid=0 suid=0
SUCCESS - Correct effective user ID
and if unsuccessful, something like:
[dhowells@andromeda tmp]$ ./test1
--TEST1--
uid=4043, euid=4043 suid=4043
exec ./test2
--TEST2--
uid=4043, euid=4043 suid=4043
ERROR - Incorrect effective user ID!
The non-root user ID you see will depend on the user you run as.
[test1.c]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <pthread.h>
static void *thread_func(void *arg)
{
while (1) {}
}
int main(int argc, char **argv)
{
pthread_t tid;
uid_t uid, euid, suid;
printf("--TEST1--\n");
getresuid(&uid, &euid, &suid);
printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid);
if (pthread_create(&tid, NULL, thread_func, NULL) < 0) {
perror("pthread_create");
exit(1);
}
printf("exec ./test2\n");
execlp("./test2", "test2", NULL);
perror("./test2");
_exit(1);
}
[test2.c]
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
int main(int argc, char **argv)
{
uid_t uid, euid, suid;
getresuid(&uid, &euid, &suid);
printf("--TEST2--\n");
printf("uid=%d, euid=%d suid=%d\n", uid, euid, suid);
if (euid != 0) {
fprintf(stderr, "ERROR - Incorrect effective user ID!\n");
exit(1);
}
printf("SUCCESS - Correct effective user ID\n");
exit(0);
}
[Makefile]
CFLAGS = -D_GNU_SOURCE -Wall -Werror -Wunused
all: test1 test2
test1: test1.c
gcc $(CFLAGS) -o test1 test1.c -lpthread
test2: test2.c
gcc $(CFLAGS) -o test2 test2.c
sudo chown root.root test2
sudo chmod +s test2
Reported-by: David Smith <dsmith@redhat.com>
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: David Smith <dsmith@redhat.com>
Signed-off-by: James Morris <jmorris@namei.org>
2009-02-06 19:45:46 +08:00
|
|
|
|
2009-03-30 19:35:18 +08:00
|
|
|
if (p->fs->users > n_fs) {
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
bprm->unsafe |= LSM_UNSAFE_SHARE;
|
2009-03-30 19:20:30 +08:00
|
|
|
} else {
|
do_execve() must not clear fs->in_exec if it was set by another thread
If do_execve() fails after check_unsafe_exec(), it clears fs->in_exec
unconditionally. This is wrong if we race with our sub-thread which
also does do_execve:
Two threads T1 and T2 and another process P, all share the same
->fs.
T1 starts do_execve(BAD_FILE). It calls check_unsafe_exec(), since
->fs is shared, we set LSM_UNSAFE but not ->in_exec.
P exits and decrements fs->users.
T2 starts do_execve(), calls check_unsafe_exec(), now ->fs is not
shared, we set fs->in_exec.
T1 continues, open_exec(BAD_FILE) fails, we clear ->in_exec and
return to the user-space.
T1 does clone(CLONE_FS /* without CLONE_THREAD */).
T2 continues without LSM_UNSAFE_SHARE while ->fs is shared with
another process.
Change check_unsafe_exec() to return res = 1 if we set ->in_exec, and change
do_execve() to clear ->in_exec depending on res.
When do_execve() suceeds, it is safe to clear ->in_exec unconditionally.
It can be set only if we don't share ->fs with another process, and since
we already killed all sub-threads either ->in_exec == 0 or we are the
only user of this ->fs.
Also, we do not need fs->lock to clear fs->in_exec.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-24 07:01:56 +08:00
|
|
|
res = -EAGAIN;
|
|
|
|
if (!p->fs->in_exec) {
|
|
|
|
p->fs->in_exec = 1;
|
|
|
|
res = 1;
|
|
|
|
}
|
2009-03-30 19:20:30 +08:00
|
|
|
}
|
2010-08-18 02:37:33 +08:00
|
|
|
spin_unlock(&p->fs->lock);
|
2009-03-30 19:20:30 +08:00
|
|
|
|
|
|
|
return res;
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* Fill the binprm structure from the inode.
|
|
|
|
* Check permissions, then read the first 128 (BINPRM_BUF_SIZE) bytes
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
*
|
|
|
|
* This may be called multiple times for binary chains (scripts for example).
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
int prepare_binprm(struct linux_binprm *bprm)
|
|
|
|
{
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
umode_t mode;
|
2006-12-08 18:36:35 +08:00
|
|
|
struct inode * inode = bprm->file->f_path.dentry->d_inode;
|
2005-04-17 06:20:36 +08:00
|
|
|
int retval;
|
|
|
|
|
|
|
|
mode = inode->i_mode;
|
|
|
|
if (bprm->file->f_op == NULL)
|
|
|
|
return -EACCES;
|
|
|
|
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
/* clear any previous set[ug]id data from a previous binary */
|
|
|
|
bprm->cred->euid = current_euid();
|
|
|
|
bprm->cred->egid = current_egid();
|
2005-04-17 06:20:36 +08:00
|
|
|
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
if (!(bprm->file->f_path.mnt->mnt_flags & MNT_NOSUID)) {
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Set-uid? */
|
|
|
|
if (mode & S_ISUID) {
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
bprm->per_clear |= PER_CLEAR_ON_SETID;
|
|
|
|
bprm->cred->euid = inode->i_uid;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/* Set-gid? */
|
|
|
|
/*
|
|
|
|
* If setgid is set but no group execute bit then this
|
|
|
|
* is a candidate for mandatory locking, not a setgid
|
|
|
|
* executable.
|
|
|
|
*/
|
|
|
|
if ((mode & (S_ISGID | S_IXGRP)) == (S_ISGID | S_IXGRP)) {
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
bprm->per_clear |= PER_CLEAR_ON_SETID;
|
|
|
|
bprm->cred->egid = inode->i_gid;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/* fill in binprm security blob */
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
retval = security_bprm_set_creds(bprm);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (retval)
|
|
|
|
return retval;
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
bprm->cred_prepared = 1;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
memset(bprm->buf, 0, BINPRM_BUF_SIZE);
|
|
|
|
return kernel_read(bprm->file, 0, bprm->buf, BINPRM_BUF_SIZE);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(prepare_binprm);
|
|
|
|
|
2007-05-08 15:25:16 +08:00
|
|
|
/*
|
|
|
|
* Arguments are '\0' separated strings found at the location bprm->p
|
|
|
|
* points to; chop off the first by relocating brpm->p to right after
|
|
|
|
* the first '\0' encountered.
|
|
|
|
*/
|
2007-07-19 16:48:16 +08:00
|
|
|
int remove_arg_zero(struct linux_binprm *bprm)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2007-07-19 16:48:16 +08:00
|
|
|
int ret = 0;
|
|
|
|
unsigned long offset;
|
|
|
|
char *kaddr;
|
|
|
|
struct page *page;
|
2007-05-08 15:25:16 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
if (!bprm->argc)
|
|
|
|
return 0;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
do {
|
|
|
|
offset = bprm->p & ~PAGE_MASK;
|
|
|
|
page = get_arg_page(bprm, bprm->p, 0);
|
|
|
|
if (!page) {
|
|
|
|
ret = -EFAULT;
|
|
|
|
goto out;
|
|
|
|
}
|
|
|
|
kaddr = kmap_atomic(page, KM_USER0);
|
2007-05-08 15:25:16 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
for (; offset < PAGE_SIZE && kaddr[offset];
|
|
|
|
offset++, bprm->p++)
|
|
|
|
;
|
2007-05-08 15:25:16 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
kunmap_atomic(kaddr, KM_USER0);
|
|
|
|
put_arg_page(page);
|
2007-05-08 15:25:16 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
if (offset == PAGE_SIZE)
|
|
|
|
free_arg_page(bprm, (bprm->p >> PAGE_SHIFT) - 1);
|
|
|
|
} while (offset == PAGE_SIZE);
|
2007-05-08 15:25:16 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
bprm->p++;
|
|
|
|
bprm->argc--;
|
|
|
|
ret = 0;
|
2007-05-08 15:25:16 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
out:
|
|
|
|
return ret;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
EXPORT_SYMBOL(remove_arg_zero);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* cycle the list of binary formats handler, until one recognizes the image
|
|
|
|
*/
|
|
|
|
int search_binary_handler(struct linux_binprm *bprm,struct pt_regs *regs)
|
|
|
|
{
|
tracehook: exec double-reporting fix
The patch 6341c39 "tracehook: exec" introduced a small regression in
2.6.27 regarding binfmt_misc exec event reporting. Since the reporting
is now done in the common search_binary_handler() function, an exec
of a misc binary will result in two (or possibly multiple) exec events
being reported, instead of just a single one, because the misc handler
contains a recursive call to search_binary_handler.
To add to the confusion, if PTRACE_O_TRACEEXEC is not active, the multiple
SIGTRAP signals will in fact cause only a single ptrace intercept, as the
signals are not queued. However, if PTRACE_O_TRACEEXEC is on, the debugger
will actually see multiple ptrace intercepts (PTRACE_EVENT_EXEC).
The test program included below demonstrates the problem.
This change fixes the bug by calling tracehook_report_exec() only in the
outermost search_binary_handler() call (bprm->recursion_depth == 0).
The additional change to restore bprm->recursion_depth after each binfmt
load_binary call is actually superfluous for this bug, since we test the
value saved on entry to search_binary_handler(). But it keeps the use of
of the depth count to its most obvious expected meaning. Depending on what
binfmt handlers do in certain cases, there could have been false-positive
tests for recursion limits before this change.
/* Test program using PTRACE_O_TRACEEXEC.
This forks and exec's the first argument with the rest of the arguments,
while ptrace'ing. It expects to see one PTRACE_EVENT_EXEC stop and
then a successful exit, with no other signals or events in between.
Test for kernel doing two PTRACE_EVENT_EXEC stops for a binfmt_misc exec:
$ gcc -g traceexec.c -o traceexec
$ sudo sh -c 'echo :test:M::foobar::/bin/cat: > /proc/sys/fs/binfmt_misc/register'
$ echo 'foobar test' > ./foobar
$ chmod +x ./foobar
$ ./traceexec ./foobar; echo $?
==> good <==
foobar test
0
$
==> bad <==
foobar test
unexpected status 0x4057f != 0
3
$
*/
#include <stdio.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/ptrace.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
static void
wait_for (pid_t child, int expect)
{
int status;
pid_t p = wait (&status);
if (p != child)
{
perror ("wait");
exit (2);
}
if (status != expect)
{
fprintf (stderr, "unexpected status %#x != %#x\n", status, expect);
exit (3);
}
}
int
main (int argc, char **argv)
{
pid_t child = fork ();
if (child < 0)
{
perror ("fork");
return 127;
}
else if (child == 0)
{
ptrace (PTRACE_TRACEME);
raise (SIGUSR1);
execv (argv[1], &argv[1]);
perror ("execve");
_exit (127);
}
wait_for (child, W_STOPCODE (SIGUSR1));
if (ptrace (PTRACE_SETOPTIONS, child,
0L, (void *) (long) PTRACE_O_TRACEEXEC) != 0)
{
perror ("PTRACE_SETOPTIONS");
return 4;
}
if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
{
perror ("PTRACE_CONT");
return 5;
}
wait_for (child, W_STOPCODE (SIGTRAP | (PTRACE_EVENT_EXEC << 8)));
if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
{
perror ("PTRACE_CONT");
return 6;
}
wait_for (child, W_EXITCODE (0, 0));
return 0;
}
Reported-by: Arnd Bergmann <arnd@arndb.de>
CC: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Signed-off-by: Roland McGrath <roland@redhat.com>
2008-12-10 11:36:38 +08:00
|
|
|
unsigned int depth = bprm->recursion_depth;
|
2005-04-17 06:20:36 +08:00
|
|
|
int try,retval;
|
|
|
|
struct linux_binfmt *fmt;
|
2011-06-29 10:13:39 +08:00
|
|
|
pid_t old_pid;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
retval = security_bprm_check(bprm);
|
|
|
|
if (retval)
|
|
|
|
return retval;
|
|
|
|
|
2006-04-27 02:04:08 +08:00
|
|
|
retval = audit_bprm(bprm);
|
|
|
|
if (retval)
|
|
|
|
return retval;
|
|
|
|
|
2011-06-29 10:13:39 +08:00
|
|
|
/* Need to fetch pid before load_binary changes it */
|
|
|
|
rcu_read_lock();
|
|
|
|
old_pid = task_pid_nr_ns(current, task_active_pid_ns(current->parent));
|
|
|
|
rcu_read_unlock();
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
retval = -ENOENT;
|
|
|
|
for (try=0; try<2; try++) {
|
|
|
|
read_lock(&binfmt_lock);
|
2007-10-17 14:26:03 +08:00
|
|
|
list_for_each_entry(fmt, &formats, lh) {
|
2005-04-17 06:20:36 +08:00
|
|
|
int (*fn)(struct linux_binprm *, struct pt_regs *) = fmt->load_binary;
|
|
|
|
if (!fn)
|
|
|
|
continue;
|
|
|
|
if (!try_module_get(fmt->module))
|
|
|
|
continue;
|
|
|
|
read_unlock(&binfmt_lock);
|
|
|
|
retval = fn(bprm, regs);
|
tracehook: exec double-reporting fix
The patch 6341c39 "tracehook: exec" introduced a small regression in
2.6.27 regarding binfmt_misc exec event reporting. Since the reporting
is now done in the common search_binary_handler() function, an exec
of a misc binary will result in two (or possibly multiple) exec events
being reported, instead of just a single one, because the misc handler
contains a recursive call to search_binary_handler.
To add to the confusion, if PTRACE_O_TRACEEXEC is not active, the multiple
SIGTRAP signals will in fact cause only a single ptrace intercept, as the
signals are not queued. However, if PTRACE_O_TRACEEXEC is on, the debugger
will actually see multiple ptrace intercepts (PTRACE_EVENT_EXEC).
The test program included below demonstrates the problem.
This change fixes the bug by calling tracehook_report_exec() only in the
outermost search_binary_handler() call (bprm->recursion_depth == 0).
The additional change to restore bprm->recursion_depth after each binfmt
load_binary call is actually superfluous for this bug, since we test the
value saved on entry to search_binary_handler(). But it keeps the use of
of the depth count to its most obvious expected meaning. Depending on what
binfmt handlers do in certain cases, there could have been false-positive
tests for recursion limits before this change.
/* Test program using PTRACE_O_TRACEEXEC.
This forks and exec's the first argument with the rest of the arguments,
while ptrace'ing. It expects to see one PTRACE_EVENT_EXEC stop and
then a successful exit, with no other signals or events in between.
Test for kernel doing two PTRACE_EVENT_EXEC stops for a binfmt_misc exec:
$ gcc -g traceexec.c -o traceexec
$ sudo sh -c 'echo :test:M::foobar::/bin/cat: > /proc/sys/fs/binfmt_misc/register'
$ echo 'foobar test' > ./foobar
$ chmod +x ./foobar
$ ./traceexec ./foobar; echo $?
==> good <==
foobar test
0
$
==> bad <==
foobar test
unexpected status 0x4057f != 0
3
$
*/
#include <stdio.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/ptrace.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
static void
wait_for (pid_t child, int expect)
{
int status;
pid_t p = wait (&status);
if (p != child)
{
perror ("wait");
exit (2);
}
if (status != expect)
{
fprintf (stderr, "unexpected status %#x != %#x\n", status, expect);
exit (3);
}
}
int
main (int argc, char **argv)
{
pid_t child = fork ();
if (child < 0)
{
perror ("fork");
return 127;
}
else if (child == 0)
{
ptrace (PTRACE_TRACEME);
raise (SIGUSR1);
execv (argv[1], &argv[1]);
perror ("execve");
_exit (127);
}
wait_for (child, W_STOPCODE (SIGUSR1));
if (ptrace (PTRACE_SETOPTIONS, child,
0L, (void *) (long) PTRACE_O_TRACEEXEC) != 0)
{
perror ("PTRACE_SETOPTIONS");
return 4;
}
if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
{
perror ("PTRACE_CONT");
return 5;
}
wait_for (child, W_STOPCODE (SIGTRAP | (PTRACE_EVENT_EXEC << 8)));
if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
{
perror ("PTRACE_CONT");
return 6;
}
wait_for (child, W_EXITCODE (0, 0));
return 0;
}
Reported-by: Arnd Bergmann <arnd@arndb.de>
CC: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Signed-off-by: Roland McGrath <roland@redhat.com>
2008-12-10 11:36:38 +08:00
|
|
|
/*
|
|
|
|
* Restore the depth counter to its starting value
|
|
|
|
* in this call, so we don't have to rely on every
|
|
|
|
* load_binary function to restore it on return.
|
|
|
|
*/
|
|
|
|
bprm->recursion_depth = depth;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (retval >= 0) {
|
tracehook: exec double-reporting fix
The patch 6341c39 "tracehook: exec" introduced a small regression in
2.6.27 regarding binfmt_misc exec event reporting. Since the reporting
is now done in the common search_binary_handler() function, an exec
of a misc binary will result in two (or possibly multiple) exec events
being reported, instead of just a single one, because the misc handler
contains a recursive call to search_binary_handler.
To add to the confusion, if PTRACE_O_TRACEEXEC is not active, the multiple
SIGTRAP signals will in fact cause only a single ptrace intercept, as the
signals are not queued. However, if PTRACE_O_TRACEEXEC is on, the debugger
will actually see multiple ptrace intercepts (PTRACE_EVENT_EXEC).
The test program included below demonstrates the problem.
This change fixes the bug by calling tracehook_report_exec() only in the
outermost search_binary_handler() call (bprm->recursion_depth == 0).
The additional change to restore bprm->recursion_depth after each binfmt
load_binary call is actually superfluous for this bug, since we test the
value saved on entry to search_binary_handler(). But it keeps the use of
of the depth count to its most obvious expected meaning. Depending on what
binfmt handlers do in certain cases, there could have been false-positive
tests for recursion limits before this change.
/* Test program using PTRACE_O_TRACEEXEC.
This forks and exec's the first argument with the rest of the arguments,
while ptrace'ing. It expects to see one PTRACE_EVENT_EXEC stop and
then a successful exit, with no other signals or events in between.
Test for kernel doing two PTRACE_EVENT_EXEC stops for a binfmt_misc exec:
$ gcc -g traceexec.c -o traceexec
$ sudo sh -c 'echo :test:M::foobar::/bin/cat: > /proc/sys/fs/binfmt_misc/register'
$ echo 'foobar test' > ./foobar
$ chmod +x ./foobar
$ ./traceexec ./foobar; echo $?
==> good <==
foobar test
0
$
==> bad <==
foobar test
unexpected status 0x4057f != 0
3
$
*/
#include <stdio.h>
#include <sys/types.h>
#include <sys/wait.h>
#include <sys/ptrace.h>
#include <unistd.h>
#include <signal.h>
#include <stdlib.h>
static void
wait_for (pid_t child, int expect)
{
int status;
pid_t p = wait (&status);
if (p != child)
{
perror ("wait");
exit (2);
}
if (status != expect)
{
fprintf (stderr, "unexpected status %#x != %#x\n", status, expect);
exit (3);
}
}
int
main (int argc, char **argv)
{
pid_t child = fork ();
if (child < 0)
{
perror ("fork");
return 127;
}
else if (child == 0)
{
ptrace (PTRACE_TRACEME);
raise (SIGUSR1);
execv (argv[1], &argv[1]);
perror ("execve");
_exit (127);
}
wait_for (child, W_STOPCODE (SIGUSR1));
if (ptrace (PTRACE_SETOPTIONS, child,
0L, (void *) (long) PTRACE_O_TRACEEXEC) != 0)
{
perror ("PTRACE_SETOPTIONS");
return 4;
}
if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
{
perror ("PTRACE_CONT");
return 5;
}
wait_for (child, W_STOPCODE (SIGTRAP | (PTRACE_EVENT_EXEC << 8)));
if (ptrace (PTRACE_CONT, child, 0L, 0L) != 0)
{
perror ("PTRACE_CONT");
return 6;
}
wait_for (child, W_EXITCODE (0, 0));
return 0;
}
Reported-by: Arnd Bergmann <arnd@arndb.de>
CC: Ulrich Weigand <ulrich.weigand@de.ibm.com>
Signed-off-by: Roland McGrath <roland@redhat.com>
2008-12-10 11:36:38 +08:00
|
|
|
if (depth == 0)
|
2011-06-29 10:13:39 +08:00
|
|
|
ptrace_event(PTRACE_EVENT_EXEC,
|
|
|
|
old_pid);
|
2005-04-17 06:20:36 +08:00
|
|
|
put_binfmt(fmt);
|
|
|
|
allow_write_access(bprm->file);
|
|
|
|
if (bprm->file)
|
|
|
|
fput(bprm->file);
|
|
|
|
bprm->file = NULL;
|
|
|
|
current->did_exec = 1;
|
2005-11-07 16:59:16 +08:00
|
|
|
proc_exec_connector(current);
|
2005-04-17 06:20:36 +08:00
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
read_lock(&binfmt_lock);
|
|
|
|
put_binfmt(fmt);
|
|
|
|
if (retval != -ENOEXEC || bprm->mm == NULL)
|
|
|
|
break;
|
|
|
|
if (!bprm->file) {
|
|
|
|
read_unlock(&binfmt_lock);
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
read_unlock(&binfmt_lock);
|
2011-07-27 07:08:42 +08:00
|
|
|
#ifdef CONFIG_MODULES
|
2005-04-17 06:20:36 +08:00
|
|
|
if (retval != -ENOEXEC || bprm->mm == NULL) {
|
|
|
|
break;
|
2008-07-09 16:28:40 +08:00
|
|
|
} else {
|
2005-04-17 06:20:36 +08:00
|
|
|
#define printable(c) (((c)=='\t') || ((c)=='\n') || (0x20<=(c) && (c)<=0x7e))
|
|
|
|
if (printable(bprm->buf[0]) &&
|
|
|
|
printable(bprm->buf[1]) &&
|
|
|
|
printable(bprm->buf[2]) &&
|
|
|
|
printable(bprm->buf[3]))
|
|
|
|
break; /* -ENOEXEC */
|
2011-07-27 07:08:41 +08:00
|
|
|
if (try)
|
|
|
|
break; /* -ENOEXEC */
|
2005-04-17 06:20:36 +08:00
|
|
|
request_module("binfmt-%04x", *(unsigned short *)(&bprm->buf[2]));
|
|
|
|
}
|
2011-07-27 07:08:42 +08:00
|
|
|
#else
|
|
|
|
break;
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(search_binary_handler);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* sys_execve() executes a new program.
|
|
|
|
*/
|
2011-03-07 01:02:37 +08:00
|
|
|
static int do_execve_common(const char *filename,
|
|
|
|
struct user_arg_ptr argv,
|
|
|
|
struct user_arg_ptr envp,
|
|
|
|
struct pt_regs *regs)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct linux_binprm *bprm;
|
|
|
|
struct file *file;
|
2008-04-22 17:31:30 +08:00
|
|
|
struct files_struct *displaced;
|
do_execve() must not clear fs->in_exec if it was set by another thread
If do_execve() fails after check_unsafe_exec(), it clears fs->in_exec
unconditionally. This is wrong if we race with our sub-thread which
also does do_execve:
Two threads T1 and T2 and another process P, all share the same
->fs.
T1 starts do_execve(BAD_FILE). It calls check_unsafe_exec(), since
->fs is shared, we set LSM_UNSAFE but not ->in_exec.
P exits and decrements fs->users.
T2 starts do_execve(), calls check_unsafe_exec(), now ->fs is not
shared, we set fs->in_exec.
T1 continues, open_exec(BAD_FILE) fails, we clear ->in_exec and
return to the user-space.
T1 does clone(CLONE_FS /* without CLONE_THREAD */).
T2 continues without LSM_UNSAFE_SHARE while ->fs is shared with
another process.
Change check_unsafe_exec() to return res = 1 if we set ->in_exec, and change
do_execve() to clear ->in_exec depending on res.
When do_execve() suceeds, it is safe to clear ->in_exec unconditionally.
It can be set only if we don't share ->fs with another process, and since
we already killed all sub-threads either ->in_exec == 0 or we are the
only user of this ->fs.
Also, we do not need fs->lock to clear fs->in_exec.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-24 07:01:56 +08:00
|
|
|
bool clear_in_exec;
|
2005-04-17 06:20:36 +08:00
|
|
|
int retval;
|
2011-08-08 23:02:04 +08:00
|
|
|
const struct cred *cred = current_cred();
|
|
|
|
|
|
|
|
/*
|
|
|
|
* We move the actual failure in case of RLIMIT_NPROC excess from
|
|
|
|
* set*uid() to execve() because too many poorly written programs
|
|
|
|
* don't check setuid() return code. Here we additionally recheck
|
|
|
|
* whether NPROC limit is still exceeded.
|
|
|
|
*/
|
|
|
|
if ((current->flags & PF_NPROC_EXCEEDED) &&
|
|
|
|
atomic_read(&cred->user->processes) > rlimit(RLIMIT_NPROC)) {
|
|
|
|
retval = -EAGAIN;
|
|
|
|
goto out_ret;
|
|
|
|
}
|
|
|
|
|
|
|
|
/* We're below the limit (still or again), so we don't want to make
|
|
|
|
* further execve() calls fail. */
|
|
|
|
current->flags &= ~PF_NPROC_EXCEEDED;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-22 17:31:30 +08:00
|
|
|
retval = unshare_files(&displaced);
|
2008-04-22 17:11:59 +08:00
|
|
|
if (retval)
|
|
|
|
goto out_ret;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
retval = -ENOMEM;
|
2006-03-25 19:08:13 +08:00
|
|
|
bprm = kzalloc(sizeof(*bprm), GFP_KERNEL);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!bprm)
|
2008-04-22 17:11:59 +08:00
|
|
|
goto out_files;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-09-06 02:17:13 +08:00
|
|
|
retval = prepare_bprm_creds(bprm);
|
|
|
|
if (retval)
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
goto out_free;
|
2009-03-30 19:20:30 +08:00
|
|
|
|
|
|
|
retval = check_unsafe_exec(bprm);
|
do_execve() must not clear fs->in_exec if it was set by another thread
If do_execve() fails after check_unsafe_exec(), it clears fs->in_exec
unconditionally. This is wrong if we race with our sub-thread which
also does do_execve:
Two threads T1 and T2 and another process P, all share the same
->fs.
T1 starts do_execve(BAD_FILE). It calls check_unsafe_exec(), since
->fs is shared, we set LSM_UNSAFE but not ->in_exec.
P exits and decrements fs->users.
T2 starts do_execve(), calls check_unsafe_exec(), now ->fs is not
shared, we set fs->in_exec.
T1 continues, open_exec(BAD_FILE) fails, we clear ->in_exec and
return to the user-space.
T1 does clone(CLONE_FS /* without CLONE_THREAD */).
T2 continues without LSM_UNSAFE_SHARE while ->fs is shared with
another process.
Change check_unsafe_exec() to return res = 1 if we set ->in_exec, and change
do_execve() to clear ->in_exec depending on res.
When do_execve() suceeds, it is safe to clear ->in_exec unconditionally.
It can be set only if we don't share ->fs with another process, and since
we already killed all sub-threads either ->in_exec == 0 or we are the
only user of this ->fs.
Also, we do not need fs->lock to clear fs->in_exec.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-24 07:01:56 +08:00
|
|
|
if (retval < 0)
|
2009-09-06 02:17:13 +08:00
|
|
|
goto out_free;
|
do_execve() must not clear fs->in_exec if it was set by another thread
If do_execve() fails after check_unsafe_exec(), it clears fs->in_exec
unconditionally. This is wrong if we race with our sub-thread which
also does do_execve:
Two threads T1 and T2 and another process P, all share the same
->fs.
T1 starts do_execve(BAD_FILE). It calls check_unsafe_exec(), since
->fs is shared, we set LSM_UNSAFE but not ->in_exec.
P exits and decrements fs->users.
T2 starts do_execve(), calls check_unsafe_exec(), now ->fs is not
shared, we set fs->in_exec.
T1 continues, open_exec(BAD_FILE) fails, we clear ->in_exec and
return to the user-space.
T1 does clone(CLONE_FS /* without CLONE_THREAD */).
T2 continues without LSM_UNSAFE_SHARE while ->fs is shared with
another process.
Change check_unsafe_exec() to return res = 1 if we set ->in_exec, and change
do_execve() to clear ->in_exec depending on res.
When do_execve() suceeds, it is safe to clear ->in_exec unconditionally.
It can be set only if we don't share ->fs with another process, and since
we already killed all sub-threads either ->in_exec == 0 or we are the
only user of this ->fs.
Also, we do not need fs->lock to clear fs->in_exec.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-24 07:01:56 +08:00
|
|
|
clear_in_exec = retval;
|
2009-09-06 02:17:13 +08:00
|
|
|
current->in_execve = 1;
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
file = open_exec(filename);
|
|
|
|
retval = PTR_ERR(file);
|
|
|
|
if (IS_ERR(file))
|
2009-03-30 19:20:30 +08:00
|
|
|
goto out_unmark;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
sched_exec();
|
|
|
|
|
|
|
|
bprm->file = file;
|
|
|
|
bprm->filename = filename;
|
|
|
|
bprm->interp = filename;
|
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
retval = bprm_mm_init(bprm);
|
|
|
|
if (retval)
|
|
|
|
goto out_file;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
bprm->argc = count(argv, MAX_ARG_STRINGS);
|
2005-04-17 06:20:36 +08:00
|
|
|
if ((retval = bprm->argc) < 0)
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
goto out;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-07-19 16:48:16 +08:00
|
|
|
bprm->envc = count(envp, MAX_ARG_STRINGS);
|
2005-04-17 06:20:36 +08:00
|
|
|
if ((retval = bprm->envc) < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
retval = prepare_binprm(bprm);
|
|
|
|
if (retval < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
retval = copy_strings_kernel(1, &bprm->filename, bprm);
|
|
|
|
if (retval < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
bprm->exec = bprm->p;
|
|
|
|
retval = copy_strings(bprm->envc, envp, bprm);
|
|
|
|
if (retval < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
retval = copy_strings(bprm->argc, argv, bprm);
|
|
|
|
if (retval < 0)
|
|
|
|
goto out;
|
|
|
|
|
|
|
|
retval = search_binary_handler(bprm,regs);
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
if (retval < 0)
|
|
|
|
goto out;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
/* execve succeeded */
|
2009-03-30 19:20:30 +08:00
|
|
|
current->fs->in_exec = 0;
|
2009-02-05 16:18:11 +08:00
|
|
|
current->in_execve = 0;
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
acct_update_integrals(current);
|
|
|
|
free_bprm(bprm);
|
|
|
|
if (displaced)
|
|
|
|
put_files_struct(displaced);
|
|
|
|
return retval;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
out:
|
2010-12-01 03:55:34 +08:00
|
|
|
if (bprm->mm) {
|
|
|
|
acct_arg_size(bprm, 0);
|
|
|
|
mmput(bprm->mm);
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
out_file:
|
|
|
|
if (bprm->file) {
|
|
|
|
allow_write_access(bprm->file);
|
|
|
|
fput(bprm->file);
|
|
|
|
}
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
|
2009-03-30 19:20:30 +08:00
|
|
|
out_unmark:
|
do_execve() must not clear fs->in_exec if it was set by another thread
If do_execve() fails after check_unsafe_exec(), it clears fs->in_exec
unconditionally. This is wrong if we race with our sub-thread which
also does do_execve:
Two threads T1 and T2 and another process P, all share the same
->fs.
T1 starts do_execve(BAD_FILE). It calls check_unsafe_exec(), since
->fs is shared, we set LSM_UNSAFE but not ->in_exec.
P exits and decrements fs->users.
T2 starts do_execve(), calls check_unsafe_exec(), now ->fs is not
shared, we set fs->in_exec.
T1 continues, open_exec(BAD_FILE) fails, we clear ->in_exec and
return to the user-space.
T1 does clone(CLONE_FS /* without CLONE_THREAD */).
T2 continues without LSM_UNSAFE_SHARE while ->fs is shared with
another process.
Change check_unsafe_exec() to return res = 1 if we set ->in_exec, and change
do_execve() to clear ->in_exec depending on res.
When do_execve() suceeds, it is safe to clear ->in_exec unconditionally.
It can be set only if we don't share ->fs with another process, and since
we already killed all sub-threads either ->in_exec == 0 or we are the
only user of this ->fs.
Also, we do not need fs->lock to clear fs->in_exec.
Signed-off-by: Oleg Nesterov <oleg@redhat.com>
Acked-by: Roland McGrath <roland@redhat.com>
Acked-by: Hugh Dickins <hugh@veritas.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-24 07:01:56 +08:00
|
|
|
if (clear_in_exec)
|
|
|
|
current->fs->in_exec = 0;
|
2009-02-05 16:18:11 +08:00
|
|
|
current->in_execve = 0;
|
CRED: Make execve() take advantage of copy-on-write credentials
Make execve() take advantage of copy-on-write credentials, allowing it to set
up the credentials in advance, and then commit the whole lot after the point
of no return.
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
The credential bits from struct linux_binprm are, for the most part,
replaced with a single credentials pointer (bprm->cred). This means that
all the creds can be calculated in advance and then applied at the point
of no return with no possibility of failure.
I would like to replace bprm->cap_effective with:
cap_isclear(bprm->cap_effective)
but this seems impossible due to special behaviour for processes of pid 1
(they always retain their parent's capability masks where normally they'd
be changed - see cap_bprm_set_creds()).
The following sequence of events now happens:
(a) At the start of do_execve, the current task's cred_exec_mutex is
locked to prevent PTRACE_ATTACH from obsoleting the calculation of
creds that we make.
(a) prepare_exec_creds() is then called to make a copy of the current
task's credentials and prepare it. This copy is then assigned to
bprm->cred.
This renders security_bprm_alloc() and security_bprm_free()
unnecessary, and so they've been removed.
(b) The determination of unsafe execution is now performed immediately
after (a) rather than later on in the code. The result is stored in
bprm->unsafe for future reference.
(c) prepare_binprm() is called, possibly multiple times.
(i) This applies the result of set[ug]id binaries to the new creds
attached to bprm->cred. Personality bit clearance is recorded,
but now deferred on the basis that the exec procedure may yet
fail.
(ii) This then calls the new security_bprm_set_creds(). This should
calculate the new LSM and capability credentials into *bprm->cred.
This folds together security_bprm_set() and parts of
security_bprm_apply_creds() (these two have been removed).
Anything that might fail must be done at this point.
(iii) bprm->cred_prepared is set to 1.
bprm->cred_prepared is 0 on the first pass of the security
calculations, and 1 on all subsequent passes. This allows SELinux
in (ii) to base its calculations only on the initial script and
not on the interpreter.
(d) flush_old_exec() is called to commit the task to execution. This
performs the following steps with regard to credentials:
(i) Clear pdeath_signal and set dumpable on certain circumstances that
may not be covered by commit_creds().
(ii) Clear any bits in current->personality that were deferred from
(c.i).
(e) install_exec_creds() [compute_creds() as was] is called to install the
new credentials. This performs the following steps with regard to
credentials:
(i) Calls security_bprm_committing_creds() to apply any security
requirements, such as flushing unauthorised files in SELinux, that
must be done before the credentials are changed.
This is made up of bits of security_bprm_apply_creds() and
security_bprm_post_apply_creds(), both of which have been removed.
This function is not allowed to fail; anything that might fail
must have been done in (c.ii).
(ii) Calls commit_creds() to apply the new credentials in a single
assignment (more or less). Possibly pdeath_signal and dumpable
should be part of struct creds.
(iii) Unlocks the task's cred_replace_mutex, thus allowing
PTRACE_ATTACH to take place.
(iv) Clears The bprm->cred pointer as the credentials it was holding
are now immutable.
(v) Calls security_bprm_committed_creds() to apply any security
alterations that must be done after the creds have been changed.
SELinux uses this to flush signals and signal handlers.
(f) If an error occurs before (d.i), bprm_free() will call abort_creds()
to destroy the proposed new credentials and will then unlock
cred_replace_mutex. No changes to the credentials will have been
made.
(2) LSM interface.
A number of functions have been changed, added or removed:
(*) security_bprm_alloc(), ->bprm_alloc_security()
(*) security_bprm_free(), ->bprm_free_security()
Removed in favour of preparing new credentials and modifying those.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
(*) security_bprm_post_apply_creds(), ->bprm_post_apply_creds()
Removed; split between security_bprm_set_creds(),
security_bprm_committing_creds() and security_bprm_committed_creds().
(*) security_bprm_set(), ->bprm_set_security()
Removed; folded into security_bprm_set_creds().
(*) security_bprm_set_creds(), ->bprm_set_creds()
New. The new credentials in bprm->creds should be checked and set up
as appropriate. bprm->cred_prepared is 0 on the first call, 1 on the
second and subsequent calls.
(*) security_bprm_committing_creds(), ->bprm_committing_creds()
(*) security_bprm_committed_creds(), ->bprm_committed_creds()
New. Apply the security effects of the new credentials. This
includes closing unauthorised files in SELinux. This function may not
fail. When the former is called, the creds haven't yet been applied
to the process; when the latter is called, they have.
The former may access bprm->cred, the latter may not.
(3) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) The bprm_security_struct struct has been removed in favour of using
the credentials-under-construction approach.
(c) flush_unauthorized_files() now takes a cred pointer and passes it on
to inode_has_perm(), file_has_perm() and dentry_open().
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Acked-by: Serge Hallyn <serue@us.ibm.com>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:24 +08:00
|
|
|
|
|
|
|
out_free:
|
2008-05-11 04:38:25 +08:00
|
|
|
free_bprm(bprm);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-04-22 17:11:59 +08:00
|
|
|
out_files:
|
2008-04-22 17:31:30 +08:00
|
|
|
if (displaced)
|
|
|
|
reset_files_struct(displaced);
|
2005-04-17 06:20:36 +08:00
|
|
|
out_ret:
|
|
|
|
return retval;
|
|
|
|
}
|
|
|
|
|
2011-03-07 01:02:37 +08:00
|
|
|
int do_execve(const char *filename,
|
|
|
|
const char __user *const __user *__argv,
|
|
|
|
const char __user *const __user *__envp,
|
|
|
|
struct pt_regs *regs)
|
|
|
|
{
|
2011-03-07 01:02:54 +08:00
|
|
|
struct user_arg_ptr argv = { .ptr.native = __argv };
|
|
|
|
struct user_arg_ptr envp = { .ptr.native = __envp };
|
|
|
|
return do_execve_common(filename, argv, envp, regs);
|
|
|
|
}
|
|
|
|
|
|
|
|
#ifdef CONFIG_COMPAT
|
|
|
|
int compat_do_execve(char *filename,
|
|
|
|
compat_uptr_t __user *__argv,
|
|
|
|
compat_uptr_t __user *__envp,
|
|
|
|
struct pt_regs *regs)
|
|
|
|
{
|
|
|
|
struct user_arg_ptr argv = {
|
|
|
|
.is_compat = true,
|
|
|
|
.ptr.compat = __argv,
|
|
|
|
};
|
|
|
|
struct user_arg_ptr envp = {
|
|
|
|
.is_compat = true,
|
|
|
|
.ptr.compat = __envp,
|
|
|
|
};
|
2011-03-07 01:02:37 +08:00
|
|
|
return do_execve_common(filename, argv, envp, regs);
|
|
|
|
}
|
2011-03-07 01:02:54 +08:00
|
|
|
#endif
|
2011-03-07 01:02:37 +08:00
|
|
|
|
2009-09-24 06:56:59 +08:00
|
|
|
void set_binfmt(struct linux_binfmt *new)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2009-09-24 06:57:41 +08:00
|
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
|
|
|
|
if (mm->binfmt)
|
|
|
|
module_put(mm->binfmt->module);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-09-24 06:57:41 +08:00
|
|
|
mm->binfmt = new;
|
2009-09-24 06:56:59 +08:00
|
|
|
if (new)
|
|
|
|
__module_get(new->module);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
EXPORT_SYMBOL(set_binfmt);
|
|
|
|
|
2010-10-28 06:34:08 +08:00
|
|
|
static int expand_corename(struct core_name *cn)
|
|
|
|
{
|
|
|
|
char *old_corename = cn->corename;
|
|
|
|
|
|
|
|
cn->size = CORENAME_MAX_SIZE * atomic_inc_return(&call_count);
|
|
|
|
cn->corename = krealloc(old_corename, cn->size, GFP_KERNEL);
|
|
|
|
|
|
|
|
if (!cn->corename) {
|
|
|
|
kfree(old_corename);
|
|
|
|
return -ENOMEM;
|
|
|
|
}
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
static int cn_printf(struct core_name *cn, const char *fmt, ...)
|
|
|
|
{
|
|
|
|
char *cur;
|
|
|
|
int need;
|
|
|
|
int ret;
|
|
|
|
va_list arg;
|
|
|
|
|
|
|
|
va_start(arg, fmt);
|
|
|
|
need = vsnprintf(NULL, 0, fmt, arg);
|
|
|
|
va_end(arg);
|
|
|
|
|
|
|
|
if (likely(need < cn->size - cn->used - 1))
|
|
|
|
goto out_printf;
|
|
|
|
|
|
|
|
ret = expand_corename(cn);
|
|
|
|
if (ret)
|
|
|
|
goto expand_fail;
|
|
|
|
|
|
|
|
out_printf:
|
|
|
|
cur = cn->corename + cn->used;
|
|
|
|
va_start(arg, fmt);
|
|
|
|
vsnprintf(cur, need + 1, fmt, arg);
|
|
|
|
va_end(arg);
|
|
|
|
cn->used += need;
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
expand_fail:
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2011-07-27 07:08:33 +08:00
|
|
|
static void cn_escape(char *str)
|
|
|
|
{
|
|
|
|
for (; *str; str++)
|
|
|
|
if (*str == '/')
|
|
|
|
*str = '!';
|
|
|
|
}
|
|
|
|
|
2011-05-27 07:25:46 +08:00
|
|
|
static int cn_print_exe_file(struct core_name *cn)
|
|
|
|
{
|
|
|
|
struct file *exe_file;
|
2011-07-27 07:08:33 +08:00
|
|
|
char *pathbuf, *path;
|
2011-05-27 07:25:46 +08:00
|
|
|
int ret;
|
|
|
|
|
|
|
|
exe_file = get_mm_exe_file(current->mm);
|
2011-07-27 07:08:33 +08:00
|
|
|
if (!exe_file) {
|
|
|
|
char *commstart = cn->corename + cn->used;
|
|
|
|
ret = cn_printf(cn, "%s (path unknown)", current->comm);
|
|
|
|
cn_escape(commstart);
|
|
|
|
return ret;
|
|
|
|
}
|
2011-05-27 07:25:46 +08:00
|
|
|
|
|
|
|
pathbuf = kmalloc(PATH_MAX, GFP_TEMPORARY);
|
|
|
|
if (!pathbuf) {
|
|
|
|
ret = -ENOMEM;
|
|
|
|
goto put_exe_file;
|
|
|
|
}
|
|
|
|
|
|
|
|
path = d_path(&exe_file->f_path, pathbuf, PATH_MAX);
|
|
|
|
if (IS_ERR(path)) {
|
|
|
|
ret = PTR_ERR(path);
|
|
|
|
goto free_buf;
|
|
|
|
}
|
|
|
|
|
2011-07-27 07:08:33 +08:00
|
|
|
cn_escape(path);
|
2011-05-27 07:25:46 +08:00
|
|
|
|
|
|
|
ret = cn_printf(cn, "%s", path);
|
|
|
|
|
|
|
|
free_buf:
|
|
|
|
kfree(pathbuf);
|
|
|
|
put_exe_file:
|
|
|
|
fput(exe_file);
|
|
|
|
return ret;
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* format_corename will inspect the pattern parameter, and output a
|
|
|
|
* name into corename, which must have space for at least
|
|
|
|
* CORENAME_MAX_SIZE bytes plus one byte for the zero terminator.
|
|
|
|
*/
|
2010-10-28 06:34:08 +08:00
|
|
|
static int format_corename(struct core_name *cn, long signr)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-11-14 07:39:18 +08:00
|
|
|
const struct cred *cred = current_cred();
|
2008-07-25 16:47:47 +08:00
|
|
|
const char *pat_ptr = core_pattern;
|
|
|
|
int ispipe = (*pat_ptr == '|');
|
2005-04-17 06:20:36 +08:00
|
|
|
int pid_in_pattern = 0;
|
2010-10-28 06:34:08 +08:00
|
|
|
int err = 0;
|
|
|
|
|
|
|
|
cn->size = CORENAME_MAX_SIZE * atomic_read(&call_count);
|
|
|
|
cn->corename = kmalloc(cn->size, GFP_KERNEL);
|
|
|
|
cn->used = 0;
|
|
|
|
|
|
|
|
if (!cn->corename)
|
|
|
|
return -ENOMEM;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/* Repeat as long as we have more pattern to process and more output
|
|
|
|
space */
|
|
|
|
while (*pat_ptr) {
|
|
|
|
if (*pat_ptr != '%') {
|
2010-10-28 06:34:08 +08:00
|
|
|
if (*pat_ptr == 0)
|
2005-04-17 06:20:36 +08:00
|
|
|
goto out;
|
2010-10-28 06:34:08 +08:00
|
|
|
err = cn_printf(cn, "%c", *pat_ptr++);
|
2005-04-17 06:20:36 +08:00
|
|
|
} else {
|
|
|
|
switch (*++pat_ptr) {
|
2010-10-28 06:34:08 +08:00
|
|
|
/* single % at the end, drop that */
|
2005-04-17 06:20:36 +08:00
|
|
|
case 0:
|
|
|
|
goto out;
|
|
|
|
/* Double percent, output one percent */
|
|
|
|
case '%':
|
2010-10-28 06:34:08 +08:00
|
|
|
err = cn_printf(cn, "%c", '%');
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
/* pid */
|
|
|
|
case 'p':
|
|
|
|
pid_in_pattern = 1;
|
2010-10-28 06:34:08 +08:00
|
|
|
err = cn_printf(cn, "%d",
|
|
|
|
task_tgid_vnr(current));
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
/* uid */
|
|
|
|
case 'u':
|
2010-10-28 06:34:08 +08:00
|
|
|
err = cn_printf(cn, "%d", cred->uid);
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
/* gid */
|
|
|
|
case 'g':
|
2010-10-28 06:34:08 +08:00
|
|
|
err = cn_printf(cn, "%d", cred->gid);
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
/* signal that caused the coredump */
|
|
|
|
case 's':
|
2010-10-28 06:34:08 +08:00
|
|
|
err = cn_printf(cn, "%ld", signr);
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
/* UNIX time of coredump */
|
|
|
|
case 't': {
|
|
|
|
struct timeval tv;
|
|
|
|
do_gettimeofday(&tv);
|
2010-10-28 06:34:08 +08:00
|
|
|
err = cn_printf(cn, "%lu", tv.tv_sec);
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
|
|
|
}
|
|
|
|
/* hostname */
|
2011-07-27 07:08:33 +08:00
|
|
|
case 'h': {
|
|
|
|
char *namestart = cn->corename + cn->used;
|
2005-04-17 06:20:36 +08:00
|
|
|
down_read(&uts_sem);
|
2010-10-28 06:34:08 +08:00
|
|
|
err = cn_printf(cn, "%s",
|
|
|
|
utsname()->nodename);
|
2005-04-17 06:20:36 +08:00
|
|
|
up_read(&uts_sem);
|
2011-07-27 07:08:33 +08:00
|
|
|
cn_escape(namestart);
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
2011-07-27 07:08:33 +08:00
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
/* executable */
|
2011-07-27 07:08:33 +08:00
|
|
|
case 'e': {
|
|
|
|
char *commstart = cn->corename + cn->used;
|
2010-10-28 06:34:08 +08:00
|
|
|
err = cn_printf(cn, "%s", current->comm);
|
2011-07-27 07:08:33 +08:00
|
|
|
cn_escape(commstart);
|
2005-04-17 06:20:36 +08:00
|
|
|
break;
|
2011-07-27 07:08:33 +08:00
|
|
|
}
|
2011-05-27 07:25:46 +08:00
|
|
|
case 'E':
|
|
|
|
err = cn_print_exe_file(cn);
|
|
|
|
break;
|
2007-10-17 14:26:35 +08:00
|
|
|
/* core limit size */
|
|
|
|
case 'c':
|
2010-10-28 06:34:08 +08:00
|
|
|
err = cn_printf(cn, "%lu",
|
|
|
|
rlimit(RLIMIT_CORE));
|
2007-10-17 14:26:35 +08:00
|
|
|
break;
|
2005-04-17 06:20:36 +08:00
|
|
|
default:
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
++pat_ptr;
|
|
|
|
}
|
2010-10-28 06:34:08 +08:00
|
|
|
|
|
|
|
if (err)
|
|
|
|
return err;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-10-28 06:34:08 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/* Backward compatibility with core_uses_pid:
|
|
|
|
*
|
|
|
|
* If core_pattern does not include a %p (as is the default)
|
|
|
|
* and core_uses_pid is set, then .%pid will be appended to
|
2007-04-17 13:53:13 +08:00
|
|
|
* the filename. Do not do this for piped commands. */
|
2008-10-19 11:28:22 +08:00
|
|
|
if (!ispipe && !pid_in_pattern && core_uses_pid) {
|
2010-10-28 06:34:08 +08:00
|
|
|
err = cn_printf(cn, ".%d", task_tgid_vnr(current));
|
|
|
|
if (err)
|
|
|
|
return err;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2007-04-17 13:53:13 +08:00
|
|
|
out:
|
|
|
|
return ispipe;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2010-03-06 05:44:14 +08:00
|
|
|
static int zap_process(struct task_struct *start, int exit_code)
|
2006-06-26 15:26:05 +08:00
|
|
|
{
|
|
|
|
struct task_struct *t;
|
2008-07-25 16:47:42 +08:00
|
|
|
int nr = 0;
|
2006-06-26 15:26:06 +08:00
|
|
|
|
2006-06-26 15:26:07 +08:00
|
|
|
start->signal->flags = SIGNAL_GROUP_EXIT;
|
2010-03-06 05:44:14 +08:00
|
|
|
start->signal->group_exit_code = exit_code;
|
2006-06-26 15:26:07 +08:00
|
|
|
start->signal->group_stop_count = 0;
|
2006-06-26 15:26:05 +08:00
|
|
|
|
|
|
|
t = start;
|
|
|
|
do {
|
2011-06-02 17:14:00 +08:00
|
|
|
task_clear_jobctl_pending(t, JOBCTL_PENDING_MASK);
|
2006-06-26 15:26:05 +08:00
|
|
|
if (t != current && t->mm) {
|
2006-06-26 15:26:06 +08:00
|
|
|
sigaddset(&t->pending.signal, SIGKILL);
|
|
|
|
signal_wake_up(t, 1);
|
2008-07-25 16:47:42 +08:00
|
|
|
nr++;
|
2006-06-26 15:26:05 +08:00
|
|
|
}
|
2008-07-25 16:47:31 +08:00
|
|
|
} while_each_thread(start, t);
|
2008-07-25 16:47:42 +08:00
|
|
|
|
|
|
|
return nr;
|
2006-06-26 15:26:05 +08:00
|
|
|
}
|
|
|
|
|
2006-06-26 15:26:08 +08:00
|
|
|
static inline int zap_threads(struct task_struct *tsk, struct mm_struct *mm,
|
2008-07-25 16:47:42 +08:00
|
|
|
struct core_state *core_state, int exit_code)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
|
|
|
struct task_struct *g, *p;
|
2006-06-26 15:26:09 +08:00
|
|
|
unsigned long flags;
|
2008-07-25 16:47:42 +08:00
|
|
|
int nr = -EAGAIN;
|
2006-06-26 15:26:08 +08:00
|
|
|
|
|
|
|
spin_lock_irq(&tsk->sighand->siglock);
|
2008-02-05 14:27:24 +08:00
|
|
|
if (!signal_group_exit(tsk->signal)) {
|
2008-07-25 16:47:42 +08:00
|
|
|
mm->core_state = core_state;
|
2010-03-06 05:44:14 +08:00
|
|
|
nr = zap_process(tsk, exit_code);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2006-06-26 15:26:08 +08:00
|
|
|
spin_unlock_irq(&tsk->sighand->siglock);
|
2008-07-25 16:47:42 +08:00
|
|
|
if (unlikely(nr < 0))
|
|
|
|
return nr;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-07-25 16:47:42 +08:00
|
|
|
if (atomic_read(&mm->mm_users) == nr + 1)
|
2006-06-26 15:26:09 +08:00
|
|
|
goto done;
|
2008-07-25 16:47:31 +08:00
|
|
|
/*
|
|
|
|
* We should find and kill all tasks which use this mm, and we should
|
2008-07-25 16:47:41 +08:00
|
|
|
* count them correctly into ->nr_threads. We don't take tasklist
|
2008-07-25 16:47:31 +08:00
|
|
|
* lock, but this is safe wrt:
|
|
|
|
*
|
|
|
|
* fork:
|
|
|
|
* None of sub-threads can fork after zap_process(leader). All
|
|
|
|
* processes which were created before this point should be
|
|
|
|
* visible to zap_threads() because copy_process() adds the new
|
|
|
|
* process to the tail of init_task.tasks list, and lock/unlock
|
|
|
|
* of ->siglock provides a memory barrier.
|
|
|
|
*
|
|
|
|
* do_exit:
|
|
|
|
* The caller holds mm->mmap_sem. This means that the task which
|
|
|
|
* uses this mm can't pass exit_mm(), so it can't exit or clear
|
|
|
|
* its ->mm.
|
|
|
|
*
|
|
|
|
* de_thread:
|
|
|
|
* It does list_replace_rcu(&leader->tasks, ¤t->tasks),
|
|
|
|
* we must see either old or new leader, this does not matter.
|
|
|
|
* However, it can change p->sighand, so lock_task_sighand(p)
|
|
|
|
* must be used. Since p->mm != NULL and we hold ->mmap_sem
|
|
|
|
* it can't fail.
|
|
|
|
*
|
|
|
|
* Note also that "g" can be the old leader with ->mm == NULL
|
|
|
|
* and already unhashed and thus removed from ->thread_group.
|
|
|
|
* This is OK, __unhash_process()->list_del_rcu() does not
|
|
|
|
* clear the ->next pointer, we will find the new leader via
|
|
|
|
* next_thread().
|
|
|
|
*/
|
2006-06-26 15:26:08 +08:00
|
|
|
rcu_read_lock();
|
2006-06-26 15:26:05 +08:00
|
|
|
for_each_process(g) {
|
2006-06-26 15:26:09 +08:00
|
|
|
if (g == tsk->group_leader)
|
|
|
|
continue;
|
2008-07-25 16:47:39 +08:00
|
|
|
if (g->flags & PF_KTHREAD)
|
|
|
|
continue;
|
2006-06-26 15:26:05 +08:00
|
|
|
p = g;
|
|
|
|
do {
|
|
|
|
if (p->mm) {
|
2008-07-25 16:47:39 +08:00
|
|
|
if (unlikely(p->mm == mm)) {
|
2006-06-26 15:26:09 +08:00
|
|
|
lock_task_sighand(p, &flags);
|
2010-03-06 05:44:14 +08:00
|
|
|
nr += zap_process(p, exit_code);
|
2006-06-26 15:26:09 +08:00
|
|
|
unlock_task_sighand(p, &flags);
|
|
|
|
}
|
2006-06-26 15:26:05 +08:00
|
|
|
break;
|
|
|
|
}
|
2008-07-25 16:47:31 +08:00
|
|
|
} while_each_thread(g, p);
|
2006-06-26 15:26:05 +08:00
|
|
|
}
|
2006-06-26 15:26:08 +08:00
|
|
|
rcu_read_unlock();
|
2006-06-26 15:26:09 +08:00
|
|
|
done:
|
2008-07-25 16:47:42 +08:00
|
|
|
atomic_set(&core_state->nr_threads, nr);
|
2008-07-25 16:47:42 +08:00
|
|
|
return nr;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-07-25 16:47:43 +08:00
|
|
|
static int coredump_wait(int exit_code, struct core_state *core_state)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2006-06-26 15:26:08 +08:00
|
|
|
struct task_struct *tsk = current;
|
|
|
|
struct mm_struct *mm = tsk->mm;
|
|
|
|
struct completion *vfork_done;
|
2010-05-27 05:43:08 +08:00
|
|
|
int core_waiters = -EBUSY;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-07-25 16:47:43 +08:00
|
|
|
init_completion(&core_state->startup);
|
2008-07-25 16:47:44 +08:00
|
|
|
core_state->dumper.task = tsk;
|
|
|
|
core_state->dumper.next = NULL;
|
2010-05-27 05:43:08 +08:00
|
|
|
|
|
|
|
down_write(&mm->mmap_sem);
|
|
|
|
if (!mm->core_state)
|
|
|
|
core_waiters = zap_threads(tsk, mm, core_state, exit_code);
|
2005-10-31 07:02:47 +08:00
|
|
|
up_write(&mm->mmap_sem);
|
|
|
|
|
2006-06-26 15:26:08 +08:00
|
|
|
if (unlikely(core_waiters < 0))
|
|
|
|
goto fail;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Make sure nobody is waiting for us to release the VM,
|
|
|
|
* otherwise we can deadlock when we wait on each other
|
|
|
|
*/
|
|
|
|
vfork_done = tsk->vfork_done;
|
|
|
|
if (vfork_done) {
|
|
|
|
tsk->vfork_done = NULL;
|
|
|
|
complete(vfork_done);
|
|
|
|
}
|
|
|
|
|
2005-10-31 07:02:47 +08:00
|
|
|
if (core_waiters)
|
2008-07-25 16:47:43 +08:00
|
|
|
wait_for_completion(&core_state->startup);
|
2006-06-26 15:26:08 +08:00
|
|
|
fail:
|
|
|
|
return core_waiters;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2008-07-25 16:47:46 +08:00
|
|
|
static void coredump_finish(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
struct core_thread *curr, *next;
|
|
|
|
struct task_struct *task;
|
|
|
|
|
|
|
|
next = mm->core_state->dumper.next;
|
|
|
|
while ((curr = next) != NULL) {
|
|
|
|
next = curr->next;
|
|
|
|
task = curr->task;
|
|
|
|
/*
|
|
|
|
* see exit_mm(), curr->task must not see
|
|
|
|
* ->task == NULL before we read ->next.
|
|
|
|
*/
|
|
|
|
smp_mb();
|
|
|
|
curr->task = NULL;
|
|
|
|
wake_up_process(task);
|
|
|
|
}
|
|
|
|
|
|
|
|
mm->core_state = NULL;
|
|
|
|
}
|
|
|
|
|
2007-07-19 16:48:27 +08:00
|
|
|
/*
|
|
|
|
* set_dumpable converts traditional three-value dumpable to two flags and
|
|
|
|
* stores them into mm->flags. It modifies lower two bits of mm->flags, but
|
|
|
|
* these bits are not changed atomically. So get_dumpable can observe the
|
|
|
|
* intermediate state. To avoid doing unexpected behavior, get get_dumpable
|
|
|
|
* return either old dumpable or new one by paying attention to the order of
|
|
|
|
* modifying the bits.
|
|
|
|
*
|
|
|
|
* dumpable | mm->flags (binary)
|
|
|
|
* old new | initial interim final
|
|
|
|
* ---------+-----------------------
|
|
|
|
* 0 1 | 00 01 01
|
|
|
|
* 0 2 | 00 10(*) 11
|
|
|
|
* 1 0 | 01 00 00
|
|
|
|
* 1 2 | 01 11 11
|
|
|
|
* 2 0 | 11 10(*) 00
|
|
|
|
* 2 1 | 11 11 01
|
|
|
|
*
|
|
|
|
* (*) get_dumpable regards interim value of 10 as 11.
|
|
|
|
*/
|
|
|
|
void set_dumpable(struct mm_struct *mm, int value)
|
|
|
|
{
|
|
|
|
switch (value) {
|
|
|
|
case 0:
|
|
|
|
clear_bit(MMF_DUMPABLE, &mm->flags);
|
|
|
|
smp_wmb();
|
|
|
|
clear_bit(MMF_DUMP_SECURELY, &mm->flags);
|
|
|
|
break;
|
|
|
|
case 1:
|
|
|
|
set_bit(MMF_DUMPABLE, &mm->flags);
|
|
|
|
smp_wmb();
|
|
|
|
clear_bit(MMF_DUMP_SECURELY, &mm->flags);
|
|
|
|
break;
|
|
|
|
case 2:
|
|
|
|
set_bit(MMF_DUMP_SECURELY, &mm->flags);
|
|
|
|
smp_wmb();
|
|
|
|
set_bit(MMF_DUMPABLE, &mm->flags);
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
2010-03-06 05:44:12 +08:00
|
|
|
static int __get_dumpable(unsigned long mm_flags)
|
2007-07-19 16:48:27 +08:00
|
|
|
{
|
|
|
|
int ret;
|
|
|
|
|
2010-03-06 05:44:12 +08:00
|
|
|
ret = mm_flags & MMF_DUMPABLE_MASK;
|
2007-07-19 16:48:27 +08:00
|
|
|
return (ret >= 2) ? 2 : ret;
|
|
|
|
}
|
|
|
|
|
2010-03-06 05:44:12 +08:00
|
|
|
int get_dumpable(struct mm_struct *mm)
|
|
|
|
{
|
|
|
|
return __get_dumpable(mm->flags);
|
|
|
|
}
|
|
|
|
|
2009-09-24 06:56:58 +08:00
|
|
|
static void wait_for_dump_helpers(struct file *file)
|
|
|
|
{
|
|
|
|
struct pipe_inode_info *pipe;
|
|
|
|
|
|
|
|
pipe = file->f_path.dentry->d_inode->i_pipe;
|
|
|
|
|
|
|
|
pipe_lock(pipe);
|
|
|
|
pipe->readers++;
|
|
|
|
pipe->writers--;
|
|
|
|
|
|
|
|
while ((pipe->readers > 1) && (!signal_pending(current))) {
|
|
|
|
wake_up_interruptible_sync(&pipe->wait);
|
|
|
|
kill_fasync(&pipe->fasync_readers, SIGIO, POLL_IN);
|
|
|
|
pipe_wait(pipe);
|
|
|
|
}
|
|
|
|
|
|
|
|
pipe->readers--;
|
|
|
|
pipe->writers++;
|
|
|
|
pipe_unlock(pipe);
|
|
|
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
2010-05-27 05:42:59 +08:00
|
|
|
/*
|
2011-02-25 00:46:49 +08:00
|
|
|
* umh_pipe_setup
|
2010-05-27 05:42:59 +08:00
|
|
|
* helper function to customize the process used
|
|
|
|
* to collect the core in userspace. Specifically
|
|
|
|
* it sets up a pipe and installs it as fd 0 (stdin)
|
|
|
|
* for the process. Returns 0 on success, or
|
|
|
|
* PTR_ERR on failure.
|
|
|
|
* Note that it also sets the core limit to 1. This
|
|
|
|
* is a special value that we use to trap recursive
|
|
|
|
* core dumps
|
|
|
|
*/
|
2011-06-17 18:25:59 +08:00
|
|
|
static int umh_pipe_setup(struct subprocess_info *info, struct cred *new)
|
2010-05-27 05:42:59 +08:00
|
|
|
{
|
|
|
|
struct file *rp, *wp;
|
|
|
|
struct fdtable *fdt;
|
|
|
|
struct coredump_params *cp = (struct coredump_params *)info->data;
|
|
|
|
struct files_struct *cf = current->files;
|
|
|
|
|
|
|
|
wp = create_write_pipe(0);
|
|
|
|
if (IS_ERR(wp))
|
|
|
|
return PTR_ERR(wp);
|
|
|
|
|
|
|
|
rp = create_read_pipe(wp, 0);
|
|
|
|
if (IS_ERR(rp)) {
|
|
|
|
free_write_pipe(wp);
|
|
|
|
return PTR_ERR(rp);
|
|
|
|
}
|
|
|
|
|
|
|
|
cp->file = wp;
|
|
|
|
|
|
|
|
sys_close(0);
|
|
|
|
fd_install(0, rp);
|
|
|
|
spin_lock(&cf->file_lock);
|
|
|
|
fdt = files_fdtable(cf);
|
|
|
|
FD_SET(0, fdt->open_fds);
|
|
|
|
FD_CLR(0, fdt->close_on_exec);
|
|
|
|
spin_unlock(&cf->file_lock);
|
|
|
|
|
|
|
|
/* and disallow core files too */
|
|
|
|
current->signal->rlim[RLIMIT_CORE] = (struct rlimit){1, 1};
|
|
|
|
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2009-01-07 06:42:48 +08:00
|
|
|
void do_coredump(long signr, int exit_code, struct pt_regs *regs)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2008-07-25 16:47:43 +08:00
|
|
|
struct core_state core_state;
|
2010-10-28 06:34:08 +08:00
|
|
|
struct core_name cn;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct mm_struct *mm = current->mm;
|
|
|
|
struct linux_binfmt * binfmt;
|
CRED: Inaugurate COW credentials
Inaugurate copy-on-write credentials management. This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.
A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().
With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:
struct cred *new = prepare_creds();
int ret = blah(new);
if (ret < 0) {
abort_creds(new);
return ret;
}
return commit_creds(new);
There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.
To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const. The purpose of this is compile-time
discouragement of altering credentials through those pointers. Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:
(1) Its reference count may incremented and decremented.
(2) The keyrings to which it points may be modified, but not replaced.
The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
This now prepares and commits credentials in various places in the
security code rather than altering the current creds directly.
(2) Temporary credential overrides.
do_coredump() and sys_faccessat() now prepare their own credentials and
temporarily override the ones currently on the acting thread, whilst
preventing interference from other threads by holding cred_replace_mutex
on the thread being dumped.
This will be replaced in a future patch by something that hands down the
credentials directly to the functions being called, rather than altering
the task's objective credentials.
(3) LSM interface.
A number of functions have been changed, added or removed:
(*) security_capset_check(), ->capset_check()
(*) security_capset_set(), ->capset_set()
Removed in favour of security_capset().
(*) security_capset(), ->capset()
New. This is passed a pointer to the new creds, a pointer to the old
creds and the proposed capability sets. It should fill in the new
creds or return an error. All pointers, barring the pointer to the
new creds, are now const.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
Changed; now returns a value, which will cause the process to be
killed if it's an error.
(*) security_task_alloc(), ->task_alloc_security()
Removed in favour of security_prepare_creds().
(*) security_cred_free(), ->cred_free()
New. Free security data attached to cred->security.
(*) security_prepare_creds(), ->cred_prepare()
New. Duplicate any security data attached to cred->security.
(*) security_commit_creds(), ->cred_commit()
New. Apply any security effects for the upcoming installation of new
security by commit_creds().
(*) security_task_post_setuid(), ->task_post_setuid()
Removed in favour of security_task_fix_setuid().
(*) security_task_fix_setuid(), ->task_fix_setuid()
Fix up the proposed new credentials for setuid(). This is used by
cap_set_fix_setuid() to implicitly adjust capabilities in line with
setuid() changes. Changes are made to the new credentials, rather
than the task itself as in security_task_post_setuid().
(*) security_task_reparent_to_init(), ->task_reparent_to_init()
Removed. Instead the task being reparented to init is referred
directly to init's credentials.
NOTE! This results in the loss of some state: SELinux's osid no
longer records the sid of the thread that forked it.
(*) security_key_alloc(), ->key_alloc()
(*) security_key_permission(), ->key_permission()
Changed. These now take cred pointers rather than task pointers to
refer to the security context.
(4) sys_capset().
This has been simplified and uses less locking. The LSM functions it
calls have been merged.
(5) reparent_to_kthreadd().
This gives the current thread the same credentials as init by simply using
commit_thread() to point that way.
(6) __sigqueue_alloc() and switch_uid()
__sigqueue_alloc() can't stop the target task from changing its creds
beneath it, so this function gets a reference to the currently applicable
user_struct which it then passes into the sigqueue struct it returns if
successful.
switch_uid() is now called from commit_creds(), and possibly should be
folded into that. commit_creds() should take care of protecting
__sigqueue_alloc().
(7) [sg]et[ug]id() and co and [sg]et_current_groups.
The set functions now all use prepare_creds(), commit_creds() and
abort_creds() to build and check a new set of credentials before applying
it.
security_task_set[ug]id() is called inside the prepared section. This
guarantees that nothing else will affect the creds until we've finished.
The calling of set_dumpable() has been moved into commit_creds().
Much of the functionality of set_user() has been moved into
commit_creds().
The get functions all simply access the data directly.
(8) security_task_prctl() and cap_task_prctl().
security_task_prctl() has been modified to return -ENOSYS if it doesn't
want to handle a function, or otherwise return the return value directly
rather than through an argument.
Additionally, cap_task_prctl() now prepares a new set of credentials, even
if it doesn't end up using it.
(9) Keyrings.
A number of changes have been made to the keyrings code:
(a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
all been dropped and built in to the credentials functions directly.
They may want separating out again later.
(b) key_alloc() and search_process_keyrings() now take a cred pointer
rather than a task pointer to specify the security context.
(c) copy_creds() gives a new thread within the same thread group a new
thread keyring if its parent had one, otherwise it discards the thread
keyring.
(d) The authorisation key now points directly to the credentials to extend
the search into rather pointing to the task that carries them.
(e) Installing thread, process or session keyrings causes a new set of
credentials to be created, even though it's not strictly necessary for
process or session keyrings (they're shared).
(10) Usermode helper.
The usermode helper code now carries a cred struct pointer in its
subprocess_info struct instead of a new session keyring pointer. This set
of credentials is derived from init_cred and installed on the new process
after it has been cloned.
call_usermodehelper_setup() allocates the new credentials and
call_usermodehelper_freeinfo() discards them if they haven't been used. A
special cred function (prepare_usermodeinfo_creds()) is provided
specifically for call_usermodehelper_setup() to call.
call_usermodehelper_setkeys() adjusts the credentials to sport the
supplied keyring as the new session keyring.
(11) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) selinux_setprocattr() no longer does its check for whether the
current ptracer can access processes with the new SID inside the lock
that covers getting the ptracer's SID. Whilst this lock ensures that
the check is done with the ptracer pinned, the result is only valid
until the lock is released, so there's no point doing it inside the
lock.
(12) is_single_threaded().
This function has been extracted from selinux_setprocattr() and put into
a file of its own in the lib/ directory as join_session_keyring() now
wants to use it too.
The code in SELinux just checked to see whether a task shared mm_structs
with other tasks (CLONE_VM), but that isn't good enough. We really want
to know if they're part of the same thread group (CLONE_THREAD).
(13) nfsd.
The NFS server daemon now has to use the COW credentials to set the
credentials it is going to use. It really needs to pass the credentials
down to the functions it calls, but it can't do that until other patches
in this series have been applied.
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:23 +08:00
|
|
|
const struct cred *old_cred;
|
|
|
|
struct cred *cred;
|
2005-04-17 06:20:36 +08:00
|
|
|
int retval = 0;
|
2005-06-23 15:09:43 +08:00
|
|
|
int flag = 0;
|
2010-05-27 05:43:06 +08:00
|
|
|
int ispipe;
|
2009-09-24 06:56:56 +08:00
|
|
|
static atomic_t core_dump_count = ATOMIC_INIT(0);
|
2009-12-18 07:27:16 +08:00
|
|
|
struct coredump_params cprm = {
|
|
|
|
.signr = signr,
|
|
|
|
.regs = regs,
|
2010-03-06 05:42:42 +08:00
|
|
|
.limit = rlimit(RLIMIT_CORE),
|
2010-03-06 05:44:12 +08:00
|
|
|
/*
|
|
|
|
* We must use the same mm->flags while dumping core to avoid
|
|
|
|
* inconsistency of bit flags, since this flag is not protected
|
|
|
|
* by any locks.
|
|
|
|
*/
|
|
|
|
.mm_flags = mm->flags,
|
2009-12-18 07:27:16 +08:00
|
|
|
};
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-04-19 22:28:21 +08:00
|
|
|
audit_core_dumps(signr);
|
|
|
|
|
2009-09-24 06:57:41 +08:00
|
|
|
binfmt = mm->binfmt;
|
2005-04-17 06:20:36 +08:00
|
|
|
if (!binfmt || !binfmt->core_dump)
|
|
|
|
goto fail;
|
2010-05-27 05:43:08 +08:00
|
|
|
if (!__get_dumpable(cprm.mm_flags))
|
|
|
|
goto fail;
|
CRED: Inaugurate COW credentials
Inaugurate copy-on-write credentials management. This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.
A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().
With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:
struct cred *new = prepare_creds();
int ret = blah(new);
if (ret < 0) {
abort_creds(new);
return ret;
}
return commit_creds(new);
There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.
To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const. The purpose of this is compile-time
discouragement of altering credentials through those pointers. Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:
(1) Its reference count may incremented and decremented.
(2) The keyrings to which it points may be modified, but not replaced.
The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
This now prepares and commits credentials in various places in the
security code rather than altering the current creds directly.
(2) Temporary credential overrides.
do_coredump() and sys_faccessat() now prepare their own credentials and
temporarily override the ones currently on the acting thread, whilst
preventing interference from other threads by holding cred_replace_mutex
on the thread being dumped.
This will be replaced in a future patch by something that hands down the
credentials directly to the functions being called, rather than altering
the task's objective credentials.
(3) LSM interface.
A number of functions have been changed, added or removed:
(*) security_capset_check(), ->capset_check()
(*) security_capset_set(), ->capset_set()
Removed in favour of security_capset().
(*) security_capset(), ->capset()
New. This is passed a pointer to the new creds, a pointer to the old
creds and the proposed capability sets. It should fill in the new
creds or return an error. All pointers, barring the pointer to the
new creds, are now const.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
Changed; now returns a value, which will cause the process to be
killed if it's an error.
(*) security_task_alloc(), ->task_alloc_security()
Removed in favour of security_prepare_creds().
(*) security_cred_free(), ->cred_free()
New. Free security data attached to cred->security.
(*) security_prepare_creds(), ->cred_prepare()
New. Duplicate any security data attached to cred->security.
(*) security_commit_creds(), ->cred_commit()
New. Apply any security effects for the upcoming installation of new
security by commit_creds().
(*) security_task_post_setuid(), ->task_post_setuid()
Removed in favour of security_task_fix_setuid().
(*) security_task_fix_setuid(), ->task_fix_setuid()
Fix up the proposed new credentials for setuid(). This is used by
cap_set_fix_setuid() to implicitly adjust capabilities in line with
setuid() changes. Changes are made to the new credentials, rather
than the task itself as in security_task_post_setuid().
(*) security_task_reparent_to_init(), ->task_reparent_to_init()
Removed. Instead the task being reparented to init is referred
directly to init's credentials.
NOTE! This results in the loss of some state: SELinux's osid no
longer records the sid of the thread that forked it.
(*) security_key_alloc(), ->key_alloc()
(*) security_key_permission(), ->key_permission()
Changed. These now take cred pointers rather than task pointers to
refer to the security context.
(4) sys_capset().
This has been simplified and uses less locking. The LSM functions it
calls have been merged.
(5) reparent_to_kthreadd().
This gives the current thread the same credentials as init by simply using
commit_thread() to point that way.
(6) __sigqueue_alloc() and switch_uid()
__sigqueue_alloc() can't stop the target task from changing its creds
beneath it, so this function gets a reference to the currently applicable
user_struct which it then passes into the sigqueue struct it returns if
successful.
switch_uid() is now called from commit_creds(), and possibly should be
folded into that. commit_creds() should take care of protecting
__sigqueue_alloc().
(7) [sg]et[ug]id() and co and [sg]et_current_groups.
The set functions now all use prepare_creds(), commit_creds() and
abort_creds() to build and check a new set of credentials before applying
it.
security_task_set[ug]id() is called inside the prepared section. This
guarantees that nothing else will affect the creds until we've finished.
The calling of set_dumpable() has been moved into commit_creds().
Much of the functionality of set_user() has been moved into
commit_creds().
The get functions all simply access the data directly.
(8) security_task_prctl() and cap_task_prctl().
security_task_prctl() has been modified to return -ENOSYS if it doesn't
want to handle a function, or otherwise return the return value directly
rather than through an argument.
Additionally, cap_task_prctl() now prepares a new set of credentials, even
if it doesn't end up using it.
(9) Keyrings.
A number of changes have been made to the keyrings code:
(a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
all been dropped and built in to the credentials functions directly.
They may want separating out again later.
(b) key_alloc() and search_process_keyrings() now take a cred pointer
rather than a task pointer to specify the security context.
(c) copy_creds() gives a new thread within the same thread group a new
thread keyring if its parent had one, otherwise it discards the thread
keyring.
(d) The authorisation key now points directly to the credentials to extend
the search into rather pointing to the task that carries them.
(e) Installing thread, process or session keyrings causes a new set of
credentials to be created, even though it's not strictly necessary for
process or session keyrings (they're shared).
(10) Usermode helper.
The usermode helper code now carries a cred struct pointer in its
subprocess_info struct instead of a new session keyring pointer. This set
of credentials is derived from init_cred and installed on the new process
after it has been cloned.
call_usermodehelper_setup() allocates the new credentials and
call_usermodehelper_freeinfo() discards them if they haven't been used. A
special cred function (prepare_usermodeinfo_creds()) is provided
specifically for call_usermodehelper_setup() to call.
call_usermodehelper_setkeys() adjusts the credentials to sport the
supplied keyring as the new session keyring.
(11) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) selinux_setprocattr() no longer does its check for whether the
current ptracer can access processes with the new SID inside the lock
that covers getting the ptracer's SID. Whilst this lock ensures that
the check is done with the ptracer pinned, the result is only valid
until the lock is released, so there's no point doing it inside the
lock.
(12) is_single_threaded().
This function has been extracted from selinux_setprocattr() and put into
a file of its own in the lib/ directory as join_session_keyring() now
wants to use it too.
The code in SELinux just checked to see whether a task shared mm_structs
with other tasks (CLONE_VM), but that isn't good enough. We really want
to know if they're part of the same thread group (CLONE_THREAD).
(13) nfsd.
The NFS server daemon now has to use the COW credentials to set the
credentials it is going to use. It really needs to pass the credentials
down to the functions it calls, but it can't do that until other patches
in this series have been applied.
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:23 +08:00
|
|
|
|
|
|
|
cred = prepare_creds();
|
2010-05-27 05:43:07 +08:00
|
|
|
if (!cred)
|
CRED: Inaugurate COW credentials
Inaugurate copy-on-write credentials management. This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.
A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().
With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:
struct cred *new = prepare_creds();
int ret = blah(new);
if (ret < 0) {
abort_creds(new);
return ret;
}
return commit_creds(new);
There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.
To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const. The purpose of this is compile-time
discouragement of altering credentials through those pointers. Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:
(1) Its reference count may incremented and decremented.
(2) The keyrings to which it points may be modified, but not replaced.
The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
This now prepares and commits credentials in various places in the
security code rather than altering the current creds directly.
(2) Temporary credential overrides.
do_coredump() and sys_faccessat() now prepare their own credentials and
temporarily override the ones currently on the acting thread, whilst
preventing interference from other threads by holding cred_replace_mutex
on the thread being dumped.
This will be replaced in a future patch by something that hands down the
credentials directly to the functions being called, rather than altering
the task's objective credentials.
(3) LSM interface.
A number of functions have been changed, added or removed:
(*) security_capset_check(), ->capset_check()
(*) security_capset_set(), ->capset_set()
Removed in favour of security_capset().
(*) security_capset(), ->capset()
New. This is passed a pointer to the new creds, a pointer to the old
creds and the proposed capability sets. It should fill in the new
creds or return an error. All pointers, barring the pointer to the
new creds, are now const.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
Changed; now returns a value, which will cause the process to be
killed if it's an error.
(*) security_task_alloc(), ->task_alloc_security()
Removed in favour of security_prepare_creds().
(*) security_cred_free(), ->cred_free()
New. Free security data attached to cred->security.
(*) security_prepare_creds(), ->cred_prepare()
New. Duplicate any security data attached to cred->security.
(*) security_commit_creds(), ->cred_commit()
New. Apply any security effects for the upcoming installation of new
security by commit_creds().
(*) security_task_post_setuid(), ->task_post_setuid()
Removed in favour of security_task_fix_setuid().
(*) security_task_fix_setuid(), ->task_fix_setuid()
Fix up the proposed new credentials for setuid(). This is used by
cap_set_fix_setuid() to implicitly adjust capabilities in line with
setuid() changes. Changes are made to the new credentials, rather
than the task itself as in security_task_post_setuid().
(*) security_task_reparent_to_init(), ->task_reparent_to_init()
Removed. Instead the task being reparented to init is referred
directly to init's credentials.
NOTE! This results in the loss of some state: SELinux's osid no
longer records the sid of the thread that forked it.
(*) security_key_alloc(), ->key_alloc()
(*) security_key_permission(), ->key_permission()
Changed. These now take cred pointers rather than task pointers to
refer to the security context.
(4) sys_capset().
This has been simplified and uses less locking. The LSM functions it
calls have been merged.
(5) reparent_to_kthreadd().
This gives the current thread the same credentials as init by simply using
commit_thread() to point that way.
(6) __sigqueue_alloc() and switch_uid()
__sigqueue_alloc() can't stop the target task from changing its creds
beneath it, so this function gets a reference to the currently applicable
user_struct which it then passes into the sigqueue struct it returns if
successful.
switch_uid() is now called from commit_creds(), and possibly should be
folded into that. commit_creds() should take care of protecting
__sigqueue_alloc().
(7) [sg]et[ug]id() and co and [sg]et_current_groups.
The set functions now all use prepare_creds(), commit_creds() and
abort_creds() to build and check a new set of credentials before applying
it.
security_task_set[ug]id() is called inside the prepared section. This
guarantees that nothing else will affect the creds until we've finished.
The calling of set_dumpable() has been moved into commit_creds().
Much of the functionality of set_user() has been moved into
commit_creds().
The get functions all simply access the data directly.
(8) security_task_prctl() and cap_task_prctl().
security_task_prctl() has been modified to return -ENOSYS if it doesn't
want to handle a function, or otherwise return the return value directly
rather than through an argument.
Additionally, cap_task_prctl() now prepares a new set of credentials, even
if it doesn't end up using it.
(9) Keyrings.
A number of changes have been made to the keyrings code:
(a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
all been dropped and built in to the credentials functions directly.
They may want separating out again later.
(b) key_alloc() and search_process_keyrings() now take a cred pointer
rather than a task pointer to specify the security context.
(c) copy_creds() gives a new thread within the same thread group a new
thread keyring if its parent had one, otherwise it discards the thread
keyring.
(d) The authorisation key now points directly to the credentials to extend
the search into rather pointing to the task that carries them.
(e) Installing thread, process or session keyrings causes a new set of
credentials to be created, even though it's not strictly necessary for
process or session keyrings (they're shared).
(10) Usermode helper.
The usermode helper code now carries a cred struct pointer in its
subprocess_info struct instead of a new session keyring pointer. This set
of credentials is derived from init_cred and installed on the new process
after it has been cloned.
call_usermodehelper_setup() allocates the new credentials and
call_usermodehelper_freeinfo() discards them if they haven't been used. A
special cred function (prepare_usermodeinfo_creds()) is provided
specifically for call_usermodehelper_setup() to call.
call_usermodehelper_setkeys() adjusts the credentials to sport the
supplied keyring as the new session keyring.
(11) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) selinux_setprocattr() no longer does its check for whether the
current ptracer can access processes with the new SID inside the lock
that covers getting the ptracer's SID. Whilst this lock ensures that
the check is done with the ptracer pinned, the result is only valid
until the lock is released, so there's no point doing it inside the
lock.
(12) is_single_threaded().
This function has been extracted from selinux_setprocattr() and put into
a file of its own in the lib/ directory as join_session_keyring() now
wants to use it too.
The code in SELinux just checked to see whether a task shared mm_structs
with other tasks (CLONE_VM), but that isn't good enough. We really want
to know if they're part of the same thread group (CLONE_THREAD).
(13) nfsd.
The NFS server daemon now has to use the COW credentials to set the
credentials it is going to use. It really needs to pass the credentials
down to the functions it calls, but it can't do that until other patches
in this series have been applied.
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:23 +08:00
|
|
|
goto fail;
|
2005-06-23 15:09:43 +08:00
|
|
|
/*
|
|
|
|
* We cannot trust fsuid as being the "true" uid of the
|
|
|
|
* process nor do we know its entire history. We only know it
|
|
|
|
* was tainted so we dump it as root in mode 2.
|
|
|
|
*/
|
2010-03-06 05:44:12 +08:00
|
|
|
if (__get_dumpable(cprm.mm_flags) == 2) {
|
|
|
|
/* Setuid core dump mode */
|
2005-06-23 15:09:43 +08:00
|
|
|
flag = O_EXCL; /* Stop rewrite attacks */
|
CRED: Inaugurate COW credentials
Inaugurate copy-on-write credentials management. This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.
A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().
With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:
struct cred *new = prepare_creds();
int ret = blah(new);
if (ret < 0) {
abort_creds(new);
return ret;
}
return commit_creds(new);
There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.
To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const. The purpose of this is compile-time
discouragement of altering credentials through those pointers. Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:
(1) Its reference count may incremented and decremented.
(2) The keyrings to which it points may be modified, but not replaced.
The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
This now prepares and commits credentials in various places in the
security code rather than altering the current creds directly.
(2) Temporary credential overrides.
do_coredump() and sys_faccessat() now prepare their own credentials and
temporarily override the ones currently on the acting thread, whilst
preventing interference from other threads by holding cred_replace_mutex
on the thread being dumped.
This will be replaced in a future patch by something that hands down the
credentials directly to the functions being called, rather than altering
the task's objective credentials.
(3) LSM interface.
A number of functions have been changed, added or removed:
(*) security_capset_check(), ->capset_check()
(*) security_capset_set(), ->capset_set()
Removed in favour of security_capset().
(*) security_capset(), ->capset()
New. This is passed a pointer to the new creds, a pointer to the old
creds and the proposed capability sets. It should fill in the new
creds or return an error. All pointers, barring the pointer to the
new creds, are now const.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
Changed; now returns a value, which will cause the process to be
killed if it's an error.
(*) security_task_alloc(), ->task_alloc_security()
Removed in favour of security_prepare_creds().
(*) security_cred_free(), ->cred_free()
New. Free security data attached to cred->security.
(*) security_prepare_creds(), ->cred_prepare()
New. Duplicate any security data attached to cred->security.
(*) security_commit_creds(), ->cred_commit()
New. Apply any security effects for the upcoming installation of new
security by commit_creds().
(*) security_task_post_setuid(), ->task_post_setuid()
Removed in favour of security_task_fix_setuid().
(*) security_task_fix_setuid(), ->task_fix_setuid()
Fix up the proposed new credentials for setuid(). This is used by
cap_set_fix_setuid() to implicitly adjust capabilities in line with
setuid() changes. Changes are made to the new credentials, rather
than the task itself as in security_task_post_setuid().
(*) security_task_reparent_to_init(), ->task_reparent_to_init()
Removed. Instead the task being reparented to init is referred
directly to init's credentials.
NOTE! This results in the loss of some state: SELinux's osid no
longer records the sid of the thread that forked it.
(*) security_key_alloc(), ->key_alloc()
(*) security_key_permission(), ->key_permission()
Changed. These now take cred pointers rather than task pointers to
refer to the security context.
(4) sys_capset().
This has been simplified and uses less locking. The LSM functions it
calls have been merged.
(5) reparent_to_kthreadd().
This gives the current thread the same credentials as init by simply using
commit_thread() to point that way.
(6) __sigqueue_alloc() and switch_uid()
__sigqueue_alloc() can't stop the target task from changing its creds
beneath it, so this function gets a reference to the currently applicable
user_struct which it then passes into the sigqueue struct it returns if
successful.
switch_uid() is now called from commit_creds(), and possibly should be
folded into that. commit_creds() should take care of protecting
__sigqueue_alloc().
(7) [sg]et[ug]id() and co and [sg]et_current_groups.
The set functions now all use prepare_creds(), commit_creds() and
abort_creds() to build and check a new set of credentials before applying
it.
security_task_set[ug]id() is called inside the prepared section. This
guarantees that nothing else will affect the creds until we've finished.
The calling of set_dumpable() has been moved into commit_creds().
Much of the functionality of set_user() has been moved into
commit_creds().
The get functions all simply access the data directly.
(8) security_task_prctl() and cap_task_prctl().
security_task_prctl() has been modified to return -ENOSYS if it doesn't
want to handle a function, or otherwise return the return value directly
rather than through an argument.
Additionally, cap_task_prctl() now prepares a new set of credentials, even
if it doesn't end up using it.
(9) Keyrings.
A number of changes have been made to the keyrings code:
(a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
all been dropped and built in to the credentials functions directly.
They may want separating out again later.
(b) key_alloc() and search_process_keyrings() now take a cred pointer
rather than a task pointer to specify the security context.
(c) copy_creds() gives a new thread within the same thread group a new
thread keyring if its parent had one, otherwise it discards the thread
keyring.
(d) The authorisation key now points directly to the credentials to extend
the search into rather pointing to the task that carries them.
(e) Installing thread, process or session keyrings causes a new set of
credentials to be created, even though it's not strictly necessary for
process or session keyrings (they're shared).
(10) Usermode helper.
The usermode helper code now carries a cred struct pointer in its
subprocess_info struct instead of a new session keyring pointer. This set
of credentials is derived from init_cred and installed on the new process
after it has been cloned.
call_usermodehelper_setup() allocates the new credentials and
call_usermodehelper_freeinfo() discards them if they haven't been used. A
special cred function (prepare_usermodeinfo_creds()) is provided
specifically for call_usermodehelper_setup() to call.
call_usermodehelper_setkeys() adjusts the credentials to sport the
supplied keyring as the new session keyring.
(11) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) selinux_setprocattr() no longer does its check for whether the
current ptracer can access processes with the new SID inside the lock
that covers getting the ptracer's SID. Whilst this lock ensures that
the check is done with the ptracer pinned, the result is only valid
until the lock is released, so there's no point doing it inside the
lock.
(12) is_single_threaded().
This function has been extracted from selinux_setprocattr() and put into
a file of its own in the lib/ directory as join_session_keyring() now
wants to use it too.
The code in SELinux just checked to see whether a task shared mm_structs
with other tasks (CLONE_VM), but that isn't good enough. We really want
to know if they're part of the same thread group (CLONE_THREAD).
(13) nfsd.
The NFS server daemon now has to use the COW credentials to set the
credentials it is going to use. It really needs to pass the credentials
down to the functions it calls, but it can't do that until other patches
in this series have been applied.
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:23 +08:00
|
|
|
cred->fsuid = 0; /* Dump root private */
|
2005-06-23 15:09:43 +08:00
|
|
|
}
|
2005-10-31 07:02:54 +08:00
|
|
|
|
2008-07-25 16:47:43 +08:00
|
|
|
retval = coredump_wait(exit_code, &core_state);
|
2010-05-27 05:43:07 +08:00
|
|
|
if (retval < 0)
|
|
|
|
goto fail_creds;
|
CRED: Inaugurate COW credentials
Inaugurate copy-on-write credentials management. This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.
A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().
With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:
struct cred *new = prepare_creds();
int ret = blah(new);
if (ret < 0) {
abort_creds(new);
return ret;
}
return commit_creds(new);
There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.
To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const. The purpose of this is compile-time
discouragement of altering credentials through those pointers. Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:
(1) Its reference count may incremented and decremented.
(2) The keyrings to which it points may be modified, but not replaced.
The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
This now prepares and commits credentials in various places in the
security code rather than altering the current creds directly.
(2) Temporary credential overrides.
do_coredump() and sys_faccessat() now prepare their own credentials and
temporarily override the ones currently on the acting thread, whilst
preventing interference from other threads by holding cred_replace_mutex
on the thread being dumped.
This will be replaced in a future patch by something that hands down the
credentials directly to the functions being called, rather than altering
the task's objective credentials.
(3) LSM interface.
A number of functions have been changed, added or removed:
(*) security_capset_check(), ->capset_check()
(*) security_capset_set(), ->capset_set()
Removed in favour of security_capset().
(*) security_capset(), ->capset()
New. This is passed a pointer to the new creds, a pointer to the old
creds and the proposed capability sets. It should fill in the new
creds or return an error. All pointers, barring the pointer to the
new creds, are now const.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
Changed; now returns a value, which will cause the process to be
killed if it's an error.
(*) security_task_alloc(), ->task_alloc_security()
Removed in favour of security_prepare_creds().
(*) security_cred_free(), ->cred_free()
New. Free security data attached to cred->security.
(*) security_prepare_creds(), ->cred_prepare()
New. Duplicate any security data attached to cred->security.
(*) security_commit_creds(), ->cred_commit()
New. Apply any security effects for the upcoming installation of new
security by commit_creds().
(*) security_task_post_setuid(), ->task_post_setuid()
Removed in favour of security_task_fix_setuid().
(*) security_task_fix_setuid(), ->task_fix_setuid()
Fix up the proposed new credentials for setuid(). This is used by
cap_set_fix_setuid() to implicitly adjust capabilities in line with
setuid() changes. Changes are made to the new credentials, rather
than the task itself as in security_task_post_setuid().
(*) security_task_reparent_to_init(), ->task_reparent_to_init()
Removed. Instead the task being reparented to init is referred
directly to init's credentials.
NOTE! This results in the loss of some state: SELinux's osid no
longer records the sid of the thread that forked it.
(*) security_key_alloc(), ->key_alloc()
(*) security_key_permission(), ->key_permission()
Changed. These now take cred pointers rather than task pointers to
refer to the security context.
(4) sys_capset().
This has been simplified and uses less locking. The LSM functions it
calls have been merged.
(5) reparent_to_kthreadd().
This gives the current thread the same credentials as init by simply using
commit_thread() to point that way.
(6) __sigqueue_alloc() and switch_uid()
__sigqueue_alloc() can't stop the target task from changing its creds
beneath it, so this function gets a reference to the currently applicable
user_struct which it then passes into the sigqueue struct it returns if
successful.
switch_uid() is now called from commit_creds(), and possibly should be
folded into that. commit_creds() should take care of protecting
__sigqueue_alloc().
(7) [sg]et[ug]id() and co and [sg]et_current_groups.
The set functions now all use prepare_creds(), commit_creds() and
abort_creds() to build and check a new set of credentials before applying
it.
security_task_set[ug]id() is called inside the prepared section. This
guarantees that nothing else will affect the creds until we've finished.
The calling of set_dumpable() has been moved into commit_creds().
Much of the functionality of set_user() has been moved into
commit_creds().
The get functions all simply access the data directly.
(8) security_task_prctl() and cap_task_prctl().
security_task_prctl() has been modified to return -ENOSYS if it doesn't
want to handle a function, or otherwise return the return value directly
rather than through an argument.
Additionally, cap_task_prctl() now prepares a new set of credentials, even
if it doesn't end up using it.
(9) Keyrings.
A number of changes have been made to the keyrings code:
(a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
all been dropped and built in to the credentials functions directly.
They may want separating out again later.
(b) key_alloc() and search_process_keyrings() now take a cred pointer
rather than a task pointer to specify the security context.
(c) copy_creds() gives a new thread within the same thread group a new
thread keyring if its parent had one, otherwise it discards the thread
keyring.
(d) The authorisation key now points directly to the credentials to extend
the search into rather pointing to the task that carries them.
(e) Installing thread, process or session keyrings causes a new set of
credentials to be created, even though it's not strictly necessary for
process or session keyrings (they're shared).
(10) Usermode helper.
The usermode helper code now carries a cred struct pointer in its
subprocess_info struct instead of a new session keyring pointer. This set
of credentials is derived from init_cred and installed on the new process
after it has been cloned.
call_usermodehelper_setup() allocates the new credentials and
call_usermodehelper_freeinfo() discards them if they haven't been used. A
special cred function (prepare_usermodeinfo_creds()) is provided
specifically for call_usermodehelper_setup() to call.
call_usermodehelper_setkeys() adjusts the credentials to sport the
supplied keyring as the new session keyring.
(11) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) selinux_setprocattr() no longer does its check for whether the
current ptracer can access processes with the new SID inside the lock
that covers getting the ptracer's SID. Whilst this lock ensures that
the check is done with the ptracer pinned, the result is only valid
until the lock is released, so there's no point doing it inside the
lock.
(12) is_single_threaded().
This function has been extracted from selinux_setprocattr() and put into
a file of its own in the lib/ directory as join_session_keyring() now
wants to use it too.
The code in SELinux just checked to see whether a task shared mm_structs
with other tasks (CLONE_VM), but that isn't good enough. We really want
to know if they're part of the same thread group (CLONE_THREAD).
(13) nfsd.
The NFS server daemon now has to use the COW credentials to set the
credentials it is going to use. It really needs to pass the credentials
down to the functions it calls, but it can't do that until other patches
in this series have been applied.
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:23 +08:00
|
|
|
|
|
|
|
old_cred = override_creds(cred);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Clear any false indication of pending signals that might
|
|
|
|
* be seen by the filesystem code called to write the core file.
|
|
|
|
*/
|
|
|
|
clear_thread_flag(TIF_SIGPENDING);
|
|
|
|
|
2010-10-28 06:34:08 +08:00
|
|
|
ispipe = format_corename(&cn, signr);
|
|
|
|
|
2007-04-17 13:53:13 +08:00
|
|
|
if (ispipe) {
|
2010-05-27 05:43:06 +08:00
|
|
|
int dump_count;
|
|
|
|
char **helper_argv;
|
|
|
|
|
2011-07-27 07:08:34 +08:00
|
|
|
if (ispipe < 0) {
|
|
|
|
printk(KERN_WARNING "format_corename failed\n");
|
|
|
|
printk(KERN_WARNING "Aborting core\n");
|
|
|
|
goto fail_corename;
|
|
|
|
}
|
|
|
|
|
2010-05-27 05:42:59 +08:00
|
|
|
if (cprm.limit == 1) {
|
2009-09-24 06:56:54 +08:00
|
|
|
/*
|
|
|
|
* Normally core limits are irrelevant to pipes, since
|
|
|
|
* we're not writing to the file system, but we use
|
2010-05-27 05:42:59 +08:00
|
|
|
* cprm.limit of 1 here as a speacial value. Any
|
|
|
|
* non-1 limit gets set to RLIM_INFINITY below, but
|
2009-09-24 06:56:54 +08:00
|
|
|
* a limit of 0 skips the dump. This is a consistent
|
|
|
|
* way to catch recursive crashes. We can still crash
|
2010-05-27 05:42:59 +08:00
|
|
|
* if the core_pattern binary sets RLIM_CORE = !1
|
2009-09-24 06:56:54 +08:00
|
|
|
* but it runs as root, and can do lots of stupid things
|
|
|
|
* Note that we use task_tgid_vnr here to grab the pid
|
|
|
|
* of the process group leader. That way we get the
|
|
|
|
* right pid if a thread in a multi-threaded
|
|
|
|
* core_pattern process dies.
|
|
|
|
*/
|
|
|
|
printk(KERN_WARNING
|
2010-05-27 05:42:59 +08:00
|
|
|
"Process %d(%s) has RLIMIT_CORE set to 1\n",
|
2009-09-24 06:56:54 +08:00
|
|
|
task_tgid_vnr(current), current->comm);
|
|
|
|
printk(KERN_WARNING "Aborting core\n");
|
|
|
|
goto fail_unlock;
|
|
|
|
}
|
2010-05-27 05:43:06 +08:00
|
|
|
cprm.limit = RLIM_INFINITY;
|
2009-09-24 06:56:54 +08:00
|
|
|
|
2009-09-24 06:56:56 +08:00
|
|
|
dump_count = atomic_inc_return(&core_dump_count);
|
|
|
|
if (core_pipe_limit && (core_pipe_limit < dump_count)) {
|
|
|
|
printk(KERN_WARNING "Pid %d(%s) over core_pipe_limit\n",
|
|
|
|
task_tgid_vnr(current), current->comm);
|
|
|
|
printk(KERN_WARNING "Skipping core dump\n");
|
|
|
|
goto fail_dropcount;
|
|
|
|
}
|
|
|
|
|
2010-10-28 06:34:08 +08:00
|
|
|
helper_argv = argv_split(GFP_KERNEL, cn.corename+1, NULL);
|
2009-01-07 06:41:11 +08:00
|
|
|
if (!helper_argv) {
|
|
|
|
printk(KERN_WARNING "%s failed to allocate memory\n",
|
|
|
|
__func__);
|
2009-09-24 06:56:56 +08:00
|
|
|
goto fail_dropcount;
|
2009-01-07 06:41:11 +08:00
|
|
|
}
|
2007-10-17 14:26:36 +08:00
|
|
|
|
2010-05-27 05:43:06 +08:00
|
|
|
retval = call_usermodehelper_fns(helper_argv[0], helper_argv,
|
|
|
|
NULL, UMH_WAIT_EXEC, umh_pipe_setup,
|
|
|
|
NULL, &cprm);
|
|
|
|
argv_free(helper_argv);
|
|
|
|
if (retval) {
|
2006-10-01 14:29:28 +08:00
|
|
|
printk(KERN_INFO "Core dump to %s pipe failed\n",
|
2010-10-28 06:34:08 +08:00
|
|
|
cn.corename);
|
2010-05-27 05:43:06 +08:00
|
|
|
goto close_fail;
|
2006-10-01 14:29:28 +08:00
|
|
|
}
|
2010-05-27 05:43:05 +08:00
|
|
|
} else {
|
|
|
|
struct inode *inode;
|
|
|
|
|
|
|
|
if (cprm.limit < binfmt->min_coredump)
|
|
|
|
goto fail_unlock;
|
|
|
|
|
2010-10-28 06:34:08 +08:00
|
|
|
cprm.file = filp_open(cn.corename,
|
2006-12-07 12:40:39 +08:00
|
|
|
O_CREAT | 2 | O_NOFOLLOW | O_LARGEFILE | flag,
|
|
|
|
0600);
|
2010-05-27 05:43:05 +08:00
|
|
|
if (IS_ERR(cprm.file))
|
|
|
|
goto fail_unlock;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-05-27 05:43:05 +08:00
|
|
|
inode = cprm.file->f_path.dentry->d_inode;
|
|
|
|
if (inode->i_nlink > 1)
|
|
|
|
goto close_fail;
|
|
|
|
if (d_unhashed(cprm.file->f_path.dentry))
|
|
|
|
goto close_fail;
|
|
|
|
/*
|
|
|
|
* AK: actually i see no reason to not allow this for named
|
|
|
|
* pipes etc, but keep the previous behaviour for now.
|
|
|
|
*/
|
|
|
|
if (!S_ISREG(inode->i_mode))
|
|
|
|
goto close_fail;
|
|
|
|
/*
|
|
|
|
* Dont allow local users get cute and trick others to coredump
|
|
|
|
* into their pre-created files.
|
|
|
|
*/
|
|
|
|
if (inode->i_uid != current_fsuid())
|
|
|
|
goto close_fail;
|
|
|
|
if (!cprm.file->f_op || !cprm.file->f_op->write)
|
|
|
|
goto close_fail;
|
|
|
|
if (do_truncate(cprm.file->f_path.dentry, 0, 0, cprm.file))
|
|
|
|
goto close_fail;
|
|
|
|
}
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2010-05-27 05:43:05 +08:00
|
|
|
retval = binfmt->core_dump(&cprm);
|
2005-04-17 06:20:36 +08:00
|
|
|
if (retval)
|
|
|
|
current->signal->group_exit_code |= 0x80;
|
2010-05-27 05:43:06 +08:00
|
|
|
|
2009-09-24 06:56:58 +08:00
|
|
|
if (ispipe && core_pipe_limit)
|
2009-12-18 07:27:16 +08:00
|
|
|
wait_for_dump_helpers(cprm.file);
|
2010-05-27 05:43:06 +08:00
|
|
|
close_fail:
|
|
|
|
if (cprm.file)
|
|
|
|
filp_close(cprm.file, NULL);
|
2009-09-24 06:56:56 +08:00
|
|
|
fail_dropcount:
|
2010-05-27 05:43:06 +08:00
|
|
|
if (ispipe)
|
2009-09-24 06:56:56 +08:00
|
|
|
atomic_dec(&core_dump_count);
|
2005-04-17 06:20:36 +08:00
|
|
|
fail_unlock:
|
2010-10-28 06:34:08 +08:00
|
|
|
kfree(cn.corename);
|
|
|
|
fail_corename:
|
2010-05-27 05:43:07 +08:00
|
|
|
coredump_finish(mm);
|
CRED: Inaugurate COW credentials
Inaugurate copy-on-write credentials management. This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.
A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().
With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:
struct cred *new = prepare_creds();
int ret = blah(new);
if (ret < 0) {
abort_creds(new);
return ret;
}
return commit_creds(new);
There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.
To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const. The purpose of this is compile-time
discouragement of altering credentials through those pointers. Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:
(1) Its reference count may incremented and decremented.
(2) The keyrings to which it points may be modified, but not replaced.
The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
This now prepares and commits credentials in various places in the
security code rather than altering the current creds directly.
(2) Temporary credential overrides.
do_coredump() and sys_faccessat() now prepare their own credentials and
temporarily override the ones currently on the acting thread, whilst
preventing interference from other threads by holding cred_replace_mutex
on the thread being dumped.
This will be replaced in a future patch by something that hands down the
credentials directly to the functions being called, rather than altering
the task's objective credentials.
(3) LSM interface.
A number of functions have been changed, added or removed:
(*) security_capset_check(), ->capset_check()
(*) security_capset_set(), ->capset_set()
Removed in favour of security_capset().
(*) security_capset(), ->capset()
New. This is passed a pointer to the new creds, a pointer to the old
creds and the proposed capability sets. It should fill in the new
creds or return an error. All pointers, barring the pointer to the
new creds, are now const.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
Changed; now returns a value, which will cause the process to be
killed if it's an error.
(*) security_task_alloc(), ->task_alloc_security()
Removed in favour of security_prepare_creds().
(*) security_cred_free(), ->cred_free()
New. Free security data attached to cred->security.
(*) security_prepare_creds(), ->cred_prepare()
New. Duplicate any security data attached to cred->security.
(*) security_commit_creds(), ->cred_commit()
New. Apply any security effects for the upcoming installation of new
security by commit_creds().
(*) security_task_post_setuid(), ->task_post_setuid()
Removed in favour of security_task_fix_setuid().
(*) security_task_fix_setuid(), ->task_fix_setuid()
Fix up the proposed new credentials for setuid(). This is used by
cap_set_fix_setuid() to implicitly adjust capabilities in line with
setuid() changes. Changes are made to the new credentials, rather
than the task itself as in security_task_post_setuid().
(*) security_task_reparent_to_init(), ->task_reparent_to_init()
Removed. Instead the task being reparented to init is referred
directly to init's credentials.
NOTE! This results in the loss of some state: SELinux's osid no
longer records the sid of the thread that forked it.
(*) security_key_alloc(), ->key_alloc()
(*) security_key_permission(), ->key_permission()
Changed. These now take cred pointers rather than task pointers to
refer to the security context.
(4) sys_capset().
This has been simplified and uses less locking. The LSM functions it
calls have been merged.
(5) reparent_to_kthreadd().
This gives the current thread the same credentials as init by simply using
commit_thread() to point that way.
(6) __sigqueue_alloc() and switch_uid()
__sigqueue_alloc() can't stop the target task from changing its creds
beneath it, so this function gets a reference to the currently applicable
user_struct which it then passes into the sigqueue struct it returns if
successful.
switch_uid() is now called from commit_creds(), and possibly should be
folded into that. commit_creds() should take care of protecting
__sigqueue_alloc().
(7) [sg]et[ug]id() and co and [sg]et_current_groups.
The set functions now all use prepare_creds(), commit_creds() and
abort_creds() to build and check a new set of credentials before applying
it.
security_task_set[ug]id() is called inside the prepared section. This
guarantees that nothing else will affect the creds until we've finished.
The calling of set_dumpable() has been moved into commit_creds().
Much of the functionality of set_user() has been moved into
commit_creds().
The get functions all simply access the data directly.
(8) security_task_prctl() and cap_task_prctl().
security_task_prctl() has been modified to return -ENOSYS if it doesn't
want to handle a function, or otherwise return the return value directly
rather than through an argument.
Additionally, cap_task_prctl() now prepares a new set of credentials, even
if it doesn't end up using it.
(9) Keyrings.
A number of changes have been made to the keyrings code:
(a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
all been dropped and built in to the credentials functions directly.
They may want separating out again later.
(b) key_alloc() and search_process_keyrings() now take a cred pointer
rather than a task pointer to specify the security context.
(c) copy_creds() gives a new thread within the same thread group a new
thread keyring if its parent had one, otherwise it discards the thread
keyring.
(d) The authorisation key now points directly to the credentials to extend
the search into rather pointing to the task that carries them.
(e) Installing thread, process or session keyrings causes a new set of
credentials to be created, even though it's not strictly necessary for
process or session keyrings (they're shared).
(10) Usermode helper.
The usermode helper code now carries a cred struct pointer in its
subprocess_info struct instead of a new session keyring pointer. This set
of credentials is derived from init_cred and installed on the new process
after it has been cloned.
call_usermodehelper_setup() allocates the new credentials and
call_usermodehelper_freeinfo() discards them if they haven't been used. A
special cred function (prepare_usermodeinfo_creds()) is provided
specifically for call_usermodehelper_setup() to call.
call_usermodehelper_setkeys() adjusts the credentials to sport the
supplied keyring as the new session keyring.
(11) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) selinux_setprocattr() no longer does its check for whether the
current ptracer can access processes with the new SID inside the lock
that covers getting the ptracer's SID. Whilst this lock ensures that
the check is done with the ptracer pinned, the result is only valid
until the lock is released, so there's no point doing it inside the
lock.
(12) is_single_threaded().
This function has been extracted from selinux_setprocattr() and put into
a file of its own in the lib/ directory as join_session_keyring() now
wants to use it too.
The code in SELinux just checked to see whether a task shared mm_structs
with other tasks (CLONE_VM), but that isn't good enough. We really want
to know if they're part of the same thread group (CLONE_THREAD).
(13) nfsd.
The NFS server daemon now has to use the COW credentials to set the
credentials it is going to use. It really needs to pass the credentials
down to the functions it calls, but it can't do that until other patches
in this series have been applied.
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:23 +08:00
|
|
|
revert_creds(old_cred);
|
2010-05-27 05:43:07 +08:00
|
|
|
fail_creds:
|
CRED: Inaugurate COW credentials
Inaugurate copy-on-write credentials management. This uses RCU to manage the
credentials pointer in the task_struct with respect to accesses by other tasks.
A process may only modify its own credentials, and so does not need locking to
access or modify its own credentials.
A mutex (cred_replace_mutex) is added to the task_struct to control the effect
of PTRACE_ATTACHED on credential calculations, particularly with respect to
execve().
With this patch, the contents of an active credentials struct may not be
changed directly; rather a new set of credentials must be prepared, modified
and committed using something like the following sequence of events:
struct cred *new = prepare_creds();
int ret = blah(new);
if (ret < 0) {
abort_creds(new);
return ret;
}
return commit_creds(new);
There are some exceptions to this rule: the keyrings pointed to by the active
credentials may be instantiated - keyrings violate the COW rule as managing
COW keyrings is tricky, given that it is possible for a task to directly alter
the keys in a keyring in use by another task.
To help enforce this, various pointers to sets of credentials, such as those in
the task_struct, are declared const. The purpose of this is compile-time
discouragement of altering credentials through those pointers. Once a set of
credentials has been made public through one of these pointers, it may not be
modified, except under special circumstances:
(1) Its reference count may incremented and decremented.
(2) The keyrings to which it points may be modified, but not replaced.
The only safe way to modify anything else is to create a replacement and commit
using the functions described in Documentation/credentials.txt (which will be
added by a later patch).
This patch and the preceding patches have been tested with the LTP SELinux
testsuite.
This patch makes several logical sets of alteration:
(1) execve().
This now prepares and commits credentials in various places in the
security code rather than altering the current creds directly.
(2) Temporary credential overrides.
do_coredump() and sys_faccessat() now prepare their own credentials and
temporarily override the ones currently on the acting thread, whilst
preventing interference from other threads by holding cred_replace_mutex
on the thread being dumped.
This will be replaced in a future patch by something that hands down the
credentials directly to the functions being called, rather than altering
the task's objective credentials.
(3) LSM interface.
A number of functions have been changed, added or removed:
(*) security_capset_check(), ->capset_check()
(*) security_capset_set(), ->capset_set()
Removed in favour of security_capset().
(*) security_capset(), ->capset()
New. This is passed a pointer to the new creds, a pointer to the old
creds and the proposed capability sets. It should fill in the new
creds or return an error. All pointers, barring the pointer to the
new creds, are now const.
(*) security_bprm_apply_creds(), ->bprm_apply_creds()
Changed; now returns a value, which will cause the process to be
killed if it's an error.
(*) security_task_alloc(), ->task_alloc_security()
Removed in favour of security_prepare_creds().
(*) security_cred_free(), ->cred_free()
New. Free security data attached to cred->security.
(*) security_prepare_creds(), ->cred_prepare()
New. Duplicate any security data attached to cred->security.
(*) security_commit_creds(), ->cred_commit()
New. Apply any security effects for the upcoming installation of new
security by commit_creds().
(*) security_task_post_setuid(), ->task_post_setuid()
Removed in favour of security_task_fix_setuid().
(*) security_task_fix_setuid(), ->task_fix_setuid()
Fix up the proposed new credentials for setuid(). This is used by
cap_set_fix_setuid() to implicitly adjust capabilities in line with
setuid() changes. Changes are made to the new credentials, rather
than the task itself as in security_task_post_setuid().
(*) security_task_reparent_to_init(), ->task_reparent_to_init()
Removed. Instead the task being reparented to init is referred
directly to init's credentials.
NOTE! This results in the loss of some state: SELinux's osid no
longer records the sid of the thread that forked it.
(*) security_key_alloc(), ->key_alloc()
(*) security_key_permission(), ->key_permission()
Changed. These now take cred pointers rather than task pointers to
refer to the security context.
(4) sys_capset().
This has been simplified and uses less locking. The LSM functions it
calls have been merged.
(5) reparent_to_kthreadd().
This gives the current thread the same credentials as init by simply using
commit_thread() to point that way.
(6) __sigqueue_alloc() and switch_uid()
__sigqueue_alloc() can't stop the target task from changing its creds
beneath it, so this function gets a reference to the currently applicable
user_struct which it then passes into the sigqueue struct it returns if
successful.
switch_uid() is now called from commit_creds(), and possibly should be
folded into that. commit_creds() should take care of protecting
__sigqueue_alloc().
(7) [sg]et[ug]id() and co and [sg]et_current_groups.
The set functions now all use prepare_creds(), commit_creds() and
abort_creds() to build and check a new set of credentials before applying
it.
security_task_set[ug]id() is called inside the prepared section. This
guarantees that nothing else will affect the creds until we've finished.
The calling of set_dumpable() has been moved into commit_creds().
Much of the functionality of set_user() has been moved into
commit_creds().
The get functions all simply access the data directly.
(8) security_task_prctl() and cap_task_prctl().
security_task_prctl() has been modified to return -ENOSYS if it doesn't
want to handle a function, or otherwise return the return value directly
rather than through an argument.
Additionally, cap_task_prctl() now prepares a new set of credentials, even
if it doesn't end up using it.
(9) Keyrings.
A number of changes have been made to the keyrings code:
(a) switch_uid_keyring(), copy_keys(), exit_keys() and suid_keys() have
all been dropped and built in to the credentials functions directly.
They may want separating out again later.
(b) key_alloc() and search_process_keyrings() now take a cred pointer
rather than a task pointer to specify the security context.
(c) copy_creds() gives a new thread within the same thread group a new
thread keyring if its parent had one, otherwise it discards the thread
keyring.
(d) The authorisation key now points directly to the credentials to extend
the search into rather pointing to the task that carries them.
(e) Installing thread, process or session keyrings causes a new set of
credentials to be created, even though it's not strictly necessary for
process or session keyrings (they're shared).
(10) Usermode helper.
The usermode helper code now carries a cred struct pointer in its
subprocess_info struct instead of a new session keyring pointer. This set
of credentials is derived from init_cred and installed on the new process
after it has been cloned.
call_usermodehelper_setup() allocates the new credentials and
call_usermodehelper_freeinfo() discards them if they haven't been used. A
special cred function (prepare_usermodeinfo_creds()) is provided
specifically for call_usermodehelper_setup() to call.
call_usermodehelper_setkeys() adjusts the credentials to sport the
supplied keyring as the new session keyring.
(11) SELinux.
SELinux has a number of changes, in addition to those to support the LSM
interface changes mentioned above:
(a) selinux_setprocattr() no longer does its check for whether the
current ptracer can access processes with the new SID inside the lock
that covers getting the ptracer's SID. Whilst this lock ensures that
the check is done with the ptracer pinned, the result is only valid
until the lock is released, so there's no point doing it inside the
lock.
(12) is_single_threaded().
This function has been extracted from selinux_setprocattr() and put into
a file of its own in the lib/ directory as join_session_keyring() now
wants to use it too.
The code in SELinux just checked to see whether a task shared mm_structs
with other tasks (CLONE_VM), but that isn't good enough. We really want
to know if they're part of the same thread group (CLONE_THREAD).
(13) nfsd.
The NFS server daemon now has to use the COW credentials to set the
credentials it is going to use. It really needs to pass the credentials
down to the functions it calls, but it can't do that until other patches
in this series have been applied.
Signed-off-by: David Howells <dhowells@redhat.com>
Acked-by: James Morris <jmorris@namei.org>
Signed-off-by: James Morris <jmorris@namei.org>
2008-11-14 07:39:23 +08:00
|
|
|
put_cred(cred);
|
2005-04-17 06:20:36 +08:00
|
|
|
fail:
|
2009-01-07 06:42:48 +08:00
|
|
|
return;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
2010-10-15 05:32:06 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Core dumping helper functions. These are the only things you should
|
|
|
|
* do on a core-file: use only these functions to write out all the
|
|
|
|
* necessary info.
|
|
|
|
*/
|
|
|
|
int dump_write(struct file *file, const void *addr, int nr)
|
|
|
|
{
|
|
|
|
return access_ok(VERIFY_READ, addr, nr) && file->f_op->write(file, addr, nr, &file->f_pos) == nr;
|
|
|
|
}
|
2010-10-15 10:15:28 +08:00
|
|
|
EXPORT_SYMBOL(dump_write);
|
2010-10-15 05:32:06 +08:00
|
|
|
|
|
|
|
int dump_seek(struct file *file, loff_t off)
|
|
|
|
{
|
|
|
|
int ret = 1;
|
|
|
|
|
|
|
|
if (file->f_op->llseek && file->f_op->llseek != no_llseek) {
|
|
|
|
if (file->f_op->llseek(file, off, SEEK_CUR) < 0)
|
|
|
|
return 0;
|
|
|
|
} else {
|
|
|
|
char *buf = (char *)get_zeroed_page(GFP_KERNEL);
|
|
|
|
|
|
|
|
if (!buf)
|
|
|
|
return 0;
|
|
|
|
while (off > 0) {
|
|
|
|
unsigned long n = off;
|
|
|
|
|
|
|
|
if (n > PAGE_SIZE)
|
|
|
|
n = PAGE_SIZE;
|
|
|
|
if (!dump_write(file, buf, n)) {
|
|
|
|
ret = 0;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
off -= n;
|
|
|
|
}
|
|
|
|
free_page((unsigned long)buf);
|
|
|
|
}
|
|
|
|
return ret;
|
|
|
|
}
|
2010-10-15 10:15:28 +08:00
|
|
|
EXPORT_SYMBOL(dump_seek);
|