2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* syscalls.h - Linux syscall interfaces (non-arch-specific)
|
|
|
|
*
|
|
|
|
* Copyright (c) 2004 Randy Dunlap
|
|
|
|
* Copyright (c) 2004 Open Source Development Labs
|
|
|
|
*
|
|
|
|
* This file is released under the GPLv2.
|
|
|
|
* See the file COPYING for more details.
|
|
|
|
*/
|
|
|
|
|
|
|
|
#ifndef _LINUX_SYSCALLS_H
|
|
|
|
#define _LINUX_SYSCALLS_H
|
|
|
|
|
|
|
|
struct epoll_event;
|
|
|
|
struct iattr;
|
|
|
|
struct inode;
|
|
|
|
struct iocb;
|
|
|
|
struct io_event;
|
|
|
|
struct iovec;
|
|
|
|
struct itimerspec;
|
|
|
|
struct itimerval;
|
|
|
|
struct kexec_segment;
|
|
|
|
struct linux_dirent;
|
|
|
|
struct linux_dirent64;
|
|
|
|
struct list_head;
|
2010-03-11 07:21:15 +08:00
|
|
|
struct mmap_arg_struct;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct msgbuf;
|
separate kernel- and userland-side msghdr
Kernel-side struct msghdr is (currently) using the same layout as
userland one, but it's not a one-to-one copy - even without considering
32bit compat issues, we have msg_iov, msg_name and msg_control copied
to kernel[1]. It's fairly localized, so we get away with a few functions
where that knowledge is needed (and we could shrink that set even
more). Pretty much everything deals with the kernel-side variant and
the few places that want userland one just use a bunch of force-casts
to paper over the differences.
The thing is, kernel-side definition of struct msghdr is *not* exposed
in include/uapi - libc doesn't see it, etc. So we can add struct user_msghdr,
with proper annotations and let the few places that ever deal with those
beasts use it for userland pointers. Saner typechecking aside, that will
allow to change the layout of kernel-side msghdr - e.g. replace
msg_iov/msg_iovlen there with struct iov_iter, getting rid of the need
to modify the iovec as we copy data to/from it, etc.
We could introduce kernel_msghdr instead, but that would create much more
noise - the absolute majority of the instances would need to have the
type switched to kernel_msghdr and definition of struct msghdr in
include/linux/socket.h is not going to be seen by userland anyway.
This commit just introduces user_msghdr and switches the few places that
are dealing with userland-side msghdr to it.
[1] actually, it's even trickier than that - we copy msg_control for
sendmsg, but keep the userland address on recvmsg.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-04-07 02:03:05 +08:00
|
|
|
struct user_msghdr;
|
2009-10-13 14:40:10 +08:00
|
|
|
struct mmsghdr;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct msqid_ds;
|
|
|
|
struct new_utsname;
|
|
|
|
struct nfsctl_arg;
|
|
|
|
struct __old_kernel_stat;
|
2010-03-11 07:21:21 +08:00
|
|
|
struct oldold_utsname;
|
|
|
|
struct old_utsname;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct pollfd;
|
|
|
|
struct rlimit;
|
2010-05-05 00:03:50 +08:00
|
|
|
struct rlimit64;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct rusage;
|
|
|
|
struct sched_param;
|
sched: Add new scheduler syscalls to support an extended scheduling parameters ABI
Add the syscalls needed for supporting scheduling algorithms
with extended scheduling parameters (e.g., SCHED_DEADLINE).
In general, it makes possible to specify a periodic/sporadic task,
that executes for a given amount of runtime at each instance, and is
scheduled according to the urgency of their own timing constraints,
i.e.:
- a (maximum/typical) instance execution time,
- a minimum interval between consecutive instances,
- a time constraint by which each instance must be completed.
Thus, both the data structure that holds the scheduling parameters of
the tasks and the system calls dealing with it must be extended.
Unfortunately, modifying the existing struct sched_param would break
the ABI and result in potentially serious compatibility issues with
legacy binaries.
For these reasons, this patch:
- defines the new struct sched_attr, containing all the fields
that are necessary for specifying a task in the computational
model described above;
- defines and implements the new scheduling related syscalls that
manipulate it, i.e., sched_setattr() and sched_getattr().
Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a
proof of concept and for developing and testing purposes. Making them
available on other architectures is straightforward.
Since no "user" for these new parameters is introduced in this patch,
the implementation of the new system calls is just identical to their
already existing counterpart. Future patches that implement scheduling
policies able to exploit the new data structure must also take care of
modifying the sched_*attr() calls accordingly with their own purposes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
[ Rewrote to use sched_attr. ]
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
[ Removed sched_setscheduler2() for now. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 21:43:36 +08:00
|
|
|
struct sched_attr;
|
2010-03-11 07:21:13 +08:00
|
|
|
struct sel_arg_struct;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct semaphore;
|
|
|
|
struct sembuf;
|
|
|
|
struct shmid_ds;
|
|
|
|
struct sockaddr;
|
|
|
|
struct stat;
|
|
|
|
struct stat64;
|
|
|
|
struct statfs;
|
|
|
|
struct statfs64;
|
statx: Add a system call to make enhanced file info available
Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.
The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode. This change is propagated to the vfs_getattr*()
function.
Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.
========
OVERVIEW
========
The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.
A number of requests were gathered for features to be included. The
following have been included:
(1) Make the fields a consistent size on all arches and make them large.
(2) Spare space, request flags and information flags are provided for
future expansion.
(3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
__s64).
(4) Creation time: The SMB protocol carries the creation time, which could
be exported by Samba, which will in turn help CIFS make use of
FS-Cache as that can be used for coherency data (stx_btime).
This is also specified in NFSv4 as a recommended attribute and could
be exported by NFSD [Steve French].
(5) Lightweight stat: Ask for just those details of interest, and allow a
netfs (such as NFS) to approximate anything not of interest, possibly
without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
Dilger] (AT_STATX_DONT_SYNC).
(6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
its cached attributes are up to date [Trond Myklebust]
(AT_STATX_FORCE_SYNC).
And the following have been left out for future extension:
(7) Data version number: Could be used by userspace NFS servers [Aneesh
Kumar].
Can also be used to modify fill_post_wcc() in NFSD which retrieves
i_version directly, but has just called vfs_getattr(). It could get
it from the kstat struct if it used vfs_xgetattr() instead.
(There's disagreement on the exact semantics of a single field, since
not all filesystems do this the same way).
(8) BSD stat compatibility: Including more fields from the BSD stat such
as creation time (st_btime) and inode generation number (st_gen)
[Jeremy Allison, Bernd Schubert].
(9) Inode generation number: Useful for FUSE and userspace NFS servers
[Bernd Schubert].
(This was asked for but later deemed unnecessary with the
open-by-handle capability available and caused disagreement as to
whether it's a security hole or not).
(10) Extra coherency data may be useful in making backups [Andreas Dilger].
(No particular data were offered, but things like last backup
timestamp, the data version number and the DOS archive bit would come
into this category).
(11) Allow the filesystem to indicate what it can/cannot provide: A
filesystem can now say it doesn't support a standard stat feature if
that isn't available, so if, for instance, inode numbers or UIDs don't
exist or are fabricated locally...
(This requires a separate system call - I have an fsinfo() call idea
for this).
(12) Store a 16-byte volume ID in the superblock that can be returned in
struct xstat [Steve French].
(Deferred to fsinfo).
(13) Include granularity fields in the time data to indicate the
granularity of each of the times (NFSv4 time_delta) [Steve French].
(Deferred to fsinfo).
(14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags.
Note that the Linux IOC flags are a mess and filesystems such as Ext4
define flags that aren't in linux/fs.h, so translation in the kernel
may be a necessity (or, possibly, we provide the filesystem type too).
(Some attributes are made available in stx_attributes, but the general
feeling was that the IOC flags were to ext[234]-specific and shouldn't
be exposed through statx this way).
(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
Michael Kerrisk].
(Deferred, probably to fsinfo. Finding out if there's an ACL or
seclabal might require extra filesystem operations).
(16) Femtosecond-resolution timestamps [Dave Chinner].
(A __reserved field has been left in the statx_timestamp struct for
this - if there proves to be a need).
(17) A set multiple attributes syscall to go with this.
===============
NEW SYSTEM CALL
===============
The new system call is:
int ret = statx(int dfd,
const char *filename,
unsigned int flags,
unsigned int mask,
struct statx *buffer);
The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat(). There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.
Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):
(1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
respect.
(2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
its attributes with the server - which might require data writeback to
occur to get the timestamps correct.
(3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
network filesystem. The resulting values should be considered
approximate.
mask is a bitmask indicating the fields in struct statx that are of
interest to the caller. The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat(). It should be noted that asking for
more information may entail extra I/O operations.
buffer points to the destination for the data. This must be 256 bytes in
size.
======================
MAIN ATTRIBUTES RECORD
======================
The following structures are defined in which to return the main attribute
set:
struct statx_timestamp {
__s64 tv_sec;
__s32 tv_nsec;
__s32 __reserved;
};
struct statx {
__u32 stx_mask;
__u32 stx_blksize;
__u64 stx_attributes;
__u32 stx_nlink;
__u32 stx_uid;
__u32 stx_gid;
__u16 stx_mode;
__u16 __spare0[1];
__u64 stx_ino;
__u64 stx_size;
__u64 stx_blocks;
__u64 __spare1[1];
struct statx_timestamp stx_atime;
struct statx_timestamp stx_btime;
struct statx_timestamp stx_ctime;
struct statx_timestamp stx_mtime;
__u32 stx_rdev_major;
__u32 stx_rdev_minor;
__u32 stx_dev_major;
__u32 stx_dev_minor;
__u64 __spare2[14];
};
The defined bits in request_mask and stx_mask are:
STATX_TYPE Want/got stx_mode & S_IFMT
STATX_MODE Want/got stx_mode & ~S_IFMT
STATX_NLINK Want/got stx_nlink
STATX_UID Want/got stx_uid
STATX_GID Want/got stx_gid
STATX_ATIME Want/got stx_atime{,_ns}
STATX_MTIME Want/got stx_mtime{,_ns}
STATX_CTIME Want/got stx_ctime{,_ns}
STATX_INO Want/got stx_ino
STATX_SIZE Want/got stx_size
STATX_BLOCKS Want/got stx_blocks
STATX_BASIC_STATS [The stuff in the normal stat struct]
STATX_BTIME Want/got stx_btime{,_ns}
STATX_ALL [All currently available stuff]
stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.
Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution. Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.
The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does. The following
attributes map to FS_*_FL flags and are the same numerical value:
STATX_ATTR_COMPRESSED File is compressed by the fs
STATX_ATTR_IMMUTABLE File is marked immutable
STATX_ATTR_APPEND File is append-only
STATX_ATTR_NODUMP File is not to be dumped
STATX_ATTR_ENCRYPTED File requires key to decrypt in fs
Within the kernel, the supported flags are listed by:
KSTAT_ATTR_FS_IOC_FLAGS
[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]
New flags include:
STATX_ATTR_AUTOMOUNT Object is an automount trigger
These are for the use of GUI tools that might want to mark files specially,
depending on what they are.
Fields in struct statx come in a number of classes:
(0) stx_dev_*, stx_blksize.
These are local system information and are always available.
(1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
stx_size, stx_blocks.
These will be returned whether the caller asks for them or not. The
corresponding bits in stx_mask will be set to indicate whether they
actually have valid values.
If the caller didn't ask for them, then they may be approximated. For
example, NFS won't waste any time updating them from the server,
unless as a byproduct of updating something requested.
If the values don't actually exist for the underlying object (such as
UID or GID on a DOS file), then the bit won't be set in the stx_mask,
even if the caller asked for the value. In such a case, the returned
value will be a fabrication.
Note that there are instances where the type might not be valid, for
instance Windows reparse points.
(2) stx_rdev_*.
This will be set only if stx_mode indicates we're looking at a
blockdev or a chardev, otherwise will be 0.
(3) stx_btime.
Similar to (1), except this will be set to 0 if it doesn't exist.
=======
TESTING
=======
The following test program can be used to test the statx system call:
samples/statx/test-statx.c
Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.
Here's some example output. Firstly, an NFS directory that crosses to
another FSID. Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.
[root@andromeda ~]# /tmp/test-statx -A /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:26 Inode: 1703937 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)
Secondly, the result of automounting on that directory.
[root@andromeda ~]# /tmp/test-statx /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:27 Inode: 2 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-02-01 00:46:22 +08:00
|
|
|
struct statx;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct __sysctl_args;
|
|
|
|
struct sysinfo;
|
|
|
|
struct timespec;
|
|
|
|
struct timeval;
|
|
|
|
struct timex;
|
|
|
|
struct timezone;
|
|
|
|
struct tms;
|
|
|
|
struct utimbuf;
|
|
|
|
struct mq_attr;
|
2006-02-01 19:04:33 +08:00
|
|
|
struct compat_stat;
|
|
|
|
struct compat_timeval;
|
2006-05-23 22:46:40 +08:00
|
|
|
struct robust_list_head;
|
2006-09-26 16:52:28 +08:00
|
|
|
struct getcpu_cache;
|
2009-01-14 21:13:55 +08:00
|
|
|
struct old_linux_dirent;
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
struct perf_event_attr;
|
2011-01-29 21:13:26 +08:00
|
|
|
struct file_handle;
|
2012-12-15 03:09:47 +08:00
|
|
|
struct sigaltstack;
|
2014-09-26 15:16:58 +08:00
|
|
|
union bpf_attr;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#include <linux/types.h>
|
|
|
|
#include <linux/aio_abi.h>
|
|
|
|
#include <linux/capability.h>
|
2012-11-26 11:24:19 +08:00
|
|
|
#include <linux/signal.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/list.h>
|
2011-11-24 09:12:59 +08:00
|
|
|
#include <linux/bug.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/sem.h>
|
|
|
|
#include <asm/siginfo.h>
|
2009-08-11 04:52:47 +08:00
|
|
|
#include <linux/unistd.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
#include <linux/quota.h>
|
|
|
|
#include <linux/key.h>
|
2009-04-09 02:40:59 +08:00
|
|
|
#include <trace/syscall.h>
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2013-01-22 04:03:44 +08:00
|
|
|
/*
|
|
|
|
* __MAP - apply a macro to syscall arguments
|
|
|
|
* __MAP(n, m, t1, a1, t2, a2, ..., tn, an) will expand to
|
|
|
|
* m(t1, a1), m(t2, a2), ..., m(tn, an)
|
|
|
|
* The first argument must be equal to the amount of type/name
|
|
|
|
* pairs given. Note that this list of pairs (i.e. the arguments
|
|
|
|
* of __MAP starting at the third one) is in the same format as
|
|
|
|
* for SYSCALL_DEFINE<n>/COMPAT_SYSCALL_DEFINE<n>
|
|
|
|
*/
|
2013-03-06 04:36:40 +08:00
|
|
|
#define __MAP0(m,...)
|
2013-01-22 04:03:44 +08:00
|
|
|
#define __MAP1(m,t,a) m(t,a)
|
|
|
|
#define __MAP2(m,t,a,...) m(t,a), __MAP1(m,__VA_ARGS__)
|
|
|
|
#define __MAP3(m,t,a,...) m(t,a), __MAP2(m,__VA_ARGS__)
|
|
|
|
#define __MAP4(m,t,a,...) m(t,a), __MAP3(m,__VA_ARGS__)
|
|
|
|
#define __MAP5(m,t,a,...) m(t,a), __MAP4(m,__VA_ARGS__)
|
|
|
|
#define __MAP6(m,t,a,...) m(t,a), __MAP5(m,__VA_ARGS__)
|
|
|
|
#define __MAP(n,...) __MAP##n(__VA_ARGS__)
|
|
|
|
|
|
|
|
#define __SC_DECL(t, a) t a
|
2017-07-08 23:40:39 +08:00
|
|
|
#define __TYPE_AS(t, v) __same_type((__force t)0, v)
|
|
|
|
#define __TYPE_IS_L(t) (__TYPE_AS(t, 0L))
|
|
|
|
#define __TYPE_IS_UL(t) (__TYPE_AS(t, 0UL))
|
|
|
|
#define __TYPE_IS_LL(t) (__TYPE_AS(t, 0LL) || __TYPE_AS(t, 0ULL))
|
2013-01-22 04:16:58 +08:00
|
|
|
#define __SC_LONG(t, a) __typeof(__builtin_choose_expr(__TYPE_IS_LL(t), 0LL, 0L)) a
|
2017-07-08 23:40:39 +08:00
|
|
|
#define __SC_CAST(t, a) (__force t) a
|
2013-01-22 04:25:54 +08:00
|
|
|
#define __SC_ARGS(t, a) a
|
2013-01-22 04:16:58 +08:00
|
|
|
#define __SC_TEST(t, a) (void)BUILD_BUG_ON_ZERO(!__TYPE_IS_LL(t) && sizeof(t) > sizeof(long))
|
2009-01-14 21:13:59 +08:00
|
|
|
|
2009-03-13 22:42:11 +08:00
|
|
|
#ifdef CONFIG_FTRACE_SYSCALLS
|
2013-01-22 04:03:44 +08:00
|
|
|
#define __SC_STR_ADECL(t, a) #a
|
|
|
|
#define __SC_STR_TDECL(t, a) #t
|
2009-03-13 22:42:11 +08:00
|
|
|
|
2015-05-05 23:45:27 +08:00
|
|
|
extern struct trace_event_class event_class_syscall_enter;
|
|
|
|
extern struct trace_event_class event_class_syscall_exit;
|
2010-04-23 22:00:22 +08:00
|
|
|
extern struct trace_event_functions enter_syscall_print_funcs;
|
|
|
|
extern struct trace_event_functions exit_syscall_print_funcs;
|
2010-04-20 22:47:33 +08:00
|
|
|
|
2009-08-11 04:52:47 +08:00
|
|
|
#define SYSCALL_TRACE_ENTER_EVENT(sname) \
|
tracing: Replace syscall_meta_data struct array with pointer array
Currently the syscall_meta structures for the syscall tracepoints are
placed in the __syscall_metadata section, and at link time, the linker
makes one large array of all these syscall metadata structures. On boot
up, this array is read (much like the initcall sections) and the syscall
data is processed.
The problem is that there is no guarantee that gcc will place complex
structures nicely together in an array format. Two structures in the
same file may be placed awkwardly, because gcc has no clue that they
are suppose to be in an array.
A hack was used previous to force the alignment to 4, to pack the
structures together. But this caused alignment issues with other
architectures (sparc).
Instead of packing the structures into an array, the structures' addresses
are now put into the __syscall_metadata section. As pointers are always the
natural alignment, gcc should always pack them tightly together
(otherwise initcall, extable, etc would also fail).
By having the pointers to the structures in the section, we can still
iterate the trace_events without causing unnecessary alignment problems
with other architectures, or depending on the current behaviour of
gcc that will likely change in the future just to tick us kernel developers
off a little more.
The __syscall_metadata section is also moved into the .init.data section
as it is now only needed at boot up.
Suggested-by: David Miller <davem@davemloft.net>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-02-03 06:06:09 +08:00
|
|
|
static struct syscall_metadata __syscall_meta_##sname; \
|
2015-05-05 23:45:27 +08:00
|
|
|
static struct trace_event_call __used \
|
2009-08-11 04:52:47 +08:00
|
|
|
event_enter_##sname = { \
|
2010-04-22 00:27:06 +08:00
|
|
|
.class = &event_class_syscall_enter, \
|
2014-04-10 05:06:08 +08:00
|
|
|
{ \
|
|
|
|
.name = "sys_enter"#sname, \
|
|
|
|
}, \
|
2010-04-23 22:00:22 +08:00
|
|
|
.event.funcs = &enter_syscall_print_funcs, \
|
2009-12-01 16:23:30 +08:00
|
|
|
.data = (void *)&__syscall_meta_##sname,\
|
2013-10-24 21:34:19 +08:00
|
|
|
.flags = TRACE_EVENT_FL_CAP_ANY, \
|
2010-11-18 09:11:42 +08:00
|
|
|
}; \
|
2015-05-05 23:45:27 +08:00
|
|
|
static struct trace_event_call __used \
|
tracing: Replace trace_event struct array with pointer array
Currently the trace_event structures are placed in the _ftrace_events
section, and at link time, the linker makes one large array of all
the trace_event structures. On boot up, this array is read (much like
the initcall sections) and the events are processed.
The problem is that there is no guarantee that gcc will place complex
structures nicely together in an array format. Two structures in the
same file may be placed awkwardly, because gcc has no clue that they
are suppose to be in an array.
A hack was used previous to force the alignment to 4, to pack the
structures together. But this caused alignment issues with other
architectures (sparc).
Instead of packing the structures into an array, the structures' addresses
are now put into the _ftrace_event section. As pointers are always the
natural alignment, gcc should always pack them tightly together
(otherwise initcall, extable, etc would also fail).
By having the pointers to the structures in the section, we can still
iterate the trace_events without causing unnecessary alignment problems
with other architectures, or depending on the current behaviour of
gcc that will likely change in the future just to tick us kernel developers
off a little more.
The _ftrace_event section is also moved into the .init.data section
as it is now only needed at boot up.
Suggested-by: David Miller <davem@davemloft.net>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-01-27 22:15:30 +08:00
|
|
|
__attribute__((section("_ftrace_events"))) \
|
2011-01-26 16:49:00 +08:00
|
|
|
*__event_enter_##sname = &event_enter_##sname;
|
2009-08-11 04:52:47 +08:00
|
|
|
|
|
|
|
#define SYSCALL_TRACE_EXIT_EVENT(sname) \
|
tracing: Replace syscall_meta_data struct array with pointer array
Currently the syscall_meta structures for the syscall tracepoints are
placed in the __syscall_metadata section, and at link time, the linker
makes one large array of all these syscall metadata structures. On boot
up, this array is read (much like the initcall sections) and the syscall
data is processed.
The problem is that there is no guarantee that gcc will place complex
structures nicely together in an array format. Two structures in the
same file may be placed awkwardly, because gcc has no clue that they
are suppose to be in an array.
A hack was used previous to force the alignment to 4, to pack the
structures together. But this caused alignment issues with other
architectures (sparc).
Instead of packing the structures into an array, the structures' addresses
are now put into the __syscall_metadata section. As pointers are always the
natural alignment, gcc should always pack them tightly together
(otherwise initcall, extable, etc would also fail).
By having the pointers to the structures in the section, we can still
iterate the trace_events without causing unnecessary alignment problems
with other architectures, or depending on the current behaviour of
gcc that will likely change in the future just to tick us kernel developers
off a little more.
The __syscall_metadata section is also moved into the .init.data section
as it is now only needed at boot up.
Suggested-by: David Miller <davem@davemloft.net>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-02-03 06:06:09 +08:00
|
|
|
static struct syscall_metadata __syscall_meta_##sname; \
|
2015-05-05 23:45:27 +08:00
|
|
|
static struct trace_event_call __used \
|
2009-08-11 04:52:47 +08:00
|
|
|
event_exit_##sname = { \
|
2010-04-22 00:27:06 +08:00
|
|
|
.class = &event_class_syscall_exit, \
|
2014-04-10 05:06:08 +08:00
|
|
|
{ \
|
|
|
|
.name = "sys_exit"#sname, \
|
|
|
|
}, \
|
2010-04-23 22:00:22 +08:00
|
|
|
.event.funcs = &exit_syscall_print_funcs, \
|
2009-12-01 16:23:30 +08:00
|
|
|
.data = (void *)&__syscall_meta_##sname,\
|
2013-10-24 21:34:19 +08:00
|
|
|
.flags = TRACE_EVENT_FL_CAP_ANY, \
|
2010-11-18 09:11:42 +08:00
|
|
|
}; \
|
2015-05-05 23:45:27 +08:00
|
|
|
static struct trace_event_call __used \
|
tracing: Replace trace_event struct array with pointer array
Currently the trace_event structures are placed in the _ftrace_events
section, and at link time, the linker makes one large array of all
the trace_event structures. On boot up, this array is read (much like
the initcall sections) and the events are processed.
The problem is that there is no guarantee that gcc will place complex
structures nicely together in an array format. Two structures in the
same file may be placed awkwardly, because gcc has no clue that they
are suppose to be in an array.
A hack was used previous to force the alignment to 4, to pack the
structures together. But this caused alignment issues with other
architectures (sparc).
Instead of packing the structures into an array, the structures' addresses
are now put into the _ftrace_event section. As pointers are always the
natural alignment, gcc should always pack them tightly together
(otherwise initcall, extable, etc would also fail).
By having the pointers to the structures in the section, we can still
iterate the trace_events without causing unnecessary alignment problems
with other architectures, or depending on the current behaviour of
gcc that will likely change in the future just to tick us kernel developers
off a little more.
The _ftrace_event section is also moved into the .init.data section
as it is now only needed at boot up.
Suggested-by: David Miller <davem@davemloft.net>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Acked-by: David S. Miller <davem@davemloft.net>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-01-27 22:15:30 +08:00
|
|
|
__attribute__((section("_ftrace_events"))) \
|
2011-01-26 16:49:00 +08:00
|
|
|
*__event_exit_##sname = &event_exit_##sname;
|
2009-08-11 04:52:47 +08:00
|
|
|
|
2013-03-06 04:36:40 +08:00
|
|
|
#define SYSCALL_METADATA(sname, nb, ...) \
|
|
|
|
static const char *types_##sname[] = { \
|
|
|
|
__MAP(nb,__SC_STR_TDECL,__VA_ARGS__) \
|
|
|
|
}; \
|
|
|
|
static const char *args_##sname[] = { \
|
|
|
|
__MAP(nb,__SC_STR_ADECL,__VA_ARGS__) \
|
|
|
|
}; \
|
2009-08-19 15:54:51 +08:00
|
|
|
SYSCALL_TRACE_ENTER_EVENT(sname); \
|
|
|
|
SYSCALL_TRACE_EXIT_EVENT(sname); \
|
2010-04-22 22:35:55 +08:00
|
|
|
static struct syscall_metadata __used \
|
2009-03-13 22:42:11 +08:00
|
|
|
__syscall_meta_##sname = { \
|
|
|
|
.name = "sys"#sname, \
|
2011-02-03 11:27:20 +08:00
|
|
|
.syscall_nr = -1, /* Filled in at boot */ \
|
2009-03-13 22:42:11 +08:00
|
|
|
.nb_args = nb, \
|
2013-03-06 04:36:40 +08:00
|
|
|
.types = nb ? types_##sname : NULL, \
|
|
|
|
.args = nb ? args_##sname : NULL, \
|
2009-08-19 15:54:51 +08:00
|
|
|
.enter_event = &event_enter_##sname, \
|
|
|
|
.exit_event = &event_exit_##sname, \
|
2010-04-22 22:35:55 +08:00
|
|
|
.enter_fields = LIST_HEAD_INIT(__syscall_meta_##sname.enter_fields), \
|
tracing: Replace syscall_meta_data struct array with pointer array
Currently the syscall_meta structures for the syscall tracepoints are
placed in the __syscall_metadata section, and at link time, the linker
makes one large array of all these syscall metadata structures. On boot
up, this array is read (much like the initcall sections) and the syscall
data is processed.
The problem is that there is no guarantee that gcc will place complex
structures nicely together in an array format. Two structures in the
same file may be placed awkwardly, because gcc has no clue that they
are suppose to be in an array.
A hack was used previous to force the alignment to 4, to pack the
structures together. But this caused alignment issues with other
architectures (sparc).
Instead of packing the structures into an array, the structures' addresses
are now put into the __syscall_metadata section. As pointers are always the
natural alignment, gcc should always pack them tightly together
(otherwise initcall, extable, etc would also fail).
By having the pointers to the structures in the section, we can still
iterate the trace_events without causing unnecessary alignment problems
with other architectures, or depending on the current behaviour of
gcc that will likely change in the future just to tick us kernel developers
off a little more.
The __syscall_metadata section is also moved into the .init.data section
as it is now only needed at boot up.
Suggested-by: David Miller <davem@davemloft.net>
Acked-by: David S. Miller <davem@davemloft.net>
Cc: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Signed-off-by: Steven Rostedt <rostedt@goodmis.org>
2011-02-03 06:06:09 +08:00
|
|
|
}; \
|
|
|
|
static struct syscall_metadata __used \
|
|
|
|
__attribute__((section("__syscalls_metadata"))) \
|
|
|
|
*__p_syscall_meta_##sname = &__syscall_meta_##sname;
|
2017-08-05 07:00:09 +08:00
|
|
|
|
|
|
|
static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
|
|
|
|
{
|
|
|
|
return tp_event->class == &event_class_syscall_enter ||
|
|
|
|
tp_event->class == &event_class_syscall_exit;
|
|
|
|
}
|
|
|
|
|
2013-03-06 04:36:40 +08:00
|
|
|
#else
|
|
|
|
#define SYSCALL_METADATA(sname, nb, ...)
|
2017-08-05 07:00:09 +08:00
|
|
|
|
|
|
|
static inline int is_syscall_trace_event(struct trace_event_call *tp_event)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
2013-03-06 04:36:40 +08:00
|
|
|
#endif
|
2009-03-13 22:42:11 +08:00
|
|
|
|
|
|
|
#define SYSCALL_DEFINE0(sname) \
|
2013-03-06 04:36:40 +08:00
|
|
|
SYSCALL_METADATA(_##sname, 0); \
|
2009-03-13 22:42:11 +08:00
|
|
|
asmlinkage long sys_##sname(void)
|
|
|
|
|
2009-02-12 05:04:38 +08:00
|
|
|
#define SYSCALL_DEFINE1(name, ...) SYSCALL_DEFINEx(1, _##name, __VA_ARGS__)
|
|
|
|
#define SYSCALL_DEFINE2(name, ...) SYSCALL_DEFINEx(2, _##name, __VA_ARGS__)
|
|
|
|
#define SYSCALL_DEFINE3(name, ...) SYSCALL_DEFINEx(3, _##name, __VA_ARGS__)
|
|
|
|
#define SYSCALL_DEFINE4(name, ...) SYSCALL_DEFINEx(4, _##name, __VA_ARGS__)
|
|
|
|
#define SYSCALL_DEFINE5(name, ...) SYSCALL_DEFINEx(5, _##name, __VA_ARGS__)
|
|
|
|
#define SYSCALL_DEFINE6(name, ...) SYSCALL_DEFINEx(6, _##name, __VA_ARGS__)
|
2009-01-14 21:13:59 +08:00
|
|
|
|
2017-09-08 09:36:15 +08:00
|
|
|
#define SYSCALL_DEFINE_MAXARGS 6
|
|
|
|
|
2009-03-13 22:42:11 +08:00
|
|
|
#define SYSCALL_DEFINEx(x, sname, ...) \
|
2013-03-06 04:36:40 +08:00
|
|
|
SYSCALL_METADATA(sname, x, __VA_ARGS__) \
|
2009-03-13 22:42:11 +08:00
|
|
|
__SYSCALL_DEFINEx(x, sname, __VA_ARGS__)
|
|
|
|
|
2013-01-22 04:25:54 +08:00
|
|
|
#define __PROTECT(...) asmlinkage_protect(__VA_ARGS__)
|
2009-03-13 22:42:11 +08:00
|
|
|
#define __SYSCALL_DEFINEx(x, name, ...) \
|
2013-11-13 07:08:36 +08:00
|
|
|
asmlinkage long sys##name(__MAP(x,__SC_DECL,__VA_ARGS__)) \
|
|
|
|
__attribute__((alias(__stringify(SyS##name)))); \
|
2013-01-22 04:03:44 +08:00
|
|
|
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__)); \
|
2013-09-12 05:23:28 +08:00
|
|
|
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)); \
|
2013-01-22 04:03:44 +08:00
|
|
|
asmlinkage long SyS##name(__MAP(x,__SC_LONG,__VA_ARGS__)) \
|
2009-01-14 21:13:59 +08:00
|
|
|
{ \
|
2013-01-22 04:25:54 +08:00
|
|
|
long ret = SYSC##name(__MAP(x,__SC_CAST,__VA_ARGS__)); \
|
2013-01-22 04:03:44 +08:00
|
|
|
__MAP(x,__SC_TEST,__VA_ARGS__); \
|
2013-01-22 04:25:54 +08:00
|
|
|
__PROTECT(x, ret,__MAP(x,__SC_ARGS,__VA_ARGS__)); \
|
|
|
|
return ret; \
|
2009-01-14 21:13:59 +08:00
|
|
|
} \
|
2013-01-22 04:03:44 +08:00
|
|
|
static inline long SYSC##name(__MAP(x,__SC_DECL,__VA_ARGS__))
|
2009-01-14 21:13:59 +08:00
|
|
|
|
2017-06-15 09:12:01 +08:00
|
|
|
/*
|
|
|
|
* Called before coming back to user-mode. Returning to user-mode with an
|
|
|
|
* address limit different than USER_DS can allow to overwrite kernel memory.
|
|
|
|
*/
|
|
|
|
static inline void addr_limit_user_check(void)
|
|
|
|
{
|
2017-09-07 23:30:44 +08:00
|
|
|
#ifdef TIF_FSCHECK
|
2017-06-15 09:12:01 +08:00
|
|
|
if (!test_thread_flag(TIF_FSCHECK))
|
|
|
|
return;
|
2017-09-07 23:30:44 +08:00
|
|
|
#endif
|
2017-06-15 09:12:01 +08:00
|
|
|
|
2017-09-07 23:30:44 +08:00
|
|
|
if (CHECK_DATA_CORRUPTION(!segment_eq(get_fs(), USER_DS),
|
|
|
|
"Invalid address limit on user-mode return"))
|
|
|
|
force_sig(SIGKILL, current);
|
|
|
|
|
|
|
|
#ifdef TIF_FSCHECK
|
2017-06-15 09:12:01 +08:00
|
|
|
clear_thread_flag(TIF_FSCHECK);
|
|
|
|
#endif
|
2017-09-07 23:30:44 +08:00
|
|
|
}
|
2017-06-15 09:12:01 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_time(time_t __user *tloc);
|
|
|
|
asmlinkage long sys_stime(time_t __user *tptr);
|
|
|
|
asmlinkage long sys_gettimeofday(struct timeval __user *tv,
|
|
|
|
struct timezone __user *tz);
|
|
|
|
asmlinkage long sys_settimeofday(struct timeval __user *tv,
|
|
|
|
struct timezone __user *tz);
|
|
|
|
asmlinkage long sys_adjtimex(struct timex __user *txc_p);
|
|
|
|
|
|
|
|
asmlinkage long sys_times(struct tms __user *tbuf);
|
|
|
|
|
|
|
|
asmlinkage long sys_gettid(void);
|
|
|
|
asmlinkage long sys_nanosleep(struct timespec __user *rqtp, struct timespec __user *rmtp);
|
2009-01-14 21:13:54 +08:00
|
|
|
asmlinkage long sys_alarm(unsigned int seconds);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_getpid(void);
|
|
|
|
asmlinkage long sys_getppid(void);
|
|
|
|
asmlinkage long sys_getuid(void);
|
|
|
|
asmlinkage long sys_geteuid(void);
|
|
|
|
asmlinkage long sys_getgid(void);
|
|
|
|
asmlinkage long sys_getegid(void);
|
|
|
|
asmlinkage long sys_getresuid(uid_t __user *ruid, uid_t __user *euid, uid_t __user *suid);
|
|
|
|
asmlinkage long sys_getresgid(gid_t __user *rgid, gid_t __user *egid, gid_t __user *sgid);
|
|
|
|
asmlinkage long sys_getpgid(pid_t pid);
|
|
|
|
asmlinkage long sys_getpgrp(void);
|
|
|
|
asmlinkage long sys_getsid(pid_t pid);
|
|
|
|
asmlinkage long sys_getgroups(int gidsetsize, gid_t __user *grouplist);
|
|
|
|
|
|
|
|
asmlinkage long sys_setregid(gid_t rgid, gid_t egid);
|
|
|
|
asmlinkage long sys_setgid(gid_t gid);
|
|
|
|
asmlinkage long sys_setreuid(uid_t ruid, uid_t euid);
|
|
|
|
asmlinkage long sys_setuid(uid_t uid);
|
|
|
|
asmlinkage long sys_setresuid(uid_t ruid, uid_t euid, uid_t suid);
|
|
|
|
asmlinkage long sys_setresgid(gid_t rgid, gid_t egid, gid_t sgid);
|
|
|
|
asmlinkage long sys_setfsuid(uid_t uid);
|
|
|
|
asmlinkage long sys_setfsgid(gid_t gid);
|
|
|
|
asmlinkage long sys_setpgid(pid_t pid, pid_t pgid);
|
|
|
|
asmlinkage long sys_setsid(void);
|
|
|
|
asmlinkage long sys_setgroups(int gidsetsize, gid_t __user *grouplist);
|
|
|
|
|
|
|
|
asmlinkage long sys_acct(const char __user *name);
|
|
|
|
asmlinkage long sys_capget(cap_user_header_t header,
|
|
|
|
cap_user_data_t dataptr);
|
|
|
|
asmlinkage long sys_capset(cap_user_header_t header,
|
|
|
|
const cap_user_data_t data);
|
2010-06-05 05:14:58 +08:00
|
|
|
asmlinkage long sys_personality(unsigned int personality);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2018-03-11 18:34:37 +08:00
|
|
|
asmlinkage long sys_sigpending(old_sigset_t __user *uset);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_sigprocmask(int how, old_sigset_t __user *set,
|
|
|
|
old_sigset_t __user *oset);
|
2012-12-15 03:09:47 +08:00
|
|
|
asmlinkage long sys_sigaltstack(const struct sigaltstack __user *uss,
|
|
|
|
struct sigaltstack __user *uoss);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_getitimer(int which, struct itimerval __user *value);
|
|
|
|
asmlinkage long sys_setitimer(int which,
|
|
|
|
struct itimerval __user *value,
|
|
|
|
struct itimerval __user *ovalue);
|
|
|
|
asmlinkage long sys_timer_create(clockid_t which_clock,
|
|
|
|
struct sigevent __user *timer_event_spec,
|
|
|
|
timer_t __user * created_timer_id);
|
|
|
|
asmlinkage long sys_timer_gettime(timer_t timer_id,
|
|
|
|
struct itimerspec __user *setting);
|
|
|
|
asmlinkage long sys_timer_getoverrun(timer_t timer_id);
|
|
|
|
asmlinkage long sys_timer_settime(timer_t timer_id, int flags,
|
|
|
|
const struct itimerspec __user *new_setting,
|
|
|
|
struct itimerspec __user *old_setting);
|
|
|
|
asmlinkage long sys_timer_delete(timer_t timer_id);
|
|
|
|
asmlinkage long sys_clock_settime(clockid_t which_clock,
|
|
|
|
const struct timespec __user *tp);
|
|
|
|
asmlinkage long sys_clock_gettime(clockid_t which_clock,
|
|
|
|
struct timespec __user *tp);
|
2011-02-01 21:52:26 +08:00
|
|
|
asmlinkage long sys_clock_adjtime(clockid_t which_clock,
|
|
|
|
struct timex __user *tx);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_clock_getres(clockid_t which_clock,
|
|
|
|
struct timespec __user *tp);
|
|
|
|
asmlinkage long sys_clock_nanosleep(clockid_t which_clock, int flags,
|
|
|
|
const struct timespec __user *rqtp,
|
|
|
|
struct timespec __user *rmtp);
|
|
|
|
|
|
|
|
asmlinkage long sys_nice(int increment);
|
|
|
|
asmlinkage long sys_sched_setscheduler(pid_t pid, int policy,
|
|
|
|
struct sched_param __user *param);
|
|
|
|
asmlinkage long sys_sched_setparam(pid_t pid,
|
|
|
|
struct sched_param __user *param);
|
sched: Add new scheduler syscalls to support an extended scheduling parameters ABI
Add the syscalls needed for supporting scheduling algorithms
with extended scheduling parameters (e.g., SCHED_DEADLINE).
In general, it makes possible to specify a periodic/sporadic task,
that executes for a given amount of runtime at each instance, and is
scheduled according to the urgency of their own timing constraints,
i.e.:
- a (maximum/typical) instance execution time,
- a minimum interval between consecutive instances,
- a time constraint by which each instance must be completed.
Thus, both the data structure that holds the scheduling parameters of
the tasks and the system calls dealing with it must be extended.
Unfortunately, modifying the existing struct sched_param would break
the ABI and result in potentially serious compatibility issues with
legacy binaries.
For these reasons, this patch:
- defines the new struct sched_attr, containing all the fields
that are necessary for specifying a task in the computational
model described above;
- defines and implements the new scheduling related syscalls that
manipulate it, i.e., sched_setattr() and sched_getattr().
Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a
proof of concept and for developing and testing purposes. Making them
available on other architectures is straightforward.
Since no "user" for these new parameters is introduced in this patch,
the implementation of the new system calls is just identical to their
already existing counterpart. Future patches that implement scheduling
policies able to exploit the new data structure must also take care of
modifying the sched_*attr() calls accordingly with their own purposes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
[ Rewrote to use sched_attr. ]
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
[ Removed sched_setscheduler2() for now. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 21:43:36 +08:00
|
|
|
asmlinkage long sys_sched_setattr(pid_t pid,
|
2014-02-15 00:19:29 +08:00
|
|
|
struct sched_attr __user *attr,
|
|
|
|
unsigned int flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_sched_getscheduler(pid_t pid);
|
|
|
|
asmlinkage long sys_sched_getparam(pid_t pid,
|
|
|
|
struct sched_param __user *param);
|
sched: Add new scheduler syscalls to support an extended scheduling parameters ABI
Add the syscalls needed for supporting scheduling algorithms
with extended scheduling parameters (e.g., SCHED_DEADLINE).
In general, it makes possible to specify a periodic/sporadic task,
that executes for a given amount of runtime at each instance, and is
scheduled according to the urgency of their own timing constraints,
i.e.:
- a (maximum/typical) instance execution time,
- a minimum interval between consecutive instances,
- a time constraint by which each instance must be completed.
Thus, both the data structure that holds the scheduling parameters of
the tasks and the system calls dealing with it must be extended.
Unfortunately, modifying the existing struct sched_param would break
the ABI and result in potentially serious compatibility issues with
legacy binaries.
For these reasons, this patch:
- defines the new struct sched_attr, containing all the fields
that are necessary for specifying a task in the computational
model described above;
- defines and implements the new scheduling related syscalls that
manipulate it, i.e., sched_setattr() and sched_getattr().
Syscalls are introduced for x86 (32 and 64 bits) and ARM only, as a
proof of concept and for developing and testing purposes. Making them
available on other architectures is straightforward.
Since no "user" for these new parameters is introduced in this patch,
the implementation of the new system calls is just identical to their
already existing counterpart. Future patches that implement scheduling
policies able to exploit the new data structure must also take care of
modifying the sched_*attr() calls accordingly with their own purposes.
Signed-off-by: Dario Faggioli <raistlin@linux.it>
[ Rewrote to use sched_attr. ]
Signed-off-by: Juri Lelli <juri.lelli@gmail.com>
[ Removed sched_setscheduler2() for now. ]
Signed-off-by: Peter Zijlstra <peterz@infradead.org>
Link: http://lkml.kernel.org/r/1383831828-15501-3-git-send-email-juri.lelli@gmail.com
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2013-11-07 21:43:36 +08:00
|
|
|
asmlinkage long sys_sched_getattr(pid_t pid,
|
|
|
|
struct sched_attr __user *attr,
|
2014-02-15 00:19:29 +08:00
|
|
|
unsigned int size,
|
|
|
|
unsigned int flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_sched_setaffinity(pid_t pid, unsigned int len,
|
|
|
|
unsigned long __user *user_mask_ptr);
|
|
|
|
asmlinkage long sys_sched_getaffinity(pid_t pid, unsigned int len,
|
|
|
|
unsigned long __user *user_mask_ptr);
|
|
|
|
asmlinkage long sys_sched_yield(void);
|
|
|
|
asmlinkage long sys_sched_get_priority_max(int policy);
|
|
|
|
asmlinkage long sys_sched_get_priority_min(int policy);
|
|
|
|
asmlinkage long sys_sched_rr_get_interval(pid_t pid,
|
|
|
|
struct timespec __user *interval);
|
|
|
|
asmlinkage long sys_setpriority(int which, int who, int niceval);
|
|
|
|
asmlinkage long sys_getpriority(int which, int who);
|
|
|
|
|
|
|
|
asmlinkage long sys_shutdown(int, int);
|
|
|
|
asmlinkage long sys_reboot(int magic1, int magic2, unsigned int cmd,
|
|
|
|
void __user *arg);
|
|
|
|
asmlinkage long sys_restart_syscall(void);
|
2005-06-26 05:58:28 +08:00
|
|
|
asmlinkage long sys_kexec_load(unsigned long entry, unsigned long nr_segments,
|
|
|
|
struct kexec_segment __user *segments,
|
|
|
|
unsigned long flags);
|
2014-08-09 05:25:55 +08:00
|
|
|
asmlinkage long sys_kexec_file_load(int kernel_fd, int initrd_fd,
|
|
|
|
unsigned long cmdline_len,
|
|
|
|
const char __user *cmdline_ptr,
|
|
|
|
unsigned long flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
asmlinkage long sys_exit(int error_code);
|
2009-01-14 21:13:54 +08:00
|
|
|
asmlinkage long sys_exit_group(int error_code);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_wait4(pid_t pid, int __user *stat_addr,
|
|
|
|
int options, struct rusage __user *ru);
|
|
|
|
asmlinkage long sys_waitid(int which, pid_t pid,
|
|
|
|
struct siginfo __user *infop,
|
|
|
|
int options, struct rusage __user *ru);
|
|
|
|
asmlinkage long sys_waitpid(pid_t pid, int __user *stat_addr, int options);
|
|
|
|
asmlinkage long sys_set_tid_address(int __user *tidptr);
|
[PATCH] pi-futex: futex code cleanups
We are pleased to announce "lightweight userspace priority inheritance" (PI)
support for futexes. The following patchset and glibc patch implements it,
ontop of the robust-futexes patchset which is included in 2.6.16-mm1.
We are calling it lightweight for 3 reasons:
- in the user-space fastpath a PI-enabled futex involves no kernel work
(or any other PI complexity) at all. No registration, no extra kernel
calls - just pure fast atomic ops in userspace.
- in the slowpath (in the lock-contention case), the system call and
scheduling pattern is in fact better than that of normal futexes, due to
the 'integrated' nature of FUTEX_LOCK_PI. [more about that further down]
- the in-kernel PI implementation is streamlined around the mutex
abstraction, with strict rules that keep the implementation relatively
simple: only a single owner may own a lock (i.e. no read-write lock
support), only the owner may unlock a lock, no recursive locking, etc.
Priority Inheritance - why, oh why???
-------------------------------------
Many of you heard the horror stories about the evil PI code circling Linux for
years, which makes no real sense at all and is only used by buggy applications
and which has horrible overhead. Some of you have dreaded this very moment,
when someone actually submits working PI code ;-)
So why would we like to see PI support for futexes?
We'd like to see it done purely for technological reasons. We dont think it's
a buggy concept, we think it's useful functionality to offer to applications,
which functionality cannot be achieved in other ways. We also think it's the
right thing to do, and we think we've got the right arguments and the right
numbers to prove that. We also believe that we can address all the
counter-arguments as well. For these reasons (and the reasons outlined below)
we are submitting this patch-set for upstream kernel inclusion.
What are the benefits of PI?
The short reply:
----------------
User-space PI helps achieving/improving determinism for user-space
applications. In the best-case, it can help achieve determinism and
well-bound latencies. Even in the worst-case, PI will improve the statistical
distribution of locking related application delays.
The longer reply:
-----------------
Firstly, sharing locks between multiple tasks is a common programming
technique that often cannot be replaced with lockless algorithms. As we can
see it in the kernel [which is a quite complex program in itself], lockless
structures are rather the exception than the norm - the current ratio of
lockless vs. locky code for shared data structures is somewhere between 1:10
and 1:100. Lockless is hard, and the complexity of lockless algorithms often
endangers to ability to do robust reviews of said code. I.e. critical RT
apps often choose lock structures to protect critical data structures, instead
of lockless algorithms. Furthermore, there are cases (like shared hardware,
or other resource limits) where lockless access is mathematically impossible.
Media players (such as Jack) are an example of reasonable application design
with multiple tasks (with multiple priority levels) sharing short-held locks:
for example, a highprio audio playback thread is combined with medium-prio
construct-audio-data threads and low-prio display-colory-stuff threads. Add
video and decoding to the mix and we've got even more priority levels.
So once we accept that synchronization objects (locks) are an unavoidable fact
of life, and once we accept that multi-task userspace apps have a very fair
expectation of being able to use locks, we've got to think about how to offer
the option of a deterministic locking implementation to user-space.
Most of the technical counter-arguments against doing priority inheritance
only apply to kernel-space locks. But user-space locks are different, there
we cannot disable interrupts or make the task non-preemptible in a critical
section, so the 'use spinlocks' argument does not apply (user-space spinlocks
have the same priority inversion problems as other user-space locking
constructs). Fact is, pretty much the only technique that currently enables
good determinism for userspace locks (such as futex-based pthread mutexes) is
priority inheritance:
Currently (without PI), if a high-prio and a low-prio task shares a lock [this
is a quite common scenario for most non-trivial RT applications], even if all
critical sections are coded carefully to be deterministic (i.e. all critical
sections are short in duration and only execute a limited number of
instructions), the kernel cannot guarantee any deterministic execution of the
high-prio task: any medium-priority task could preempt the low-prio task while
it holds the shared lock and executes the critical section, and could delay it
indefinitely.
Implementation:
---------------
As mentioned before, the userspace fastpath of PI-enabled pthread mutexes
involves no kernel work at all - they behave quite similarly to normal
futex-based locks: a 0 value means unlocked, and a value==TID means locked.
(This is the same method as used by list-based robust futexes.) Userspace uses
atomic ops to lock/unlock these mutexes without entering the kernel.
To handle the slowpath, we have added two new futex ops:
FUTEX_LOCK_PI
FUTEX_UNLOCK_PI
If the lock-acquire fastpath fails, [i.e. an atomic transition from 0 to TID
fails], then FUTEX_LOCK_PI is called. The kernel does all the remaining work:
if there is no futex-queue attached to the futex address yet then the code
looks up the task that owns the futex [it has put its own TID into the futex
value], and attaches a 'PI state' structure to the futex-queue. The pi_state
includes an rt-mutex, which is a PI-aware, kernel-based synchronization
object. The 'other' task is made the owner of the rt-mutex, and the
FUTEX_WAITERS bit is atomically set in the futex value. Then this task tries
to lock the rt-mutex, on which it blocks. Once it returns, it has the mutex
acquired, and it sets the futex value to its own TID and returns. Userspace
has no other work to perform - it now owns the lock, and futex value contains
FUTEX_WAITERS|TID.
If the unlock side fastpath succeeds, [i.e. userspace manages to do a TID ->
0 atomic transition of the futex value], then no kernel work is triggered.
If the unlock fastpath fails (because the FUTEX_WAITERS bit is set), then
FUTEX_UNLOCK_PI is called, and the kernel unlocks the futex on the behalf of
userspace - and it also unlocks the attached pi_state->rt_mutex and thus wakes
up any potential waiters.
Note that under this approach, contrary to other PI-futex approaches, there is
no prior 'registration' of a PI-futex. [which is not quite possible anyway,
due to existing ABI properties of pthread mutexes.]
Also, under this scheme, 'robustness' and 'PI' are two orthogonal properties
of futexes, and all four combinations are possible: futex, robust-futex,
PI-futex, robust+PI-futex.
glibc support:
--------------
Ulrich Drepper and Jakub Jelinek have written glibc support for PI-futexes
(and robust futexes), enabling robust and PI (PTHREAD_PRIO_INHERIT) POSIX
mutexes. (PTHREAD_PRIO_PROTECT support will be added later on too, no
additional kernel changes are needed for that). [NOTE: The glibc patch is
obviously inofficial and unsupported without matching upstream kernel
functionality.]
the patch-queue and the glibc patch can also be downloaded from:
http://redhat.com/~mingo/PI-futex-patches/
Many thanks go to the people who helped us create this kernel feature: Steven
Rostedt, Esben Nielsen, Benedikt Spranger, Daniel Walker, John Cooper, Arjan
van de Ven, Oleg Nesterov and others. Credits for related prior projects goes
to Dirk Grambow, Inaky Perez-Gonzalez, Bill Huey and many others.
Clean up the futex code, before adding more features to it:
- use u32 as the futex field type - that's the ABI
- use __user and pointers to u32 instead of unsigned long
- code style / comment style cleanups
- rename hash-bucket name from 'bh' to 'hb'.
I checked the pre and post futex.o object files to make sure this
patch has no code effects.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:54:47 +08:00
|
|
|
asmlinkage long sys_futex(u32 __user *uaddr, int op, u32 val,
|
2005-04-17 06:20:36 +08:00
|
|
|
struct timespec __user *utime, u32 __user *uaddr2,
|
[PATCH] pi-futex: futex code cleanups
We are pleased to announce "lightweight userspace priority inheritance" (PI)
support for futexes. The following patchset and glibc patch implements it,
ontop of the robust-futexes patchset which is included in 2.6.16-mm1.
We are calling it lightweight for 3 reasons:
- in the user-space fastpath a PI-enabled futex involves no kernel work
(or any other PI complexity) at all. No registration, no extra kernel
calls - just pure fast atomic ops in userspace.
- in the slowpath (in the lock-contention case), the system call and
scheduling pattern is in fact better than that of normal futexes, due to
the 'integrated' nature of FUTEX_LOCK_PI. [more about that further down]
- the in-kernel PI implementation is streamlined around the mutex
abstraction, with strict rules that keep the implementation relatively
simple: only a single owner may own a lock (i.e. no read-write lock
support), only the owner may unlock a lock, no recursive locking, etc.
Priority Inheritance - why, oh why???
-------------------------------------
Many of you heard the horror stories about the evil PI code circling Linux for
years, which makes no real sense at all and is only used by buggy applications
and which has horrible overhead. Some of you have dreaded this very moment,
when someone actually submits working PI code ;-)
So why would we like to see PI support for futexes?
We'd like to see it done purely for technological reasons. We dont think it's
a buggy concept, we think it's useful functionality to offer to applications,
which functionality cannot be achieved in other ways. We also think it's the
right thing to do, and we think we've got the right arguments and the right
numbers to prove that. We also believe that we can address all the
counter-arguments as well. For these reasons (and the reasons outlined below)
we are submitting this patch-set for upstream kernel inclusion.
What are the benefits of PI?
The short reply:
----------------
User-space PI helps achieving/improving determinism for user-space
applications. In the best-case, it can help achieve determinism and
well-bound latencies. Even in the worst-case, PI will improve the statistical
distribution of locking related application delays.
The longer reply:
-----------------
Firstly, sharing locks between multiple tasks is a common programming
technique that often cannot be replaced with lockless algorithms. As we can
see it in the kernel [which is a quite complex program in itself], lockless
structures are rather the exception than the norm - the current ratio of
lockless vs. locky code for shared data structures is somewhere between 1:10
and 1:100. Lockless is hard, and the complexity of lockless algorithms often
endangers to ability to do robust reviews of said code. I.e. critical RT
apps often choose lock structures to protect critical data structures, instead
of lockless algorithms. Furthermore, there are cases (like shared hardware,
or other resource limits) where lockless access is mathematically impossible.
Media players (such as Jack) are an example of reasonable application design
with multiple tasks (with multiple priority levels) sharing short-held locks:
for example, a highprio audio playback thread is combined with medium-prio
construct-audio-data threads and low-prio display-colory-stuff threads. Add
video and decoding to the mix and we've got even more priority levels.
So once we accept that synchronization objects (locks) are an unavoidable fact
of life, and once we accept that multi-task userspace apps have a very fair
expectation of being able to use locks, we've got to think about how to offer
the option of a deterministic locking implementation to user-space.
Most of the technical counter-arguments against doing priority inheritance
only apply to kernel-space locks. But user-space locks are different, there
we cannot disable interrupts or make the task non-preemptible in a critical
section, so the 'use spinlocks' argument does not apply (user-space spinlocks
have the same priority inversion problems as other user-space locking
constructs). Fact is, pretty much the only technique that currently enables
good determinism for userspace locks (such as futex-based pthread mutexes) is
priority inheritance:
Currently (without PI), if a high-prio and a low-prio task shares a lock [this
is a quite common scenario for most non-trivial RT applications], even if all
critical sections are coded carefully to be deterministic (i.e. all critical
sections are short in duration and only execute a limited number of
instructions), the kernel cannot guarantee any deterministic execution of the
high-prio task: any medium-priority task could preempt the low-prio task while
it holds the shared lock and executes the critical section, and could delay it
indefinitely.
Implementation:
---------------
As mentioned before, the userspace fastpath of PI-enabled pthread mutexes
involves no kernel work at all - they behave quite similarly to normal
futex-based locks: a 0 value means unlocked, and a value==TID means locked.
(This is the same method as used by list-based robust futexes.) Userspace uses
atomic ops to lock/unlock these mutexes without entering the kernel.
To handle the slowpath, we have added two new futex ops:
FUTEX_LOCK_PI
FUTEX_UNLOCK_PI
If the lock-acquire fastpath fails, [i.e. an atomic transition from 0 to TID
fails], then FUTEX_LOCK_PI is called. The kernel does all the remaining work:
if there is no futex-queue attached to the futex address yet then the code
looks up the task that owns the futex [it has put its own TID into the futex
value], and attaches a 'PI state' structure to the futex-queue. The pi_state
includes an rt-mutex, which is a PI-aware, kernel-based synchronization
object. The 'other' task is made the owner of the rt-mutex, and the
FUTEX_WAITERS bit is atomically set in the futex value. Then this task tries
to lock the rt-mutex, on which it blocks. Once it returns, it has the mutex
acquired, and it sets the futex value to its own TID and returns. Userspace
has no other work to perform - it now owns the lock, and futex value contains
FUTEX_WAITERS|TID.
If the unlock side fastpath succeeds, [i.e. userspace manages to do a TID ->
0 atomic transition of the futex value], then no kernel work is triggered.
If the unlock fastpath fails (because the FUTEX_WAITERS bit is set), then
FUTEX_UNLOCK_PI is called, and the kernel unlocks the futex on the behalf of
userspace - and it also unlocks the attached pi_state->rt_mutex and thus wakes
up any potential waiters.
Note that under this approach, contrary to other PI-futex approaches, there is
no prior 'registration' of a PI-futex. [which is not quite possible anyway,
due to existing ABI properties of pthread mutexes.]
Also, under this scheme, 'robustness' and 'PI' are two orthogonal properties
of futexes, and all four combinations are possible: futex, robust-futex,
PI-futex, robust+PI-futex.
glibc support:
--------------
Ulrich Drepper and Jakub Jelinek have written glibc support for PI-futexes
(and robust futexes), enabling robust and PI (PTHREAD_PRIO_INHERIT) POSIX
mutexes. (PTHREAD_PRIO_PROTECT support will be added later on too, no
additional kernel changes are needed for that). [NOTE: The glibc patch is
obviously inofficial and unsupported without matching upstream kernel
functionality.]
the patch-queue and the glibc patch can also be downloaded from:
http://redhat.com/~mingo/PI-futex-patches/
Many thanks go to the people who helped us create this kernel feature: Steven
Rostedt, Esben Nielsen, Benedikt Spranger, Daniel Walker, John Cooper, Arjan
van de Ven, Oleg Nesterov and others. Credits for related prior projects goes
to Dirk Grambow, Inaky Perez-Gonzalez, Bill Huey and many others.
Clean up the futex code, before adding more features to it:
- use u32 as the futex field type - that's the ABI
- use __user and pointers to u32 instead of unsigned long
- code style / comment style cleanups
- rename hash-bucket name from 'bh' to 'hb'.
I checked the pre and post futex.o object files to make sure this
patch has no code effects.
Signed-off-by: Ingo Molnar <mingo@elte.hu>
Signed-off-by: Thomas Gleixner <tglx@linutronix.de>
Signed-off-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Ulrich Drepper <drepper@redhat.com>
Cc: Jakub Jelinek <jakub@redhat.com>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-27 17:54:47 +08:00
|
|
|
u32 val3);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
asmlinkage long sys_init_module(void __user *umod, unsigned long len,
|
|
|
|
const char __user *uargs);
|
|
|
|
asmlinkage long sys_delete_module(const char __user *name_user,
|
|
|
|
unsigned int flags);
|
|
|
|
|
2012-12-26 05:04:12 +08:00
|
|
|
#ifdef CONFIG_OLD_SIGSUSPEND
|
|
|
|
asmlinkage long sys_sigsuspend(old_sigset_t mask);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
#ifdef CONFIG_OLD_SIGSUSPEND3
|
|
|
|
asmlinkage long sys_sigsuspend(int unused1, int unused2, old_sigset_t mask);
|
|
|
|
#endif
|
|
|
|
|
2012-11-25 15:30:08 +08:00
|
|
|
asmlinkage long sys_rt_sigsuspend(sigset_t __user *unewset, size_t sigsetsize);
|
|
|
|
|
2012-12-26 08:09:45 +08:00
|
|
|
#ifdef CONFIG_OLD_SIGACTION
|
|
|
|
asmlinkage long sys_sigaction(int, const struct old_sigaction __user *,
|
|
|
|
struct old_sigaction __user *);
|
|
|
|
#endif
|
|
|
|
|
2012-11-26 12:12:10 +08:00
|
|
|
#ifndef CONFIG_ODD_RT_SIGACTION
|
|
|
|
asmlinkage long sys_rt_sigaction(int,
|
|
|
|
const struct sigaction __user *,
|
|
|
|
struct sigaction __user *,
|
|
|
|
size_t);
|
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_rt_sigprocmask(int how, sigset_t __user *set,
|
|
|
|
sigset_t __user *oset, size_t sigsetsize);
|
|
|
|
asmlinkage long sys_rt_sigpending(sigset_t __user *set, size_t sigsetsize);
|
|
|
|
asmlinkage long sys_rt_sigtimedwait(const sigset_t __user *uthese,
|
|
|
|
siginfo_t __user *uinfo,
|
|
|
|
const struct timespec __user *uts,
|
|
|
|
size_t sigsetsize);
|
2009-07-07 16:23:43 +08:00
|
|
|
asmlinkage long sys_rt_tgsigqueueinfo(pid_t tgid, pid_t pid, int sig,
|
|
|
|
siginfo_t __user *uinfo);
|
2016-05-21 08:00:30 +08:00
|
|
|
asmlinkage long sys_kill(pid_t pid, int sig);
|
|
|
|
asmlinkage long sys_tgkill(pid_t tgid, pid_t pid, int sig);
|
|
|
|
asmlinkage long sys_tkill(pid_t pid, int sig);
|
|
|
|
asmlinkage long sys_rt_sigqueueinfo(pid_t pid, int sig, siginfo_t __user *uinfo);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_sgetmask(void);
|
|
|
|
asmlinkage long sys_ssetmask(int newmask);
|
2009-01-14 21:13:54 +08:00
|
|
|
asmlinkage long sys_signal(int sig, __sighandler_t handler);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_pause(void);
|
|
|
|
|
|
|
|
asmlinkage long sys_sync(void);
|
|
|
|
asmlinkage long sys_fsync(unsigned int fd);
|
|
|
|
asmlinkage long sys_fdatasync(unsigned int fd);
|
|
|
|
asmlinkage long sys_bdflush(int func, long data);
|
|
|
|
asmlinkage long sys_mount(char __user *dev_name, char __user *dir_name,
|
|
|
|
char __user *type, unsigned long flags,
|
|
|
|
void __user *data);
|
|
|
|
asmlinkage long sys_umount(char __user *name, int flags);
|
|
|
|
asmlinkage long sys_oldumount(char __user *name);
|
2009-09-23 23:49:55 +08:00
|
|
|
asmlinkage long sys_truncate(const char __user *path, long length);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_ftruncate(unsigned int fd, unsigned long length);
|
2010-08-11 18:26:22 +08:00
|
|
|
asmlinkage long sys_stat(const char __user *filename,
|
2005-04-17 06:20:36 +08:00
|
|
|
struct __old_kernel_stat __user *statbuf);
|
|
|
|
asmlinkage long sys_statfs(const char __user * path,
|
|
|
|
struct statfs __user *buf);
|
|
|
|
asmlinkage long sys_statfs64(const char __user *path, size_t sz,
|
|
|
|
struct statfs64 __user *buf);
|
|
|
|
asmlinkage long sys_fstatfs(unsigned int fd, struct statfs __user *buf);
|
|
|
|
asmlinkage long sys_fstatfs64(unsigned int fd, size_t sz,
|
|
|
|
struct statfs64 __user *buf);
|
2010-08-11 18:26:22 +08:00
|
|
|
asmlinkage long sys_lstat(const char __user *filename,
|
2005-04-17 06:20:36 +08:00
|
|
|
struct __old_kernel_stat __user *statbuf);
|
|
|
|
asmlinkage long sys_fstat(unsigned int fd,
|
|
|
|
struct __old_kernel_stat __user *statbuf);
|
2010-08-11 18:26:22 +08:00
|
|
|
asmlinkage long sys_newstat(const char __user *filename,
|
2005-04-17 06:20:36 +08:00
|
|
|
struct stat __user *statbuf);
|
2010-08-11 18:26:22 +08:00
|
|
|
asmlinkage long sys_newlstat(const char __user *filename,
|
2005-04-17 06:20:36 +08:00
|
|
|
struct stat __user *statbuf);
|
|
|
|
asmlinkage long sys_newfstat(unsigned int fd, struct stat __user *statbuf);
|
|
|
|
asmlinkage long sys_ustat(unsigned dev, struct ustat __user *ubuf);
|
2015-01-07 00:52:38 +08:00
|
|
|
#if defined(__ARCH_WANT_STAT64) || defined(__ARCH_WANT_COMPAT_STAT64)
|
2010-08-11 18:26:22 +08:00
|
|
|
asmlinkage long sys_stat64(const char __user *filename,
|
2005-04-17 06:20:36 +08:00
|
|
|
struct stat64 __user *statbuf);
|
|
|
|
asmlinkage long sys_fstat64(unsigned long fd, struct stat64 __user *statbuf);
|
2010-08-11 18:26:22 +08:00
|
|
|
asmlinkage long sys_lstat64(const char __user *filename,
|
2005-04-17 06:20:36 +08:00
|
|
|
struct stat64 __user *statbuf);
|
2015-01-07 00:52:38 +08:00
|
|
|
asmlinkage long sys_fstatat64(int dfd, const char __user *filename,
|
|
|
|
struct stat64 __user *statbuf, int flag);
|
|
|
|
#endif
|
|
|
|
#if BITS_PER_LONG == 32
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_truncate64(const char __user *path, loff_t length);
|
|
|
|
asmlinkage long sys_ftruncate64(unsigned int fd, loff_t length);
|
|
|
|
#endif
|
|
|
|
|
2008-04-29 15:59:41 +08:00
|
|
|
asmlinkage long sys_setxattr(const char __user *path, const char __user *name,
|
|
|
|
const void __user *value, size_t size, int flags);
|
|
|
|
asmlinkage long sys_lsetxattr(const char __user *path, const char __user *name,
|
|
|
|
const void __user *value, size_t size, int flags);
|
|
|
|
asmlinkage long sys_fsetxattr(int fd, const char __user *name,
|
|
|
|
const void __user *value, size_t size, int flags);
|
2009-01-14 21:13:54 +08:00
|
|
|
asmlinkage long sys_getxattr(const char __user *path, const char __user *name,
|
|
|
|
void __user *value, size_t size);
|
|
|
|
asmlinkage long sys_lgetxattr(const char __user *path, const char __user *name,
|
|
|
|
void __user *value, size_t size);
|
|
|
|
asmlinkage long sys_fgetxattr(int fd, const char __user *name,
|
|
|
|
void __user *value, size_t size);
|
|
|
|
asmlinkage long sys_listxattr(const char __user *path, char __user *list,
|
|
|
|
size_t size);
|
|
|
|
asmlinkage long sys_llistxattr(const char __user *path, char __user *list,
|
|
|
|
size_t size);
|
|
|
|
asmlinkage long sys_flistxattr(int fd, char __user *list, size_t size);
|
2008-04-29 15:59:41 +08:00
|
|
|
asmlinkage long sys_removexattr(const char __user *path,
|
|
|
|
const char __user *name);
|
|
|
|
asmlinkage long sys_lremovexattr(const char __user *path,
|
|
|
|
const char __user *name);
|
|
|
|
asmlinkage long sys_fremovexattr(int fd, const char __user *name);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2009-01-14 21:13:54 +08:00
|
|
|
asmlinkage long sys_brk(unsigned long brk);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_mprotect(unsigned long start, size_t len,
|
|
|
|
unsigned long prot);
|
2009-01-14 21:13:54 +08:00
|
|
|
asmlinkage long sys_mremap(unsigned long addr,
|
|
|
|
unsigned long old_len, unsigned long new_len,
|
|
|
|
unsigned long flags, unsigned long new_addr);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size,
|
|
|
|
unsigned long prot, unsigned long pgoff,
|
|
|
|
unsigned long flags);
|
|
|
|
asmlinkage long sys_msync(unsigned long start, size_t len, int flags);
|
|
|
|
asmlinkage long sys_fadvise64(int fd, loff_t offset, size_t len, int advice);
|
|
|
|
asmlinkage long sys_fadvise64_64(int fd, loff_t offset, loff_t len, int advice);
|
|
|
|
asmlinkage long sys_munmap(unsigned long addr, size_t len);
|
|
|
|
asmlinkage long sys_mlock(unsigned long start, size_t len);
|
|
|
|
asmlinkage long sys_munlock(unsigned long start, size_t len);
|
|
|
|
asmlinkage long sys_mlockall(int flags);
|
|
|
|
asmlinkage long sys_munlockall(void);
|
|
|
|
asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior);
|
|
|
|
asmlinkage long sys_mincore(unsigned long start, size_t len,
|
|
|
|
unsigned char __user * vec);
|
|
|
|
|
|
|
|
asmlinkage long sys_pivot_root(const char __user *new_root,
|
|
|
|
const char __user *put_old);
|
|
|
|
asmlinkage long sys_chroot(const char __user *filename);
|
2011-07-26 05:32:17 +08:00
|
|
|
asmlinkage long sys_mknod(const char __user *filename, umode_t mode,
|
2005-04-17 06:20:36 +08:00
|
|
|
unsigned dev);
|
|
|
|
asmlinkage long sys_link(const char __user *oldname,
|
|
|
|
const char __user *newname);
|
|
|
|
asmlinkage long sys_symlink(const char __user *old, const char __user *new);
|
|
|
|
asmlinkage long sys_unlink(const char __user *pathname);
|
|
|
|
asmlinkage long sys_rename(const char __user *oldname,
|
|
|
|
const char __user *newname);
|
2011-07-26 16:22:01 +08:00
|
|
|
asmlinkage long sys_chmod(const char __user *filename, umode_t mode);
|
|
|
|
asmlinkage long sys_fchmod(unsigned int fd, umode_t mode);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
asmlinkage long sys_fcntl(unsigned int fd, unsigned int cmd, unsigned long arg);
|
|
|
|
#if BITS_PER_LONG == 32
|
|
|
|
asmlinkage long sys_fcntl64(unsigned int fd,
|
|
|
|
unsigned int cmd, unsigned long arg);
|
|
|
|
#endif
|
2009-06-17 06:33:49 +08:00
|
|
|
asmlinkage long sys_pipe(int __user *fildes);
|
2009-05-13 04:19:37 +08:00
|
|
|
asmlinkage long sys_pipe2(int __user *fildes, int flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_dup(unsigned int fildes);
|
|
|
|
asmlinkage long sys_dup2(unsigned int oldfd, unsigned int newfd);
|
2008-07-24 12:29:29 +08:00
|
|
|
asmlinkage long sys_dup3(unsigned int oldfd, unsigned int newfd, int flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_ioperm(unsigned long from, unsigned long num, int on);
|
|
|
|
asmlinkage long sys_ioctl(unsigned int fd, unsigned int cmd,
|
|
|
|
unsigned long arg);
|
|
|
|
asmlinkage long sys_flock(unsigned int fd, unsigned int cmd);
|
|
|
|
asmlinkage long sys_io_setup(unsigned nr_reqs, aio_context_t __user *ctx);
|
|
|
|
asmlinkage long sys_io_destroy(aio_context_t ctx);
|
|
|
|
asmlinkage long sys_io_getevents(aio_context_t ctx_id,
|
|
|
|
long min_nr,
|
|
|
|
long nr,
|
|
|
|
struct io_event __user *events,
|
|
|
|
struct timespec __user *timeout);
|
|
|
|
asmlinkage long sys_io_submit(aio_context_t, long,
|
|
|
|
struct iocb __user * __user *);
|
|
|
|
asmlinkage long sys_io_cancel(aio_context_t ctx_id, struct iocb __user *iocb,
|
|
|
|
struct io_event __user *result);
|
2009-01-14 21:13:54 +08:00
|
|
|
asmlinkage long sys_sendfile(int out_fd, int in_fd,
|
|
|
|
off_t __user *offset, size_t count);
|
|
|
|
asmlinkage long sys_sendfile64(int out_fd, int in_fd,
|
|
|
|
loff_t __user *offset, size_t count);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_readlink(const char __user *path,
|
|
|
|
char __user *buf, int bufsiz);
|
2011-11-22 03:59:34 +08:00
|
|
|
asmlinkage long sys_creat(const char __user *pathname, umode_t mode);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_open(const char __user *filename,
|
2011-11-22 03:59:34 +08:00
|
|
|
int flags, umode_t mode);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_close(unsigned int fd);
|
|
|
|
asmlinkage long sys_access(const char __user *filename, int mode);
|
|
|
|
asmlinkage long sys_vhangup(void);
|
|
|
|
asmlinkage long sys_chown(const char __user *filename,
|
|
|
|
uid_t user, gid_t group);
|
|
|
|
asmlinkage long sys_lchown(const char __user *filename,
|
|
|
|
uid_t user, gid_t group);
|
|
|
|
asmlinkage long sys_fchown(unsigned int fd, uid_t user, gid_t group);
|
2015-11-20 19:12:21 +08:00
|
|
|
#ifdef CONFIG_HAVE_UID16
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_chown16(const char __user *filename,
|
|
|
|
old_uid_t user, old_gid_t group);
|
|
|
|
asmlinkage long sys_lchown16(const char __user *filename,
|
|
|
|
old_uid_t user, old_gid_t group);
|
|
|
|
asmlinkage long sys_fchown16(unsigned int fd, old_uid_t user, old_gid_t group);
|
|
|
|
asmlinkage long sys_setregid16(old_gid_t rgid, old_gid_t egid);
|
|
|
|
asmlinkage long sys_setgid16(old_gid_t gid);
|
|
|
|
asmlinkage long sys_setreuid16(old_uid_t ruid, old_uid_t euid);
|
|
|
|
asmlinkage long sys_setuid16(old_uid_t uid);
|
|
|
|
asmlinkage long sys_setresuid16(old_uid_t ruid, old_uid_t euid, old_uid_t suid);
|
|
|
|
asmlinkage long sys_getresuid16(old_uid_t __user *ruid,
|
|
|
|
old_uid_t __user *euid, old_uid_t __user *suid);
|
|
|
|
asmlinkage long sys_setresgid16(old_gid_t rgid, old_gid_t egid, old_gid_t sgid);
|
|
|
|
asmlinkage long sys_getresgid16(old_gid_t __user *rgid,
|
|
|
|
old_gid_t __user *egid, old_gid_t __user *sgid);
|
|
|
|
asmlinkage long sys_setfsuid16(old_uid_t uid);
|
|
|
|
asmlinkage long sys_setfsgid16(old_gid_t gid);
|
|
|
|
asmlinkage long sys_getgroups16(int gidsetsize, old_gid_t __user *grouplist);
|
|
|
|
asmlinkage long sys_setgroups16(int gidsetsize, old_gid_t __user *grouplist);
|
|
|
|
asmlinkage long sys_getuid16(void);
|
|
|
|
asmlinkage long sys_geteuid16(void);
|
|
|
|
asmlinkage long sys_getgid16(void);
|
|
|
|
asmlinkage long sys_getegid16(void);
|
|
|
|
#endif
|
|
|
|
|
|
|
|
asmlinkage long sys_utime(char __user *filename,
|
|
|
|
struct utimbuf __user *times);
|
|
|
|
asmlinkage long sys_utimes(char __user *filename,
|
|
|
|
struct timeval __user *utimes);
|
2009-01-14 21:13:54 +08:00
|
|
|
asmlinkage long sys_lseek(unsigned int fd, off_t offset,
|
2012-12-18 07:59:39 +08:00
|
|
|
unsigned int whence);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_llseek(unsigned int fd, unsigned long offset_high,
|
|
|
|
unsigned long offset_low, loff_t __user *result,
|
2012-12-18 07:59:39 +08:00
|
|
|
unsigned int whence);
|
2009-01-14 21:13:54 +08:00
|
|
|
asmlinkage long sys_read(unsigned int fd, char __user *buf, size_t count);
|
|
|
|
asmlinkage long sys_readahead(int fd, loff_t offset, size_t count);
|
|
|
|
asmlinkage long sys_readv(unsigned long fd,
|
|
|
|
const struct iovec __user *vec,
|
|
|
|
unsigned long vlen);
|
|
|
|
asmlinkage long sys_write(unsigned int fd, const char __user *buf,
|
|
|
|
size_t count);
|
|
|
|
asmlinkage long sys_writev(unsigned long fd,
|
|
|
|
const struct iovec __user *vec,
|
|
|
|
unsigned long vlen);
|
|
|
|
asmlinkage long sys_pread64(unsigned int fd, char __user *buf,
|
|
|
|
size_t count, loff_t pos);
|
|
|
|
asmlinkage long sys_pwrite64(unsigned int fd, const char __user *buf,
|
|
|
|
size_t count, loff_t pos);
|
2009-04-03 07:59:23 +08:00
|
|
|
asmlinkage long sys_preadv(unsigned long fd, const struct iovec __user *vec,
|
Make non-compat preadv/pwritev use native register size
Instead of always splitting the file offset into 32-bit 'high' and 'low'
parts, just split them into the largest natural word-size - which in C
terms is 'unsigned long'.
This allows 64-bit architectures to avoid the unnecessary 32-bit
shifting and masking for native format (while the compat interfaces will
obviously always have to do it).
This also changes the order of 'high' and 'low' to be "low first". Why?
Because when we have it like this, the 64-bit system calls now don't use
the "pos_high" argument at all, and it makes more sense for the native
system call to simply match the user-mode prototype.
This results in a much more natural calling convention, and allows the
compiler to generate much more straightforward code. On x86-64, we now
generate
testq %rcx, %rcx # pos_l
js .L122 #,
movq %rcx, -48(%rbp) # pos_l, pos
from the C source
loff_t pos = pos_from_hilo(pos_h, pos_l);
...
if (pos < 0)
return -EINVAL;
and the 'pos_h' register isn't even touched. It used to generate code
like
mov %r8d, %r8d # pos_low, pos_low
salq $32, %rcx #, tmp71
movq %r8, %rax # pos_low, pos.386
orq %rcx, %rax # tmp71, pos.386
js .L122 #,
movq %rax, -48(%rbp) # pos.386, pos
which isn't _that_ horrible, but it does show how the natural word size
is just a more sensible interface (same arguments will hold in the user
level glibc wrapper function, of course, so the kernel side is just half
of the equation!)
Note: in all cases the user code wrapper can again be the same. You can
just do
#define HALF_BITS (sizeof(unsigned long)*4)
__syscall(PWRITEV, fd, iov, count, offset, (offset >> HALF_BITS) >> HALF_BITS);
or something like that. That way the user mode wrapper will also be
nicely passing in a zero (it won't actually have to do the shifts, the
compiler will understand what is going on) for the last argument.
And that is a good idea, even if nobody will necessarily ever care: if
we ever do move to a 128-bit lloff_t, this particular system call might
be left alone. Of course, that will be the least of our worries if we
really ever need to care, so this may not be worth really caring about.
[ Fixed for lost 'loff_t' cast noticed by Andrew Morton ]
Acked-by: Gerd Hoffmann <kraxel@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ralf Baechle <ralf@linux-mips.org>>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 23:03:22 +08:00
|
|
|
unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
|
2016-03-03 23:03:59 +08:00
|
|
|
asmlinkage long sys_preadv2(unsigned long fd, const struct iovec __user *vec,
|
|
|
|
unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
|
2017-07-07 00:58:37 +08:00
|
|
|
rwf_t flags);
|
2009-04-03 07:59:23 +08:00
|
|
|
asmlinkage long sys_pwritev(unsigned long fd, const struct iovec __user *vec,
|
Make non-compat preadv/pwritev use native register size
Instead of always splitting the file offset into 32-bit 'high' and 'low'
parts, just split them into the largest natural word-size - which in C
terms is 'unsigned long'.
This allows 64-bit architectures to avoid the unnecessary 32-bit
shifting and masking for native format (while the compat interfaces will
obviously always have to do it).
This also changes the order of 'high' and 'low' to be "low first". Why?
Because when we have it like this, the 64-bit system calls now don't use
the "pos_high" argument at all, and it makes more sense for the native
system call to simply match the user-mode prototype.
This results in a much more natural calling convention, and allows the
compiler to generate much more straightforward code. On x86-64, we now
generate
testq %rcx, %rcx # pos_l
js .L122 #,
movq %rcx, -48(%rbp) # pos_l, pos
from the C source
loff_t pos = pos_from_hilo(pos_h, pos_l);
...
if (pos < 0)
return -EINVAL;
and the 'pos_h' register isn't even touched. It used to generate code
like
mov %r8d, %r8d # pos_low, pos_low
salq $32, %rcx #, tmp71
movq %r8, %rax # pos_low, pos.386
orq %rcx, %rax # tmp71, pos.386
js .L122 #,
movq %rax, -48(%rbp) # pos.386, pos
which isn't _that_ horrible, but it does show how the natural word size
is just a more sensible interface (same arguments will hold in the user
level glibc wrapper function, of course, so the kernel side is just half
of the equation!)
Note: in all cases the user code wrapper can again be the same. You can
just do
#define HALF_BITS (sizeof(unsigned long)*4)
__syscall(PWRITEV, fd, iov, count, offset, (offset >> HALF_BITS) >> HALF_BITS);
or something like that. That way the user mode wrapper will also be
nicely passing in a zero (it won't actually have to do the shifts, the
compiler will understand what is going on) for the last argument.
And that is a good idea, even if nobody will necessarily ever care: if
we ever do move to a 128-bit lloff_t, this particular system call might
be left alone. Of course, that will be the least of our worries if we
really ever need to care, so this may not be worth really caring about.
[ Fixed for lost 'loff_t' cast noticed by Andrew Morton ]
Acked-by: Gerd Hoffmann <kraxel@redhat.com>
Cc: H. Peter Anvin <hpa@zytor.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: linux-api@vger.kernel.org
Cc: linux-arch@vger.kernel.org
Cc: Ingo Molnar <mingo@elte.hu>
Cc: Ralf Baechle <ralf@linux-mips.org>>
Cc: Al Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-04-03 23:03:22 +08:00
|
|
|
unsigned long vlen, unsigned long pos_l, unsigned long pos_h);
|
2016-03-03 23:03:59 +08:00
|
|
|
asmlinkage long sys_pwritev2(unsigned long fd, const struct iovec __user *vec,
|
|
|
|
unsigned long vlen, unsigned long pos_l, unsigned long pos_h,
|
2017-07-07 00:58:37 +08:00
|
|
|
rwf_t flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_getcwd(char __user *buf, unsigned long size);
|
2011-11-22 03:59:34 +08:00
|
|
|
asmlinkage long sys_mkdir(const char __user *pathname, umode_t mode);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_chdir(const char __user *filename);
|
|
|
|
asmlinkage long sys_fchdir(unsigned int fd);
|
|
|
|
asmlinkage long sys_rmdir(const char __user *pathname);
|
|
|
|
asmlinkage long sys_lookup_dcookie(u64 cookie64, char __user *buf, size_t len);
|
|
|
|
asmlinkage long sys_quotactl(unsigned int cmd, const char __user *special,
|
|
|
|
qid_t id, void __user *addr);
|
|
|
|
asmlinkage long sys_getdents(unsigned int fd,
|
|
|
|
struct linux_dirent __user *dirent,
|
|
|
|
unsigned int count);
|
|
|
|
asmlinkage long sys_getdents64(unsigned int fd,
|
|
|
|
struct linux_dirent64 __user *dirent,
|
|
|
|
unsigned int count);
|
|
|
|
|
|
|
|
asmlinkage long sys_setsockopt(int fd, int level, int optname,
|
|
|
|
char __user *optval, int optlen);
|
|
|
|
asmlinkage long sys_getsockopt(int fd, int level, int optname,
|
|
|
|
char __user *optval, int __user *optlen);
|
|
|
|
asmlinkage long sys_bind(int, struct sockaddr __user *, int);
|
|
|
|
asmlinkage long sys_connect(int, struct sockaddr __user *, int);
|
|
|
|
asmlinkage long sys_accept(int, struct sockaddr __user *, int __user *);
|
reintroduce accept4
Introduce a new accept4() system call. The addition of this system call
matches analogous changes in 2.6.27 (dup3(), evenfd2(), signalfd4(),
inotify_init1(), epoll_create1(), pipe2()) which added new system calls
that differed from analogous traditional system calls in adding a flags
argument that can be used to access additional functionality.
The accept4() system call is exactly the same as accept(), except that
it adds a flags bit-mask argument. Two flags are initially implemented.
(Most of the new system calls in 2.6.27 also had both of these flags.)
SOCK_CLOEXEC causes the close-on-exec (FD_CLOEXEC) flag to be enabled
for the new file descriptor returned by accept4(). This is a useful
security feature to avoid leaking information in a multithreaded
program where one thread is doing an accept() at the same time as
another thread is doing a fork() plus exec(). More details here:
http://udrepper.livejournal.com/20407.html "Secure File Descriptor Handling",
Ulrich Drepper).
The other flag is SOCK_NONBLOCK, which causes the O_NONBLOCK flag
to be enabled on the new open file description created by accept4().
(This flag is merely a convenience, saving the use of additional calls
fcntl(F_GETFL) and fcntl (F_SETFL) to achieve the same result.
Here's a test program. Works on x86-32. Should work on x86-64, but
I (mtk) don't have a system to hand to test with.
It tests accept4() with each of the four possible combinations of
SOCK_CLOEXEC and SOCK_NONBLOCK set/clear in 'flags', and verifies
that the appropriate flags are set on the file descriptor/open file
description returned by accept4().
I tested Ulrich's patch in this thread by applying against 2.6.28-rc2,
and it passes according to my test program.
/* test_accept4.c
Copyright (C) 2008, Linux Foundation, written by Michael Kerrisk
<mtk.manpages@gmail.com>
Licensed under the GNU GPLv2 or later.
*/
#define _GNU_SOURCE
#include <unistd.h>
#include <sys/syscall.h>
#include <sys/socket.h>
#include <netinet/in.h>
#include <stdlib.h>
#include <fcntl.h>
#include <stdio.h>
#include <string.h>
#define PORT_NUM 33333
#define die(msg) do { perror(msg); exit(EXIT_FAILURE); } while (0)
/**********************************************************************/
/* The following is what we need until glibc gets a wrapper for
accept4() */
/* Flags for socket(), socketpair(), accept4() */
#ifndef SOCK_CLOEXEC
#define SOCK_CLOEXEC O_CLOEXEC
#endif
#ifndef SOCK_NONBLOCK
#define SOCK_NONBLOCK O_NONBLOCK
#endif
#ifdef __x86_64__
#define SYS_accept4 288
#elif __i386__
#define USE_SOCKETCALL 1
#define SYS_ACCEPT4 18
#else
#error "Sorry -- don't know the syscall # on this architecture"
#endif
static int
accept4(int fd, struct sockaddr *sockaddr, socklen_t *addrlen, int flags)
{
printf("Calling accept4(): flags = %x", flags);
if (flags != 0) {
printf(" (");
if (flags & SOCK_CLOEXEC)
printf("SOCK_CLOEXEC");
if ((flags & SOCK_CLOEXEC) && (flags & SOCK_NONBLOCK))
printf(" ");
if (flags & SOCK_NONBLOCK)
printf("SOCK_NONBLOCK");
printf(")");
}
printf("\n");
#if USE_SOCKETCALL
long args[6];
args[0] = fd;
args[1] = (long) sockaddr;
args[2] = (long) addrlen;
args[3] = flags;
return syscall(SYS_socketcall, SYS_ACCEPT4, args);
#else
return syscall(SYS_accept4, fd, sockaddr, addrlen, flags);
#endif
}
/**********************************************************************/
static int
do_test(int lfd, struct sockaddr_in *conn_addr,
int closeonexec_flag, int nonblock_flag)
{
int connfd, acceptfd;
int fdf, flf, fdf_pass, flf_pass;
struct sockaddr_in claddr;
socklen_t addrlen;
printf("=======================================\n");
connfd = socket(AF_INET, SOCK_STREAM, 0);
if (connfd == -1)
die("socket");
if (connect(connfd, (struct sockaddr *) conn_addr,
sizeof(struct sockaddr_in)) == -1)
die("connect");
addrlen = sizeof(struct sockaddr_in);
acceptfd = accept4(lfd, (struct sockaddr *) &claddr, &addrlen,
closeonexec_flag | nonblock_flag);
if (acceptfd == -1) {
perror("accept4()");
close(connfd);
return 0;
}
fdf = fcntl(acceptfd, F_GETFD);
if (fdf == -1)
die("fcntl:F_GETFD");
fdf_pass = ((fdf & FD_CLOEXEC) != 0) ==
((closeonexec_flag & SOCK_CLOEXEC) != 0);
printf("Close-on-exec flag is %sset (%s); ",
(fdf & FD_CLOEXEC) ? "" : "not ",
fdf_pass ? "OK" : "failed");
flf = fcntl(acceptfd, F_GETFL);
if (flf == -1)
die("fcntl:F_GETFD");
flf_pass = ((flf & O_NONBLOCK) != 0) ==
((nonblock_flag & SOCK_NONBLOCK) !=0);
printf("nonblock flag is %sset (%s)\n",
(flf & O_NONBLOCK) ? "" : "not ",
flf_pass ? "OK" : "failed");
close(acceptfd);
close(connfd);
printf("Test result: %s\n", (fdf_pass && flf_pass) ? "PASS" : "FAIL");
return fdf_pass && flf_pass;
}
static int
create_listening_socket(int port_num)
{
struct sockaddr_in svaddr;
int lfd;
int optval;
memset(&svaddr, 0, sizeof(struct sockaddr_in));
svaddr.sin_family = AF_INET;
svaddr.sin_addr.s_addr = htonl(INADDR_ANY);
svaddr.sin_port = htons(port_num);
lfd = socket(AF_INET, SOCK_STREAM, 0);
if (lfd == -1)
die("socket");
optval = 1;
if (setsockopt(lfd, SOL_SOCKET, SO_REUSEADDR, &optval,
sizeof(optval)) == -1)
die("setsockopt");
if (bind(lfd, (struct sockaddr *) &svaddr,
sizeof(struct sockaddr_in)) == -1)
die("bind");
if (listen(lfd, 5) == -1)
die("listen");
return lfd;
}
int
main(int argc, char *argv[])
{
struct sockaddr_in conn_addr;
int lfd;
int port_num;
int passed;
passed = 1;
port_num = (argc > 1) ? atoi(argv[1]) : PORT_NUM;
memset(&conn_addr, 0, sizeof(struct sockaddr_in));
conn_addr.sin_family = AF_INET;
conn_addr.sin_addr.s_addr = htonl(INADDR_LOOPBACK);
conn_addr.sin_port = htons(port_num);
lfd = create_listening_socket(port_num);
if (!do_test(lfd, &conn_addr, 0, 0))
passed = 0;
if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, 0))
passed = 0;
if (!do_test(lfd, &conn_addr, 0, SOCK_NONBLOCK))
passed = 0;
if (!do_test(lfd, &conn_addr, SOCK_CLOEXEC, SOCK_NONBLOCK))
passed = 0;
close(lfd);
exit(passed ? EXIT_SUCCESS : EXIT_FAILURE);
}
[mtk.manpages@gmail.com: rewrote changelog, updated test program]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Tested-by: Michael Kerrisk <mtk.manpages@gmail.com>
Acked-by: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: <linux-api@vger.kernel.org>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-11-20 07:36:14 +08:00
|
|
|
asmlinkage long sys_accept4(int, struct sockaddr __user *, int __user *, int);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_getsockname(int, struct sockaddr __user *, int __user *);
|
|
|
|
asmlinkage long sys_getpeername(int, struct sockaddr __user *, int __user *);
|
|
|
|
asmlinkage long sys_send(int, void __user *, size_t, unsigned);
|
|
|
|
asmlinkage long sys_sendto(int, void __user *, size_t, unsigned,
|
|
|
|
struct sockaddr __user *, int);
|
separate kernel- and userland-side msghdr
Kernel-side struct msghdr is (currently) using the same layout as
userland one, but it's not a one-to-one copy - even without considering
32bit compat issues, we have msg_iov, msg_name and msg_control copied
to kernel[1]. It's fairly localized, so we get away with a few functions
where that knowledge is needed (and we could shrink that set even
more). Pretty much everything deals with the kernel-side variant and
the few places that want userland one just use a bunch of force-casts
to paper over the differences.
The thing is, kernel-side definition of struct msghdr is *not* exposed
in include/uapi - libc doesn't see it, etc. So we can add struct user_msghdr,
with proper annotations and let the few places that ever deal with those
beasts use it for userland pointers. Saner typechecking aside, that will
allow to change the layout of kernel-side msghdr - e.g. replace
msg_iov/msg_iovlen there with struct iov_iter, getting rid of the need
to modify the iovec as we copy data to/from it, etc.
We could introduce kernel_msghdr instead, but that would create much more
noise - the absolute majority of the instances would need to have the
type switched to kernel_msghdr and definition of struct msghdr in
include/linux/socket.h is not going to be seen by userland anyway.
This commit just introduces user_msghdr and switches the few places that
are dealing with userland-side msghdr to it.
[1] actually, it's even trickier than that - we copy msg_control for
sendmsg, but keep the userland address on recvmsg.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-04-07 02:03:05 +08:00
|
|
|
asmlinkage long sys_sendmsg(int fd, struct user_msghdr __user *msg, unsigned flags);
|
2011-05-03 04:21:35 +08:00
|
|
|
asmlinkage long sys_sendmmsg(int fd, struct mmsghdr __user *msg,
|
|
|
|
unsigned int vlen, unsigned flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_recv(int, void __user *, size_t, unsigned);
|
|
|
|
asmlinkage long sys_recvfrom(int, void __user *, size_t, unsigned,
|
|
|
|
struct sockaddr __user *, int __user *);
|
separate kernel- and userland-side msghdr
Kernel-side struct msghdr is (currently) using the same layout as
userland one, but it's not a one-to-one copy - even without considering
32bit compat issues, we have msg_iov, msg_name and msg_control copied
to kernel[1]. It's fairly localized, so we get away with a few functions
where that knowledge is needed (and we could shrink that set even
more). Pretty much everything deals with the kernel-side variant and
the few places that want userland one just use a bunch of force-casts
to paper over the differences.
The thing is, kernel-side definition of struct msghdr is *not* exposed
in include/uapi - libc doesn't see it, etc. So we can add struct user_msghdr,
with proper annotations and let the few places that ever deal with those
beasts use it for userland pointers. Saner typechecking aside, that will
allow to change the layout of kernel-side msghdr - e.g. replace
msg_iov/msg_iovlen there with struct iov_iter, getting rid of the need
to modify the iovec as we copy data to/from it, etc.
We could introduce kernel_msghdr instead, but that would create much more
noise - the absolute majority of the instances would need to have the
type switched to kernel_msghdr and definition of struct msghdr in
include/linux/socket.h is not going to be seen by userland anyway.
This commit just introduces user_msghdr and switches the few places that
are dealing with userland-side msghdr to it.
[1] actually, it's even trickier than that - we copy msg_control for
sendmsg, but keep the userland address on recvmsg.
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2014-04-07 02:03:05 +08:00
|
|
|
asmlinkage long sys_recvmsg(int fd, struct user_msghdr __user *msg, unsigned flags);
|
2009-10-13 14:40:10 +08:00
|
|
|
asmlinkage long sys_recvmmsg(int fd, struct mmsghdr __user *msg,
|
|
|
|
unsigned int vlen, unsigned flags,
|
|
|
|
struct timespec __user *timeout);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_socket(int, int, int);
|
|
|
|
asmlinkage long sys_socketpair(int, int, int, int __user *);
|
|
|
|
asmlinkage long sys_socketcall(int call, unsigned long __user *args);
|
|
|
|
asmlinkage long sys_listen(int, int);
|
|
|
|
asmlinkage long sys_poll(struct pollfd __user *ufds, unsigned int nfds,
|
sys_poll: fix incorrect type for 'timeout' parameter
The 'poll()' system call timeout parameter is supposed to be 'int', not
'long'.
Now, the reason this matters is that right now 32-bit compat mode is
broken on at least x86-64, because the 32-bit code just calls
'sys_poll()' directly on x86-64, and the 32-bit argument will have been
zero-extended, turning a signed 'int' into a large unsigned 'long'
value.
We could just introduce a 'compat_sys_poll()' function for this, and
that may eventually be what we have to do, but since the actual standard
poll() semantics is *supposed* to be 'int', and since at least on x86-64
glibc sign-extends the argument before invocing the system call (so
nobody can actually use a 64-bit timeout value in user space _anyway_,
even in 64-bit binaries), the simpler solution would seem to be to just
fix the definition of the system call to match what it should have been
from the very start.
If it turns out that somebody somehow circumvents the user-level libc
64-bit sign extension and actually uses a large unsigned 64-bit timeout
despite that not being how poll() is supposed to work, we will need to
do the compat_sys_poll() approach.
Reported-by: Thomas Meyer <thomas@m3y3r.de>
Acked-by: Eric Dumazet <eric.dumazet@gmail.com>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2012-02-22 09:24:20 +08:00
|
|
|
int timeout);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_select(int n, fd_set __user *inp, fd_set __user *outp,
|
|
|
|
fd_set __user *exp, struct timeval __user *tvp);
|
2010-03-11 07:21:13 +08:00
|
|
|
asmlinkage long sys_old_select(struct sel_arg_struct __user *arg);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_epoll_create(int size);
|
2008-07-24 12:29:43 +08:00
|
|
|
asmlinkage long sys_epoll_create1(int flags);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_epoll_ctl(int epfd, int op, int fd,
|
|
|
|
struct epoll_event __user *event);
|
|
|
|
asmlinkage long sys_epoll_wait(int epfd, struct epoll_event __user *events,
|
|
|
|
int maxevents, int timeout);
|
2006-10-11 16:21:44 +08:00
|
|
|
asmlinkage long sys_epoll_pwait(int epfd, struct epoll_event __user *events,
|
|
|
|
int maxevents, int timeout,
|
|
|
|
const sigset_t __user *sigmask,
|
|
|
|
size_t sigsetsize);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_gethostname(char __user *name, int len);
|
|
|
|
asmlinkage long sys_sethostname(char __user *name, int len);
|
|
|
|
asmlinkage long sys_setdomainname(char __user *name, int len);
|
|
|
|
asmlinkage long sys_newuname(struct new_utsname __user *name);
|
2010-03-11 07:21:21 +08:00
|
|
|
asmlinkage long sys_uname(struct old_utsname __user *);
|
|
|
|
asmlinkage long sys_olduname(struct oldold_utsname __user *);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
asmlinkage long sys_getrlimit(unsigned int resource,
|
|
|
|
struct rlimit __user *rlim);
|
2017-05-27 10:04:29 +08:00
|
|
|
#ifdef __ARCH_WANT_SYS_OLD_GETRLIMIT
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_old_getrlimit(unsigned int resource, struct rlimit __user *rlim);
|
|
|
|
#endif
|
|
|
|
asmlinkage long sys_setrlimit(unsigned int resource,
|
|
|
|
struct rlimit __user *rlim);
|
2010-05-05 00:03:50 +08:00
|
|
|
asmlinkage long sys_prlimit64(pid_t pid, unsigned int resource,
|
|
|
|
const struct rlimit64 __user *new_rlim,
|
|
|
|
struct rlimit64 __user *old_rlim);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_getrusage(int who, struct rusage __user *ru);
|
|
|
|
asmlinkage long sys_umask(int mask);
|
|
|
|
|
|
|
|
asmlinkage long sys_msgget(key_t key, int msgflg);
|
|
|
|
asmlinkage long sys_msgsnd(int msqid, struct msgbuf __user *msgp,
|
|
|
|
size_t msgsz, int msgflg);
|
|
|
|
asmlinkage long sys_msgrcv(int msqid, struct msgbuf __user *msgp,
|
|
|
|
size_t msgsz, long msgtyp, int msgflg);
|
|
|
|
asmlinkage long sys_msgctl(int msqid, int cmd, struct msqid_ds __user *buf);
|
|
|
|
|
|
|
|
asmlinkage long sys_semget(key_t key, int nsems, int semflg);
|
|
|
|
asmlinkage long sys_semop(int semid, struct sembuf __user *sops,
|
|
|
|
unsigned nsops);
|
2013-03-06 04:04:55 +08:00
|
|
|
asmlinkage long sys_semctl(int semid, int semnum, int cmd, unsigned long arg);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_semtimedop(int semid, struct sembuf __user *sops,
|
|
|
|
unsigned nsops,
|
|
|
|
const struct timespec __user *timeout);
|
2005-05-01 23:59:12 +08:00
|
|
|
asmlinkage long sys_shmat(int shmid, char __user *shmaddr, int shmflg);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_shmget(key_t key, size_t size, int flag);
|
|
|
|
asmlinkage long sys_shmdt(char __user *shmaddr);
|
|
|
|
asmlinkage long sys_shmctl(int shmid, int cmd, struct shmid_ds __user *buf);
|
2010-03-23 04:12:33 +08:00
|
|
|
asmlinkage long sys_ipc(unsigned int call, int first, unsigned long second,
|
2010-03-11 07:21:18 +08:00
|
|
|
unsigned long third, void __user *ptr, long fifth);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2011-07-26 17:26:10 +08:00
|
|
|
asmlinkage long sys_mq_open(const char __user *name, int oflag, umode_t mode, struct mq_attr __user *attr);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_mq_unlink(const char __user *name);
|
|
|
|
asmlinkage long sys_mq_timedsend(mqd_t mqdes, const char __user *msg_ptr, size_t msg_len, unsigned int msg_prio, const struct timespec __user *abs_timeout);
|
2009-01-14 21:13:54 +08:00
|
|
|
asmlinkage long sys_mq_timedreceive(mqd_t mqdes, char __user *msg_ptr, size_t msg_len, unsigned int __user *msg_prio, const struct timespec __user *abs_timeout);
|
2005-04-17 06:20:36 +08:00
|
|
|
asmlinkage long sys_mq_notify(mqd_t mqdes, const struct sigevent __user *notification);
|
|
|
|
asmlinkage long sys_mq_getsetattr(mqd_t mqdes, const struct mq_attr __user *mqstat, struct mq_attr __user *omqstat);
|
|
|
|
|
|
|
|
asmlinkage long sys_pciconfig_iobase(long which, unsigned long bus, unsigned long devfn);
|
|
|
|
asmlinkage long sys_pciconfig_read(unsigned long bus, unsigned long dfn,
|
|
|
|
unsigned long off, unsigned long len,
|
|
|
|
void __user *buf);
|
|
|
|
asmlinkage long sys_pciconfig_write(unsigned long bus, unsigned long dfn,
|
|
|
|
unsigned long off, unsigned long len,
|
|
|
|
void __user *buf);
|
|
|
|
|
|
|
|
asmlinkage long sys_prctl(int option, unsigned long arg2, unsigned long arg3,
|
|
|
|
unsigned long arg4, unsigned long arg5);
|
|
|
|
asmlinkage long sys_swapon(const char __user *specialfile, int swap_flags);
|
|
|
|
asmlinkage long sys_swapoff(const char __user *specialfile);
|
|
|
|
asmlinkage long sys_sysctl(struct __sysctl_args __user *args);
|
|
|
|
asmlinkage long sys_sysinfo(struct sysinfo __user *info);
|
|
|
|
asmlinkage long sys_sysfs(int option,
|
|
|
|
unsigned long arg1, unsigned long arg2);
|
|
|
|
asmlinkage long sys_syslog(int type, char __user *buf, int len);
|
|
|
|
asmlinkage long sys_uselib(const char __user *library);
|
|
|
|
asmlinkage long sys_ni_syscall(void);
|
2010-10-28 06:33:45 +08:00
|
|
|
asmlinkage long sys_ptrace(long request, long pid, unsigned long addr,
|
|
|
|
unsigned long data);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
asmlinkage long sys_add_key(const char __user *_type,
|
|
|
|
const char __user *_description,
|
|
|
|
const void __user *_payload,
|
|
|
|
size_t plen,
|
|
|
|
key_serial_t destringid);
|
|
|
|
|
|
|
|
asmlinkage long sys_request_key(const char __user *_type,
|
|
|
|
const char __user *_description,
|
|
|
|
const char __user *_callout_info,
|
|
|
|
key_serial_t destringid);
|
|
|
|
|
|
|
|
asmlinkage long sys_keyctl(int cmd, unsigned long arg2, unsigned long arg3,
|
|
|
|
unsigned long arg4, unsigned long arg5);
|
|
|
|
|
2005-07-08 08:56:13 +08:00
|
|
|
asmlinkage long sys_ioprio_set(int which, int who, int ioprio);
|
|
|
|
asmlinkage long sys_ioprio_get(int which, int who);
|
2014-06-05 07:07:58 +08:00
|
|
|
asmlinkage long sys_set_mempolicy(int mode, const unsigned long __user *nmask,
|
2006-01-19 09:43:04 +08:00
|
|
|
unsigned long maxnode);
|
2006-01-08 17:00:51 +08:00
|
|
|
asmlinkage long sys_migrate_pages(pid_t pid, unsigned long maxnode,
|
2006-01-19 09:43:04 +08:00
|
|
|
const unsigned long __user *from,
|
|
|
|
const unsigned long __user *to);
|
2006-06-23 17:03:55 +08:00
|
|
|
asmlinkage long sys_move_pages(pid_t pid, unsigned long nr_pages,
|
|
|
|
const void __user * __user *pages,
|
|
|
|
const int __user *nodes,
|
|
|
|
int __user *status,
|
|
|
|
int flags);
|
2006-01-19 09:43:04 +08:00
|
|
|
asmlinkage long sys_mbind(unsigned long start, unsigned long len,
|
|
|
|
unsigned long mode,
|
2014-06-05 07:07:57 +08:00
|
|
|
const unsigned long __user *nmask,
|
2006-01-19 09:43:04 +08:00
|
|
|
unsigned long maxnode,
|
|
|
|
unsigned flags);
|
|
|
|
asmlinkage long sys_get_mempolicy(int __user *policy,
|
|
|
|
unsigned long __user *nmask,
|
|
|
|
unsigned long maxnode,
|
|
|
|
unsigned long addr, unsigned long flags);
|
|
|
|
|
|
|
|
asmlinkage long sys_inotify_init(void);
|
2008-07-24 12:29:32 +08:00
|
|
|
asmlinkage long sys_inotify_init1(int flags);
|
2006-01-19 09:43:04 +08:00
|
|
|
asmlinkage long sys_inotify_add_watch(int fd, const char __user *path,
|
|
|
|
u32 mask);
|
2009-01-05 20:19:16 +08:00
|
|
|
asmlinkage long sys_inotify_rm_watch(int fd, __s32 wd);
|
2005-07-08 08:56:13 +08:00
|
|
|
|
2005-11-16 04:53:48 +08:00
|
|
|
asmlinkage long sys_spu_run(int fd, __u32 __user *unpc,
|
|
|
|
__u32 __user *ustatus);
|
|
|
|
asmlinkage long sys_spu_create(const char __user *name,
|
2011-07-27 04:50:23 +08:00
|
|
|
unsigned int flags, umode_t mode, int fd);
|
2005-11-16 04:53:48 +08:00
|
|
|
|
2011-07-26 05:32:17 +08:00
|
|
|
asmlinkage long sys_mknodat(int dfd, const char __user * filename, umode_t mode,
|
2006-02-01 19:04:33 +08:00
|
|
|
unsigned dev);
|
2011-11-22 03:59:34 +08:00
|
|
|
asmlinkage long sys_mkdirat(int dfd, const char __user * pathname, umode_t mode);
|
2006-02-01 19:04:33 +08:00
|
|
|
asmlinkage long sys_unlinkat(int dfd, const char __user * pathname, int flag);
|
|
|
|
asmlinkage long sys_symlinkat(const char __user * oldname,
|
|
|
|
int newdfd, const char __user * newname);
|
|
|
|
asmlinkage long sys_linkat(int olddfd, const char __user *oldname,
|
2006-02-25 05:04:21 +08:00
|
|
|
int newdfd, const char __user *newname, int flags);
|
2006-02-01 19:04:33 +08:00
|
|
|
asmlinkage long sys_renameat(int olddfd, const char __user * oldname,
|
|
|
|
int newdfd, const char __user * newname);
|
2014-04-08 18:55:46 +08:00
|
|
|
asmlinkage long sys_renameat2(int olddfd, const char __user *oldname,
|
|
|
|
int newdfd, const char __user *newname,
|
|
|
|
unsigned int flags);
|
2010-08-11 18:26:22 +08:00
|
|
|
asmlinkage long sys_futimesat(int dfd, const char __user *filename,
|
2006-02-01 19:04:33 +08:00
|
|
|
struct timeval __user *utimes);
|
|
|
|
asmlinkage long sys_faccessat(int dfd, const char __user *filename, int mode);
|
|
|
|
asmlinkage long sys_fchmodat(int dfd, const char __user * filename,
|
2011-07-26 16:22:01 +08:00
|
|
|
umode_t mode);
|
2006-02-01 19:04:33 +08:00
|
|
|
asmlinkage long sys_fchownat(int dfd, const char __user *filename, uid_t user,
|
|
|
|
gid_t group, int flag);
|
|
|
|
asmlinkage long sys_openat(int dfd, const char __user *filename, int flags,
|
2011-11-22 03:59:34 +08:00
|
|
|
umode_t mode);
|
2010-08-11 18:26:22 +08:00
|
|
|
asmlinkage long sys_newfstatat(int dfd, const char __user *filename,
|
2006-02-01 19:04:33 +08:00
|
|
|
struct stat __user *statbuf, int flag);
|
|
|
|
asmlinkage long sys_readlinkat(int dfd, const char __user *path, char __user *buf,
|
|
|
|
int bufsiz);
|
2010-08-11 18:26:22 +08:00
|
|
|
asmlinkage long sys_utimensat(int dfd, const char __user *filename,
|
2007-05-09 17:32:35 +08:00
|
|
|
struct timespec __user *utimes, int flags);
|
2006-03-24 19:15:08 +08:00
|
|
|
asmlinkage long sys_unshare(unsigned long unshare_flags);
|
2006-04-10 21:18:58 +08:00
|
|
|
|
|
|
|
asmlinkage long sys_splice(int fd_in, loff_t __user *off_in,
|
|
|
|
int fd_out, loff_t __user *off_out,
|
|
|
|
size_t len, unsigned int flags);
|
|
|
|
|
2006-04-26 16:59:21 +08:00
|
|
|
asmlinkage long sys_vmsplice(int fd, const struct iovec __user *iov,
|
|
|
|
unsigned long nr_segs, unsigned int flags);
|
|
|
|
|
2006-04-11 21:51:17 +08:00
|
|
|
asmlinkage long sys_tee(int fdin, int fdout, size_t len, unsigned int flags);
|
|
|
|
|
2006-03-31 18:30:42 +08:00
|
|
|
asmlinkage long sys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
|
2006-04-11 13:53:57 +08:00
|
|
|
unsigned int flags);
|
Introduce fixed sys_sync_file_range2() syscall, implement on PowerPC and ARM
Not all the world is an i386. Many architectures need 64-bit arguments to be
aligned in suitable pairs of registers, and the original
sys_sync_file_range(int, loff_t, loff_t, int) was therefore wasting an
argument register for padding after the first integer. Since we don't
normally have more than 6 arguments for system calls, that left no room for
the final argument on some architectures.
Fix this by introducing sys_sync_file_range2(int, int, loff_t, loff_t) which
all fits nicely. In fact, ARM already had that, but called it
sys_arm_sync_file_range. Move it to fs/sync.c and rename it, then implement
the needed compatibility routine. And stop the missing syscall check from
bitching about the absence of sys_sync_file_range() if we've implemented
sys_sync_file_range2() instead.
Tested on PPC32 and with 32-bit and 64-bit userspace on PPC64.
Signed-off-by: David Woodhouse <dwmw2@infradead.org>
Acked-by: Russell King <rmk+kernel@arm.linux.org.uk>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Paul Mackerras <paulus@samba.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-06-28 05:10:09 +08:00
|
|
|
asmlinkage long sys_sync_file_range2(int fd, unsigned int flags,
|
|
|
|
loff_t offset, loff_t nbytes);
|
2006-05-23 22:46:40 +08:00
|
|
|
asmlinkage long sys_get_robust_list(int pid,
|
2006-10-11 05:46:07 +08:00
|
|
|
struct robust_list_head __user * __user *head_ptr,
|
2006-05-23 22:46:40 +08:00
|
|
|
size_t __user *len_ptr);
|
|
|
|
asmlinkage long sys_set_robust_list(struct robust_list_head __user *head,
|
|
|
|
size_t len);
|
2006-09-29 16:58:35 +08:00
|
|
|
asmlinkage long sys_getcpu(unsigned __user *cpu, unsigned __user *node, struct getcpu_cache __user *cache);
|
signal/timer/event: signalfd core
This patch series implements the new signalfd() system call.
I took part of the original Linus code (and you know how badly it can be
broken :), and I added even more breakage ;) Signals are fetched from the same
signal queue used by the process, so signalfd will compete with standard
kernel delivery in dequeue_signal(). If you want to reliably fetch signals on
the signalfd file, you need to block them with sigprocmask(SIG_BLOCK). This
seems to be working fine on my Dual Opteron machine. I made a quick test
program for it:
http://www.xmailserver.org/signafd-test.c
The signalfd() system call implements signal delivery into a file descriptor
receiver. The signalfd file descriptor if created with the following API:
int signalfd(int ufd, const sigset_t *mask, size_t masksize);
The "ufd" parameter allows to change an existing signalfd sigmask, w/out going
to close/create cycle (Linus idea). Use "ufd" == -1 if you want a brand new
signalfd file.
The "mask" allows to specify the signal mask of signals that we are interested
in. The "masksize" parameter is the size of "mask".
The signalfd fd supports the poll(2) and read(2) system calls. The poll(2)
will return POLLIN when signals are available to be dequeued. As a direct
consequence of supporting the Linux poll subsystem, the signalfd fd can use
used together with epoll(2) too.
The read(2) system call will return a "struct signalfd_siginfo" structure in
the userspace supplied buffer. The return value is the number of bytes copied
in the supplied buffer, or -1 in case of error. The read(2) call can also
return 0, in case the sighand structure to which the signalfd was attached,
has been orphaned. The O_NONBLOCK flag is also supported, and read(2) will
return -EAGAIN in case no signal is available.
If the size of the buffer passed to read(2) is lower than sizeof(struct
signalfd_siginfo), -EINVAL is returned. A read from the signalfd can also
return -ERESTARTSYS in case a signal hits the process. The format of the
struct signalfd_siginfo is, and the valid fields depends of the (->code &
__SI_MASK) value, in the same way a struct siginfo would:
struct signalfd_siginfo {
__u32 signo; /* si_signo */
__s32 err; /* si_errno */
__s32 code; /* si_code */
__u32 pid; /* si_pid */
__u32 uid; /* si_uid */
__s32 fd; /* si_fd */
__u32 tid; /* si_fd */
__u32 band; /* si_band */
__u32 overrun; /* si_overrun */
__u32 trapno; /* si_trapno */
__s32 status; /* si_status */
__s32 svint; /* si_int */
__u64 svptr; /* si_ptr */
__u64 utime; /* si_utime */
__u64 stime; /* si_stime */
__u64 addr; /* si_addr */
};
[akpm@linux-foundation.org: fix signalfd_copyinfo() on i386]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-11 13:23:13 +08:00
|
|
|
asmlinkage long sys_signalfd(int ufd, sigset_t __user *user_mask, size_t sizemask);
|
flag parameters: signalfd
This patch adds the new signalfd4 syscall. It extends the old signalfd
syscall by one parameter which is meant to hold a flag value. In this
patch the only flag support is SFD_CLOEXEC which causes the close-on-exec
flag for the returned file descriptor to be set.
A new name SFD_CLOEXEC is introduced which in this implementation must
have the same value as O_CLOEXEC.
The following test must be adjusted for architectures other than x86 and
x86-64 and in case the syscall numbers changed.
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#include <fcntl.h>
#include <signal.h>
#include <stdio.h>
#include <unistd.h>
#include <sys/syscall.h>
#ifndef __NR_signalfd4
# ifdef __x86_64__
# define __NR_signalfd4 289
# elif defined __i386__
# define __NR_signalfd4 327
# else
# error "need __NR_signalfd4"
# endif
#endif
#define SFD_CLOEXEC O_CLOEXEC
int
main (void)
{
sigset_t ss;
sigemptyset (&ss);
sigaddset (&ss, SIGUSR1);
int fd = syscall (__NR_signalfd4, -1, &ss, 8, 0);
if (fd == -1)
{
puts ("signalfd4(0) failed");
return 1;
}
int coe = fcntl (fd, F_GETFD);
if (coe == -1)
{
puts ("fcntl failed");
return 1;
}
if (coe & FD_CLOEXEC)
{
puts ("signalfd4(0) set close-on-exec flag");
return 1;
}
close (fd);
fd = syscall (__NR_signalfd4, -1, &ss, 8, SFD_CLOEXEC);
if (fd == -1)
{
puts ("signalfd4(SFD_CLOEXEC) failed");
return 1;
}
coe = fcntl (fd, F_GETFD);
if (coe == -1)
{
puts ("fcntl failed");
return 1;
}
if ((coe & FD_CLOEXEC) == 0)
{
puts ("signalfd4(SFD_CLOEXEC) does not set close-on-exec flag");
return 1;
}
close (fd);
puts ("OK");
return 0;
}
~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
[akpm@linux-foundation.org: add sys_ni stub]
Signed-off-by: Ulrich Drepper <drepper@redhat.com>
Acked-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk.manpages@googlemail.com>
Cc: <linux-arch@vger.kernel.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-07-24 12:29:24 +08:00
|
|
|
asmlinkage long sys_signalfd4(int ufd, sigset_t __user *user_mask, size_t sizemask, int flags);
|
timerfd: new timerfd API
This is the new timerfd API as it is implemented by the following patch:
int timerfd_create(int clockid, int flags);
int timerfd_settime(int ufd, int flags,
const struct itimerspec *utmr,
struct itimerspec *otmr);
int timerfd_gettime(int ufd, struct itimerspec *otmr);
The timerfd_create() API creates an un-programmed timerfd fd. The "clockid"
parameter can be either CLOCK_MONOTONIC or CLOCK_REALTIME.
The timerfd_settime() API give new settings by the timerfd fd, by optionally
retrieving the previous expiration time (in case the "otmr" parameter is not
NULL).
The time value specified in "utmr" is absolute, if the TFD_TIMER_ABSTIME bit
is set in the "flags" parameter. Otherwise it's a relative time.
The timerfd_gettime() API returns the next expiration time of the timer, or
{0, 0} if the timerfd has not been set yet.
Like the previous timerfd API implementation, read(2) and poll(2) are
supported (with the same interface). Here's a simple test program I used to
exercise the new timerfd APIs:
http://www.xmailserver.org/timerfd-test2.c
[akpm@linux-foundation.org: coding-style cleanups]
[akpm@linux-foundation.org: fix ia64 build]
[akpm@linux-foundation.org: fix m68k build]
[akpm@linux-foundation.org: fix mips build]
[akpm@linux-foundation.org: fix alpha, arm, blackfin, cris, m68k, s390, sparc and sparc64 builds]
[heiko.carstens@de.ibm.com: fix s390]
[akpm@linux-foundation.org: fix powerpc build]
[akpm@linux-foundation.org: fix sparc64 more]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Davide Libenzi <davidel@xmailserver.org>
Cc: Michael Kerrisk <mtk-manpages@gmx.net>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Signed-off-by: Heiko Carstens <heiko.carstens@de.ibm.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2008-02-05 14:27:26 +08:00
|
|
|
asmlinkage long sys_timerfd_create(int clockid, int flags);
|
|
|
|
asmlinkage long sys_timerfd_settime(int ufd, int flags,
|
|
|
|
const struct itimerspec __user *utmr,
|
|
|
|
struct itimerspec __user *otmr);
|
|
|
|
asmlinkage long sys_timerfd_gettime(int ufd, struct itimerspec __user *otmr);
|
signal/timer/event: eventfd core
This is a very simple and light file descriptor, that can be used as event
wait/dispatch by userspace (both wait and dispatch) and by the kernel
(dispatch only). It can be used instead of pipe(2) in all cases where those
would simply be used to signal events. Their kernel overhead is much lower
than pipes, and they do not consume two fds. When used in the kernel, it can
offer an fd-bridge to enable, for example, functionalities like KAIO or
syslets/threadlets to signal to an fd the completion of certain operations.
But more in general, an eventfd can be used by the kernel to signal readiness,
in a POSIX poll/select way, of interfaces that would otherwise be incompatible
with it. The API is:
int eventfd(unsigned int count);
The eventfd API accepts an initial "count" parameter, and returns an eventfd
fd. It supports poll(2) (POLLIN, POLLOUT, POLLERR), read(2) and write(2).
The POLLIN flag is raised when the internal counter is greater than zero.
The POLLOUT flag is raised when at least a value of "1" can be written to the
internal counter.
The POLLERR flag is raised when an overflow in the counter value is detected.
The write(2) operation can never overflow the counter, since it blocks (unless
O_NONBLOCK is set, in which case -EAGAIN is returned).
But the eventfd_signal() function can do it, since it's supposed to not sleep
during its operation.
The read(2) function reads the __u64 counter value, and reset the internal
value to zero. If the value read is equal to (__u64) -1, an overflow happened
on the internal counter (due to 2^64 eventfd_signal() posts that has never
been retired - unlickely, but possible).
The write(2) call writes an __u64 count value, and adds it to the current
counter. The eventfd fd supports O_NONBLOCK also.
On the kernel side, we have:
struct file *eventfd_fget(int fd);
int eventfd_signal(struct file *file, unsigned int n);
The eventfd_fget() should be called to get a struct file* from an eventfd fd
(this is an fget() + check of f_op being an eventfd fops pointer).
The kernel can then call eventfd_signal() every time it wants to post an event
to userspace. The eventfd_signal() function can be called from any context.
An eventfd() simple test and bench is available here:
http://www.xmailserver.org/eventfd-bench.c
This is the eventfd-based version of pipetest-4 (pipe(2) based):
http://www.xmailserver.org/pipetest-4.c
Not that performance matters much in the eventfd case, but eventfd-bench
shows almost as double as performance than pipetest-4.
[akpm@linux-foundation.org: fix i386 build]
[akpm@linux-foundation.org: add sys_eventfd to sys_ni.c]
Signed-off-by: Davide Libenzi <davidel@xmailserver.org>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2007-05-11 13:23:19 +08:00
|
|
|
asmlinkage long sys_eventfd(unsigned int count);
|
2008-07-24 12:29:25 +08:00
|
|
|
asmlinkage long sys_eventfd2(unsigned int count, int flags);
|
shm: add memfd_create() syscall
memfd_create() is similar to mmap(MAP_ANON), but returns a file-descriptor
that you can pass to mmap(). It can support sealing and avoids any
connection to user-visible mount-points. Thus, it's not subject to quotas
on mounted file-systems, but can be used like malloc()'ed memory, but with
a file-descriptor to it.
memfd_create() returns the raw shmem file, so calls like ftruncate() can
be used to modify the underlying inode. Also calls like fstat() will
return proper information and mark the file as regular file. If you want
sealing, you can specify MFD_ALLOW_SEALING. Otherwise, sealing is not
supported (like on all other regular files).
Compared to O_TMPFILE, it does not require a tmpfs mount-point and is not
subject to a filesystem size limit. It is still properly accounted to
memcg limits, though, and to the same overcommit or no-overcommit
accounting as all user memory.
Signed-off-by: David Herrmann <dh.herrmann@gmail.com>
Acked-by: Hugh Dickins <hughd@google.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Ryan Lortie <desrt@desrt.ca>
Cc: Lennart Poettering <lennart@poettering.net>
Cc: Daniel Mack <zonque@gmail.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-08-09 05:25:29 +08:00
|
|
|
asmlinkage long sys_memfd_create(const char __user *uname_ptr, unsigned int flags);
|
2015-09-05 06:46:58 +08:00
|
|
|
asmlinkage long sys_userfaultfd(int flags);
|
sys_fallocate() implementation on i386, x86_64 and powerpc
fallocate() is a new system call being proposed here which will allow
applications to preallocate space to any file(s) in a file system.
Each file system implementation that wants to use this feature will need
to support an inode operation called ->fallocate().
Applications can use this feature to avoid fragmentation to certain
level and thus get faster access speed. With preallocation, applications
also get a guarantee of space for particular file(s) - even if later the
the system becomes full.
Currently, glibc provides an interface called posix_fallocate() which
can be used for similar cause. Though this has the advantage of working
on all file systems, but it is quite slow (since it writes zeroes to
each block that has to be preallocated). Without a doubt, file systems
can do this more efficiently within the kernel, by implementing
the proposed fallocate() system call. It is expected that
posix_fallocate() will be modified to call this new system call first
and incase the kernel/filesystem does not implement it, it should fall
back to the current implementation of writing zeroes to the new blocks.
ToDos:
1. Implementation on other architectures (other than i386, x86_64,
and ppc). Patches for s390(x) and ia64 are already available from
previous posts, but it was decided that they should be added later
once fallocate is in the mainline. Hence not including those patches
in this take.
2. Changes to glibc,
a) to support fallocate() system call
b) to make posix_fallocate() and posix_fallocate64() call fallocate()
Signed-off-by: Amit Arora <aarora@in.ibm.com>
2007-07-18 09:42:44 +08:00
|
|
|
asmlinkage long sys_fallocate(int fd, int mode, loff_t offset, loff_t len);
|
2009-01-14 21:13:55 +08:00
|
|
|
asmlinkage long sys_old_readdir(unsigned int, struct old_linux_dirent __user *, unsigned int);
|
2009-01-14 21:14:34 +08:00
|
|
|
asmlinkage long sys_pselect6(int, fd_set __user *, fd_set __user *,
|
|
|
|
fd_set __user *, struct timespec __user *,
|
|
|
|
void __user *);
|
|
|
|
asmlinkage long sys_ppoll(struct pollfd __user *, unsigned int,
|
|
|
|
struct timespec __user *, const sigset_t __user *,
|
|
|
|
size_t);
|
2010-05-27 21:41:40 +08:00
|
|
|
asmlinkage long sys_fanotify_init(unsigned int flags, unsigned int event_f_flags);
|
2009-12-18 10:24:26 +08:00
|
|
|
asmlinkage long sys_fanotify_mark(int fanotify_fd, unsigned int flags,
|
|
|
|
u64 mask, int fd,
|
|
|
|
const char __user *pathname);
|
2011-03-11 03:31:30 +08:00
|
|
|
asmlinkage long sys_syncfs(int fd);
|
2006-02-01 19:04:33 +08:00
|
|
|
|
2012-11-29 12:04:26 +08:00
|
|
|
asmlinkage long sys_fork(void);
|
|
|
|
asmlinkage long sys_vfork(void);
|
|
|
|
#ifdef CONFIG_CLONE_BACKWARDS
|
clone: support passing tls argument via C rather than pt_regs magic
clone has some of the quirkiest syscall handling in the kernel, with a
pile of special cases, historical curiosities, and architecture-specific
calling conventions. In particular, clone with CLONE_SETTLS accepts a
parameter "tls" that the C entry point completely ignores and some
assembly entry points overwrite; instead, the low-level arch-specific
code pulls the tls parameter out of the arch-specific register captured
as part of pt_regs on entry to the kernel. That's a massive hack, and
it makes the arch-specific code only work when called via the specific
existing syscall entry points; because of this hack, any new clone-like
system call would have to accept an identical tls argument in exactly
the same arch-specific position, rather than providing a unified system
call entry point across architectures.
The first patch allows architectures to handle the tls argument via
normal C parameter passing, if they opt in by selecting
HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt
into this.
These two patches came out of the clone4 series, which isn't ready for
this merge window, but these first two cleanup patches were entirely
uncontroversial and have acks. I'd like to go ahead and submit these
two so that other architectures can begin building on top of this and
opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and
send these through the next merge window (along with v3 of clone4) if
anyone would prefer that.
This patch (of 2):
clone with CLONE_SETTLS accepts an argument to set the thread-local
storage area for the new thread. sys_clone declares an int argument
tls_val in the appropriate point in the argument list (based on the
various CLONE_BACKWARDS variants), but doesn't actually use or pass along
that argument. Instead, sys_clone calls do_fork, which calls
copy_process, which calls the arch-specific copy_thread, and copy_thread
pulls the corresponding syscall argument out of the pt_regs captured at
kernel entry (knowing what argument of clone that architecture passes tls
in).
Apart from being awful and inscrutable, that also only works because only
one code path into copy_thread can pass the CLONE_SETTLS flag, and that
code path comes from sys_clone with its architecture-specific
argument-passing order. This prevents introducing a new version of the
clone system call without propagating the same architecture-specific
position of the tls argument.
However, there's no reason to pull the argument out of pt_regs when
sys_clone could just pass it down via C function call arguments.
Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into,
and a new copy_thread_tls that accepts the tls parameter as an additional
unsigned long (syscall-argument-sized) argument. Change sys_clone's tls
argument to an unsigned long (which does not change the ABI), and pass
that down to copy_thread_tls.
Architectures that don't opt into copy_thread_tls will continue to ignore
the C argument to sys_clone in favor of the pt_regs captured at kernel
entry, and thus will be unable to introduce new versions of the clone
syscall.
Patch co-authored by Josh Triplett and Thiago Macieira.
Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thiago Macieira <thiago.macieira@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-06-26 06:01:19 +08:00
|
|
|
asmlinkage long sys_clone(unsigned long, unsigned long, int __user *, unsigned long,
|
2012-11-29 12:04:26 +08:00
|
|
|
int __user *);
|
|
|
|
#else
|
2013-08-14 07:00:53 +08:00
|
|
|
#ifdef CONFIG_CLONE_BACKWARDS3
|
|
|
|
asmlinkage long sys_clone(unsigned long, unsigned long, int, int __user *,
|
clone: support passing tls argument via C rather than pt_regs magic
clone has some of the quirkiest syscall handling in the kernel, with a
pile of special cases, historical curiosities, and architecture-specific
calling conventions. In particular, clone with CLONE_SETTLS accepts a
parameter "tls" that the C entry point completely ignores and some
assembly entry points overwrite; instead, the low-level arch-specific
code pulls the tls parameter out of the arch-specific register captured
as part of pt_regs on entry to the kernel. That's a massive hack, and
it makes the arch-specific code only work when called via the specific
existing syscall entry points; because of this hack, any new clone-like
system call would have to accept an identical tls argument in exactly
the same arch-specific position, rather than providing a unified system
call entry point across architectures.
The first patch allows architectures to handle the tls argument via
normal C parameter passing, if they opt in by selecting
HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt
into this.
These two patches came out of the clone4 series, which isn't ready for
this merge window, but these first two cleanup patches were entirely
uncontroversial and have acks. I'd like to go ahead and submit these
two so that other architectures can begin building on top of this and
opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and
send these through the next merge window (along with v3 of clone4) if
anyone would prefer that.
This patch (of 2):
clone with CLONE_SETTLS accepts an argument to set the thread-local
storage area for the new thread. sys_clone declares an int argument
tls_val in the appropriate point in the argument list (based on the
various CLONE_BACKWARDS variants), but doesn't actually use or pass along
that argument. Instead, sys_clone calls do_fork, which calls
copy_process, which calls the arch-specific copy_thread, and copy_thread
pulls the corresponding syscall argument out of the pt_regs captured at
kernel entry (knowing what argument of clone that architecture passes tls
in).
Apart from being awful and inscrutable, that also only works because only
one code path into copy_thread can pass the CLONE_SETTLS flag, and that
code path comes from sys_clone with its architecture-specific
argument-passing order. This prevents introducing a new version of the
clone system call without propagating the same architecture-specific
position of the tls argument.
However, there's no reason to pull the argument out of pt_regs when
sys_clone could just pass it down via C function call arguments.
Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into,
and a new copy_thread_tls that accepts the tls parameter as an additional
unsigned long (syscall-argument-sized) argument. Change sys_clone's tls
argument to an unsigned long (which does not change the ABI), and pass
that down to copy_thread_tls.
Architectures that don't opt into copy_thread_tls will continue to ignore
the C argument to sys_clone in favor of the pt_regs captured at kernel
entry, and thus will be unable to introduce new versions of the clone
syscall.
Patch co-authored by Josh Triplett and Thiago Macieira.
Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thiago Macieira <thiago.macieira@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-06-26 06:01:19 +08:00
|
|
|
int __user *, unsigned long);
|
2013-08-14 07:00:53 +08:00
|
|
|
#else
|
2012-11-29 12:04:26 +08:00
|
|
|
asmlinkage long sys_clone(unsigned long, unsigned long, int __user *,
|
clone: support passing tls argument via C rather than pt_regs magic
clone has some of the quirkiest syscall handling in the kernel, with a
pile of special cases, historical curiosities, and architecture-specific
calling conventions. In particular, clone with CLONE_SETTLS accepts a
parameter "tls" that the C entry point completely ignores and some
assembly entry points overwrite; instead, the low-level arch-specific
code pulls the tls parameter out of the arch-specific register captured
as part of pt_regs on entry to the kernel. That's a massive hack, and
it makes the arch-specific code only work when called via the specific
existing syscall entry points; because of this hack, any new clone-like
system call would have to accept an identical tls argument in exactly
the same arch-specific position, rather than providing a unified system
call entry point across architectures.
The first patch allows architectures to handle the tls argument via
normal C parameter passing, if they opt in by selecting
HAVE_COPY_THREAD_TLS. The second patch makes 32-bit and 64-bit x86 opt
into this.
These two patches came out of the clone4 series, which isn't ready for
this merge window, but these first two cleanup patches were entirely
uncontroversial and have acks. I'd like to go ahead and submit these
two so that other architectures can begin building on top of this and
opting into HAVE_COPY_THREAD_TLS. However, I'm also happy to wait and
send these through the next merge window (along with v3 of clone4) if
anyone would prefer that.
This patch (of 2):
clone with CLONE_SETTLS accepts an argument to set the thread-local
storage area for the new thread. sys_clone declares an int argument
tls_val in the appropriate point in the argument list (based on the
various CLONE_BACKWARDS variants), but doesn't actually use or pass along
that argument. Instead, sys_clone calls do_fork, which calls
copy_process, which calls the arch-specific copy_thread, and copy_thread
pulls the corresponding syscall argument out of the pt_regs captured at
kernel entry (knowing what argument of clone that architecture passes tls
in).
Apart from being awful and inscrutable, that also only works because only
one code path into copy_thread can pass the CLONE_SETTLS flag, and that
code path comes from sys_clone with its architecture-specific
argument-passing order. This prevents introducing a new version of the
clone system call without propagating the same architecture-specific
position of the tls argument.
However, there's no reason to pull the argument out of pt_regs when
sys_clone could just pass it down via C function call arguments.
Introduce a new CONFIG_HAVE_COPY_THREAD_TLS for architectures to opt into,
and a new copy_thread_tls that accepts the tls parameter as an additional
unsigned long (syscall-argument-sized) argument. Change sys_clone's tls
argument to an unsigned long (which does not change the ABI), and pass
that down to copy_thread_tls.
Architectures that don't opt into copy_thread_tls will continue to ignore
the C argument to sys_clone in favor of the pt_regs captured at kernel
entry, and thus will be unable to introduce new versions of the clone
syscall.
Patch co-authored by Josh Triplett and Thiago Macieira.
Signed-off-by: Josh Triplett <josh@joshtriplett.org>
Acked-by: Andy Lutomirski <luto@kernel.org>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Thiago Macieira <thiago.macieira@intel.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-06-26 06:01:19 +08:00
|
|
|
int __user *, unsigned long);
|
2012-11-29 12:04:26 +08:00
|
|
|
#endif
|
2013-08-14 07:00:53 +08:00
|
|
|
#endif
|
2012-11-29 12:04:26 +08:00
|
|
|
|
2012-10-21 01:32:30 +08:00
|
|
|
asmlinkage long sys_execve(const char __user *filename,
|
|
|
|
const char __user *const __user *argv,
|
|
|
|
const char __user *const __user *envp);
|
2008-12-10 19:33:23 +08:00
|
|
|
|
perf: Do the big rename: Performance Counters -> Performance Events
Bye-bye Performance Counters, welcome Performance Events!
In the past few months the perfcounters subsystem has grown out its
initial role of counting hardware events, and has become (and is
becoming) a much broader generic event enumeration, reporting, logging,
monitoring, analysis facility.
Naming its core object 'perf_counter' and naming the subsystem
'perfcounters' has become more and more of a misnomer. With pending
code like hw-breakpoints support the 'counter' name is less and
less appropriate.
All in one, we've decided to rename the subsystem to 'performance
events' and to propagate this rename through all fields, variables
and API names. (in an ABI compatible fashion)
The word 'event' is also a bit shorter than 'counter' - which makes
it slightly more convenient to write/handle as well.
Thanks goes to Stephane Eranian who first observed this misnomer and
suggested a rename.
User-space tooling and ABI compatibility is not affected - this patch
should be function-invariant. (Also, defconfigs were not touched to
keep the size down.)
This patch has been generated via the following script:
FILES=$(find * -type f | grep -vE 'oprofile|[^K]config')
sed -i \
-e 's/PERF_EVENT_/PERF_RECORD_/g' \
-e 's/PERF_COUNTER/PERF_EVENT/g' \
-e 's/perf_counter/perf_event/g' \
-e 's/nb_counters/nb_events/g' \
-e 's/swcounter/swevent/g' \
-e 's/tpcounter_event/tp_event/g' \
$FILES
for N in $(find . -name perf_counter.[ch]); do
M=$(echo $N | sed 's/perf_counter/perf_event/g')
mv $N $M
done
FILES=$(find . -name perf_event.*)
sed -i \
-e 's/COUNTER_MASK/REG_MASK/g' \
-e 's/COUNTER/EVENT/g' \
-e 's/\<event\>/event_id/g' \
-e 's/counter/event/g' \
-e 's/Counter/Event/g' \
$FILES
... to keep it as correct as possible. This script can also be
used by anyone who has pending perfcounters patches - it converts
a Linux kernel tree over to the new naming. We tried to time this
change to the point in time where the amount of pending patches
is the smallest: the end of the merge window.
Namespace clashes were fixed up in a preparatory patch - and some
stylistic fallout will be fixed up in a subsequent patch.
( NOTE: 'counters' are still the proper terminology when we deal
with hardware registers - and these sed scripts are a bit
over-eager in renaming them. I've undone some of that, but
in case there's something left where 'counter' would be
better than 'event' we can undo that on an individual basis
instead of touching an otherwise nicely automated patch. )
Suggested-by: Stephane Eranian <eranian@google.com>
Acked-by: Peter Zijlstra <a.p.zijlstra@chello.nl>
Acked-by: Paul Mackerras <paulus@samba.org>
Reviewed-by: Arjan van de Ven <arjan@linux.intel.com>
Cc: Mike Galbraith <efault@gmx.de>
Cc: Arnaldo Carvalho de Melo <acme@redhat.com>
Cc: Frederic Weisbecker <fweisbec@gmail.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Benjamin Herrenschmidt <benh@kernel.crashing.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Kyle McMartin <kyle@mcmartin.ca>
Cc: Martin Schwidefsky <schwidefsky@de.ibm.com>
Cc: "David S. Miller" <davem@davemloft.net>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: <linux-arch@vger.kernel.org>
LKML-Reference: <new-submission>
Signed-off-by: Ingo Molnar <mingo@elte.hu>
2009-09-21 18:02:48 +08:00
|
|
|
asmlinkage long sys_perf_event_open(
|
|
|
|
struct perf_event_attr __user *attr_uptr,
|
2009-03-04 17:36:51 +08:00
|
|
|
pid_t pid, int cpu, int group_fd, unsigned long flags);
|
2009-12-01 06:37:04 +08:00
|
|
|
|
|
|
|
asmlinkage long sys_mmap_pgoff(unsigned long addr, unsigned long len,
|
|
|
|
unsigned long prot, unsigned long flags,
|
|
|
|
unsigned long fd, unsigned long pgoff);
|
2010-03-11 07:21:15 +08:00
|
|
|
asmlinkage long sys_old_mmap(struct mmap_arg_struct __user *arg);
|
2011-01-29 21:13:26 +08:00
|
|
|
asmlinkage long sys_name_to_handle_at(int dfd, const char __user *name,
|
|
|
|
struct file_handle __user *handle,
|
|
|
|
int __user *mnt_id, int flag);
|
2011-01-29 21:13:26 +08:00
|
|
|
asmlinkage long sys_open_by_handle_at(int mountdirfd,
|
|
|
|
struct file_handle __user *handle,
|
|
|
|
int flags);
|
2011-05-12 05:06:58 +08:00
|
|
|
asmlinkage long sys_setns(int fd, int nstype);
|
2011-11-01 08:06:39 +08:00
|
|
|
asmlinkage long sys_process_vm_readv(pid_t pid,
|
|
|
|
const struct iovec __user *lvec,
|
|
|
|
unsigned long liovcnt,
|
|
|
|
const struct iovec __user *rvec,
|
|
|
|
unsigned long riovcnt,
|
|
|
|
unsigned long flags);
|
|
|
|
asmlinkage long sys_process_vm_writev(pid_t pid,
|
|
|
|
const struct iovec __user *lvec,
|
|
|
|
unsigned long liovcnt,
|
|
|
|
const struct iovec __user *rvec,
|
|
|
|
unsigned long riovcnt,
|
|
|
|
unsigned long flags);
|
|
|
|
|
2012-06-01 07:26:44 +08:00
|
|
|
asmlinkage long sys_kcmp(pid_t pid1, pid_t pid2, int type,
|
|
|
|
unsigned long idx1, unsigned long idx2);
|
2012-10-22 15:39:41 +08:00
|
|
|
asmlinkage long sys_finit_module(int fd, const char __user *uargs, int flags);
|
2014-06-26 07:08:24 +08:00
|
|
|
asmlinkage long sys_seccomp(unsigned int op, unsigned int flags,
|
|
|
|
const char __user *uargs);
|
random: introduce getrandom(2) system call
The getrandom(2) system call was requested by the LibreSSL Portable
developers. It is analoguous to the getentropy(2) system call in
OpenBSD.
The rationale of this system call is to provide resiliance against
file descriptor exhaustion attacks, where the attacker consumes all
available file descriptors, forcing the use of the fallback code where
/dev/[u]random is not available. Since the fallback code is often not
well-tested, it is better to eliminate this potential failure mode
entirely.
The other feature provided by this new system call is the ability to
request randomness from the /dev/urandom entropy pool, but to block
until at least 128 bits of entropy has been accumulated in the
/dev/urandom entropy pool. Historically, the emphasis in the
/dev/urandom development has been to ensure that urandom pool is
initialized as quickly as possible after system boot, and preferably
before the init scripts start execution.
This is because changing /dev/urandom reads to block represents an
interface change that could potentially break userspace which is not
acceptable. In practice, on most x86 desktop and server systems, in
general the entropy pool can be initialized before it is needed (and
in modern kernels, we will printk a warning message if not). However,
on an embedded system, this may not be the case. And so with this new
interface, we can provide the functionality of blocking until the
urandom pool has been initialized. Any userspace program which uses
this new functionality must take care to assure that if it is used
during the boot process, that it will not cause the init scripts or
other portions of the system startup to hang indefinitely.
SYNOPSIS
#include <linux/random.h>
int getrandom(void *buf, size_t buflen, unsigned int flags);
DESCRIPTION
The system call getrandom() fills the buffer pointed to by buf
with up to buflen random bytes which can be used to seed user
space random number generators (i.e., DRBG's) or for other
cryptographic uses. It should not be used for Monte Carlo
simulations or other programs/algorithms which are doing
probabilistic sampling.
If the GRND_RANDOM flags bit is set, then draw from the
/dev/random pool instead of the /dev/urandom pool. The
/dev/random pool is limited based on the entropy that can be
obtained from environmental noise, so if there is insufficient
entropy, the requested number of bytes may not be returned.
If there is no entropy available at all, getrandom(2) will
either block, or return an error with errno set to EAGAIN if
the GRND_NONBLOCK bit is set in flags.
If the GRND_RANDOM bit is not set, then the /dev/urandom pool
will be used. Unlike using read(2) to fetch data from
/dev/urandom, if the urandom pool has not been sufficiently
initialized, getrandom(2) will block (or return -1 with the
errno set to EAGAIN if the GRND_NONBLOCK bit is set in flags).
The getentropy(2) system call in OpenBSD can be emulated using
the following function:
int getentropy(void *buf, size_t buflen)
{
int ret;
if (buflen > 256)
goto failure;
ret = getrandom(buf, buflen, 0);
if (ret < 0)
return ret;
if (ret == buflen)
return 0;
failure:
errno = EIO;
return -1;
}
RETURN VALUE
On success, the number of bytes that was filled in the buf is
returned. This may not be all the bytes requested by the
caller via buflen if insufficient entropy was present in the
/dev/random pool, or if the system call was interrupted by a
signal.
On error, -1 is returned, and errno is set appropriately.
ERRORS
EINVAL An invalid flag was passed to getrandom(2)
EFAULT buf is outside the accessible address space.
EAGAIN The requested entropy was not available, and
getentropy(2) would have blocked if the
GRND_NONBLOCK flag was not set.
EINTR While blocked waiting for entropy, the call was
interrupted by a signal handler; see the description
of how interrupted read(2) calls on "slow" devices
are handled with and without the SA_RESTART flag
in the signal(7) man page.
NOTES
For small requests (buflen <= 256) getrandom(2) will not
return EINTR when reading from the urandom pool once the
entropy pool has been initialized, and it will return all of
the bytes that have been requested. This is the recommended
way to use getrandom(2), and is designed for compatibility
with OpenBSD's getentropy() system call.
However, if you are using GRND_RANDOM, then getrandom(2) may
block until the entropy accounting determines that sufficient
environmental noise has been gathered such that getrandom(2)
will be operating as a NRBG instead of a DRBG for those people
who are working in the NIST SP 800-90 regime. Since it may
block for a long time, these guarantees do *not* apply. The
user may want to interrupt a hanging process using a signal,
so blocking until all of the requested bytes are returned
would be unfriendly.
For this reason, the user of getrandom(2) MUST always check
the return value, in case it returns some error, or if fewer
bytes than requested was returned. In the case of
!GRND_RANDOM and small request, the latter should never
happen, but the careful userspace code (and all crypto code
should be careful) should check for this anyway!
Finally, unless you are doing long-term key generation (and
perhaps not even then), you probably shouldn't be using
GRND_RANDOM. The cryptographic algorithms used for
/dev/urandom are quite conservative, and so should be
sufficient for all purposes. The disadvantage of GRND_RANDOM
is that it can block, and the increased complexity required to
deal with partially fulfilled getrandom(2) requests.
Signed-off-by: Theodore Ts'o <tytso@mit.edu>
Reviewed-by: Zach Brown <zab@zabbo.net>
2014-07-17 16:13:05 +08:00
|
|
|
asmlinkage long sys_getrandom(char __user *buf, size_t count,
|
|
|
|
unsigned int flags);
|
2014-09-26 15:16:58 +08:00
|
|
|
asmlinkage long sys_bpf(int cmd, union bpf_attr *attr, unsigned int size);
|
syscalls: implement execveat() system call
This patchset adds execveat(2) for x86, and is derived from Meredydd
Luff's patch from Sept 2012 (https://lkml.org/lkml/2012/9/11/528).
The primary aim of adding an execveat syscall is to allow an
implementation of fexecve(3) that does not rely on the /proc filesystem,
at least for executables (rather than scripts). The current glibc version
of fexecve(3) is implemented via /proc, which causes problems in sandboxed
or otherwise restricted environments.
Given the desire for a /proc-free fexecve() implementation, HPA suggested
(https://lkml.org/lkml/2006/7/11/556) that an execveat(2) syscall would be
an appropriate generalization.
Also, having a new syscall means that it can take a flags argument without
back-compatibility concerns. The current implementation just defines the
AT_EMPTY_PATH and AT_SYMLINK_NOFOLLOW flags, but other flags could be
added in future -- for example, flags for new namespaces (as suggested at
https://lkml.org/lkml/2006/7/11/474).
Related history:
- https://lkml.org/lkml/2006/12/27/123 is an example of someone
realizing that fexecve() is likely to fail in a chroot environment.
- http://bugs.debian.org/cgi-bin/bugreport.cgi?bug=514043 covered
documenting the /proc requirement of fexecve(3) in its manpage, to
"prevent other people from wasting their time".
- https://bugzilla.redhat.com/show_bug.cgi?id=241609 described a
problem where a process that did setuid() could not fexecve()
because it no longer had access to /proc/self/fd; this has since
been fixed.
This patch (of 4):
Add a new execveat(2) system call. execveat() is to execve() as openat()
is to open(): it takes a file descriptor that refers to a directory, and
resolves the filename relative to that.
In addition, if the filename is empty and AT_EMPTY_PATH is specified,
execveat() executes the file to which the file descriptor refers. This
replicates the functionality of fexecve(), which is a system call in other
UNIXen, but in Linux glibc it depends on opening "/proc/self/fd/<fd>" (and
so relies on /proc being mounted).
The filename fed to the executed program as argv[0] (or the name of the
script fed to a script interpreter) will be of the form "/dev/fd/<fd>"
(for an empty filename) or "/dev/fd/<fd>/<filename>", effectively
reflecting how the executable was found. This does however mean that
execution of a script in a /proc-less environment won't work; also, script
execution via an O_CLOEXEC file descriptor fails (as the file will not be
accessible after exec).
Based on patches by Meredydd Luff.
Signed-off-by: David Drysdale <drysdale@google.com>
Cc: Meredydd Luff <meredydd@senatehouse.org>
Cc: Shuah Khan <shuah.kh@samsung.com>
Cc: "Eric W. Biederman" <ebiederm@xmission.com>
Cc: Andy Lutomirski <luto@amacapital.net>
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: "H. Peter Anvin" <hpa@zytor.com>
Cc: Kees Cook <keescook@chromium.org>
Cc: Arnd Bergmann <arnd@arndb.de>
Cc: Rich Felker <dalias@aerifal.cx>
Cc: Christoph Hellwig <hch@infradead.org>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-12-13 08:57:29 +08:00
|
|
|
|
|
|
|
asmlinkage long sys_execveat(int dfd, const char __user *filename,
|
|
|
|
const char __user *const __user *argv,
|
|
|
|
const char __user *const __user *envp, int flags);
|
|
|
|
|
sys_membarrier(): system-wide memory barrier (generic, x86)
Here is an implementation of a new system call, sys_membarrier(), which
executes a memory barrier on all threads running on the system. It is
implemented by calling synchronize_sched(). It can be used to
distribute the cost of user-space memory barriers asymmetrically by
transforming pairs of memory barriers into pairs consisting of
sys_membarrier() and a compiler barrier. For synchronization primitives
that distinguish between read-side and write-side (e.g. userspace RCU
[1], rwlocks), the read-side can be accelerated significantly by moving
the bulk of the memory barrier overhead to the write-side.
The existing applications of which I am aware that would be improved by
this system call are as follows:
* Through Userspace RCU library (http://urcu.so)
- DNS server (Knot DNS) https://www.knot-dns.cz/
- Network sniffer (http://netsniff-ng.org/)
- Distributed object storage (https://sheepdog.github.io/sheepdog/)
- User-space tracing (http://lttng.org)
- Network storage system (https://www.gluster.org/)
- Virtual routers (https://events.linuxfoundation.org/sites/events/files/slides/DPDK_RCU_0MQ.pdf)
- Financial software (https://lkml.org/lkml/2015/3/23/189)
Those projects use RCU in userspace to increase read-side speed and
scalability compared to locking. Especially in the case of RCU used by
libraries, sys_membarrier can speed up the read-side by moving the bulk of
the memory barrier cost to synchronize_rcu().
* Direct users of sys_membarrier
- core dotnet garbage collector (https://github.com/dotnet/coreclr/issues/198)
Microsoft core dotnet GC developers are planning to use the mprotect()
side-effect of issuing memory barriers through IPIs as a way to implement
Windows FlushProcessWriteBuffers() on Linux. They are referring to
sys_membarrier in their github thread, specifically stating that
sys_membarrier() is what they are looking for.
To explain the benefit of this scheme, let's introduce two example threads:
Thread A (non-frequent, e.g. executing liburcu synchronize_rcu())
Thread B (frequent, e.g. executing liburcu
rcu_read_lock()/rcu_read_unlock())
In a scheme where all smp_mb() in thread A are ordering memory accesses
with respect to smp_mb() present in Thread B, we can change each
smp_mb() within Thread A into calls to sys_membarrier() and each
smp_mb() within Thread B into compiler barriers "barrier()".
Before the change, we had, for each smp_mb() pairs:
Thread A Thread B
previous mem accesses previous mem accesses
smp_mb() smp_mb()
following mem accesses following mem accesses
After the change, these pairs become:
Thread A Thread B
prev mem accesses prev mem accesses
sys_membarrier() barrier()
follow mem accesses follow mem accesses
As we can see, there are two possible scenarios: either Thread B memory
accesses do not happen concurrently with Thread A accesses (1), or they
do (2).
1) Non-concurrent Thread A vs Thread B accesses:
Thread A Thread B
prev mem accesses
sys_membarrier()
follow mem accesses
prev mem accesses
barrier()
follow mem accesses
In this case, thread B accesses will be weakly ordered. This is OK,
because at that point, thread A is not particularly interested in
ordering them with respect to its own accesses.
2) Concurrent Thread A vs Thread B accesses
Thread A Thread B
prev mem accesses prev mem accesses
sys_membarrier() barrier()
follow mem accesses follow mem accesses
In this case, thread B accesses, which are ensured to be in program
order thanks to the compiler barrier, will be "upgraded" to full
smp_mb() by synchronize_sched().
* Benchmarks
On Intel Xeon E5405 (8 cores)
(one thread is calling sys_membarrier, the other 7 threads are busy
looping)
1000 non-expedited sys_membarrier calls in 33s =3D 33 milliseconds/call.
* User-space user of this system call: Userspace RCU library
Both the signal-based and the sys_membarrier userspace RCU schemes
permit us to remove the memory barrier from the userspace RCU
rcu_read_lock() and rcu_read_unlock() primitives, thus significantly
accelerating them. These memory barriers are replaced by compiler
barriers on the read-side, and all matching memory barriers on the
write-side are turned into an invocation of a memory barrier on all
active threads in the process. By letting the kernel perform this
synchronization rather than dumbly sending a signal to every process
threads (as we currently do), we diminish the number of unnecessary wake
ups and only issue the memory barriers on active threads. Non-running
threads do not need to execute such barrier anyway, because these are
implied by the scheduler context switches.
Results in liburcu:
Operations in 10s, 6 readers, 2 writers:
memory barriers in reader: 1701557485 reads, 2202847 writes
signal-based scheme: 9830061167 reads, 6700 writes
sys_membarrier: 9952759104 reads, 425 writes
sys_membarrier (dyn. check): 7970328887 reads, 425 writes
The dynamic sys_membarrier availability check adds some overhead to
the read-side compared to the signal-based scheme, but besides that,
sys_membarrier slightly outperforms the signal-based scheme. However,
this non-expedited sys_membarrier implementation has a much slower grace
period than signal and memory barrier schemes.
Besides diminishing the number of wake-ups, one major advantage of the
membarrier system call over the signal-based scheme is that it does not
need to reserve a signal. This plays much more nicely with libraries,
and with processes injected into for tracing purposes, for which we
cannot expect that signals will be unused by the application.
An expedited version of this system call can be added later on to speed
up the grace period. Its implementation will likely depend on reading
the cpu_curr()->mm without holding each CPU's rq lock.
This patch adds the system call to x86 and to asm-generic.
[1] http://urcu.so
membarrier(2) man page:
MEMBARRIER(2) Linux Programmer's Manual MEMBARRIER(2)
NAME
membarrier - issue memory barriers on a set of threads
SYNOPSIS
#include <linux/membarrier.h>
int membarrier(int cmd, int flags);
DESCRIPTION
The cmd argument is one of the following:
MEMBARRIER_CMD_QUERY
Query the set of supported commands. It returns a bitmask of
supported commands.
MEMBARRIER_CMD_SHARED
Execute a memory barrier on all threads running on the system.
Upon return from system call, the caller thread is ensured that
all running threads have passed through a state where all memory
accesses to user-space addresses match program order between
entry to and return from the system call (non-running threads
are de facto in such a state). This covers threads from all pro=E2=80=90
cesses running on the system. This command returns 0.
The flags argument needs to be 0. For future extensions.
All memory accesses performed in program order from each targeted
thread is guaranteed to be ordered with respect to sys_membarrier(). If
we use the semantic "barrier()" to represent a compiler barrier forcing
memory accesses to be performed in program order across the barrier,
and smp_mb() to represent explicit memory barriers forcing full memory
ordering across the barrier, we have the following ordering table for
each pair of barrier(), sys_membarrier() and smp_mb():
The pair ordering is detailed as (O: ordered, X: not ordered):
barrier() smp_mb() sys_membarrier()
barrier() X X O
smp_mb() X O O
sys_membarrier() O O O
RETURN VALUE
On success, these system calls return zero. On error, -1 is returned,
and errno is set appropriately. For a given command, with flags
argument set to 0, this system call is guaranteed to always return the
same value until reboot.
ERRORS
ENOSYS System call is not implemented.
EINVAL Invalid arguments.
Linux 2015-04-15 MEMBARRIER(2)
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nicholas Miell <nmiell@comcast.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-12 04:07:39 +08:00
|
|
|
asmlinkage long sys_membarrier(int cmd, int flags);
|
2015-11-11 05:53:30 +08:00
|
|
|
asmlinkage long sys_copy_file_range(int fd_in, loff_t __user *off_in,
|
|
|
|
int fd_out, loff_t __user *off_out,
|
|
|
|
size_t len, unsigned int flags);
|
sys_membarrier(): system-wide memory barrier (generic, x86)
Here is an implementation of a new system call, sys_membarrier(), which
executes a memory barrier on all threads running on the system. It is
implemented by calling synchronize_sched(). It can be used to
distribute the cost of user-space memory barriers asymmetrically by
transforming pairs of memory barriers into pairs consisting of
sys_membarrier() and a compiler barrier. For synchronization primitives
that distinguish between read-side and write-side (e.g. userspace RCU
[1], rwlocks), the read-side can be accelerated significantly by moving
the bulk of the memory barrier overhead to the write-side.
The existing applications of which I am aware that would be improved by
this system call are as follows:
* Through Userspace RCU library (http://urcu.so)
- DNS server (Knot DNS) https://www.knot-dns.cz/
- Network sniffer (http://netsniff-ng.org/)
- Distributed object storage (https://sheepdog.github.io/sheepdog/)
- User-space tracing (http://lttng.org)
- Network storage system (https://www.gluster.org/)
- Virtual routers (https://events.linuxfoundation.org/sites/events/files/slides/DPDK_RCU_0MQ.pdf)
- Financial software (https://lkml.org/lkml/2015/3/23/189)
Those projects use RCU in userspace to increase read-side speed and
scalability compared to locking. Especially in the case of RCU used by
libraries, sys_membarrier can speed up the read-side by moving the bulk of
the memory barrier cost to synchronize_rcu().
* Direct users of sys_membarrier
- core dotnet garbage collector (https://github.com/dotnet/coreclr/issues/198)
Microsoft core dotnet GC developers are planning to use the mprotect()
side-effect of issuing memory barriers through IPIs as a way to implement
Windows FlushProcessWriteBuffers() on Linux. They are referring to
sys_membarrier in their github thread, specifically stating that
sys_membarrier() is what they are looking for.
To explain the benefit of this scheme, let's introduce two example threads:
Thread A (non-frequent, e.g. executing liburcu synchronize_rcu())
Thread B (frequent, e.g. executing liburcu
rcu_read_lock()/rcu_read_unlock())
In a scheme where all smp_mb() in thread A are ordering memory accesses
with respect to smp_mb() present in Thread B, we can change each
smp_mb() within Thread A into calls to sys_membarrier() and each
smp_mb() within Thread B into compiler barriers "barrier()".
Before the change, we had, for each smp_mb() pairs:
Thread A Thread B
previous mem accesses previous mem accesses
smp_mb() smp_mb()
following mem accesses following mem accesses
After the change, these pairs become:
Thread A Thread B
prev mem accesses prev mem accesses
sys_membarrier() barrier()
follow mem accesses follow mem accesses
As we can see, there are two possible scenarios: either Thread B memory
accesses do not happen concurrently with Thread A accesses (1), or they
do (2).
1) Non-concurrent Thread A vs Thread B accesses:
Thread A Thread B
prev mem accesses
sys_membarrier()
follow mem accesses
prev mem accesses
barrier()
follow mem accesses
In this case, thread B accesses will be weakly ordered. This is OK,
because at that point, thread A is not particularly interested in
ordering them with respect to its own accesses.
2) Concurrent Thread A vs Thread B accesses
Thread A Thread B
prev mem accesses prev mem accesses
sys_membarrier() barrier()
follow mem accesses follow mem accesses
In this case, thread B accesses, which are ensured to be in program
order thanks to the compiler barrier, will be "upgraded" to full
smp_mb() by synchronize_sched().
* Benchmarks
On Intel Xeon E5405 (8 cores)
(one thread is calling sys_membarrier, the other 7 threads are busy
looping)
1000 non-expedited sys_membarrier calls in 33s =3D 33 milliseconds/call.
* User-space user of this system call: Userspace RCU library
Both the signal-based and the sys_membarrier userspace RCU schemes
permit us to remove the memory barrier from the userspace RCU
rcu_read_lock() and rcu_read_unlock() primitives, thus significantly
accelerating them. These memory barriers are replaced by compiler
barriers on the read-side, and all matching memory barriers on the
write-side are turned into an invocation of a memory barrier on all
active threads in the process. By letting the kernel perform this
synchronization rather than dumbly sending a signal to every process
threads (as we currently do), we diminish the number of unnecessary wake
ups and only issue the memory barriers on active threads. Non-running
threads do not need to execute such barrier anyway, because these are
implied by the scheduler context switches.
Results in liburcu:
Operations in 10s, 6 readers, 2 writers:
memory barriers in reader: 1701557485 reads, 2202847 writes
signal-based scheme: 9830061167 reads, 6700 writes
sys_membarrier: 9952759104 reads, 425 writes
sys_membarrier (dyn. check): 7970328887 reads, 425 writes
The dynamic sys_membarrier availability check adds some overhead to
the read-side compared to the signal-based scheme, but besides that,
sys_membarrier slightly outperforms the signal-based scheme. However,
this non-expedited sys_membarrier implementation has a much slower grace
period than signal and memory barrier schemes.
Besides diminishing the number of wake-ups, one major advantage of the
membarrier system call over the signal-based scheme is that it does not
need to reserve a signal. This plays much more nicely with libraries,
and with processes injected into for tracing purposes, for which we
cannot expect that signals will be unused by the application.
An expedited version of this system call can be added later on to speed
up the grace period. Its implementation will likely depend on reading
the cpu_curr()->mm without holding each CPU's rq lock.
This patch adds the system call to x86 and to asm-generic.
[1] http://urcu.so
membarrier(2) man page:
MEMBARRIER(2) Linux Programmer's Manual MEMBARRIER(2)
NAME
membarrier - issue memory barriers on a set of threads
SYNOPSIS
#include <linux/membarrier.h>
int membarrier(int cmd, int flags);
DESCRIPTION
The cmd argument is one of the following:
MEMBARRIER_CMD_QUERY
Query the set of supported commands. It returns a bitmask of
supported commands.
MEMBARRIER_CMD_SHARED
Execute a memory barrier on all threads running on the system.
Upon return from system call, the caller thread is ensured that
all running threads have passed through a state where all memory
accesses to user-space addresses match program order between
entry to and return from the system call (non-running threads
are de facto in such a state). This covers threads from all pro=E2=80=90
cesses running on the system. This command returns 0.
The flags argument needs to be 0. For future extensions.
All memory accesses performed in program order from each targeted
thread is guaranteed to be ordered with respect to sys_membarrier(). If
we use the semantic "barrier()" to represent a compiler barrier forcing
memory accesses to be performed in program order across the barrier,
and smp_mb() to represent explicit memory barriers forcing full memory
ordering across the barrier, we have the following ordering table for
each pair of barrier(), sys_membarrier() and smp_mb():
The pair ordering is detailed as (O: ordered, X: not ordered):
barrier() smp_mb() sys_membarrier()
barrier() X X O
smp_mb() X O O
sys_membarrier() O O O
RETURN VALUE
On success, these system calls return zero. On error, -1 is returned,
and errno is set appropriately. For a given command, with flags
argument set to 0, this system call is guaranteed to always return the
same value until reboot.
ERRORS
ENOSYS System call is not implemented.
EINVAL Invalid arguments.
Linux 2015-04-15 MEMBARRIER(2)
Signed-off-by: Mathieu Desnoyers <mathieu.desnoyers@efficios.com>
Reviewed-by: Paul E. McKenney <paulmck@linux.vnet.ibm.com>
Reviewed-by: Josh Triplett <josh@joshtriplett.org>
Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com>
Cc: Steven Rostedt <rostedt@goodmis.org>
Cc: Nicholas Miell <nmiell@comcast.net>
Cc: Ingo Molnar <mingo@redhat.com>
Cc: Alan Cox <gnomes@lxorguk.ukuu.org.uk>
Cc: Lai Jiangshan <laijs@cn.fujitsu.com>
Cc: Stephen Hemminger <stephen@networkplumber.org>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: David Howells <dhowells@redhat.com>
Cc: Pranith Kumar <bobby.prani@gmail.com>
Cc: Michael Kerrisk <mtk.manpages@gmail.com>
Cc: Shuah Khan <shuahkh@osg.samsung.com>
Signed-off-by: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-12 04:07:39 +08:00
|
|
|
|
2015-11-06 10:51:33 +08:00
|
|
|
asmlinkage long sys_mlock2(unsigned long start, size_t len, int flags);
|
|
|
|
|
2016-07-30 00:30:18 +08:00
|
|
|
asmlinkage long sys_pkey_mprotect(unsigned long start, size_t len,
|
|
|
|
unsigned long prot, int pkey);
|
|
|
|
asmlinkage long sys_pkey_alloc(unsigned long flags, unsigned long init_val);
|
|
|
|
asmlinkage long sys_pkey_free(int pkey);
|
statx: Add a system call to make enhanced file info available
Add a system call to make extended file information available, including
file creation and some attribute flags where available through the
underlying filesystem.
The getattr inode operation is altered to take two additional arguments: a
u32 request_mask and an unsigned int flags that indicate the
synchronisation mode. This change is propagated to the vfs_getattr*()
function.
Functions like vfs_stat() are now inline wrappers around new functions
vfs_statx() and vfs_statx_fd() to reduce stack usage.
========
OVERVIEW
========
The idea was initially proposed as a set of xattrs that could be retrieved
with getxattr(), but the general preference proved to be for a new syscall
with an extended stat structure.
A number of requests were gathered for features to be included. The
following have been included:
(1) Make the fields a consistent size on all arches and make them large.
(2) Spare space, request flags and information flags are provided for
future expansion.
(3) Better support for the y2038 problem [Arnd Bergmann] (tv_sec is an
__s64).
(4) Creation time: The SMB protocol carries the creation time, which could
be exported by Samba, which will in turn help CIFS make use of
FS-Cache as that can be used for coherency data (stx_btime).
This is also specified in NFSv4 as a recommended attribute and could
be exported by NFSD [Steve French].
(5) Lightweight stat: Ask for just those details of interest, and allow a
netfs (such as NFS) to approximate anything not of interest, possibly
without going to the server [Trond Myklebust, Ulrich Drepper, Andreas
Dilger] (AT_STATX_DONT_SYNC).
(6) Heavyweight stat: Force a netfs to go to the server, even if it thinks
its cached attributes are up to date [Trond Myklebust]
(AT_STATX_FORCE_SYNC).
And the following have been left out for future extension:
(7) Data version number: Could be used by userspace NFS servers [Aneesh
Kumar].
Can also be used to modify fill_post_wcc() in NFSD which retrieves
i_version directly, but has just called vfs_getattr(). It could get
it from the kstat struct if it used vfs_xgetattr() instead.
(There's disagreement on the exact semantics of a single field, since
not all filesystems do this the same way).
(8) BSD stat compatibility: Including more fields from the BSD stat such
as creation time (st_btime) and inode generation number (st_gen)
[Jeremy Allison, Bernd Schubert].
(9) Inode generation number: Useful for FUSE and userspace NFS servers
[Bernd Schubert].
(This was asked for but later deemed unnecessary with the
open-by-handle capability available and caused disagreement as to
whether it's a security hole or not).
(10) Extra coherency data may be useful in making backups [Andreas Dilger].
(No particular data were offered, but things like last backup
timestamp, the data version number and the DOS archive bit would come
into this category).
(11) Allow the filesystem to indicate what it can/cannot provide: A
filesystem can now say it doesn't support a standard stat feature if
that isn't available, so if, for instance, inode numbers or UIDs don't
exist or are fabricated locally...
(This requires a separate system call - I have an fsinfo() call idea
for this).
(12) Store a 16-byte volume ID in the superblock that can be returned in
struct xstat [Steve French].
(Deferred to fsinfo).
(13) Include granularity fields in the time data to indicate the
granularity of each of the times (NFSv4 time_delta) [Steve French].
(Deferred to fsinfo).
(14) FS_IOC_GETFLAGS value. These could be translated to BSD's st_flags.
Note that the Linux IOC flags are a mess and filesystems such as Ext4
define flags that aren't in linux/fs.h, so translation in the kernel
may be a necessity (or, possibly, we provide the filesystem type too).
(Some attributes are made available in stx_attributes, but the general
feeling was that the IOC flags were to ext[234]-specific and shouldn't
be exposed through statx this way).
(15) Mask of features available on file (eg: ACLs, seclabel) [Brad Boyer,
Michael Kerrisk].
(Deferred, probably to fsinfo. Finding out if there's an ACL or
seclabal might require extra filesystem operations).
(16) Femtosecond-resolution timestamps [Dave Chinner].
(A __reserved field has been left in the statx_timestamp struct for
this - if there proves to be a need).
(17) A set multiple attributes syscall to go with this.
===============
NEW SYSTEM CALL
===============
The new system call is:
int ret = statx(int dfd,
const char *filename,
unsigned int flags,
unsigned int mask,
struct statx *buffer);
The dfd, filename and flags parameters indicate the file to query, in a
similar way to fstatat(). There is no equivalent of lstat() as that can be
emulated with statx() by passing AT_SYMLINK_NOFOLLOW in flags. There is
also no equivalent of fstat() as that can be emulated by passing a NULL
filename to statx() with the fd of interest in dfd.
Whether or not statx() synchronises the attributes with the backing store
can be controlled by OR'ing a value into the flags argument (this typically
only affects network filesystems):
(1) AT_STATX_SYNC_AS_STAT tells statx() to behave as stat() does in this
respect.
(2) AT_STATX_FORCE_SYNC will require a network filesystem to synchronise
its attributes with the server - which might require data writeback to
occur to get the timestamps correct.
(3) AT_STATX_DONT_SYNC will suppress synchronisation with the server in a
network filesystem. The resulting values should be considered
approximate.
mask is a bitmask indicating the fields in struct statx that are of
interest to the caller. The user should set this to STATX_BASIC_STATS to
get the basic set returned by stat(). It should be noted that asking for
more information may entail extra I/O operations.
buffer points to the destination for the data. This must be 256 bytes in
size.
======================
MAIN ATTRIBUTES RECORD
======================
The following structures are defined in which to return the main attribute
set:
struct statx_timestamp {
__s64 tv_sec;
__s32 tv_nsec;
__s32 __reserved;
};
struct statx {
__u32 stx_mask;
__u32 stx_blksize;
__u64 stx_attributes;
__u32 stx_nlink;
__u32 stx_uid;
__u32 stx_gid;
__u16 stx_mode;
__u16 __spare0[1];
__u64 stx_ino;
__u64 stx_size;
__u64 stx_blocks;
__u64 __spare1[1];
struct statx_timestamp stx_atime;
struct statx_timestamp stx_btime;
struct statx_timestamp stx_ctime;
struct statx_timestamp stx_mtime;
__u32 stx_rdev_major;
__u32 stx_rdev_minor;
__u32 stx_dev_major;
__u32 stx_dev_minor;
__u64 __spare2[14];
};
The defined bits in request_mask and stx_mask are:
STATX_TYPE Want/got stx_mode & S_IFMT
STATX_MODE Want/got stx_mode & ~S_IFMT
STATX_NLINK Want/got stx_nlink
STATX_UID Want/got stx_uid
STATX_GID Want/got stx_gid
STATX_ATIME Want/got stx_atime{,_ns}
STATX_MTIME Want/got stx_mtime{,_ns}
STATX_CTIME Want/got stx_ctime{,_ns}
STATX_INO Want/got stx_ino
STATX_SIZE Want/got stx_size
STATX_BLOCKS Want/got stx_blocks
STATX_BASIC_STATS [The stuff in the normal stat struct]
STATX_BTIME Want/got stx_btime{,_ns}
STATX_ALL [All currently available stuff]
stx_btime is the file creation time, stx_mask is a bitmask indicating the
data provided and __spares*[] are where as-yet undefined fields can be
placed.
Time fields are structures with separate seconds and nanoseconds fields
plus a reserved field in case we want to add even finer resolution. Note
that times will be negative if before 1970; in such a case, the nanosecond
fields will also be negative if not zero.
The bits defined in the stx_attributes field convey information about a
file, how it is accessed, where it is and what it does. The following
attributes map to FS_*_FL flags and are the same numerical value:
STATX_ATTR_COMPRESSED File is compressed by the fs
STATX_ATTR_IMMUTABLE File is marked immutable
STATX_ATTR_APPEND File is append-only
STATX_ATTR_NODUMP File is not to be dumped
STATX_ATTR_ENCRYPTED File requires key to decrypt in fs
Within the kernel, the supported flags are listed by:
KSTAT_ATTR_FS_IOC_FLAGS
[Are any other IOC flags of sufficient general interest to be exposed
through this interface?]
New flags include:
STATX_ATTR_AUTOMOUNT Object is an automount trigger
These are for the use of GUI tools that might want to mark files specially,
depending on what they are.
Fields in struct statx come in a number of classes:
(0) stx_dev_*, stx_blksize.
These are local system information and are always available.
(1) stx_mode, stx_nlinks, stx_uid, stx_gid, stx_[amc]time, stx_ino,
stx_size, stx_blocks.
These will be returned whether the caller asks for them or not. The
corresponding bits in stx_mask will be set to indicate whether they
actually have valid values.
If the caller didn't ask for them, then they may be approximated. For
example, NFS won't waste any time updating them from the server,
unless as a byproduct of updating something requested.
If the values don't actually exist for the underlying object (such as
UID or GID on a DOS file), then the bit won't be set in the stx_mask,
even if the caller asked for the value. In such a case, the returned
value will be a fabrication.
Note that there are instances where the type might not be valid, for
instance Windows reparse points.
(2) stx_rdev_*.
This will be set only if stx_mode indicates we're looking at a
blockdev or a chardev, otherwise will be 0.
(3) stx_btime.
Similar to (1), except this will be set to 0 if it doesn't exist.
=======
TESTING
=======
The following test program can be used to test the statx system call:
samples/statx/test-statx.c
Just compile and run, passing it paths to the files you want to examine.
The file is built automatically if CONFIG_SAMPLES is enabled.
Here's some example output. Firstly, an NFS directory that crosses to
another FSID. Note that the AUTOMOUNT attribute is set because transiting
this directory will cause d_automount to be invoked by the VFS.
[root@andromeda ~]# /tmp/test-statx -A /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:26 Inode: 1703937 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Attributes: 0000000000001000 (-------- -------- -------- -------- -------- -------- ---m---- --------)
Secondly, the result of automounting on that directory.
[root@andromeda ~]# /tmp/test-statx /warthog/data
statx(/warthog/data) = 0
results=7ff
Size: 4096 Blocks: 8 IO Block: 1048576 directory
Device: 00:27 Inode: 2 Links: 125
Access: (3777/drwxrwxrwx) Uid: 0 Gid: 4041
Access: 2016-11-24 09:02:12.219699527+0000
Modify: 2016-11-17 10:44:36.225653653+0000
Change: 2016-11-17 10:44:36.225653653+0000
Signed-off-by: David Howells <dhowells@redhat.com>
Signed-off-by: Al Viro <viro@zeniv.linux.org.uk>
2017-02-01 00:46:22 +08:00
|
|
|
asmlinkage long sys_statx(int dfd, const char __user *path, unsigned flags,
|
|
|
|
unsigned mask, struct statx __user *buffer);
|
2016-07-30 00:30:18 +08:00
|
|
|
|
2018-03-11 18:34:25 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Kernel code should not call syscalls (i.e., sys_xyzyyz()) directly.
|
|
|
|
* Instead, use one of the functions which work equivalently, such as
|
|
|
|
* the ksys_xyzyyz() functions prototyped below.
|
|
|
|
*/
|
|
|
|
|
2018-03-11 18:34:39 +08:00
|
|
|
int ksys_mount(char __user *dev_name, char __user *dir_name, char __user *type,
|
|
|
|
unsigned long flags, void __user *data);
|
2018-03-11 18:34:40 +08:00
|
|
|
int ksys_umount(char __user *name, int flags);
|
fs: add ksys_dup{,3}() helper; remove in-kernel calls to sys_dup{,3}()
Using ksys_dup() and ksys_dup3() as helper functions allows us to
avoid the in-kernel calls to the sys_dup() and sys_dup3() syscalls.
The ksys_ prefix denotes that these functions are meant as a drop-in
replacement for the syscalls. In particular, they use the same
calling convention as sys_dup{,3}().
In the near future, the fs-external callers of ksys_dup{,3}() should be
converted to call do_dup2() directly. Then, ksys_dup{,3}() can be moved
within sys_dup{,3}() again.
This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net
Cc: Alexander Viro <viro@zeniv.linux.org.uk>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
2018-03-11 18:34:40 +08:00
|
|
|
int ksys_dup(unsigned int fildes);
|
2018-03-11 18:34:41 +08:00
|
|
|
int ksys_chroot(const char __user *filename);
|
2018-03-11 18:34:41 +08:00
|
|
|
ssize_t ksys_write(unsigned int fd, const char __user *buf, size_t count);
|
2018-03-11 18:34:46 +08:00
|
|
|
int ksys_chdir(const char __user *filename);
|
2018-03-11 18:34:53 +08:00
|
|
|
int ksys_fchmod(unsigned int fd, umode_t mode);
|
fs: add do_fchownat(), ksys_fchown() helpers and ksys_{,l}chown() wrappers
Using the fs-interal do_fchownat() wrapper allows us to get rid of
fs-internal calls to the sys_fchownat() syscall.
Introducing the ksys_fchown() helper and the ksys_{,}chown() wrappers
allows us to avoid the in-kernel calls to the sys_{,l,f}chown() syscalls.
The ksys_ prefix denotes that these functions are meant as a drop-in
replacement for the syscalls. In particular, they use the same calling
convention as sys_{,l,f}chown().
This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
2018-03-11 18:34:55 +08:00
|
|
|
int ksys_fchown(unsigned int fd, uid_t user, gid_t group);
|
2018-03-14 04:34:04 +08:00
|
|
|
int ksys_getdents64(unsigned int fd, struct linux_dirent64 __user *dirent,
|
|
|
|
unsigned int count);
|
2018-03-14 04:43:59 +08:00
|
|
|
int ksys_ioctl(unsigned int fd, unsigned int cmd, unsigned long arg);
|
2018-03-14 04:51:17 +08:00
|
|
|
off_t ksys_lseek(unsigned int fd, off_t offset, unsigned int whence);
|
2018-03-14 04:56:26 +08:00
|
|
|
ssize_t ksys_read(unsigned int fd, char __user *buf, size_t count);
|
2018-03-15 05:35:11 +08:00
|
|
|
void ksys_sync(void);
|
2018-03-11 18:34:42 +08:00
|
|
|
int ksys_unshare(unsigned long unshare_flags);
|
2018-03-16 19:36:06 +08:00
|
|
|
int ksys_setsid(void);
|
2018-03-11 18:34:47 +08:00
|
|
|
int ksys_sync_file_range(int fd, loff_t offset, loff_t nbytes,
|
|
|
|
unsigned int flags);
|
2018-03-20 00:38:31 +08:00
|
|
|
ssize_t ksys_pread64(unsigned int fd, char __user *buf, size_t count,
|
|
|
|
loff_t pos);
|
|
|
|
ssize_t ksys_pwrite64(unsigned int fd, const char __user *buf,
|
|
|
|
size_t count, loff_t pos);
|
2018-03-20 00:46:32 +08:00
|
|
|
int ksys_fallocate(int fd, int mode, loff_t offset, loff_t len);
|
2018-03-11 18:34:39 +08:00
|
|
|
|
2018-03-11 18:34:47 +08:00
|
|
|
/*
|
|
|
|
* The following kernel syscall equivalents are just wrappers to fs-internal
|
|
|
|
* functions. Therefore, provide stubs to be inlined at the callsites.
|
|
|
|
*/
|
|
|
|
extern long do_unlinkat(int dfd, struct filename *name);
|
|
|
|
|
|
|
|
static inline long ksys_unlink(const char __user *pathname)
|
|
|
|
{
|
|
|
|
return do_unlinkat(AT_FDCWD, getname(pathname));
|
|
|
|
}
|
|
|
|
|
2018-03-11 18:34:48 +08:00
|
|
|
extern long do_rmdir(int dfd, const char __user *pathname);
|
|
|
|
|
|
|
|
static inline long ksys_rmdir(const char __user *pathname)
|
|
|
|
{
|
|
|
|
return do_rmdir(AT_FDCWD, pathname);
|
|
|
|
}
|
|
|
|
|
2018-03-11 18:34:49 +08:00
|
|
|
extern long do_mkdirat(int dfd, const char __user *pathname, umode_t mode);
|
|
|
|
|
|
|
|
static inline long ksys_mkdir(const char __user *pathname, umode_t mode)
|
|
|
|
{
|
|
|
|
return do_mkdirat(AT_FDCWD, pathname, mode);
|
|
|
|
}
|
|
|
|
|
2018-03-11 18:34:49 +08:00
|
|
|
extern long do_symlinkat(const char __user *oldname, int newdfd,
|
|
|
|
const char __user *newname);
|
|
|
|
|
|
|
|
static inline long ksys_symlink(const char __user *oldname,
|
|
|
|
const char __user *newname)
|
|
|
|
{
|
|
|
|
return do_symlinkat(oldname, AT_FDCWD, newname);
|
|
|
|
}
|
|
|
|
|
2018-03-11 18:34:50 +08:00
|
|
|
extern long do_mknodat(int dfd, const char __user *filename, umode_t mode,
|
|
|
|
unsigned int dev);
|
|
|
|
|
|
|
|
static inline long ksys_mknod(const char __user *filename, umode_t mode,
|
|
|
|
unsigned int dev)
|
|
|
|
{
|
|
|
|
return do_mknodat(AT_FDCWD, filename, mode, dev);
|
|
|
|
}
|
|
|
|
|
2018-03-11 18:34:53 +08:00
|
|
|
extern int do_linkat(int olddfd, const char __user *oldname, int newdfd,
|
|
|
|
const char __user *newname, int flags);
|
|
|
|
|
|
|
|
static inline long ksys_link(const char __user *oldname,
|
|
|
|
const char __user *newname)
|
|
|
|
{
|
|
|
|
return do_linkat(AT_FDCWD, oldname, AT_FDCWD, newname, 0);
|
|
|
|
}
|
|
|
|
|
2018-03-11 18:34:53 +08:00
|
|
|
extern int do_fchmodat(int dfd, const char __user *filename, umode_t mode);
|
|
|
|
|
|
|
|
static inline int ksys_chmod(const char __user *filename, umode_t mode)
|
|
|
|
{
|
|
|
|
return do_fchmodat(AT_FDCWD, filename, mode);
|
|
|
|
}
|
|
|
|
|
2018-03-11 18:34:54 +08:00
|
|
|
extern long do_faccessat(int dfd, const char __user *filename, int mode);
|
|
|
|
|
|
|
|
static inline long ksys_access(const char __user *filename, int mode)
|
|
|
|
{
|
|
|
|
return do_faccessat(AT_FDCWD, filename, mode);
|
|
|
|
}
|
|
|
|
|
fs: add do_fchownat(), ksys_fchown() helpers and ksys_{,l}chown() wrappers
Using the fs-interal do_fchownat() wrapper allows us to get rid of
fs-internal calls to the sys_fchownat() syscall.
Introducing the ksys_fchown() helper and the ksys_{,}chown() wrappers
allows us to avoid the in-kernel calls to the sys_{,l,f}chown() syscalls.
The ksys_ prefix denotes that these functions are meant as a drop-in
replacement for the syscalls. In particular, they use the same calling
convention as sys_{,l,f}chown().
This patch is part of a series which removes in-kernel calls to syscalls.
On this basis, the syscall entry path can be streamlined. For details, see
http://lkml.kernel.org/r/20180325162527.GA17492@light.dominikbrodowski.net
Cc: Al Viro <viro@zeniv.linux.org.uk>
Cc: Andrew Morton <akpm@linux-foundation.org>
Signed-off-by: Dominik Brodowski <linux@dominikbrodowski.net>
2018-03-11 18:34:55 +08:00
|
|
|
extern int do_fchownat(int dfd, const char __user *filename, uid_t user,
|
|
|
|
gid_t group, int flag);
|
|
|
|
|
|
|
|
static inline long ksys_chown(const char __user *filename, uid_t user,
|
|
|
|
gid_t group)
|
|
|
|
{
|
|
|
|
return do_fchownat(AT_FDCWD, filename, user, group, 0);
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline long ksys_lchown(const char __user *filename, uid_t user,
|
|
|
|
gid_t group)
|
|
|
|
{
|
|
|
|
return do_fchownat(AT_FDCWD, filename, user, group,
|
|
|
|
AT_SYMLINK_NOFOLLOW);
|
|
|
|
}
|
|
|
|
|
2018-03-11 18:34:54 +08:00
|
|
|
extern long do_sys_ftruncate(unsigned int fd, loff_t length, int small);
|
|
|
|
|
|
|
|
static inline long ksys_ftruncate(unsigned int fd, unsigned long length)
|
|
|
|
{
|
|
|
|
return do_sys_ftruncate(fd, length, 1);
|
|
|
|
}
|
|
|
|
|
2018-03-11 18:34:55 +08:00
|
|
|
extern int __close_fd(struct files_struct *files, unsigned int fd);
|
|
|
|
|
|
|
|
/*
|
|
|
|
* In contrast to sys_close(), this stub does not check whether the syscall
|
|
|
|
* should or should not be restarted, but returns the raw error codes from
|
|
|
|
* __close_fd().
|
|
|
|
*/
|
|
|
|
static inline int ksys_close(unsigned int fd)
|
|
|
|
{
|
|
|
|
return __close_fd(current->files, fd);
|
|
|
|
}
|
|
|
|
|
2018-03-11 18:34:56 +08:00
|
|
|
extern long do_sys_open(int dfd, const char __user *filename, int flags,
|
|
|
|
umode_t mode);
|
|
|
|
|
|
|
|
static inline long ksys_open(const char __user *filename, int flags,
|
|
|
|
umode_t mode)
|
|
|
|
{
|
|
|
|
if (force_o_largefile())
|
|
|
|
flags |= O_LARGEFILE;
|
|
|
|
return do_sys_open(AT_FDCWD, filename, flags, mode);
|
|
|
|
}
|
|
|
|
|
2018-03-20 00:32:11 +08:00
|
|
|
extern long do_sys_truncate(const char __user *pathname, loff_t length);
|
|
|
|
|
|
|
|
static inline long ksys_truncate(const char __user *pathname, loff_t length)
|
|
|
|
{
|
|
|
|
return do_sys_truncate(pathname, length);
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif
|