2017-11-01 22:09:13 +08:00
|
|
|
/* SPDX-License-Identifier: ((GPL-2.0 WITH Linux-syscall-note) OR BSD-2-Clause) */
|
[PATCH] FUSE - core
This patch adds FUSE core.
This contains the following files:
o inode.c
- superblock operations (alloc_inode, destroy_inode, read_inode,
clear_inode, put_super, show_options)
- registers FUSE filesystem
o fuse_i.h
- private header file
Requirements
============
The most important difference between orinary filesystems and FUSE is
the fact, that the filesystem data/metadata is provided by a userspace
process run with the privileges of the mount "owner" instead of the
kernel, or some remote entity usually running with elevated
privileges.
The security implication of this is that a non-privileged user must
not be able to use this capability to compromise the system. Obvious
requirements arising from this are:
- mount owner should not be able to get elevated privileges with the
help of the mounted filesystem
- mount owner should not be able to induce undesired behavior in
other users' or the super user's processes
- mount owner should not get illegitimate access to information from
other users' and the super user's processes
These are currently ensured with the following constraints:
1) mount is only allowed to directory or file which the mount owner
can modify without limitation (write access + no sticky bit for
directories)
2) nosuid,nodev mount options are forced
3) any process running with fsuid different from the owner is denied
all access to the filesystem
1) and 2) are ensured by the "fusermount" mount utility which is a
setuid root application doing the actual mount operation.
3) is ensured by a check in the permission() method in kernel
I started thinking about doing 3) in a different way because Christoph
H. made a big deal out of it, saying that FUSE is unacceptable into
mainline in this form.
The suggested use of private namespaces would be OK, but in their
current form have many limitations that make their use impractical (as
discussed in this thread).
Suggested improvements that would address these limitations:
- implement shared subtrees
- allow a process to join an existing namespace (make namespaces
first-class objects)
- implement the namespace creation/joining in a PAM module
With all that in place the check of owner against current->fsuid may
be removed from the FUSE kernel module, without compromising the
security requirements.
Suid programs still interesting questions, since they get access even
to the private namespace causing some information leak (exact
order/timing of filesystem operations performed), giving some
ptrace-like capabilities to unprivileged users. BTW this problem is
not strictly limited to the namespace approach, since suid programs
setting fsuid and accessing users' files will succeed with the current
approach too.
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-10 04:10:26 +08:00
|
|
|
/*
|
2013-02-07 18:58:12 +08:00
|
|
|
This file defines the kernel interface of FUSE
|
2008-12-02 02:14:02 +08:00
|
|
|
Copyright (C) 2001-2008 Miklos Szeredi <miklos@szeredi.hu>
|
[PATCH] FUSE - core
This patch adds FUSE core.
This contains the following files:
o inode.c
- superblock operations (alloc_inode, destroy_inode, read_inode,
clear_inode, put_super, show_options)
- registers FUSE filesystem
o fuse_i.h
- private header file
Requirements
============
The most important difference between orinary filesystems and FUSE is
the fact, that the filesystem data/metadata is provided by a userspace
process run with the privileges of the mount "owner" instead of the
kernel, or some remote entity usually running with elevated
privileges.
The security implication of this is that a non-privileged user must
not be able to use this capability to compromise the system. Obvious
requirements arising from this are:
- mount owner should not be able to get elevated privileges with the
help of the mounted filesystem
- mount owner should not be able to induce undesired behavior in
other users' or the super user's processes
- mount owner should not get illegitimate access to information from
other users' and the super user's processes
These are currently ensured with the following constraints:
1) mount is only allowed to directory or file which the mount owner
can modify without limitation (write access + no sticky bit for
directories)
2) nosuid,nodev mount options are forced
3) any process running with fsuid different from the owner is denied
all access to the filesystem
1) and 2) are ensured by the "fusermount" mount utility which is a
setuid root application doing the actual mount operation.
3) is ensured by a check in the permission() method in kernel
I started thinking about doing 3) in a different way because Christoph
H. made a big deal out of it, saying that FUSE is unacceptable into
mainline in this form.
The suggested use of private namespaces would be OK, but in their
current form have many limitations that make their use impractical (as
discussed in this thread).
Suggested improvements that would address these limitations:
- implement shared subtrees
- allow a process to join an existing namespace (make namespaces
first-class objects)
- implement the namespace creation/joining in a PAM module
With all that in place the check of owner against current->fsuid may
be removed from the FUSE kernel module, without compromising the
security requirements.
Suid programs still interesting questions, since they get access even
to the private namespace causing some information leak (exact
order/timing of filesystem operations performed), giving some
ptrace-like capabilities to unprivileged users. BTW this problem is
not strictly limited to the namespace approach, since suid programs
setting fsuid and accessing users' files will succeed with the current
approach too.
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-10 04:10:26 +08:00
|
|
|
|
|
|
|
This program can be distributed under the terms of the GNU GPL.
|
|
|
|
See the file COPYING.
|
2013-02-07 18:58:12 +08:00
|
|
|
|
|
|
|
This -- and only this -- header file may also be distributed under
|
|
|
|
the terms of the BSD Licence as follows:
|
|
|
|
|
|
|
|
Copyright (C) 2001-2007 Miklos Szeredi. All rights reserved.
|
|
|
|
|
|
|
|
Redistribution and use in source and binary forms, with or without
|
|
|
|
modification, are permitted provided that the following conditions
|
|
|
|
are met:
|
|
|
|
1. Redistributions of source code must retain the above copyright
|
|
|
|
notice, this list of conditions and the following disclaimer.
|
|
|
|
2. Redistributions in binary form must reproduce the above copyright
|
|
|
|
notice, this list of conditions and the following disclaimer in the
|
|
|
|
documentation and/or other materials provided with the distribution.
|
|
|
|
|
|
|
|
THIS SOFTWARE IS PROVIDED BY AUTHOR AND CONTRIBUTORS ``AS IS'' AND
|
|
|
|
ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
|
|
|
|
IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
|
|
|
|
ARE DISCLAIMED. IN NO EVENT SHALL AUTHOR OR CONTRIBUTORS BE LIABLE
|
|
|
|
FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL
|
|
|
|
DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS
|
|
|
|
OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION)
|
|
|
|
HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT
|
|
|
|
LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY
|
|
|
|
OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF
|
|
|
|
SUCH DAMAGE.
|
[PATCH] FUSE - core
This patch adds FUSE core.
This contains the following files:
o inode.c
- superblock operations (alloc_inode, destroy_inode, read_inode,
clear_inode, put_super, show_options)
- registers FUSE filesystem
o fuse_i.h
- private header file
Requirements
============
The most important difference between orinary filesystems and FUSE is
the fact, that the filesystem data/metadata is provided by a userspace
process run with the privileges of the mount "owner" instead of the
kernel, or some remote entity usually running with elevated
privileges.
The security implication of this is that a non-privileged user must
not be able to use this capability to compromise the system. Obvious
requirements arising from this are:
- mount owner should not be able to get elevated privileges with the
help of the mounted filesystem
- mount owner should not be able to induce undesired behavior in
other users' or the super user's processes
- mount owner should not get illegitimate access to information from
other users' and the super user's processes
These are currently ensured with the following constraints:
1) mount is only allowed to directory or file which the mount owner
can modify without limitation (write access + no sticky bit for
directories)
2) nosuid,nodev mount options are forced
3) any process running with fsuid different from the owner is denied
all access to the filesystem
1) and 2) are ensured by the "fusermount" mount utility which is a
setuid root application doing the actual mount operation.
3) is ensured by a check in the permission() method in kernel
I started thinking about doing 3) in a different way because Christoph
H. made a big deal out of it, saying that FUSE is unacceptable into
mainline in this form.
The suggested use of private namespaces would be OK, but in their
current form have many limitations that make their use impractical (as
discussed in this thread).
Suggested improvements that would address these limitations:
- implement shared subtrees
- allow a process to join an existing namespace (make namespaces
first-class objects)
- implement the namespace creation/joining in a PAM module
With all that in place the check of owner against current->fsuid may
be removed from the FUSE kernel module, without compromising the
security requirements.
Suid programs still interesting questions, since they get access even
to the private namespace causing some information leak (exact
order/timing of filesystem operations performed), giving some
ptrace-like capabilities to unprivileged users. BTW this problem is
not strictly limited to the namespace approach, since suid programs
setting fsuid and accessing users' files will succeed with the current
approach too.
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-10 04:10:26 +08:00
|
|
|
*/
|
|
|
|
|
2007-10-18 18:06:59 +08:00
|
|
|
/*
|
|
|
|
* This file defines the kernel interface of FUSE
|
|
|
|
*
|
|
|
|
* Protocol changelog:
|
|
|
|
*
|
|
|
|
* 7.9:
|
|
|
|
* - new fuse_getattr_in input argument of GETATTR
|
2007-10-18 18:07:02 +08:00
|
|
|
* - add lk_flags in fuse_lk_in
|
2007-10-18 18:07:04 +08:00
|
|
|
* - add lock_owner field to fuse_setattr_in, fuse_read_in and fuse_write_in
|
2007-10-18 18:07:05 +08:00
|
|
|
* - add blksize field to fuse_attr
|
2007-11-29 08:22:00 +08:00
|
|
|
* - add file flags field to fuse_read_in and fuse_write_in
|
2008-10-16 22:08:57 +08:00
|
|
|
*
|
|
|
|
* 7.10
|
|
|
|
* - add nonseekable open flag
|
2008-12-02 02:14:02 +08:00
|
|
|
*
|
|
|
|
* 7.11
|
|
|
|
* - add IOCTL message
|
|
|
|
* - add unsolicited notification support
|
|
|
|
* - add POLL message and NOTIFY_POLL notification
|
2009-07-01 02:12:23 +08:00
|
|
|
*
|
|
|
|
* 7.12
|
|
|
|
* - add umask flag to input argument of open, mknod and mkdir
|
2009-05-31 23:13:57 +08:00
|
|
|
* - add notification messages for invalidation of inodes and
|
|
|
|
* directory entries
|
2009-07-02 08:28:41 +08:00
|
|
|
*
|
|
|
|
* 7.13
|
|
|
|
* - make max number of background requests and congestion threshold
|
|
|
|
* tunables
|
fuse: support splice() writing to fuse device
Allow userspace filesystem implementation to use splice() to write to
the fuse device. The semantics of using splice() are:
1) buffer the message header and data in a temporary pipe
2) with a *single* splice() call move the message from the temporary pipe
to the fuse device
The READ reply message has the most interesting use for this, since
now the data from an arbitrary file descriptor (which could be a
regular file, a block device or a socket) can be tranferred into the
fuse device without having to go through a userspace buffer. It will
also allow zero copy moving of pages.
One caveat is that the protocol on the fuse device requires the length
of the whole message to be written into the header. But the length of
the data transferred into the temporary pipe may not be known in
advance. The current library implementation works around this by
using vmplice to write the header and modifying the header after
splicing the data into the pipe (error handling omitted):
struct fuse_out_header out;
iov.iov_base = &out;
iov.iov_len = sizeof(struct fuse_out_header);
vmsplice(pip[1], &iov, 1, 0);
len = splice(input_fd, input_offset, pip[1], NULL, len, 0);
/* retrospectively modify the header: */
out.len = len + sizeof(struct fuse_out_header);
splice(pip[0], NULL, fuse_chan_fd(req->ch), NULL, out.len, flags);
This works since vmsplice only saves a pointer to the data, it does
not copy the data itself.
Since pipes are currently limited to 16 pages and messages need to be
spliced atomically, the length of the data is limited to 15 pages (or
60kB for 4k pages).
Signed-off-by: Miklos Szeredi <mszeredi@suse.cz>
2010-05-25 21:06:06 +08:00
|
|
|
*
|
|
|
|
* 7.14
|
|
|
|
* - add splice support to fuse device
|
2010-07-12 20:41:40 +08:00
|
|
|
*
|
|
|
|
* 7.15
|
|
|
|
* - add store notify
|
2010-07-12 20:41:40 +08:00
|
|
|
* - add retrieve notify
|
2010-12-08 03:16:56 +08:00
|
|
|
*
|
|
|
|
* 7.16
|
|
|
|
* - add BATCH_FORGET request
|
2010-12-08 03:16:56 +08:00
|
|
|
* - FUSE_IOCTL_UNRESTRICTED shall now return with array of 'struct
|
|
|
|
* fuse_ioctl_iovec' instead of ambiguous 'struct iovec'
|
|
|
|
* - add FUSE_IOCTL_32BIT flag
|
2011-08-08 22:08:08 +08:00
|
|
|
*
|
|
|
|
* 7.17
|
|
|
|
* - add FUSE_FLOCK_LOCKS and FUSE_RELEASE_FLOCK_UNLOCK
|
2011-12-13 18:58:49 +08:00
|
|
|
*
|
|
|
|
* 7.18
|
|
|
|
* - add FUSE_IOCTL_DIR flag
|
2011-12-07 04:50:06 +08:00
|
|
|
* - add FUSE_NOTIFY_DELETE
|
2012-04-23 09:45:24 +08:00
|
|
|
*
|
|
|
|
* 7.19
|
|
|
|
* - add FUSE_FALLOCATE
|
2012-07-17 03:23:48 +08:00
|
|
|
*
|
|
|
|
* 7.20
|
|
|
|
* - add FUSE_AUTO_INVAL_DATA
|
2013-02-01 00:08:11 +08:00
|
|
|
*
|
|
|
|
* 7.21
|
|
|
|
* - add FUSE_READDIRPLUS
|
2013-02-04 23:14:32 +08:00
|
|
|
* - send the requested events in POLL request
|
2013-05-01 20:37:21 +08:00
|
|
|
*
|
|
|
|
* 7.22
|
|
|
|
* - add FUSE_ASYNC_DIO
|
2013-10-10 21:12:18 +08:00
|
|
|
*
|
|
|
|
* 7.23
|
|
|
|
* - add FUSE_WRITEBACK_CACHE
|
2014-04-28 20:19:23 +08:00
|
|
|
* - add time_gran to fuse_init_out
|
|
|
|
* - add reserved space to fuse_init_out
|
2014-04-28 20:19:24 +08:00
|
|
|
* - add FATTR_CTIME
|
|
|
|
* - add ctime and ctimensec to fuse_setattr_in
|
2014-04-28 22:43:44 +08:00
|
|
|
* - add FUSE_RENAME2 request
|
2014-07-22 22:37:43 +08:00
|
|
|
* - add FUSE_NO_OPEN_SUPPORT flag
|
2015-07-01 02:10:22 +08:00
|
|
|
*
|
|
|
|
* 7.24
|
|
|
|
* - add FUSE_LSEEK for SEEK_HOLE and SEEK_DATA support
|
2016-06-30 19:10:49 +08:00
|
|
|
*
|
|
|
|
* 7.25
|
|
|
|
* - add FUSE_PARALLEL_DIROPS
|
2016-10-01 13:32:32 +08:00
|
|
|
*
|
|
|
|
* 7.26
|
|
|
|
* - add FUSE_HANDLE_KILLPRIV
|
2016-08-29 21:46:37 +08:00
|
|
|
* - add FUSE_POSIX_ACL
|
2017-11-10 04:23:35 +08:00
|
|
|
*
|
|
|
|
* 7.27
|
|
|
|
* - add FUSE_ABORT_ERROR
|
2018-08-21 20:36:31 +08:00
|
|
|
*
|
|
|
|
* 7.28
|
|
|
|
* - add FUSE_COPY_FILE_RANGE
|
2018-09-28 22:43:23 +08:00
|
|
|
* - add FOPEN_CACHE_DIR
|
fuse: add max_pages to init_out
Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to
improve performance.
Old RFC with detailed description of the problem and many fixes by Mitsuo
Hayasaka (mitsuo.hayasaka.hu@hitachi.com):
- https://lkml.org/lkml/2012/7/5/136
We've encountered performance degradation and fixed it on a big and complex
virtual environment.
Environment to reproduce degradation and improvement:
1. Add lag to user mode FUSE
Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in
passthrough_fh.c
2. patch UM fuse with configurable max_pages parameter. The patch will be
provided latter.
3. run test script and perform test on tmpfs
fuse_test()
{
cd /tmp
mkdir -p fusemnt
passthrough_fh -o max_pages=$1 /tmp/fusemnt
grep fuse /proc/self/mounts
dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \
count=1K bs=1M 2>&1 | grep -v records
rm fusemnt/tmp/tmp
killall passthrough_fh
}
Test results:
passthrough_fh /tmp/fusemnt fuse.passthrough_fh \
rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0
1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s
passthrough_fh /tmp/fusemnt fuse.passthrough_fh \
rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0
1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s
Obviously with bigger lag the difference between 'before' and 'after'
will be more significant.
Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136),
observed improvement from 400-550 to 520-740.
Signed-off-by: Constantine Shulyupin <const@MakeLinux.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2018-09-06 20:37:06 +08:00
|
|
|
* - add FUSE_MAX_PAGES, add max_pages to init_out
|
2018-10-11 23:17:00 +08:00
|
|
|
* - add FUSE_CACHE_SYMLINKS
|
2019-01-08 08:53:17 +08:00
|
|
|
*
|
|
|
|
* 7.29
|
|
|
|
* - add FUSE_NO_OPENDIR_SUPPORT flag
|
fuse: allow filesystems to have precise control over data cache
On networked filesystems file data can be changed externally. FUSE
provides notification messages for filesystem to inform kernel that
metadata or data region of a file needs to be invalidated in local page
cache. That provides the basis for filesystem implementations to invalidate
kernel cache explicitly based on observed filesystem-specific events.
FUSE has also "automatic" invalidation mode(*) when the kernel
automatically invalidates data cache of a file if it sees mtime change. It
also automatically invalidates whole data cache of a file if it sees file
size being changed.
The automatic mode has corresponding capability - FUSE_AUTO_INVAL_DATA.
However, due to probably historical reason, that capability controls only
whether mtime change should be resulting in automatic invalidation or
not. A change in file size always results in invalidating whole data cache
of a file irregardless of whether FUSE_AUTO_INVAL_DATA was negotiated(+).
The filesystem I write[1] represents data arrays stored in networked
database as local files suitable for mmap. It is read-only filesystem -
changes to data are committed externally via database interfaces and the
filesystem only glues data into contiguous file streams suitable for mmap
and traditional array processing. The files are big - starting from
hundreds gigabytes and more. The files change regularly, and frequently by
data being appended to their end. The size of files thus changes
frequently.
If a file was accessed locally and some part of its data got into page
cache, we want that data to stay cached unless there is memory pressure, or
unless corresponding part of the file was actually changed. However current
FUSE behaviour - when it sees file size change - is to invalidate the whole
file. The data cache of the file is thus completely lost even on small size
change, and despite that the filesystem server is careful to accurately
translate database changes into FUSE invalidation messages to kernel.
Let's fix it: if a filesystem, through new FUSE_EXPLICIT_INVAL_DATA
capability, indicates to kernel that it is fully responsible for data cache
invalidation, then the kernel won't invalidate files data cache on size
change and only truncate that cache to new size in case the size decreased.
(*) see 72d0d248ca "fuse: add FUSE_AUTO_INVAL_DATA init flag",
eed2179efe "fuse: invalidate inode mapping if mtime changes"
(+) in writeback mode the kernel does not invalidate data cache on file
size change, but neither it allows the filesystem to set the size due to
external event (see 8373200b12 "fuse: Trust kernel i_size only")
[1] https://lab.nexedi.com/kirr/wendelin.core/blob/a50f1d9f/wcfs/wcfs.go#L20
Signed-off-by: Kirill Smelkov <kirr@nexedi.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2019-03-27 19:14:15 +08:00
|
|
|
*
|
|
|
|
* 7.30
|
|
|
|
* - add FUSE_EXPLICIT_INVAL_DATA
|
2007-10-18 18:06:59 +08:00
|
|
|
*/
|
[PATCH] FUSE - core
This patch adds FUSE core.
This contains the following files:
o inode.c
- superblock operations (alloc_inode, destroy_inode, read_inode,
clear_inode, put_super, show_options)
- registers FUSE filesystem
o fuse_i.h
- private header file
Requirements
============
The most important difference between orinary filesystems and FUSE is
the fact, that the filesystem data/metadata is provided by a userspace
process run with the privileges of the mount "owner" instead of the
kernel, or some remote entity usually running with elevated
privileges.
The security implication of this is that a non-privileged user must
not be able to use this capability to compromise the system. Obvious
requirements arising from this are:
- mount owner should not be able to get elevated privileges with the
help of the mounted filesystem
- mount owner should not be able to induce undesired behavior in
other users' or the super user's processes
- mount owner should not get illegitimate access to information from
other users' and the super user's processes
These are currently ensured with the following constraints:
1) mount is only allowed to directory or file which the mount owner
can modify without limitation (write access + no sticky bit for
directories)
2) nosuid,nodev mount options are forced
3) any process running with fsuid different from the owner is denied
all access to the filesystem
1) and 2) are ensured by the "fusermount" mount utility which is a
setuid root application doing the actual mount operation.
3) is ensured by a check in the permission() method in kernel
I started thinking about doing 3) in a different way because Christoph
H. made a big deal out of it, saying that FUSE is unacceptable into
mainline in this form.
The suggested use of private namespaces would be OK, but in their
current form have many limitations that make their use impractical (as
discussed in this thread).
Suggested improvements that would address these limitations:
- implement shared subtrees
- allow a process to join an existing namespace (make namespaces
first-class objects)
- implement the namespace creation/joining in a PAM module
With all that in place the check of owner against current->fsuid may
be removed from the FUSE kernel module, without compromising the
security requirements.
Suid programs still interesting questions, since they get access even
to the private namespace causing some information leak (exact
order/timing of filesystem operations performed), giving some
ptrace-like capabilities to unprivileged users. BTW this problem is
not strictly limited to the namespace approach, since suid programs
setting fsuid and accessing users' files will succeed with the current
approach too.
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-10 04:10:26 +08:00
|
|
|
|
2008-10-16 22:08:57 +08:00
|
|
|
#ifndef _LINUX_FUSE_H
|
|
|
|
#define _LINUX_FUSE_H
|
|
|
|
|
2013-04-17 18:30:40 +08:00
|
|
|
#ifdef __KERNEL__
|
2008-12-02 02:14:02 +08:00
|
|
|
#include <linux/types.h>
|
2013-02-07 18:58:12 +08:00
|
|
|
#else
|
|
|
|
#include <stdint.h>
|
|
|
|
#endif
|
[PATCH] FUSE - core
This patch adds FUSE core.
This contains the following files:
o inode.c
- superblock operations (alloc_inode, destroy_inode, read_inode,
clear_inode, put_super, show_options)
- registers FUSE filesystem
o fuse_i.h
- private header file
Requirements
============
The most important difference between orinary filesystems and FUSE is
the fact, that the filesystem data/metadata is provided by a userspace
process run with the privileges of the mount "owner" instead of the
kernel, or some remote entity usually running with elevated
privileges.
The security implication of this is that a non-privileged user must
not be able to use this capability to compromise the system. Obvious
requirements arising from this are:
- mount owner should not be able to get elevated privileges with the
help of the mounted filesystem
- mount owner should not be able to induce undesired behavior in
other users' or the super user's processes
- mount owner should not get illegitimate access to information from
other users' and the super user's processes
These are currently ensured with the following constraints:
1) mount is only allowed to directory or file which the mount owner
can modify without limitation (write access + no sticky bit for
directories)
2) nosuid,nodev mount options are forced
3) any process running with fsuid different from the owner is denied
all access to the filesystem
1) and 2) are ensured by the "fusermount" mount utility which is a
setuid root application doing the actual mount operation.
3) is ensured by a check in the permission() method in kernel
I started thinking about doing 3) in a different way because Christoph
H. made a big deal out of it, saying that FUSE is unacceptable into
mainline in this form.
The suggested use of private namespaces would be OK, but in their
current form have many limitations that make their use impractical (as
discussed in this thread).
Suggested improvements that would address these limitations:
- implement shared subtrees
- allow a process to join an existing namespace (make namespaces
first-class objects)
- implement the namespace creation/joining in a PAM module
With all that in place the check of owner against current->fsuid may
be removed from the FUSE kernel module, without compromising the
security requirements.
Suid programs still interesting questions, since they get access even
to the private namespace causing some information leak (exact
order/timing of filesystem operations performed), giving some
ptrace-like capabilities to unprivileged users. BTW this problem is
not strictly limited to the namespace approach, since suid programs
setting fsuid and accessing users' files will succeed with the current
approach too.
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-10 04:10:26 +08:00
|
|
|
|
2009-07-09 00:17:58 +08:00
|
|
|
/*
|
|
|
|
* Version negotiation:
|
|
|
|
*
|
|
|
|
* Both the kernel and userspace send the version they support in the
|
|
|
|
* INIT request and reply respectively.
|
|
|
|
*
|
|
|
|
* If the major versions match then both shall use the smallest
|
|
|
|
* of the two minor versions for communication.
|
|
|
|
*
|
|
|
|
* If the kernel supports a larger major version, then userspace shall
|
|
|
|
* reply with the major version it supports, ignore the rest of the
|
|
|
|
* INIT message and expect a new INIT message from the kernel with a
|
|
|
|
* matching major version.
|
|
|
|
*
|
|
|
|
* If the library supports a larger major version, then it shall fall
|
|
|
|
* back to the major protocol version sent by the kernel for
|
|
|
|
* communication and reply with that major version (and an arbitrary
|
|
|
|
* supported minor version).
|
|
|
|
*/
|
|
|
|
|
[PATCH] FUSE - core
This patch adds FUSE core.
This contains the following files:
o inode.c
- superblock operations (alloc_inode, destroy_inode, read_inode,
clear_inode, put_super, show_options)
- registers FUSE filesystem
o fuse_i.h
- private header file
Requirements
============
The most important difference between orinary filesystems and FUSE is
the fact, that the filesystem data/metadata is provided by a userspace
process run with the privileges of the mount "owner" instead of the
kernel, or some remote entity usually running with elevated
privileges.
The security implication of this is that a non-privileged user must
not be able to use this capability to compromise the system. Obvious
requirements arising from this are:
- mount owner should not be able to get elevated privileges with the
help of the mounted filesystem
- mount owner should not be able to induce undesired behavior in
other users' or the super user's processes
- mount owner should not get illegitimate access to information from
other users' and the super user's processes
These are currently ensured with the following constraints:
1) mount is only allowed to directory or file which the mount owner
can modify without limitation (write access + no sticky bit for
directories)
2) nosuid,nodev mount options are forced
3) any process running with fsuid different from the owner is denied
all access to the filesystem
1) and 2) are ensured by the "fusermount" mount utility which is a
setuid root application doing the actual mount operation.
3) is ensured by a check in the permission() method in kernel
I started thinking about doing 3) in a different way because Christoph
H. made a big deal out of it, saying that FUSE is unacceptable into
mainline in this form.
The suggested use of private namespaces would be OK, but in their
current form have many limitations that make their use impractical (as
discussed in this thread).
Suggested improvements that would address these limitations:
- implement shared subtrees
- allow a process to join an existing namespace (make namespaces
first-class objects)
- implement the namespace creation/joining in a PAM module
With all that in place the check of owner against current->fsuid may
be removed from the FUSE kernel module, without compromising the
security requirements.
Suid programs still interesting questions, since they get access even
to the private namespace causing some information leak (exact
order/timing of filesystem operations performed), giving some
ptrace-like capabilities to unprivileged users. BTW this problem is
not strictly limited to the namespace approach, since suid programs
setting fsuid and accessing users' files will succeed with the current
approach too.
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-10 04:10:26 +08:00
|
|
|
/** Version number of this interface */
|
2005-09-10 04:10:29 +08:00
|
|
|
#define FUSE_KERNEL_VERSION 7
|
[PATCH] FUSE - core
This patch adds FUSE core.
This contains the following files:
o inode.c
- superblock operations (alloc_inode, destroy_inode, read_inode,
clear_inode, put_super, show_options)
- registers FUSE filesystem
o fuse_i.h
- private header file
Requirements
============
The most important difference between orinary filesystems and FUSE is
the fact, that the filesystem data/metadata is provided by a userspace
process run with the privileges of the mount "owner" instead of the
kernel, or some remote entity usually running with elevated
privileges.
The security implication of this is that a non-privileged user must
not be able to use this capability to compromise the system. Obvious
requirements arising from this are:
- mount owner should not be able to get elevated privileges with the
help of the mounted filesystem
- mount owner should not be able to induce undesired behavior in
other users' or the super user's processes
- mount owner should not get illegitimate access to information from
other users' and the super user's processes
These are currently ensured with the following constraints:
1) mount is only allowed to directory or file which the mount owner
can modify without limitation (write access + no sticky bit for
directories)
2) nosuid,nodev mount options are forced
3) any process running with fsuid different from the owner is denied
all access to the filesystem
1) and 2) are ensured by the "fusermount" mount utility which is a
setuid root application doing the actual mount operation.
3) is ensured by a check in the permission() method in kernel
I started thinking about doing 3) in a different way because Christoph
H. made a big deal out of it, saying that FUSE is unacceptable into
mainline in this form.
The suggested use of private namespaces would be OK, but in their
current form have many limitations that make their use impractical (as
discussed in this thread).
Suggested improvements that would address these limitations:
- implement shared subtrees
- allow a process to join an existing namespace (make namespaces
first-class objects)
- implement the namespace creation/joining in a PAM module
With all that in place the check of owner against current->fsuid may
be removed from the FUSE kernel module, without compromising the
security requirements.
Suid programs still interesting questions, since they get access even
to the private namespace causing some information leak (exact
order/timing of filesystem operations performed), giving some
ptrace-like capabilities to unprivileged users. BTW this problem is
not strictly limited to the namespace approach, since suid programs
setting fsuid and accessing users' files will succeed with the current
approach too.
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-10 04:10:26 +08:00
|
|
|
|
|
|
|
/** Minor version number of this interface */
|
fuse: allow filesystems to have precise control over data cache
On networked filesystems file data can be changed externally. FUSE
provides notification messages for filesystem to inform kernel that
metadata or data region of a file needs to be invalidated in local page
cache. That provides the basis for filesystem implementations to invalidate
kernel cache explicitly based on observed filesystem-specific events.
FUSE has also "automatic" invalidation mode(*) when the kernel
automatically invalidates data cache of a file if it sees mtime change. It
also automatically invalidates whole data cache of a file if it sees file
size being changed.
The automatic mode has corresponding capability - FUSE_AUTO_INVAL_DATA.
However, due to probably historical reason, that capability controls only
whether mtime change should be resulting in automatic invalidation or
not. A change in file size always results in invalidating whole data cache
of a file irregardless of whether FUSE_AUTO_INVAL_DATA was negotiated(+).
The filesystem I write[1] represents data arrays stored in networked
database as local files suitable for mmap. It is read-only filesystem -
changes to data are committed externally via database interfaces and the
filesystem only glues data into contiguous file streams suitable for mmap
and traditional array processing. The files are big - starting from
hundreds gigabytes and more. The files change regularly, and frequently by
data being appended to their end. The size of files thus changes
frequently.
If a file was accessed locally and some part of its data got into page
cache, we want that data to stay cached unless there is memory pressure, or
unless corresponding part of the file was actually changed. However current
FUSE behaviour - when it sees file size change - is to invalidate the whole
file. The data cache of the file is thus completely lost even on small size
change, and despite that the filesystem server is careful to accurately
translate database changes into FUSE invalidation messages to kernel.
Let's fix it: if a filesystem, through new FUSE_EXPLICIT_INVAL_DATA
capability, indicates to kernel that it is fully responsible for data cache
invalidation, then the kernel won't invalidate files data cache on size
change and only truncate that cache to new size in case the size decreased.
(*) see 72d0d248ca "fuse: add FUSE_AUTO_INVAL_DATA init flag",
eed2179efe "fuse: invalidate inode mapping if mtime changes"
(+) in writeback mode the kernel does not invalidate data cache on file
size change, but neither it allows the filesystem to set the size due to
external event (see 8373200b12 "fuse: Trust kernel i_size only")
[1] https://lab.nexedi.com/kirr/wendelin.core/blob/a50f1d9f/wcfs/wcfs.go#L20
Signed-off-by: Kirill Smelkov <kirr@nexedi.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2019-03-27 19:14:15 +08:00
|
|
|
#define FUSE_KERNEL_MINOR_VERSION 30
|
[PATCH] FUSE - core
This patch adds FUSE core.
This contains the following files:
o inode.c
- superblock operations (alloc_inode, destroy_inode, read_inode,
clear_inode, put_super, show_options)
- registers FUSE filesystem
o fuse_i.h
- private header file
Requirements
============
The most important difference between orinary filesystems and FUSE is
the fact, that the filesystem data/metadata is provided by a userspace
process run with the privileges of the mount "owner" instead of the
kernel, or some remote entity usually running with elevated
privileges.
The security implication of this is that a non-privileged user must
not be able to use this capability to compromise the system. Obvious
requirements arising from this are:
- mount owner should not be able to get elevated privileges with the
help of the mounted filesystem
- mount owner should not be able to induce undesired behavior in
other users' or the super user's processes
- mount owner should not get illegitimate access to information from
other users' and the super user's processes
These are currently ensured with the following constraints:
1) mount is only allowed to directory or file which the mount owner
can modify without limitation (write access + no sticky bit for
directories)
2) nosuid,nodev mount options are forced
3) any process running with fsuid different from the owner is denied
all access to the filesystem
1) and 2) are ensured by the "fusermount" mount utility which is a
setuid root application doing the actual mount operation.
3) is ensured by a check in the permission() method in kernel
I started thinking about doing 3) in a different way because Christoph
H. made a big deal out of it, saying that FUSE is unacceptable into
mainline in this form.
The suggested use of private namespaces would be OK, but in their
current form have many limitations that make their use impractical (as
discussed in this thread).
Suggested improvements that would address these limitations:
- implement shared subtrees
- allow a process to join an existing namespace (make namespaces
first-class objects)
- implement the namespace creation/joining in a PAM module
With all that in place the check of owner against current->fsuid may
be removed from the FUSE kernel module, without compromising the
security requirements.
Suid programs still interesting questions, since they get access even
to the private namespace causing some information leak (exact
order/timing of filesystem operations performed), giving some
ptrace-like capabilities to unprivileged users. BTW this problem is
not strictly limited to the namespace approach, since suid programs
setting fsuid and accessing users' files will succeed with the current
approach too.
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-10 04:10:26 +08:00
|
|
|
|
|
|
|
/** The node ID of the root inode */
|
|
|
|
#define FUSE_ROOT_ID 1
|
|
|
|
|
2005-09-10 04:10:32 +08:00
|
|
|
/* Make sure all structures are padded to 64bit boundary, so 32bit
|
|
|
|
userspace works under 64bit kernels */
|
|
|
|
|
[PATCH] FUSE - core
This patch adds FUSE core.
This contains the following files:
o inode.c
- superblock operations (alloc_inode, destroy_inode, read_inode,
clear_inode, put_super, show_options)
- registers FUSE filesystem
o fuse_i.h
- private header file
Requirements
============
The most important difference between orinary filesystems and FUSE is
the fact, that the filesystem data/metadata is provided by a userspace
process run with the privileges of the mount "owner" instead of the
kernel, or some remote entity usually running with elevated
privileges.
The security implication of this is that a non-privileged user must
not be able to use this capability to compromise the system. Obvious
requirements arising from this are:
- mount owner should not be able to get elevated privileges with the
help of the mounted filesystem
- mount owner should not be able to induce undesired behavior in
other users' or the super user's processes
- mount owner should not get illegitimate access to information from
other users' and the super user's processes
These are currently ensured with the following constraints:
1) mount is only allowed to directory or file which the mount owner
can modify without limitation (write access + no sticky bit for
directories)
2) nosuid,nodev mount options are forced
3) any process running with fsuid different from the owner is denied
all access to the filesystem
1) and 2) are ensured by the "fusermount" mount utility which is a
setuid root application doing the actual mount operation.
3) is ensured by a check in the permission() method in kernel
I started thinking about doing 3) in a different way because Christoph
H. made a big deal out of it, saying that FUSE is unacceptable into
mainline in this form.
The suggested use of private namespaces would be OK, but in their
current form have many limitations that make their use impractical (as
discussed in this thread).
Suggested improvements that would address these limitations:
- implement shared subtrees
- allow a process to join an existing namespace (make namespaces
first-class objects)
- implement the namespace creation/joining in a PAM module
With all that in place the check of owner against current->fsuid may
be removed from the FUSE kernel module, without compromising the
security requirements.
Suid programs still interesting questions, since they get access even
to the private namespace causing some information leak (exact
order/timing of filesystem operations performed), giving some
ptrace-like capabilities to unprivileged users. BTW this problem is
not strictly limited to the namespace approach, since suid programs
setting fsuid and accessing users' files will succeed with the current
approach too.
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-10 04:10:26 +08:00
|
|
|
struct fuse_attr {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t ino;
|
|
|
|
uint64_t size;
|
|
|
|
uint64_t blocks;
|
|
|
|
uint64_t atime;
|
|
|
|
uint64_t mtime;
|
|
|
|
uint64_t ctime;
|
|
|
|
uint32_t atimensec;
|
|
|
|
uint32_t mtimensec;
|
|
|
|
uint32_t ctimensec;
|
|
|
|
uint32_t mode;
|
|
|
|
uint32_t nlink;
|
|
|
|
uint32_t uid;
|
|
|
|
uint32_t gid;
|
|
|
|
uint32_t rdev;
|
|
|
|
uint32_t blksize;
|
|
|
|
uint32_t padding;
|
[PATCH] FUSE - core
This patch adds FUSE core.
This contains the following files:
o inode.c
- superblock operations (alloc_inode, destroy_inode, read_inode,
clear_inode, put_super, show_options)
- registers FUSE filesystem
o fuse_i.h
- private header file
Requirements
============
The most important difference between orinary filesystems and FUSE is
the fact, that the filesystem data/metadata is provided by a userspace
process run with the privileges of the mount "owner" instead of the
kernel, or some remote entity usually running with elevated
privileges.
The security implication of this is that a non-privileged user must
not be able to use this capability to compromise the system. Obvious
requirements arising from this are:
- mount owner should not be able to get elevated privileges with the
help of the mounted filesystem
- mount owner should not be able to induce undesired behavior in
other users' or the super user's processes
- mount owner should not get illegitimate access to information from
other users' and the super user's processes
These are currently ensured with the following constraints:
1) mount is only allowed to directory or file which the mount owner
can modify without limitation (write access + no sticky bit for
directories)
2) nosuid,nodev mount options are forced
3) any process running with fsuid different from the owner is denied
all access to the filesystem
1) and 2) are ensured by the "fusermount" mount utility which is a
setuid root application doing the actual mount operation.
3) is ensured by a check in the permission() method in kernel
I started thinking about doing 3) in a different way because Christoph
H. made a big deal out of it, saying that FUSE is unacceptable into
mainline in this form.
The suggested use of private namespaces would be OK, but in their
current form have many limitations that make their use impractical (as
discussed in this thread).
Suggested improvements that would address these limitations:
- implement shared subtrees
- allow a process to join an existing namespace (make namespaces
first-class objects)
- implement the namespace creation/joining in a PAM module
With all that in place the check of owner against current->fsuid may
be removed from the FUSE kernel module, without compromising the
security requirements.
Suid programs still interesting questions, since they get access even
to the private namespace causing some information leak (exact
order/timing of filesystem operations performed), giving some
ptrace-like capabilities to unprivileged users. BTW this problem is
not strictly limited to the namespace approach, since suid programs
setting fsuid and accessing users' files will succeed with the current
approach too.
Signed-off-by: Miklos Szeredi <miklos@szeredi.hu>
Signed-off-by: Andrew Morton <akpm@osdl.org>
Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2005-09-10 04:10:26 +08:00
|
|
|
};
|
|
|
|
|
2005-09-10 04:10:28 +08:00
|
|
|
struct fuse_kstatfs {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t blocks;
|
|
|
|
uint64_t bfree;
|
|
|
|
uint64_t bavail;
|
|
|
|
uint64_t files;
|
|
|
|
uint64_t ffree;
|
|
|
|
uint32_t bsize;
|
|
|
|
uint32_t namelen;
|
|
|
|
uint32_t frsize;
|
|
|
|
uint32_t padding;
|
|
|
|
uint32_t spare[6];
|
2005-09-10 04:10:28 +08:00
|
|
|
};
|
|
|
|
|
2006-06-25 20:48:52 +08:00
|
|
|
struct fuse_file_lock {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t start;
|
|
|
|
uint64_t end;
|
|
|
|
uint32_t type;
|
|
|
|
uint32_t pid; /* tgid */
|
2006-06-25 20:48:52 +08:00
|
|
|
};
|
|
|
|
|
2006-02-01 19:04:40 +08:00
|
|
|
/**
|
|
|
|
* Bitmasks for fuse_setattr_in.valid
|
|
|
|
*/
|
2005-09-10 04:10:29 +08:00
|
|
|
#define FATTR_MODE (1 << 0)
|
|
|
|
#define FATTR_UID (1 << 1)
|
|
|
|
#define FATTR_GID (1 << 2)
|
|
|
|
#define FATTR_SIZE (1 << 3)
|
|
|
|
#define FATTR_ATIME (1 << 4)
|
|
|
|
#define FATTR_MTIME (1 << 5)
|
2005-11-07 16:59:52 +08:00
|
|
|
#define FATTR_FH (1 << 6)
|
2007-10-18 18:07:01 +08:00
|
|
|
#define FATTR_ATIME_NOW (1 << 7)
|
|
|
|
#define FATTR_MTIME_NOW (1 << 8)
|
2007-10-18 18:07:04 +08:00
|
|
|
#define FATTR_LOCKOWNER (1 << 9)
|
2014-04-28 20:19:24 +08:00
|
|
|
#define FATTR_CTIME (1 << 10)
|
2005-09-10 04:10:29 +08:00
|
|
|
|
2005-09-10 04:10:37 +08:00
|
|
|
/**
|
|
|
|
* Flags returned by the OPEN request
|
|
|
|
*
|
|
|
|
* FOPEN_DIRECT_IO: bypass page cache for this open file
|
|
|
|
* FOPEN_KEEP_CACHE: don't invalidate the data cache on open
|
2008-10-16 22:08:57 +08:00
|
|
|
* FOPEN_NONSEEKABLE: the file is not seekable
|
2018-09-28 22:43:23 +08:00
|
|
|
* FOPEN_CACHE_DIR: allow caching this directory
|
2005-09-10 04:10:37 +08:00
|
|
|
*/
|
|
|
|
#define FOPEN_DIRECT_IO (1 << 0)
|
|
|
|
#define FOPEN_KEEP_CACHE (1 << 1)
|
2008-10-16 22:08:57 +08:00
|
|
|
#define FOPEN_NONSEEKABLE (1 << 2)
|
2018-09-28 22:43:23 +08:00
|
|
|
#define FOPEN_CACHE_DIR (1 << 3)
|
2005-09-10 04:10:37 +08:00
|
|
|
|
2006-02-01 19:04:40 +08:00
|
|
|
/**
|
|
|
|
* INIT request/reply flags
|
2008-07-25 16:49:02 +08:00
|
|
|
*
|
2012-07-18 22:09:40 +08:00
|
|
|
* FUSE_ASYNC_READ: asynchronous read requests
|
2011-08-08 22:08:08 +08:00
|
|
|
* FUSE_POSIX_LOCKS: remote locking for POSIX file locks
|
2012-07-18 22:09:40 +08:00
|
|
|
* FUSE_FILE_OPS: kernel sends file handle for fstat, etc... (not yet supported)
|
|
|
|
* FUSE_ATOMIC_O_TRUNC: handles the O_TRUNC open flag in the filesystem
|
2008-07-25 16:49:02 +08:00
|
|
|
* FUSE_EXPORT_SUPPORT: filesystem handles lookups of "." and ".."
|
2012-07-18 22:09:40 +08:00
|
|
|
* FUSE_BIG_WRITES: filesystem can handle write size larger than 4kB
|
2009-07-01 02:12:23 +08:00
|
|
|
* FUSE_DONT_MASK: don't apply umask to file mode on create operations
|
2012-07-18 22:09:40 +08:00
|
|
|
* FUSE_SPLICE_WRITE: kernel supports splice write on the device
|
|
|
|
* FUSE_SPLICE_MOVE: kernel supports splice move on the device
|
|
|
|
* FUSE_SPLICE_READ: kernel supports splice read on the device
|
2011-08-08 22:08:08 +08:00
|
|
|
* FUSE_FLOCK_LOCKS: remote locking for BSD style file locks
|
2012-07-18 22:09:40 +08:00
|
|
|
* FUSE_HAS_IOCTL_DIR: kernel supports ioctl on directories
|
2012-07-17 03:23:48 +08:00
|
|
|
* FUSE_AUTO_INVAL_DATA: automatically invalidate cached pages
|
2013-02-07 06:29:01 +08:00
|
|
|
* FUSE_DO_READDIRPLUS: do READDIRPLUS (READDIR+LOOKUP in one)
|
|
|
|
* FUSE_READDIRPLUS_AUTO: adaptive readdirplus
|
2013-05-01 20:37:21 +08:00
|
|
|
* FUSE_ASYNC_DIO: asynchronous direct I/O submission
|
2013-10-10 21:12:18 +08:00
|
|
|
* FUSE_WRITEBACK_CACHE: use writeback cache for buffered writes
|
2014-07-22 22:37:43 +08:00
|
|
|
* FUSE_NO_OPEN_SUPPORT: kernel supports zero-message opens
|
2016-06-30 19:10:49 +08:00
|
|
|
* FUSE_PARALLEL_DIROPS: allow parallel lookups and readdir
|
2016-10-01 13:32:32 +08:00
|
|
|
* FUSE_HANDLE_KILLPRIV: fs handles killing suid/sgid/cap on write/chown/trunc
|
2016-08-29 21:46:37 +08:00
|
|
|
* FUSE_POSIX_ACL: filesystem supports posix acls
|
2017-11-10 04:23:35 +08:00
|
|
|
* FUSE_ABORT_ERROR: reading the device after abort returns ECONNABORTED
|
fuse: add max_pages to init_out
Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to
improve performance.
Old RFC with detailed description of the problem and many fixes by Mitsuo
Hayasaka (mitsuo.hayasaka.hu@hitachi.com):
- https://lkml.org/lkml/2012/7/5/136
We've encountered performance degradation and fixed it on a big and complex
virtual environment.
Environment to reproduce degradation and improvement:
1. Add lag to user mode FUSE
Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in
passthrough_fh.c
2. patch UM fuse with configurable max_pages parameter. The patch will be
provided latter.
3. run test script and perform test on tmpfs
fuse_test()
{
cd /tmp
mkdir -p fusemnt
passthrough_fh -o max_pages=$1 /tmp/fusemnt
grep fuse /proc/self/mounts
dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \
count=1K bs=1M 2>&1 | grep -v records
rm fusemnt/tmp/tmp
killall passthrough_fh
}
Test results:
passthrough_fh /tmp/fusemnt fuse.passthrough_fh \
rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0
1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s
passthrough_fh /tmp/fusemnt fuse.passthrough_fh \
rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0
1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s
Obviously with bigger lag the difference between 'before' and 'after'
will be more significant.
Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136),
observed improvement from 400-550 to 520-740.
Signed-off-by: Constantine Shulyupin <const@MakeLinux.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2018-09-06 20:37:06 +08:00
|
|
|
* FUSE_MAX_PAGES: init_out.max_pages contains the max number of req pages
|
2018-10-11 23:17:00 +08:00
|
|
|
* FUSE_CACHE_SYMLINKS: cache READLINK responses
|
2019-01-08 08:53:17 +08:00
|
|
|
* FUSE_NO_OPENDIR_SUPPORT: kernel supports zero-message opendir
|
fuse: allow filesystems to have precise control over data cache
On networked filesystems file data can be changed externally. FUSE
provides notification messages for filesystem to inform kernel that
metadata or data region of a file needs to be invalidated in local page
cache. That provides the basis for filesystem implementations to invalidate
kernel cache explicitly based on observed filesystem-specific events.
FUSE has also "automatic" invalidation mode(*) when the kernel
automatically invalidates data cache of a file if it sees mtime change. It
also automatically invalidates whole data cache of a file if it sees file
size being changed.
The automatic mode has corresponding capability - FUSE_AUTO_INVAL_DATA.
However, due to probably historical reason, that capability controls only
whether mtime change should be resulting in automatic invalidation or
not. A change in file size always results in invalidating whole data cache
of a file irregardless of whether FUSE_AUTO_INVAL_DATA was negotiated(+).
The filesystem I write[1] represents data arrays stored in networked
database as local files suitable for mmap. It is read-only filesystem -
changes to data are committed externally via database interfaces and the
filesystem only glues data into contiguous file streams suitable for mmap
and traditional array processing. The files are big - starting from
hundreds gigabytes and more. The files change regularly, and frequently by
data being appended to their end. The size of files thus changes
frequently.
If a file was accessed locally and some part of its data got into page
cache, we want that data to stay cached unless there is memory pressure, or
unless corresponding part of the file was actually changed. However current
FUSE behaviour - when it sees file size change - is to invalidate the whole
file. The data cache of the file is thus completely lost even on small size
change, and despite that the filesystem server is careful to accurately
translate database changes into FUSE invalidation messages to kernel.
Let's fix it: if a filesystem, through new FUSE_EXPLICIT_INVAL_DATA
capability, indicates to kernel that it is fully responsible for data cache
invalidation, then the kernel won't invalidate files data cache on size
change and only truncate that cache to new size in case the size decreased.
(*) see 72d0d248ca "fuse: add FUSE_AUTO_INVAL_DATA init flag",
eed2179efe "fuse: invalidate inode mapping if mtime changes"
(+) in writeback mode the kernel does not invalidate data cache on file
size change, but neither it allows the filesystem to set the size due to
external event (see 8373200b12 "fuse: Trust kernel i_size only")
[1] https://lab.nexedi.com/kirr/wendelin.core/blob/a50f1d9f/wcfs/wcfs.go#L20
Signed-off-by: Kirill Smelkov <kirr@nexedi.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2019-03-27 19:14:15 +08:00
|
|
|
* FUSE_EXPLICIT_INVAL_DATA: only invalidate cached pages on explicit request
|
2006-02-01 19:04:40 +08:00
|
|
|
*/
|
|
|
|
#define FUSE_ASYNC_READ (1 << 0)
|
2006-06-25 20:48:52 +08:00
|
|
|
#define FUSE_POSIX_LOCKS (1 << 1)
|
2007-10-18 18:06:59 +08:00
|
|
|
#define FUSE_FILE_OPS (1 << 2)
|
2007-10-18 18:07:02 +08:00
|
|
|
#define FUSE_ATOMIC_O_TRUNC (1 << 3)
|
2008-07-25 16:49:02 +08:00
|
|
|
#define FUSE_EXPORT_SUPPORT (1 << 4)
|
2008-05-13 05:02:32 +08:00
|
|
|
#define FUSE_BIG_WRITES (1 << 5)
|
2009-07-01 02:12:23 +08:00
|
|
|
#define FUSE_DONT_MASK (1 << 6)
|
2012-07-18 22:09:40 +08:00
|
|
|
#define FUSE_SPLICE_WRITE (1 << 7)
|
|
|
|
#define FUSE_SPLICE_MOVE (1 << 8)
|
|
|
|
#define FUSE_SPLICE_READ (1 << 9)
|
2011-08-08 22:08:08 +08:00
|
|
|
#define FUSE_FLOCK_LOCKS (1 << 10)
|
2012-07-18 22:09:40 +08:00
|
|
|
#define FUSE_HAS_IOCTL_DIR (1 << 11)
|
2012-07-17 03:23:48 +08:00
|
|
|
#define FUSE_AUTO_INVAL_DATA (1 << 12)
|
2012-08-19 20:53:23 +08:00
|
|
|
#define FUSE_DO_READDIRPLUS (1 << 13)
|
2013-02-07 06:29:01 +08:00
|
|
|
#define FUSE_READDIRPLUS_AUTO (1 << 14)
|
2013-05-01 20:37:21 +08:00
|
|
|
#define FUSE_ASYNC_DIO (1 << 15)
|
2013-10-10 21:12:18 +08:00
|
|
|
#define FUSE_WRITEBACK_CACHE (1 << 16)
|
2014-07-22 22:37:43 +08:00
|
|
|
#define FUSE_NO_OPEN_SUPPORT (1 << 17)
|
2016-06-30 19:10:49 +08:00
|
|
|
#define FUSE_PARALLEL_DIROPS (1 << 18)
|
2016-10-01 13:32:32 +08:00
|
|
|
#define FUSE_HANDLE_KILLPRIV (1 << 19)
|
2016-08-29 21:46:37 +08:00
|
|
|
#define FUSE_POSIX_ACL (1 << 20)
|
2017-11-10 04:23:35 +08:00
|
|
|
#define FUSE_ABORT_ERROR (1 << 21)
|
fuse: add max_pages to init_out
Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to
improve performance.
Old RFC with detailed description of the problem and many fixes by Mitsuo
Hayasaka (mitsuo.hayasaka.hu@hitachi.com):
- https://lkml.org/lkml/2012/7/5/136
We've encountered performance degradation and fixed it on a big and complex
virtual environment.
Environment to reproduce degradation and improvement:
1. Add lag to user mode FUSE
Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in
passthrough_fh.c
2. patch UM fuse with configurable max_pages parameter. The patch will be
provided latter.
3. run test script and perform test on tmpfs
fuse_test()
{
cd /tmp
mkdir -p fusemnt
passthrough_fh -o max_pages=$1 /tmp/fusemnt
grep fuse /proc/self/mounts
dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \
count=1K bs=1M 2>&1 | grep -v records
rm fusemnt/tmp/tmp
killall passthrough_fh
}
Test results:
passthrough_fh /tmp/fusemnt fuse.passthrough_fh \
rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0
1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s
passthrough_fh /tmp/fusemnt fuse.passthrough_fh \
rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0
1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s
Obviously with bigger lag the difference between 'before' and 'after'
will be more significant.
Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136),
observed improvement from 400-550 to 520-740.
Signed-off-by: Constantine Shulyupin <const@MakeLinux.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2018-09-06 20:37:06 +08:00
|
|
|
#define FUSE_MAX_PAGES (1 << 22)
|
2018-10-11 23:17:00 +08:00
|
|
|
#define FUSE_CACHE_SYMLINKS (1 << 23)
|
2019-01-08 08:53:17 +08:00
|
|
|
#define FUSE_NO_OPENDIR_SUPPORT (1 << 24)
|
fuse: allow filesystems to have precise control over data cache
On networked filesystems file data can be changed externally. FUSE
provides notification messages for filesystem to inform kernel that
metadata or data region of a file needs to be invalidated in local page
cache. That provides the basis for filesystem implementations to invalidate
kernel cache explicitly based on observed filesystem-specific events.
FUSE has also "automatic" invalidation mode(*) when the kernel
automatically invalidates data cache of a file if it sees mtime change. It
also automatically invalidates whole data cache of a file if it sees file
size being changed.
The automatic mode has corresponding capability - FUSE_AUTO_INVAL_DATA.
However, due to probably historical reason, that capability controls only
whether mtime change should be resulting in automatic invalidation or
not. A change in file size always results in invalidating whole data cache
of a file irregardless of whether FUSE_AUTO_INVAL_DATA was negotiated(+).
The filesystem I write[1] represents data arrays stored in networked
database as local files suitable for mmap. It is read-only filesystem -
changes to data are committed externally via database interfaces and the
filesystem only glues data into contiguous file streams suitable for mmap
and traditional array processing. The files are big - starting from
hundreds gigabytes and more. The files change regularly, and frequently by
data being appended to their end. The size of files thus changes
frequently.
If a file was accessed locally and some part of its data got into page
cache, we want that data to stay cached unless there is memory pressure, or
unless corresponding part of the file was actually changed. However current
FUSE behaviour - when it sees file size change - is to invalidate the whole
file. The data cache of the file is thus completely lost even on small size
change, and despite that the filesystem server is careful to accurately
translate database changes into FUSE invalidation messages to kernel.
Let's fix it: if a filesystem, through new FUSE_EXPLICIT_INVAL_DATA
capability, indicates to kernel that it is fully responsible for data cache
invalidation, then the kernel won't invalidate files data cache on size
change and only truncate that cache to new size in case the size decreased.
(*) see 72d0d248ca "fuse: add FUSE_AUTO_INVAL_DATA init flag",
eed2179efe "fuse: invalidate inode mapping if mtime changes"
(+) in writeback mode the kernel does not invalidate data cache on file
size change, but neither it allows the filesystem to set the size due to
external event (see 8373200b12 "fuse: Trust kernel i_size only")
[1] https://lab.nexedi.com/kirr/wendelin.core/blob/a50f1d9f/wcfs/wcfs.go#L20
Signed-off-by: Kirill Smelkov <kirr@nexedi.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2019-03-27 19:14:15 +08:00
|
|
|
#define FUSE_EXPLICIT_INVAL_DATA (1 << 25)
|
2006-02-01 19:04:40 +08:00
|
|
|
|
2009-04-14 09:54:54 +08:00
|
|
|
/**
|
|
|
|
* CUSE INIT request/reply flags
|
|
|
|
*
|
|
|
|
* CUSE_UNRESTRICTED_IOCTL: use unrestricted ioctl
|
|
|
|
*/
|
|
|
|
#define CUSE_UNRESTRICTED_IOCTL (1 << 0)
|
|
|
|
|
2006-12-07 12:35:38 +08:00
|
|
|
/**
|
|
|
|
* Release flags
|
|
|
|
*/
|
|
|
|
#define FUSE_RELEASE_FLUSH (1 << 0)
|
2011-08-08 22:08:08 +08:00
|
|
|
#define FUSE_RELEASE_FLOCK_UNLOCK (1 << 1)
|
2006-12-07 12:35:38 +08:00
|
|
|
|
2007-10-18 18:06:59 +08:00
|
|
|
/**
|
|
|
|
* Getattr flags
|
|
|
|
*/
|
|
|
|
#define FUSE_GETATTR_FH (1 << 0)
|
|
|
|
|
2007-10-18 18:07:02 +08:00
|
|
|
/**
|
|
|
|
* Lock flags
|
|
|
|
*/
|
|
|
|
#define FUSE_LK_FLOCK (1 << 0)
|
|
|
|
|
2007-10-18 18:07:03 +08:00
|
|
|
/**
|
|
|
|
* WRITE flags
|
|
|
|
*
|
|
|
|
* FUSE_WRITE_CACHE: delayed write from page cache, file handle is guessed
|
2007-10-18 18:07:04 +08:00
|
|
|
* FUSE_WRITE_LOCKOWNER: lock_owner field is valid
|
2007-10-18 18:07:03 +08:00
|
|
|
*/
|
|
|
|
#define FUSE_WRITE_CACHE (1 << 0)
|
2007-10-18 18:07:04 +08:00
|
|
|
#define FUSE_WRITE_LOCKOWNER (1 << 1)
|
|
|
|
|
|
|
|
/**
|
|
|
|
* Read flags
|
|
|
|
*/
|
|
|
|
#define FUSE_READ_LOCKOWNER (1 << 1)
|
2007-10-18 18:07:03 +08:00
|
|
|
|
2008-11-26 19:03:55 +08:00
|
|
|
/**
|
|
|
|
* Ioctl flags
|
|
|
|
*
|
|
|
|
* FUSE_IOCTL_COMPAT: 32bit compat ioctl on 64bit machine
|
|
|
|
* FUSE_IOCTL_UNRESTRICTED: not restricted to well-formed ioctls, retry allowed
|
|
|
|
* FUSE_IOCTL_RETRY: retry with new iovecs
|
2010-12-08 03:16:56 +08:00
|
|
|
* FUSE_IOCTL_32BIT: 32bit ioctl
|
2011-12-13 18:58:49 +08:00
|
|
|
* FUSE_IOCTL_DIR: is a directory
|
2008-11-26 19:03:55 +08:00
|
|
|
*
|
|
|
|
* FUSE_IOCTL_MAX_IOV: maximum of in_iovecs + out_iovecs
|
|
|
|
*/
|
|
|
|
#define FUSE_IOCTL_COMPAT (1 << 0)
|
|
|
|
#define FUSE_IOCTL_UNRESTRICTED (1 << 1)
|
|
|
|
#define FUSE_IOCTL_RETRY (1 << 2)
|
2010-12-08 03:16:56 +08:00
|
|
|
#define FUSE_IOCTL_32BIT (1 << 3)
|
2011-12-13 18:58:49 +08:00
|
|
|
#define FUSE_IOCTL_DIR (1 << 4)
|
2008-11-26 19:03:55 +08:00
|
|
|
|
|
|
|
#define FUSE_IOCTL_MAX_IOV 256
|
|
|
|
|
2008-11-26 19:03:55 +08:00
|
|
|
/**
|
|
|
|
* Poll flags
|
|
|
|
*
|
|
|
|
* FUSE_POLL_SCHEDULE_NOTIFY: request poll notify
|
|
|
|
*/
|
|
|
|
#define FUSE_POLL_SCHEDULE_NOTIFY (1 << 0)
|
|
|
|
|
2005-09-10 04:10:27 +08:00
|
|
|
enum fuse_opcode {
|
2018-08-21 20:36:31 +08:00
|
|
|
FUSE_LOOKUP = 1,
|
|
|
|
FUSE_FORGET = 2, /* no reply */
|
|
|
|
FUSE_GETATTR = 3,
|
|
|
|
FUSE_SETATTR = 4,
|
|
|
|
FUSE_READLINK = 5,
|
|
|
|
FUSE_SYMLINK = 6,
|
|
|
|
FUSE_MKNOD = 8,
|
|
|
|
FUSE_MKDIR = 9,
|
|
|
|
FUSE_UNLINK = 10,
|
|
|
|
FUSE_RMDIR = 11,
|
|
|
|
FUSE_RENAME = 12,
|
|
|
|
FUSE_LINK = 13,
|
|
|
|
FUSE_OPEN = 14,
|
|
|
|
FUSE_READ = 15,
|
|
|
|
FUSE_WRITE = 16,
|
|
|
|
FUSE_STATFS = 17,
|
|
|
|
FUSE_RELEASE = 18,
|
|
|
|
FUSE_FSYNC = 20,
|
|
|
|
FUSE_SETXATTR = 21,
|
|
|
|
FUSE_GETXATTR = 22,
|
|
|
|
FUSE_LISTXATTR = 23,
|
|
|
|
FUSE_REMOVEXATTR = 24,
|
|
|
|
FUSE_FLUSH = 25,
|
|
|
|
FUSE_INIT = 26,
|
|
|
|
FUSE_OPENDIR = 27,
|
|
|
|
FUSE_READDIR = 28,
|
|
|
|
FUSE_RELEASEDIR = 29,
|
|
|
|
FUSE_FSYNCDIR = 30,
|
|
|
|
FUSE_GETLK = 31,
|
|
|
|
FUSE_SETLK = 32,
|
|
|
|
FUSE_SETLKW = 33,
|
|
|
|
FUSE_ACCESS = 34,
|
|
|
|
FUSE_CREATE = 35,
|
|
|
|
FUSE_INTERRUPT = 36,
|
|
|
|
FUSE_BMAP = 37,
|
|
|
|
FUSE_DESTROY = 38,
|
|
|
|
FUSE_IOCTL = 39,
|
|
|
|
FUSE_POLL = 40,
|
|
|
|
FUSE_NOTIFY_REPLY = 41,
|
|
|
|
FUSE_BATCH_FORGET = 42,
|
|
|
|
FUSE_FALLOCATE = 43,
|
|
|
|
FUSE_READDIRPLUS = 44,
|
|
|
|
FUSE_RENAME2 = 45,
|
|
|
|
FUSE_LSEEK = 46,
|
|
|
|
FUSE_COPY_FILE_RANGE = 47,
|
2009-04-14 09:54:54 +08:00
|
|
|
|
|
|
|
/* CUSE specific operations */
|
2018-08-21 20:36:31 +08:00
|
|
|
CUSE_INIT = 4096,
|
2005-09-10 04:10:27 +08:00
|
|
|
};
|
|
|
|
|
2008-11-26 19:03:55 +08:00
|
|
|
enum fuse_notify_code {
|
2008-11-26 19:03:55 +08:00
|
|
|
FUSE_NOTIFY_POLL = 1,
|
2009-05-31 23:13:57 +08:00
|
|
|
FUSE_NOTIFY_INVAL_INODE = 2,
|
|
|
|
FUSE_NOTIFY_INVAL_ENTRY = 3,
|
2010-07-12 20:41:40 +08:00
|
|
|
FUSE_NOTIFY_STORE = 4,
|
2010-07-12 20:41:40 +08:00
|
|
|
FUSE_NOTIFY_RETRIEVE = 5,
|
2011-12-07 04:50:06 +08:00
|
|
|
FUSE_NOTIFY_DELETE = 6,
|
2008-11-26 19:03:55 +08:00
|
|
|
FUSE_NOTIFY_CODE_MAX,
|
|
|
|
};
|
|
|
|
|
2006-01-06 16:19:40 +08:00
|
|
|
/* The read buffer is required to be at least 8k, but may be much larger */
|
|
|
|
#define FUSE_MIN_READ_BUFFER 8192
|
2005-09-10 04:10:28 +08:00
|
|
|
|
2007-10-18 18:07:05 +08:00
|
|
|
#define FUSE_COMPAT_ENTRY_OUT_SIZE 120
|
|
|
|
|
2005-09-10 04:10:28 +08:00
|
|
|
struct fuse_entry_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t nodeid; /* Inode ID */
|
|
|
|
uint64_t generation; /* Inode generation: nodeid:gen must
|
|
|
|
be unique for the fs's lifetime */
|
|
|
|
uint64_t entry_valid; /* Cache timeout for the name */
|
|
|
|
uint64_t attr_valid; /* Cache timeout for the attributes */
|
|
|
|
uint32_t entry_valid_nsec;
|
|
|
|
uint32_t attr_valid_nsec;
|
2005-09-10 04:10:28 +08:00
|
|
|
struct fuse_attr attr;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_forget_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t nlookup;
|
2005-09-10 04:10:28 +08:00
|
|
|
};
|
|
|
|
|
2010-12-08 03:16:56 +08:00
|
|
|
struct fuse_forget_one {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t nodeid;
|
|
|
|
uint64_t nlookup;
|
2010-12-08 03:16:56 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_batch_forget_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t count;
|
|
|
|
uint32_t dummy;
|
2010-12-08 03:16:56 +08:00
|
|
|
};
|
|
|
|
|
2007-10-18 18:06:59 +08:00
|
|
|
struct fuse_getattr_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t getattr_flags;
|
|
|
|
uint32_t dummy;
|
|
|
|
uint64_t fh;
|
2007-10-18 18:06:59 +08:00
|
|
|
};
|
|
|
|
|
2007-10-18 18:07:05 +08:00
|
|
|
#define FUSE_COMPAT_ATTR_OUT_SIZE 96
|
|
|
|
|
2005-09-10 04:10:28 +08:00
|
|
|
struct fuse_attr_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t attr_valid; /* Cache timeout for the attributes */
|
|
|
|
uint32_t attr_valid_nsec;
|
|
|
|
uint32_t dummy;
|
2005-09-10 04:10:28 +08:00
|
|
|
struct fuse_attr attr;
|
|
|
|
};
|
|
|
|
|
2009-07-01 02:12:23 +08:00
|
|
|
#define FUSE_COMPAT_MKNOD_IN_SIZE 8
|
|
|
|
|
2005-09-10 04:10:29 +08:00
|
|
|
struct fuse_mknod_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t mode;
|
|
|
|
uint32_t rdev;
|
|
|
|
uint32_t umask;
|
|
|
|
uint32_t padding;
|
2005-09-10 04:10:29 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_mkdir_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t mode;
|
|
|
|
uint32_t umask;
|
2005-09-10 04:10:29 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_rename_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t newdir;
|
2005-09-10 04:10:29 +08:00
|
|
|
};
|
|
|
|
|
2014-04-28 22:43:44 +08:00
|
|
|
struct fuse_rename2_in {
|
|
|
|
uint64_t newdir;
|
|
|
|
uint32_t flags;
|
|
|
|
uint32_t padding;
|
|
|
|
};
|
|
|
|
|
2005-09-10 04:10:29 +08:00
|
|
|
struct fuse_link_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t oldnodeid;
|
2005-09-10 04:10:29 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_setattr_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t valid;
|
|
|
|
uint32_t padding;
|
|
|
|
uint64_t fh;
|
|
|
|
uint64_t size;
|
|
|
|
uint64_t lock_owner;
|
|
|
|
uint64_t atime;
|
|
|
|
uint64_t mtime;
|
2014-04-28 20:19:24 +08:00
|
|
|
uint64_t ctime;
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t atimensec;
|
|
|
|
uint32_t mtimensec;
|
2014-04-28 20:19:24 +08:00
|
|
|
uint32_t ctimensec;
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t mode;
|
|
|
|
uint32_t unused4;
|
|
|
|
uint32_t uid;
|
|
|
|
uint32_t gid;
|
|
|
|
uint32_t unused5;
|
2005-09-10 04:10:29 +08:00
|
|
|
};
|
|
|
|
|
2005-09-10 04:10:30 +08:00
|
|
|
struct fuse_open_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t flags;
|
|
|
|
uint32_t unused;
|
2009-07-01 02:12:23 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_create_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t flags;
|
|
|
|
uint32_t mode;
|
|
|
|
uint32_t umask;
|
|
|
|
uint32_t padding;
|
2005-09-10 04:10:30 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_open_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t fh;
|
|
|
|
uint32_t open_flags;
|
|
|
|
uint32_t padding;
|
2005-09-10 04:10:30 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_release_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t fh;
|
|
|
|
uint32_t flags;
|
|
|
|
uint32_t release_flags;
|
|
|
|
uint64_t lock_owner;
|
2005-09-10 04:10:30 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_flush_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t fh;
|
|
|
|
uint32_t unused;
|
|
|
|
uint32_t padding;
|
|
|
|
uint64_t lock_owner;
|
2005-09-10 04:10:30 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_read_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t fh;
|
|
|
|
uint64_t offset;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t read_flags;
|
|
|
|
uint64_t lock_owner;
|
|
|
|
uint32_t flags;
|
|
|
|
uint32_t padding;
|
2005-09-10 04:10:30 +08:00
|
|
|
};
|
|
|
|
|
2007-10-18 18:07:04 +08:00
|
|
|
#define FUSE_COMPAT_WRITE_IN_SIZE 24
|
|
|
|
|
2005-09-10 04:10:30 +08:00
|
|
|
struct fuse_write_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t fh;
|
|
|
|
uint64_t offset;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t write_flags;
|
|
|
|
uint64_t lock_owner;
|
|
|
|
uint32_t flags;
|
|
|
|
uint32_t padding;
|
2005-09-10 04:10:30 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_write_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t size;
|
|
|
|
uint32_t padding;
|
2005-09-10 04:10:30 +08:00
|
|
|
};
|
|
|
|
|
2006-01-06 16:19:37 +08:00
|
|
|
#define FUSE_COMPAT_STATFS_SIZE 48
|
|
|
|
|
2005-09-10 04:10:28 +08:00
|
|
|
struct fuse_statfs_out {
|
|
|
|
struct fuse_kstatfs st;
|
|
|
|
};
|
|
|
|
|
2005-09-10 04:10:30 +08:00
|
|
|
struct fuse_fsync_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t fh;
|
|
|
|
uint32_t fsync_flags;
|
|
|
|
uint32_t padding;
|
2005-09-10 04:10:30 +08:00
|
|
|
};
|
|
|
|
|
2005-09-10 04:10:31 +08:00
|
|
|
struct fuse_setxattr_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t size;
|
|
|
|
uint32_t flags;
|
2005-09-10 04:10:31 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_getxattr_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t size;
|
|
|
|
uint32_t padding;
|
2005-09-10 04:10:31 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_getxattr_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t size;
|
|
|
|
uint32_t padding;
|
2005-09-10 04:10:31 +08:00
|
|
|
};
|
|
|
|
|
2006-06-25 20:48:52 +08:00
|
|
|
struct fuse_lk_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t fh;
|
|
|
|
uint64_t owner;
|
2006-06-25 20:48:52 +08:00
|
|
|
struct fuse_file_lock lk;
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t lk_flags;
|
|
|
|
uint32_t padding;
|
2006-06-25 20:48:52 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_lk_out {
|
|
|
|
struct fuse_file_lock lk;
|
|
|
|
};
|
|
|
|
|
2005-11-07 16:59:50 +08:00
|
|
|
struct fuse_access_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t mask;
|
|
|
|
uint32_t padding;
|
2005-11-07 16:59:50 +08:00
|
|
|
};
|
|
|
|
|
2006-01-06 16:19:41 +08:00
|
|
|
struct fuse_init_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t major;
|
|
|
|
uint32_t minor;
|
|
|
|
uint32_t max_readahead;
|
|
|
|
uint32_t flags;
|
2005-09-10 04:10:27 +08:00
|
|
|
};
|
|
|
|
|
2014-04-28 20:19:23 +08:00
|
|
|
#define FUSE_COMPAT_INIT_OUT_SIZE 8
|
|
|
|
#define FUSE_COMPAT_22_INIT_OUT_SIZE 24
|
|
|
|
|
2006-01-06 16:19:41 +08:00
|
|
|
struct fuse_init_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t major;
|
|
|
|
uint32_t minor;
|
|
|
|
uint32_t max_readahead;
|
|
|
|
uint32_t flags;
|
|
|
|
uint16_t max_background;
|
|
|
|
uint16_t congestion_threshold;
|
|
|
|
uint32_t max_write;
|
2014-04-28 20:19:23 +08:00
|
|
|
uint32_t time_gran;
|
fuse: add max_pages to init_out
Replace FUSE_MAX_PAGES_PER_REQ with the configurable parameter max_pages to
improve performance.
Old RFC with detailed description of the problem and many fixes by Mitsuo
Hayasaka (mitsuo.hayasaka.hu@hitachi.com):
- https://lkml.org/lkml/2012/7/5/136
We've encountered performance degradation and fixed it on a big and complex
virtual environment.
Environment to reproduce degradation and improvement:
1. Add lag to user mode FUSE
Add nanosleep(&(struct timespec){ 0, 1000 }, NULL); to xmp_write_buf in
passthrough_fh.c
2. patch UM fuse with configurable max_pages parameter. The patch will be
provided latter.
3. run test script and perform test on tmpfs
fuse_test()
{
cd /tmp
mkdir -p fusemnt
passthrough_fh -o max_pages=$1 /tmp/fusemnt
grep fuse /proc/self/mounts
dd conv=fdatasync oflag=dsync if=/dev/zero of=fusemnt/tmp/tmp \
count=1K bs=1M 2>&1 | grep -v records
rm fusemnt/tmp/tmp
killall passthrough_fh
}
Test results:
passthrough_fh /tmp/fusemnt fuse.passthrough_fh \
rw,nosuid,nodev,relatime,user_id=0,group_id=0 0 0
1073741824 bytes (1.1 GB) copied, 1.73867 s, 618 MB/s
passthrough_fh /tmp/fusemnt fuse.passthrough_fh \
rw,nosuid,nodev,relatime,user_id=0,group_id=0,max_pages=256 0 0
1073741824 bytes (1.1 GB) copied, 1.15643 s, 928 MB/s
Obviously with bigger lag the difference between 'before' and 'after'
will be more significant.
Mitsuo Hayasaka, in 2012 (https://lkml.org/lkml/2012/7/5/136),
observed improvement from 400-550 to 520-740.
Signed-off-by: Constantine Shulyupin <const@MakeLinux.com>
Signed-off-by: Miklos Szeredi <mszeredi@redhat.com>
2018-09-06 20:37:06 +08:00
|
|
|
uint16_t max_pages;
|
|
|
|
uint16_t padding;
|
|
|
|
uint32_t unused[8];
|
2006-01-06 16:19:41 +08:00
|
|
|
};
|
|
|
|
|
2009-04-14 09:54:54 +08:00
|
|
|
#define CUSE_INIT_INFO_MAX 4096
|
|
|
|
|
|
|
|
struct cuse_init_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t major;
|
|
|
|
uint32_t minor;
|
|
|
|
uint32_t unused;
|
|
|
|
uint32_t flags;
|
2009-04-14 09:54:54 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct cuse_init_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t major;
|
|
|
|
uint32_t minor;
|
|
|
|
uint32_t unused;
|
|
|
|
uint32_t flags;
|
|
|
|
uint32_t max_read;
|
|
|
|
uint32_t max_write;
|
|
|
|
uint32_t dev_major; /* chardev major */
|
|
|
|
uint32_t dev_minor; /* chardev minor */
|
|
|
|
uint32_t spare[10];
|
2009-04-14 09:54:54 +08:00
|
|
|
};
|
|
|
|
|
2006-06-25 20:48:54 +08:00
|
|
|
struct fuse_interrupt_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t unique;
|
2006-06-25 20:48:54 +08:00
|
|
|
};
|
|
|
|
|
2006-12-07 12:35:51 +08:00
|
|
|
struct fuse_bmap_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t block;
|
|
|
|
uint32_t blocksize;
|
|
|
|
uint32_t padding;
|
2006-12-07 12:35:51 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_bmap_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t block;
|
2006-12-07 12:35:51 +08:00
|
|
|
};
|
|
|
|
|
2008-11-26 19:03:55 +08:00
|
|
|
struct fuse_ioctl_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t fh;
|
|
|
|
uint32_t flags;
|
|
|
|
uint32_t cmd;
|
|
|
|
uint64_t arg;
|
|
|
|
uint32_t in_size;
|
|
|
|
uint32_t out_size;
|
2008-11-26 19:03:55 +08:00
|
|
|
};
|
|
|
|
|
2010-12-08 03:16:56 +08:00
|
|
|
struct fuse_ioctl_iovec {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t base;
|
|
|
|
uint64_t len;
|
2010-12-08 03:16:56 +08:00
|
|
|
};
|
|
|
|
|
2008-11-26 19:03:55 +08:00
|
|
|
struct fuse_ioctl_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
int32_t result;
|
|
|
|
uint32_t flags;
|
|
|
|
uint32_t in_iovs;
|
|
|
|
uint32_t out_iovs;
|
2008-11-26 19:03:55 +08:00
|
|
|
};
|
|
|
|
|
2008-11-26 19:03:55 +08:00
|
|
|
struct fuse_poll_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t fh;
|
|
|
|
uint64_t kh;
|
|
|
|
uint32_t flags;
|
|
|
|
uint32_t events;
|
2008-11-26 19:03:55 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_poll_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t revents;
|
|
|
|
uint32_t padding;
|
2008-11-26 19:03:55 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_notify_poll_wakeup_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t kh;
|
2008-11-26 19:03:55 +08:00
|
|
|
};
|
|
|
|
|
2012-04-23 09:45:24 +08:00
|
|
|
struct fuse_fallocate_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t fh;
|
|
|
|
uint64_t offset;
|
|
|
|
uint64_t length;
|
|
|
|
uint32_t mode;
|
|
|
|
uint32_t padding;
|
2012-04-23 09:45:24 +08:00
|
|
|
};
|
|
|
|
|
2005-09-10 04:10:27 +08:00
|
|
|
struct fuse_in_header {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t len;
|
|
|
|
uint32_t opcode;
|
|
|
|
uint64_t unique;
|
|
|
|
uint64_t nodeid;
|
|
|
|
uint32_t uid;
|
|
|
|
uint32_t gid;
|
|
|
|
uint32_t pid;
|
|
|
|
uint32_t padding;
|
2005-09-10 04:10:27 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_out_header {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint32_t len;
|
|
|
|
int32_t error;
|
|
|
|
uint64_t unique;
|
2005-09-10 04:10:27 +08:00
|
|
|
};
|
|
|
|
|
2005-09-10 04:10:28 +08:00
|
|
|
struct fuse_dirent {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t ino;
|
|
|
|
uint64_t off;
|
|
|
|
uint32_t namelen;
|
|
|
|
uint32_t type;
|
2012-04-12 18:57:08 +08:00
|
|
|
char name[];
|
2005-09-10 04:10:28 +08:00
|
|
|
};
|
|
|
|
|
2007-07-16 14:39:50 +08:00
|
|
|
#define FUSE_NAME_OFFSET offsetof(struct fuse_dirent, name)
|
2013-04-17 18:30:40 +08:00
|
|
|
#define FUSE_DIRENT_ALIGN(x) \
|
|
|
|
(((x) + sizeof(uint64_t) - 1) & ~(sizeof(uint64_t) - 1))
|
2005-09-10 04:10:28 +08:00
|
|
|
#define FUSE_DIRENT_SIZE(d) \
|
|
|
|
FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET + (d)->namelen)
|
2008-10-16 22:08:57 +08:00
|
|
|
|
2012-08-19 20:53:23 +08:00
|
|
|
struct fuse_direntplus {
|
|
|
|
struct fuse_entry_out entry_out;
|
|
|
|
struct fuse_dirent dirent;
|
|
|
|
};
|
|
|
|
|
|
|
|
#define FUSE_NAME_OFFSET_DIRENTPLUS \
|
|
|
|
offsetof(struct fuse_direntplus, dirent.name)
|
|
|
|
#define FUSE_DIRENTPLUS_SIZE(d) \
|
|
|
|
FUSE_DIRENT_ALIGN(FUSE_NAME_OFFSET_DIRENTPLUS + (d)->dirent.namelen)
|
|
|
|
|
2009-05-31 23:13:57 +08:00
|
|
|
struct fuse_notify_inval_inode_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t ino;
|
|
|
|
int64_t off;
|
|
|
|
int64_t len;
|
2009-05-31 23:13:57 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_notify_inval_entry_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t parent;
|
|
|
|
uint32_t namelen;
|
|
|
|
uint32_t padding;
|
2009-05-31 23:13:57 +08:00
|
|
|
};
|
|
|
|
|
2011-12-07 04:50:06 +08:00
|
|
|
struct fuse_notify_delete_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t parent;
|
|
|
|
uint64_t child;
|
|
|
|
uint32_t namelen;
|
|
|
|
uint32_t padding;
|
2011-12-07 04:50:06 +08:00
|
|
|
};
|
|
|
|
|
2010-07-12 20:41:40 +08:00
|
|
|
struct fuse_notify_store_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t nodeid;
|
|
|
|
uint64_t offset;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t padding;
|
2010-07-12 20:41:40 +08:00
|
|
|
};
|
|
|
|
|
2010-07-12 20:41:40 +08:00
|
|
|
struct fuse_notify_retrieve_out {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t notify_unique;
|
|
|
|
uint64_t nodeid;
|
|
|
|
uint64_t offset;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t padding;
|
2010-07-12 20:41:40 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
/* Matches the size of fuse_write_in */
|
|
|
|
struct fuse_notify_retrieve_in {
|
2013-04-17 18:30:40 +08:00
|
|
|
uint64_t dummy1;
|
|
|
|
uint64_t offset;
|
|
|
|
uint32_t size;
|
|
|
|
uint32_t dummy2;
|
|
|
|
uint64_t dummy3;
|
|
|
|
uint64_t dummy4;
|
2010-07-12 20:41:40 +08:00
|
|
|
};
|
|
|
|
|
2015-07-01 22:26:08 +08:00
|
|
|
/* Device ioctls: */
|
|
|
|
#define FUSE_DEV_IOC_CLONE _IOR(229, 0, uint32_t)
|
|
|
|
|
2015-07-01 02:10:22 +08:00
|
|
|
struct fuse_lseek_in {
|
|
|
|
uint64_t fh;
|
|
|
|
uint64_t offset;
|
|
|
|
uint32_t whence;
|
|
|
|
uint32_t padding;
|
|
|
|
};
|
|
|
|
|
|
|
|
struct fuse_lseek_out {
|
|
|
|
uint64_t offset;
|
|
|
|
};
|
|
|
|
|
2018-08-21 20:36:31 +08:00
|
|
|
struct fuse_copy_file_range_in {
|
|
|
|
uint64_t fh_in;
|
|
|
|
uint64_t off_in;
|
|
|
|
uint64_t nodeid_out;
|
|
|
|
uint64_t fh_out;
|
|
|
|
uint64_t off_out;
|
|
|
|
uint64_t len;
|
|
|
|
uint64_t flags;
|
|
|
|
};
|
|
|
|
|
2008-10-16 22:08:57 +08:00
|
|
|
#endif /* _LINUX_FUSE_H */
|