OpenCloudOS-Kernel/fs/hostfs/hostfs_kern.c

1010 lines
21 KiB
C
Raw Normal View History

/*
* Copyright (C) 2000 - 2007 Jeff Dike (jdike@{addtoit,linux.intel}.com)
* Licensed under the GPL
*
* Ported the filesystem routines to 2.5.
* 2003-02-10 Petr Baudis <pasky@ucw.cz>
*/
#include <linux/fs.h>
#include <linux/magic.h>
#include <linux/module.h>
#include <linux/mm.h>
#include <linux/pagemap.h>
#include <linux/statfs.h>
include cleanup: Update gfp.h and slab.h includes to prepare for breaking implicit slab.h inclusion from percpu.h percpu.h is included by sched.h and module.h and thus ends up being included when building most .c files. percpu.h includes slab.h which in turn includes gfp.h making everything defined by the two files universally available and complicating inclusion dependencies. percpu.h -> slab.h dependency is about to be removed. Prepare for this change by updating users of gfp and slab facilities include those headers directly instead of assuming availability. As this conversion needs to touch large number of source files, the following script is used as the basis of conversion. http://userweb.kernel.org/~tj/misc/slabh-sweep.py The script does the followings. * Scan files for gfp and slab usages and update includes such that only the necessary includes are there. ie. if only gfp is used, gfp.h, if slab is used, slab.h. * When the script inserts a new include, it looks at the include blocks and try to put the new include such that its order conforms to its surrounding. It's put in the include block which contains core kernel includes, in the same order that the rest are ordered - alphabetical, Christmas tree, rev-Xmas-tree or at the end if there doesn't seem to be any matching order. * If the script can't find a place to put a new include (mostly because the file doesn't have fitting include block), it prints out an error message indicating which .h file needs to be added to the file. The conversion was done in the following steps. 1. The initial automatic conversion of all .c files updated slightly over 4000 files, deleting around 700 includes and adding ~480 gfp.h and ~3000 slab.h inclusions. The script emitted errors for ~400 files. 2. Each error was manually checked. Some didn't need the inclusion, some needed manual addition while adding it to implementation .h or embedding .c file was more appropriate for others. This step added inclusions to around 150 files. 3. The script was run again and the output was compared to the edits from #2 to make sure no file was left behind. 4. Several build tests were done and a couple of problems were fixed. e.g. lib/decompress_*.c used malloc/free() wrappers around slab APIs requiring slab.h to be added manually. 5. The script was run on all .h files but without automatically editing them as sprinkling gfp.h and slab.h inclusions around .h files could easily lead to inclusion dependency hell. Most gfp.h inclusion directives were ignored as stuff from gfp.h was usually wildly available and often used in preprocessor macros. Each slab.h inclusion directive was examined and added manually as necessary. 6. percpu.h was updated not to include slab.h. 7. Build test were done on the following configurations and failures were fixed. CONFIG_GCOV_KERNEL was turned off for all tests (as my distributed build env didn't work with gcov compiles) and a few more options had to be turned off depending on archs to make things build (like ipr on powerpc/64 which failed due to missing writeq). * x86 and x86_64 UP and SMP allmodconfig and a custom test config. * powerpc and powerpc64 SMP allmodconfig * sparc and sparc64 SMP allmodconfig * ia64 SMP allmodconfig * s390 SMP allmodconfig * alpha SMP allmodconfig * um on x86_64 SMP allmodconfig 8. percpu.h modifications were reverted so that it could be applied as a separate patch and serve as bisection point. Given the fact that I had only a couple of failures from tests on step 6, I'm fairly confident about the coverage of this conversion patch. If there is a breakage, it's likely to be something in one of the arch headers which should be easily discoverable easily on most builds of the specific arch. Signed-off-by: Tejun Heo <tj@kernel.org> Guess-its-ok-by: Christoph Lameter <cl@linux-foundation.org> Cc: Ingo Molnar <mingo@redhat.com> Cc: Lee Schermerhorn <Lee.Schermerhorn@hp.com>
2010-03-24 16:04:11 +08:00
#include <linux/slab.h>
#include <linux/seq_file.h>
#include <linux/mount.h>
#include <linux/namei.h>
#include "hostfs.h"
#include <init.h>
#include <kern.h>
struct hostfs_inode_info {
int fd;
fmode_t mode;
struct inode vfs_inode;
struct mutex open_mutex;
};
static inline struct hostfs_inode_info *HOSTFS_I(struct inode *inode)
{
return list_entry(inode, struct hostfs_inode_info, vfs_inode);
}
#define FILE_HOSTFS_I(file) HOSTFS_I(file_inode(file))
static struct kmem_cache *hostfs_inode_cache;
/* Changed in hostfs_args before the kernel starts running */
static char *root_ino = "";
static int append = 0;
static const struct inode_operations hostfs_iops;
static const struct inode_operations hostfs_dir_iops;
static const struct inode_operations hostfs_link_iops;
#ifndef MODULE
static int __init hostfs_args(char *options, int *add)
{
char *ptr;
ptr = strchr(options, ',');
if (ptr != NULL)
*ptr++ = '\0';
if (*options != '\0')
root_ino = options;
options = ptr;
while (options) {
ptr = strchr(options, ',');
if (ptr != NULL)
*ptr++ = '\0';
if (*options != '\0') {
if (!strcmp(options, "append"))
append = 1;
else printf("hostfs_args - unsupported option - %s\n",
options);
}
options = ptr;
}
return 0;
}
__uml_setup("hostfs=", hostfs_args,
"hostfs=<root dir>,<flags>,...\n"
" This is used to set hostfs parameters. The root directory argument\n"
" is used to confine all hostfs mounts to within the specified directory\n"
" tree on the host. If this isn't specified, then a user inside UML can\n"
" mount anything on the host that's accessible to the user that's running\n"
" it.\n"
" The only flag currently supported is 'append', which specifies that all\n"
" files opened by hostfs will be opened in append mode.\n\n"
);
#endif
static char *__dentry_name(struct dentry *dentry, char *name)
{
char *p = dentry_path_raw(dentry, name, PATH_MAX);
char *root;
size_t len;
root = dentry->d_sb->s_fs_info;
len = strlen(root);
if (IS_ERR(p)) {
__putname(name);
return NULL;
}
/*
* This function relies on the fact that dentry_path_raw() will place
* the path name at the end of the provided buffer.
*/
BUG_ON(p + strlen(p) + 1 != name + PATH_MAX);
strlcpy(name, root, PATH_MAX);
if (len > p - name) {
__putname(name);
return NULL;
}
if (p > name + len)
strcpy(name + len, p);
return name;
}
static char *dentry_name(struct dentry *dentry)
{
char *name = __getname();
if (!name)
return NULL;
return __dentry_name(dentry, name);
}
static char *inode_name(struct inode *ino)
{
struct dentry *dentry;
char *name;
dentry = d_find_alias(ino);
if (!dentry)
return NULL;
name = dentry_name(dentry);
dput(dentry);
return name;
}
static char *follow_link(char *link)
{
char *name, *resolved, *end;
int n;
name = kmalloc(PATH_MAX, GFP_KERNEL);
if (!name) {
n = -ENOMEM;
goto out_free;
}
n = hostfs_do_readlink(link, name, PATH_MAX);
if (n < 0)
goto out_free;
else if (n == PATH_MAX) {
n = -E2BIG;
goto out_free;
}
if (*name == '/')
return name;
end = strrchr(link, '/');
if (end == NULL)
return name;
*(end + 1) = '\0';
resolved = kasprintf(GFP_KERNEL, "%s%s", link, name);
if (resolved == NULL) {
n = -ENOMEM;
goto out_free;
}
kfree(name);
return resolved;
out_free:
kfree(name);
return ERR_PTR(n);
}
static struct inode *hostfs_iget(struct super_block *sb)
{
struct inode *inode = new_inode(sb);
if (!inode)
return ERR_PTR(-ENOMEM);
return inode;
}
static int hostfs_statfs(struct dentry *dentry, struct kstatfs *sf)
{
/*
* do_statfs uses struct statfs64 internally, but the linux kernel
* struct statfs still has 32-bit versions for most of these fields,
* so we convert them here
*/
int err;
long long f_blocks;
long long f_bfree;
long long f_bavail;
long long f_files;
long long f_ffree;
err = do_statfs(dentry->d_sb->s_fs_info,
&sf->f_bsize, &f_blocks, &f_bfree, &f_bavail, &f_files,
&f_ffree, &sf->f_fsid, sizeof(sf->f_fsid),
&sf->f_namelen);
if (err)
return err;
sf->f_blocks = f_blocks;
sf->f_bfree = f_bfree;
sf->f_bavail = f_bavail;
sf->f_files = f_files;
sf->f_ffree = f_ffree;
sf->f_type = HOSTFS_SUPER_MAGIC;
return 0;
}
static struct inode *hostfs_alloc_inode(struct super_block *sb)
{
struct hostfs_inode_info *hi;
hi = kmem_cache_alloc(hostfs_inode_cache, GFP_KERNEL_ACCOUNT);
if (hi == NULL)
return NULL;
hi->fd = -1;
hi->mode = 0;
inode_init_once(&hi->vfs_inode);
mutex_init(&hi->open_mutex);
return &hi->vfs_inode;
}
static void hostfs_evict_inode(struct inode *inode)
{
mm + fs: store shadow entries in page cache Reclaim will be leaving shadow entries in the page cache radix tree upon evicting the real page. As those pages are found from the LRU, an iput() can lead to the inode being freed concurrently. At this point, reclaim must no longer install shadow pages because the inode freeing code needs to ensure the page tree is really empty. Add an address_space flag, AS_EXITING, that the inode freeing code sets under the tree lock before doing the final truncate. Reclaim will check for this flag before installing shadow pages. Signed-off-by: Johannes Weiner <hannes@cmpxchg.org> Reviewed-by: Rik van Riel <riel@redhat.com> Reviewed-by: Minchan Kim <minchan@kernel.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Cc: Bob Liu <bob.liu@oracle.com> Cc: Christoph Hellwig <hch@infradead.org> Cc: Dave Chinner <david@fromorbit.com> Cc: Greg Thelen <gthelen@google.com> Cc: Hugh Dickins <hughd@google.com> Cc: Jan Kara <jack@suse.cz> Cc: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: Luigi Semenzato <semenzato@google.com> Cc: Mel Gorman <mgorman@suse.de> Cc: Metin Doslu <metin@citusdata.com> Cc: Michel Lespinasse <walken@google.com> Cc: Ozgun Erdogan <ozgun@citusdata.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Roman Gushchin <klamm@yandex-team.ru> Cc: Ryan Mallon <rmallon@gmail.com> Cc: Tejun Heo <tj@kernel.org> Cc: Vlastimil Babka <vbabka@suse.cz> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2014-04-04 05:47:49 +08:00
truncate_inode_pages_final(&inode->i_data);
clear_inode(inode);
if (HOSTFS_I(inode)->fd != -1) {
close_file(&HOSTFS_I(inode)->fd);
HOSTFS_I(inode)->fd = -1;
}
}
static void hostfs_free_inode(struct inode *inode)
{
kmem_cache_free(hostfs_inode_cache, HOSTFS_I(inode));
}
2011-01-07 14:49:49 +08:00
static int hostfs_show_options(struct seq_file *seq, struct dentry *root)
{
const char *root_path = root->d_sb->s_fs_info;
size_t offset = strlen(root_ino) + 1;
if (strlen(root_path) > offset)
fs: create and use seq_show_option for escaping Many file systems that implement the show_options hook fail to correctly escape their output which could lead to unescaped characters (e.g. new lines) leaking into /proc/mounts and /proc/[pid]/mountinfo files. This could lead to confusion, spoofed entries (resulting in things like systemd issuing false d-bus "mount" notifications), and who knows what else. This looks like it would only be the root user stepping on themselves, but it's possible weird things could happen in containers or in other situations with delegated mount privileges. Here's an example using overlay with setuid fusermount trusting the contents of /proc/mounts (via the /etc/mtab symlink). Imagine the use of "sudo" is something more sneaky: $ BASE="ovl" $ MNT="$BASE/mnt" $ LOW="$BASE/lower" $ UP="$BASE/upper" $ WORK="$BASE/work/ 0 0 none /proc fuse.pwn user_id=1000" $ mkdir -p "$LOW" "$UP" "$WORK" $ sudo mount -t overlay -o "lowerdir=$LOW,upperdir=$UP,workdir=$WORK" none /mnt $ cat /proc/mounts none /root/ovl/mnt overlay rw,relatime,lowerdir=ovl/lower,upperdir=ovl/upper,workdir=ovl/work/ 0 0 none /proc fuse.pwn user_id=1000 0 0 $ fusermount -u /proc $ cat /proc/mounts cat: /proc/mounts: No such file or directory This fixes the problem by adding new seq_show_option and seq_show_option_n helpers, and updating the vulnerable show_option handlers to use them as needed. Some, like SELinux, need to be open coded due to unusual existing escape mechanisms. [akpm@linux-foundation.org: add lost chunk, per Kees] [keescook@chromium.org: seq_show_option should be using const parameters] Signed-off-by: Kees Cook <keescook@chromium.org> Acked-by: Serge Hallyn <serge.hallyn@canonical.com> Acked-by: Jan Kara <jack@suse.com> Acked-by: Paul Moore <paul@paul-moore.com> Cc: J. R. Okajima <hooanon05g@gmail.com> Signed-off-by: Kees Cook <keescook@chromium.org> Cc: <stable@vger.kernel.org> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2015-09-05 06:44:57 +08:00
seq_show_option(seq, root_path + offset, NULL);
if (append)
seq_puts(seq, ",append");
return 0;
}
static const struct super_operations hostfs_sbops = {
.alloc_inode = hostfs_alloc_inode,
.free_inode = hostfs_free_inode,
.evict_inode = hostfs_evict_inode,
.statfs = hostfs_statfs,
.show_options = hostfs_show_options,
};
static int hostfs_readdir(struct file *file, struct dir_context *ctx)
{
void *dir;
char *name;
unsigned long long next, ino;
int error, len;
unsigned int type;
name = dentry_name(file->f_path.dentry);
if (name == NULL)
return -ENOMEM;
dir = open_dir(name, &error);
__putname(name);
if (dir == NULL)
return -error;
next = ctx->pos;
seek_dir(dir, next);
while ((name = read_dir(dir, &next, &ino, &len, &type)) != NULL) {
if (!dir_emit(ctx, name, len, ino, type))
break;
ctx->pos = next;
}
close_dir(dir);
return 0;
}
static int hostfs_open(struct inode *ino, struct file *file)
{
char *name;
fmode_t mode;
int err;
int r, w, fd;
mode = file->f_mode & (FMODE_READ | FMODE_WRITE);
if ((mode & HOSTFS_I(ino)->mode) == mode)
return 0;
mode |= HOSTFS_I(ino)->mode;
retry:
r = w = 0;
if (mode & FMODE_READ)
r = 1;
if (mode & FMODE_WRITE)
r = w = 1;
name = dentry_name(file_dentry(file));
if (name == NULL)
return -ENOMEM;
fd = open_file(name, r, w, append);
__putname(name);
if (fd < 0)
return fd;
mutex_lock(&HOSTFS_I(ino)->open_mutex);
/* somebody else had handled it first? */
if ((mode & HOSTFS_I(ino)->mode) == mode) {
mutex_unlock(&HOSTFS_I(ino)->open_mutex);
close_file(&fd);
return 0;
}
if ((mode | HOSTFS_I(ino)->mode) != mode) {
mode |= HOSTFS_I(ino)->mode;
mutex_unlock(&HOSTFS_I(ino)->open_mutex);
close_file(&fd);
goto retry;
}
if (HOSTFS_I(ino)->fd == -1) {
HOSTFS_I(ino)->fd = fd;
} else {
err = replace_file(fd, HOSTFS_I(ino)->fd);
close_file(&fd);
if (err < 0) {
mutex_unlock(&HOSTFS_I(ino)->open_mutex);
return err;
}
}
HOSTFS_I(ino)->mode = mode;
mutex_unlock(&HOSTFS_I(ino)->open_mutex);
return 0;
}
static int hostfs_file_release(struct inode *inode, struct file *file)
{
filemap_write_and_wait(inode->i_mapping);
return 0;
}
static int hostfs_fsync(struct file *file, loff_t start, loff_t end,
int datasync)
{
struct inode *inode = file->f_mapping->host;
int ret;
ret = file_write_and_wait_range(file, start, end);
if (ret)
return ret;
inode_lock(inode);
ret = fsync_file(HOSTFS_I(inode)->fd, datasync);
inode_unlock(inode);
return ret;
}
static const struct file_operations hostfs_file_fops = {
.llseek = generic_file_llseek,
.splice_read = generic_file_splice_read,
.splice_write = iter_file_splice_write,
.read_iter = generic_file_read_iter,
.write_iter = generic_file_write_iter,
.mmap = generic_file_mmap,
.open = hostfs_open,
.release = hostfs_file_release,
.fsync = hostfs_fsync,
};
static const struct file_operations hostfs_dir_fops = {
.llseek = generic_file_llseek,
.iterate_shared = hostfs_readdir,
.read = generic_read_dir,
.open = hostfs_open,
.fsync = hostfs_fsync,
};
static int hostfs_writepage(struct page *page, struct writeback_control *wbc)
{
struct address_space *mapping = page->mapping;
struct inode *inode = mapping->host;
char *buffer;
loff_t base = page_offset(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
int count = PAGE_SIZE;
int end_index = inode->i_size >> PAGE_SHIFT;
int err;
if (page->index >= end_index)
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
count = inode->i_size & (PAGE_SIZE-1);
buffer = kmap(page);
err = write_file(HOSTFS_I(inode)->fd, &base, buffer, count);
if (err != count) {
ClearPageUptodate(page);
goto out;
}
if (base > inode->i_size)
inode->i_size = base;
if (PageError(page))
ClearPageError(page);
err = 0;
out:
kunmap(page);
unlock_page(page);
return err;
}
static int hostfs_readpage(struct file *file, struct page *page)
{
char *buffer;
loff_t start = page_offset(page);
int bytes_read, ret = 0;
buffer = kmap(page);
bytes_read = read_file(FILE_HOSTFS_I(file)->fd, &start, buffer,
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
PAGE_SIZE);
if (bytes_read < 0) {
ClearPageUptodate(page);
SetPageError(page);
ret = bytes_read;
goto out;
}
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
memset(buffer + bytes_read, 0, PAGE_SIZE - bytes_read);
ClearPageError(page);
SetPageUptodate(page);
out:
flush_dcache_page(page);
kunmap(page);
unlock_page(page);
return ret;
}
static int hostfs_write_begin(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned flags,
struct page **pagep, void **fsdata)
{
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
pgoff_t index = pos >> PAGE_SHIFT;
fs: symlink write_begin allocation context fix With the write_begin/write_end aops, page_symlink was broken because it could no longer pass a GFP_NOFS type mask into the point where the allocations happened. They are done in write_begin, which would always assume that the filesystem can be entered from reclaim. This bug could cause filesystem deadlocks. The funny thing with having a gfp_t mask there is that it doesn't really allow the caller to arbitrarily tinker with the context in which it can be called. It couldn't ever be GFP_ATOMIC, for example, because it needs to take the page lock. The only thing any callers care about is __GFP_FS anyway, so turn that into a single flag. Add a new flag for write_begin, AOP_FLAG_NOFS. Filesystems can now act on this flag in their write_begin function. Change __grab_cache_page to accept a nofs argument as well, to honour that flag (while we're there, change the name to grab_cache_page_write_begin which is more instructive and does away with random leading underscores). This is really a more flexible way to go in the end anyway -- if a filesystem happens to want any extra allocations aside from the pagecache ones in ints write_begin function, it may now use GFP_KERNEL (rather than GFP_NOFS) for common case allocations (eg. ocfs2_alloc_write_ctxt, for a random example). [kosaki.motohiro@jp.fujitsu.com: fix ubifs] [kosaki.motohiro@jp.fujitsu.com: fix fuse] Signed-off-by: Nick Piggin <npiggin@suse.de> Reviewed-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Cc: <stable@kernel.org> [2.6.28.x] Signed-off-by: KOSAKI Motohiro <kosaki.motohiro@jp.fujitsu.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> [ Cleaned up the calling convention: just pass in the AOP flags untouched to the grab_cache_page_write_begin() function. That just simplifies everybody, and may even allow future expansion of the logic. - Linus ] Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2009-01-05 04:00:53 +08:00
*pagep = grab_cache_page_write_begin(mapping, index, flags);
if (!*pagep)
return -ENOMEM;
return 0;
}
static int hostfs_write_end(struct file *file, struct address_space *mapping,
loff_t pos, unsigned len, unsigned copied,
struct page *page, void *fsdata)
{
struct inode *inode = mapping->host;
void *buffer;
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
unsigned from = pos & (PAGE_SIZE - 1);
int err;
buffer = kmap(page);
err = write_file(FILE_HOSTFS_I(file)->fd, &pos, buffer + from, copied);
kunmap(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
if (!PageUptodate(page) && err == PAGE_SIZE)
SetPageUptodate(page);
/*
* If err > 0, write_file has added err to pos, so we are comparing
* i_size against the last byte written.
*/
if (err > 0 && (pos > inode->i_size))
inode->i_size = pos;
unlock_page(page);
mm, fs: get rid of PAGE_CACHE_* and page_cache_{get,release} macros PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} macros were introduced *long* time ago with promise that one day it will be possible to implement page cache with bigger chunks than PAGE_SIZE. This promise never materialized. And unlikely will. We have many places where PAGE_CACHE_SIZE assumed to be equal to PAGE_SIZE. And it's constant source of confusion on whether PAGE_CACHE_* or PAGE_* constant should be used in a particular case, especially on the border between fs and mm. Global switching to PAGE_CACHE_SIZE != PAGE_SIZE would cause to much breakage to be doable. Let's stop pretending that pages in page cache are special. They are not. The changes are pretty straight-forward: - <foo> << (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - <foo> >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) -> <foo>; - PAGE_CACHE_{SIZE,SHIFT,MASK,ALIGN} -> PAGE_{SIZE,SHIFT,MASK,ALIGN}; - page_cache_get() -> get_page(); - page_cache_release() -> put_page(); This patch contains automated changes generated with coccinelle using script below. For some reason, coccinelle doesn't patch header files. I've called spatch for them manually. The only adjustment after coccinelle is revert of changes to PAGE_CAHCE_ALIGN definition: we are going to drop it later. There are few places in the code where coccinelle didn't reach. I'll fix them manually in a separate patch. Comments and documentation also will be addressed with the separate patch. virtual patch @@ expression E; @@ - E << (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ expression E; @@ - E >> (PAGE_CACHE_SHIFT - PAGE_SHIFT) + E @@ @@ - PAGE_CACHE_SHIFT + PAGE_SHIFT @@ @@ - PAGE_CACHE_SIZE + PAGE_SIZE @@ @@ - PAGE_CACHE_MASK + PAGE_MASK @@ expression E; @@ - PAGE_CACHE_ALIGN(E) + PAGE_ALIGN(E) @@ expression E; @@ - page_cache_get(E) + get_page(E) @@ expression E; @@ - page_cache_release(E) + put_page(E) Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Acked-by: Michal Hocko <mhocko@suse.com> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
2016-04-01 20:29:47 +08:00
put_page(page);
return err;
}
static const struct address_space_operations hostfs_aops = {
.writepage = hostfs_writepage,
.readpage = hostfs_readpage,
.set_page_dirty = __set_page_dirty_nobuffers,
.write_begin = hostfs_write_begin,
.write_end = hostfs_write_end,
};
static int read_name(struct inode *ino, char *name)
{
dev_t rdev;
struct hostfs_stat st;
int err = stat_file(name, &st, -1);
if (err)
return err;
/* Reencode maj and min with the kernel encoding.*/
rdev = MKDEV(st.maj, st.min);
switch (st.mode & S_IFMT) {
case S_IFLNK:
ino->i_op = &hostfs_link_iops;
break;
case S_IFDIR:
ino->i_op = &hostfs_dir_iops;
ino->i_fop = &hostfs_dir_fops;
break;
case S_IFCHR:
case S_IFBLK:
case S_IFIFO:
case S_IFSOCK:
init_special_inode(ino, st.mode & S_IFMT, rdev);
ino->i_op = &hostfs_iops;
break;
case S_IFREG:
ino->i_op = &hostfs_iops;
ino->i_fop = &hostfs_file_fops;
ino->i_mapping->a_ops = &hostfs_aops;
break;
default:
return -EIO;
}
ino->i_ino = st.ino;
ino->i_mode = st.mode;
set_nlink(ino, st.nlink);
i_uid_write(ino, st.uid);
i_gid_write(ino, st.gid);
ino->i_atime = (struct timespec64){ st.atime.tv_sec, st.atime.tv_nsec };
ino->i_mtime = (struct timespec64){ st.mtime.tv_sec, st.mtime.tv_nsec };
ino->i_ctime = (struct timespec64){ st.ctime.tv_sec, st.ctime.tv_nsec };
ino->i_size = st.size;
ino->i_blocks = st.blocks;
return 0;
}
static int hostfs_create(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, bool excl)
{
struct inode *inode;
char *name;
int error, fd;
inode = hostfs_iget(dir->i_sb);
if (IS_ERR(inode)) {
error = PTR_ERR(inode);
goto out;
}
error = -ENOMEM;
name = dentry_name(dentry);
if (name == NULL)
goto out_put;
fd = file_create(name, mode & 0777);
if (fd < 0)
error = fd;
else
error = read_name(inode, name);
__putname(name);
if (error)
goto out_put;
HOSTFS_I(inode)->fd = fd;
HOSTFS_I(inode)->mode = FMODE_READ | FMODE_WRITE;
d_instantiate(dentry, inode);
return 0;
out_put:
iput(inode);
out:
return error;
}
static struct dentry *hostfs_lookup(struct inode *ino, struct dentry *dentry,
unsigned int flags)
{
struct inode *inode;
char *name;
int err;
inode = hostfs_iget(ino->i_sb);
if (IS_ERR(inode))
goto out;
err = -ENOMEM;
name = dentry_name(dentry);
if (name) {
err = read_name(inode, name);
__putname(name);
}
if (err) {
iput(inode);
inode = (err == -ENOENT) ? NULL : ERR_PTR(err);
}
out:
return d_splice_alias(inode, dentry);
}
static int hostfs_link(struct dentry *to, struct inode *ino,
struct dentry *from)
{
char *from_name, *to_name;
int err;
if ((from_name = dentry_name(from)) == NULL)
return -ENOMEM;
to_name = dentry_name(to);
if (to_name == NULL) {
__putname(from_name);
return -ENOMEM;
}
err = link_file(to_name, from_name);
__putname(from_name);
__putname(to_name);
return err;
}
static int hostfs_unlink(struct inode *ino, struct dentry *dentry)
{
char *file;
int err;
if (append)
return -EPERM;
if ((file = dentry_name(dentry)) == NULL)
return -ENOMEM;
err = unlink_file(file);
__putname(file);
return err;
}
static int hostfs_symlink(struct user_namespace *mnt_userns, struct inode *ino,
struct dentry *dentry, const char *to)
{
char *file;
int err;
if ((file = dentry_name(dentry)) == NULL)
return -ENOMEM;
err = make_symlink(file, to);
__putname(file);
return err;
}
static int hostfs_mkdir(struct user_namespace *mnt_userns, struct inode *ino,
struct dentry *dentry, umode_t mode)
{
char *file;
int err;
if ((file = dentry_name(dentry)) == NULL)
return -ENOMEM;
err = do_mkdir(file, mode);
__putname(file);
return err;
}
static int hostfs_rmdir(struct inode *ino, struct dentry *dentry)
{
char *file;
int err;
if ((file = dentry_name(dentry)) == NULL)
return -ENOMEM;
err = hostfs_do_rmdir(file);
__putname(file);
return err;
}
static int hostfs_mknod(struct user_namespace *mnt_userns, struct inode *dir,
struct dentry *dentry, umode_t mode, dev_t dev)
{
struct inode *inode;
char *name;
int err;
inode = hostfs_iget(dir->i_sb);
if (IS_ERR(inode)) {
err = PTR_ERR(inode);
goto out;
}
err = -ENOMEM;
name = dentry_name(dentry);
if (name == NULL)
goto out_put;
err = do_mknod(name, mode, MAJOR(dev), MINOR(dev));
uml: fix hostfs mknod() An inverted return value check in hostfs_mknod() caused the function to return success after handling it as an error (and cleaning up). It resulted in the following segfault when trying to bind() a named unix socket: Pid: 198, comm: a.out Not tainted 4.4.0-rc4 RIP: 0033:[<0000000061077df6>] RSP: 00000000daae5d60 EFLAGS: 00010202 RAX: 0000000000000000 RBX: 000000006092a460 RCX: 00000000dfc54208 RDX: 0000000061073ef1 RSI: 0000000000000070 RDI: 00000000e027d600 RBP: 00000000daae5de0 R08: 00000000da980ac0 R09: 0000000000000000 R10: 0000000000000003 R11: 00007fb1ae08f72a R12: 0000000000000000 R13: 000000006092a460 R14: 00000000daaa97c0 R15: 00000000daaa9a88 Kernel panic - not syncing: Kernel mode fault at addr 0x40, ip 0x61077df6 CPU: 0 PID: 198 Comm: a.out Not tainted 4.4.0-rc4 #1 Stack: e027d620 dfc54208 0000006f da981398 61bee000 0000c1ed daae5de0 0000006e e027d620 dfcd4208 00000005 6092a460 Call Trace: [<60dedc67>] SyS_bind+0xf7/0x110 [<600587be>] handle_syscall+0x7e/0x80 [<60066ad7>] userspace+0x3e7/0x4e0 [<6006321f>] ? save_registers+0x1f/0x40 [<6006c88e>] ? arch_prctl+0x1be/0x1f0 [<60054985>] fork_handler+0x85/0x90 Let's also get rid of the "cosmic ray protection" while we're at it. Fixes: e9193059b1b3 "hostfs: fix races in dentry_name() and inode_name()" Signed-off-by: Vegard Nossum <vegard.nossum@oracle.com> Cc: Jeff Dike <jdike@addtoit.com> Cc: Al Viro <viro@zeniv.linux.org.uk> Cc: stable@vger.kernel.org Signed-off-by: Richard Weinberger <richard@nod.at>
2015-12-17 04:59:56 +08:00
if (err)
goto out_free;
err = read_name(inode, name);
__putname(name);
if (err)
goto out_put;
d_instantiate(dentry, inode);
return 0;
out_free:
__putname(name);
out_put:
iput(inode);
out:
return err;
}
static int hostfs_rename2(struct user_namespace *mnt_userns,
struct inode *old_dir, struct dentry *old_dentry,
struct inode *new_dir, struct dentry *new_dentry,
unsigned int flags)
{
char *old_name, *new_name;
int err;
if (flags & ~(RENAME_NOREPLACE | RENAME_EXCHANGE))
return -EINVAL;
old_name = dentry_name(old_dentry);
if (old_name == NULL)
return -ENOMEM;
new_name = dentry_name(new_dentry);
if (new_name == NULL) {
__putname(old_name);
return -ENOMEM;
}
if (!flags)
err = rename_file(old_name, new_name);
else
err = rename2_file(old_name, new_name, flags);
__putname(old_name);
__putname(new_name);
return err;
}
static int hostfs_permission(struct user_namespace *mnt_userns,
struct inode *ino, int desired)
{
char *name;
int r = 0, w = 0, x = 0, err;
if (desired & MAY_NOT_BLOCK)
return -ECHILD;
if (desired & MAY_READ) r = 1;
if (desired & MAY_WRITE) w = 1;
if (desired & MAY_EXEC) x = 1;
name = inode_name(ino);
if (name == NULL)
return -ENOMEM;
if (S_ISCHR(ino->i_mode) || S_ISBLK(ino->i_mode) ||
S_ISFIFO(ino->i_mode) || S_ISSOCK(ino->i_mode))
err = 0;
else
err = access_file(name, r, w, x);
__putname(name);
if (!err)
err = generic_permission(&init_user_ns, ino, desired);
return err;
}
static int hostfs_setattr(struct user_namespace *mnt_userns,
struct dentry *dentry, struct iattr *attr)
{
struct inode *inode = d_inode(dentry);
struct hostfs_iattr attrs;
char *name;
int err;
int fd = HOSTFS_I(inode)->fd;
err = setattr_prepare(&init_user_ns, dentry, attr);
if (err)
return err;
if (append)
attr->ia_valid &= ~ATTR_SIZE;
attrs.ia_valid = 0;
if (attr->ia_valid & ATTR_MODE) {
attrs.ia_valid |= HOSTFS_ATTR_MODE;
attrs.ia_mode = attr->ia_mode;
}
if (attr->ia_valid & ATTR_UID) {
attrs.ia_valid |= HOSTFS_ATTR_UID;
attrs.ia_uid = from_kuid(&init_user_ns, attr->ia_uid);
}
if (attr->ia_valid & ATTR_GID) {
attrs.ia_valid |= HOSTFS_ATTR_GID;
attrs.ia_gid = from_kgid(&init_user_ns, attr->ia_gid);
}
if (attr->ia_valid & ATTR_SIZE) {
attrs.ia_valid |= HOSTFS_ATTR_SIZE;
attrs.ia_size = attr->ia_size;
}
if (attr->ia_valid & ATTR_ATIME) {
attrs.ia_valid |= HOSTFS_ATTR_ATIME;
attrs.ia_atime = (struct hostfs_timespec)
{ attr->ia_atime.tv_sec, attr->ia_atime.tv_nsec };
}
if (attr->ia_valid & ATTR_MTIME) {
attrs.ia_valid |= HOSTFS_ATTR_MTIME;
attrs.ia_mtime = (struct hostfs_timespec)
{ attr->ia_mtime.tv_sec, attr->ia_mtime.tv_nsec };
}
if (attr->ia_valid & ATTR_CTIME) {
attrs.ia_valid |= HOSTFS_ATTR_CTIME;
attrs.ia_ctime = (struct hostfs_timespec)
{ attr->ia_ctime.tv_sec, attr->ia_ctime.tv_nsec };
}
if (attr->ia_valid & ATTR_ATIME_SET) {
attrs.ia_valid |= HOSTFS_ATTR_ATIME_SET;
}
if (attr->ia_valid & ATTR_MTIME_SET) {
attrs.ia_valid |= HOSTFS_ATTR_MTIME_SET;
}
name = dentry_name(dentry);
if (name == NULL)
return -ENOMEM;
err = set_attr(name, &attrs, fd);
__putname(name);
if (err)
return err;
if ((attr->ia_valid & ATTR_SIZE) &&
attr->ia_size != i_size_read(inode))
truncate_setsize(inode, attr->ia_size);
setattr_copy(&init_user_ns, inode, attr);
mark_inode_dirty(inode);
return 0;
}
static const struct inode_operations hostfs_iops = {
.permission = hostfs_permission,
.setattr = hostfs_setattr,
};
static const struct inode_operations hostfs_dir_iops = {
.create = hostfs_create,
.lookup = hostfs_lookup,
.link = hostfs_link,
.unlink = hostfs_unlink,
.symlink = hostfs_symlink,
.mkdir = hostfs_mkdir,
.rmdir = hostfs_rmdir,
.mknod = hostfs_mknod,
.rename = hostfs_rename2,
.permission = hostfs_permission,
.setattr = hostfs_setattr,
};
static const char *hostfs_get_link(struct dentry *dentry,
struct inode *inode,
struct delayed_call *done)
{
char *link;
if (!dentry)
return ERR_PTR(-ECHILD);
link = kmalloc(PATH_MAX, GFP_KERNEL);
if (link) {
char *path = dentry_name(dentry);
int err = -ENOMEM;
if (path) {
err = hostfs_do_readlink(path, link, PATH_MAX);
if (err == PATH_MAX)
err = -E2BIG;
__putname(path);
}
if (err < 0) {
kfree(link);
return ERR_PTR(err);
}
} else {
return ERR_PTR(-ENOMEM);
}
set_delayed_call(done, kfree_link, link);
return link;
}
static const struct inode_operations hostfs_link_iops = {
.get_link = hostfs_get_link,
};
static int hostfs_fill_sb_common(struct super_block *sb, void *d, int silent)
{
struct inode *root_inode;
char *host_root_path, *req_root = d;
int err;
sb->s_blocksize = 1024;
sb->s_blocksize_bits = 10;
sb->s_magic = HOSTFS_SUPER_MAGIC;
sb->s_op = &hostfs_sbops;
sb->s_d_op = &simple_dentry_operations;
sb->s_maxbytes = MAX_LFS_FILESIZE;
/* NULL is printed as '(null)' by printf(): avoid that. */
if (req_root == NULL)
req_root = "";
err = -ENOMEM;
sb->s_fs_info = host_root_path =
kasprintf(GFP_KERNEL, "%s/%s", root_ino, req_root);
if (host_root_path == NULL)
goto out;
root_inode = new_inode(sb);
if (!root_inode)
goto out;
err = read_name(root_inode, host_root_path);
if (err)
goto out_put;
if (S_ISLNK(root_inode->i_mode)) {
char *name = follow_link(host_root_path);
if (IS_ERR(name)) {
err = PTR_ERR(name);
goto out_put;
}
err = read_name(root_inode, name);
kfree(name);
if (err)
goto out_put;
}
err = -ENOMEM;
sb->s_root = d_make_root(root_inode);
if (sb->s_root == NULL)
goto out;
return 0;
out_put:
iput(root_inode);
out:
return err;
}
static struct dentry *hostfs_read_sb(struct file_system_type *type,
[PATCH] VFS: Permit filesystem to override root dentry on mount Extend the get_sb() filesystem operation to take an extra argument that permits the VFS to pass in the target vfsmount that defines the mountpoint. The filesystem is then required to manually set the superblock and root dentry pointers. For most filesystems, this should be done with simple_set_mnt() which will set the superblock pointer and then set the root dentry to the superblock's s_root (as per the old default behaviour). The get_sb() op now returns an integer as there's now no need to return the superblock pointer. This patch permits a superblock to be implicitly shared amongst several mount points, such as can be done with NFS to avoid potential inode aliasing. In such a case, simple_set_mnt() would not be called, and instead the mnt_root and mnt_sb would be set directly. The patch also makes the following changes: (*) the get_sb_*() convenience functions in the core kernel now take a vfsmount pointer argument and return an integer, so most filesystems have to change very little. (*) If one of the convenience function is not used, then get_sb() should normally call simple_set_mnt() to instantiate the vfsmount. This will always return 0, and so can be tail-called from get_sb(). (*) generic_shutdown_super() now calls shrink_dcache_sb() to clean up the dcache upon superblock destruction rather than shrink_dcache_anon(). This is required because the superblock may now have multiple trees that aren't actually bound to s_root, but that still need to be cleaned up. The currently called functions assume that the whole tree is rooted at s_root, and that anonymous dentries are not the roots of trees which results in dentries being left unculled. However, with the way NFS superblock sharing are currently set to be implemented, these assumptions are violated: the root of the filesystem is simply a dummy dentry and inode (the real inode for '/' may well be inaccessible), and all the vfsmounts are rooted on anonymous[*] dentries with child trees. [*] Anonymous until discovered from another tree. (*) The documentation has been adjusted, including the additional bit of changing ext2_* into foo_* in the documentation. [akpm@osdl.org: convert ipath_fs, do other stuff] Signed-off-by: David Howells <dhowells@redhat.com> Acked-by: Al Viro <viro@zeniv.linux.org.uk> Cc: Nathan Scott <nathans@sgi.com> Cc: Roland Dreier <rolandd@cisco.com> Signed-off-by: Andrew Morton <akpm@osdl.org> Signed-off-by: Linus Torvalds <torvalds@osdl.org>
2006-06-23 17:02:57 +08:00
int flags, const char *dev_name,
void *data)
{
return mount_nodev(type, flags, data, hostfs_fill_sb_common);
}
static void hostfs_kill_sb(struct super_block *s)
{
kill_anon_super(s);
kfree(s->s_fs_info);
}
static struct file_system_type hostfs_type = {
.owner = THIS_MODULE,
.name = "hostfs",
.mount = hostfs_read_sb,
.kill_sb = hostfs_kill_sb,
.fs_flags = 0,
};
MODULE_ALIAS_FS("hostfs");
static int __init init_hostfs(void)
{
hostfs_inode_cache = KMEM_CACHE(hostfs_inode_info, 0);
if (!hostfs_inode_cache)
return -ENOMEM;
return register_filesystem(&hostfs_type);
}
static void __exit exit_hostfs(void)
{
unregister_filesystem(&hostfs_type);
kmem_cache_destroy(hostfs_inode_cache);
}
module_init(init_hostfs)
module_exit(exit_hostfs)
MODULE_LICENSE("GPL");