From f4cc1c3810a0382ff76a4e119a21b90b84dbe195 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Nov 2016 22:37:03 -0500 Subject: [PATCH 01/10] remove a bogus claim about namespace_sem being held by callers of mnt_alloc_id() Hadn't been true for quite a while Signed-off-by: Al Viro --- fs/namespace.c | 4 ---- 1 file changed, 4 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index e6c234b1a645..a1a04dd1ebfc 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -96,10 +96,6 @@ static inline struct hlist_head *mp_hash(struct dentry *dentry) return &mountpoint_hashtable[tmp & mp_hash_mask]; } -/* - * allocation is serialized by namespace_sem, but we need the spinlock to - * serialize with freeing. - */ static int mnt_alloc_id(struct mount *mnt) { int res; From 066715d3fde4834cbbec88d12ca277c4185b9303 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sat, 19 Nov 2016 23:23:18 -0500 Subject: [PATCH 02/10] clone_private_mount() doesn't need to touch namespace_sem not for CL_PRIVATE clone_mnt() Signed-off-by: Al Viro --- fs/namespace.c | 2 -- 1 file changed, 2 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index a1a04dd1ebfc..ec726ae00579 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1795,9 +1795,7 @@ struct vfsmount *clone_private_mount(struct path *path) if (IS_MNT_UNBINDABLE(old_mnt)) return ERR_PTR(-EINVAL); - down_read(&namespace_sem); new_mnt = clone_mnt(old_mnt, path->dentry, CL_PRIVATE); - up_read(&namespace_sem); if (IS_ERR(new_mnt)) return ERR_CAST(new_mnt); From 5235d448c48e1f5a4a34bf90d412775cb75ffb32 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Sun, 20 Nov 2016 19:33:09 -0500 Subject: [PATCH 03/10] reorganize do_make_slave() Make sure that clone_mnt() never returns a mount with MNT_SHARED in flags, but without a valid ->mnt_group_id. That allows to demystify do_make_slave() quite a bit, among other things. Signed-off-by: Al Viro --- fs/namespace.c | 2 ++ fs/pnode.c | 76 ++++++++++++++++++++++++-------------------------- 2 files changed, 39 insertions(+), 39 deletions(-) diff --git a/fs/namespace.c b/fs/namespace.c index ec726ae00579..141d5776c70e 100644 --- a/fs/namespace.c +++ b/fs/namespace.c @@ -1030,6 +1030,8 @@ static struct mount *clone_mnt(struct mount *old, struct dentry *root, if (IS_MNT_SLAVE(old)) list_add(&mnt->mnt_slave, &old->mnt_slave); mnt->mnt_master = old->mnt_master; + } else { + CLEAR_MNT_SHARED(mnt); } if (flag & CL_MAKE_SHARED) set_mnt_shared(mnt); diff --git a/fs/pnode.c b/fs/pnode.c index 234a9ac49958..06a793f4ae38 100644 --- a/fs/pnode.c +++ b/fs/pnode.c @@ -67,49 +67,47 @@ int get_dominating_id(struct mount *mnt, const struct path *root) static int do_make_slave(struct mount *mnt) { - struct mount *peer_mnt = mnt, *master = mnt->mnt_master; - struct mount *slave_mnt; + struct mount *master, *slave_mnt; - /* - * slave 'mnt' to a peer mount that has the - * same root dentry. If none is available then - * slave it to anything that is available. - */ - while ((peer_mnt = next_peer(peer_mnt)) != mnt && - peer_mnt->mnt.mnt_root != mnt->mnt.mnt_root) ; - - if (peer_mnt == mnt) { - peer_mnt = next_peer(mnt); - if (peer_mnt == mnt) - peer_mnt = NULL; - } - if (mnt->mnt_group_id && IS_MNT_SHARED(mnt) && - list_empty(&mnt->mnt_share)) - mnt_release_group_id(mnt); - - list_del_init(&mnt->mnt_share); - mnt->mnt_group_id = 0; - - if (peer_mnt) - master = peer_mnt; - - if (master) { - list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) - slave_mnt->mnt_master = master; - list_move(&mnt->mnt_slave, &master->mnt_slave_list); - list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); - INIT_LIST_HEAD(&mnt->mnt_slave_list); - } else { - struct list_head *p = &mnt->mnt_slave_list; - while (!list_empty(p)) { - slave_mnt = list_first_entry(p, - struct mount, mnt_slave); - list_del_init(&slave_mnt->mnt_slave); - slave_mnt->mnt_master = NULL; + if (list_empty(&mnt->mnt_share)) { + if (IS_MNT_SHARED(mnt)) { + mnt_release_group_id(mnt); + CLEAR_MNT_SHARED(mnt); } + master = mnt->mnt_master; + if (!master) { + struct list_head *p = &mnt->mnt_slave_list; + while (!list_empty(p)) { + slave_mnt = list_first_entry(p, + struct mount, mnt_slave); + list_del_init(&slave_mnt->mnt_slave); + slave_mnt->mnt_master = NULL; + } + return 0; + } + } else { + struct mount *m; + /* + * slave 'mnt' to a peer mount that has the + * same root dentry. If none is available then + * slave it to anything that is available. + */ + for (m = master = next_peer(mnt); m != mnt; m = next_peer(m)) { + if (m->mnt.mnt_root == mnt->mnt.mnt_root) { + master = m; + break; + } + } + list_del_init(&mnt->mnt_share); + mnt->mnt_group_id = 0; + CLEAR_MNT_SHARED(mnt); } + list_for_each_entry(slave_mnt, &mnt->mnt_slave_list, mnt_slave) + slave_mnt->mnt_master = master; + list_move(&mnt->mnt_slave, &master->mnt_slave_list); + list_splice(&mnt->mnt_slave_list, master->mnt_slave_list.prev); + INIT_LIST_HEAD(&mnt->mnt_slave_list); mnt->mnt_master = master; - CLEAR_MNT_SHARED(mnt); return 0; } From c00d2c7e89880036f288a764599b2b8b87c0a364 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Tue, 20 Dec 2016 07:04:57 -0500 Subject: [PATCH 04/10] move aio compat to fs/aio.c ... and fix the minor buglet in compat io_submit() - native one kills ioctx as cleanup when put_user() fails. Get rid of bogus compat_... in !CONFIG_AIO case, while we are at it - they should simply fail with ENOSYS, same as for native counterparts. Signed-off-by: Al Viro --- fs/aio.c | 97 ++++++++++++++++++++++++++++++++++++++++++++- fs/compat.c | 75 ----------------------------------- include/linux/aio.h | 5 --- kernel/sys_ni.c | 3 ++ 4 files changed, 98 insertions(+), 82 deletions(-) diff --git a/fs/aio.c b/fs/aio.c index 8edf253484af..8c79e1a53af9 100644 --- a/fs/aio.c +++ b/fs/aio.c @@ -1367,6 +1367,39 @@ out: return ret; } +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_events, u32 __user *, ctx32p) +{ + struct kioctx *ioctx = NULL; + unsigned long ctx; + long ret; + + ret = get_user(ctx, ctx32p); + if (unlikely(ret)) + goto out; + + ret = -EINVAL; + if (unlikely(ctx || nr_events == 0)) { + pr_debug("EINVAL: ctx %lu nr_events %u\n", + ctx, nr_events); + goto out; + } + + ioctx = ioctx_alloc(nr_events); + ret = PTR_ERR(ioctx); + if (!IS_ERR(ioctx)) { + /* truncating is ok because it's a user address */ + ret = put_user((u32)ioctx->user_id, ctx32p); + if (ret) + kill_ioctx(current->mm, ioctx, NULL); + percpu_ref_put(&ioctx->users); + } + +out: + return ret; +} +#endif + /* sys_io_destroy: * Destroy the aio_context specified. May cancel any outstanding * AIOs and block on completion. Will fail with -ENOSYS if not @@ -1591,8 +1624,8 @@ out_put_req: return ret; } -long do_io_submit(aio_context_t ctx_id, long nr, - struct iocb __user *__user *iocbpp, bool compat) +static long do_io_submit(aio_context_t ctx_id, long nr, + struct iocb __user *__user *iocbpp, bool compat) { struct kioctx *ctx; long ret = 0; @@ -1662,6 +1695,44 @@ SYSCALL_DEFINE3(io_submit, aio_context_t, ctx_id, long, nr, return do_io_submit(ctx_id, nr, iocbpp, 0); } +#ifdef CONFIG_COMPAT +static inline long +copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) +{ + compat_uptr_t uptr; + int i; + + for (i = 0; i < nr; ++i) { + if (get_user(uptr, ptr32 + i)) + return -EFAULT; + if (put_user(compat_ptr(uptr), ptr64 + i)) + return -EFAULT; + } + return 0; +} + +#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) + +COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, + int, nr, u32 __user *, iocb) +{ + struct iocb __user * __user *iocb64; + long ret; + + if (unlikely(nr < 0)) + return -EINVAL; + + if (nr > MAX_AIO_SUBMITS) + nr = MAX_AIO_SUBMITS; + + iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); + ret = copy_iocb(nr, iocb, iocb64); + if (!ret) + ret = do_io_submit(ctx_id, nr, iocb64, 1); + return ret; +} +#endif + /* lookup_kiocb * Finds a given iocb for cancellation. */ @@ -1761,3 +1832,25 @@ SYSCALL_DEFINE5(io_getevents, aio_context_t, ctx_id, } return ret; } + +#ifdef CONFIG_COMPAT +COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, + compat_long_t, min_nr, + compat_long_t, nr, + struct io_event __user *, events, + struct compat_timespec __user *, timeout) +{ + struct timespec t; + struct timespec __user *ut = NULL; + + if (timeout) { + if (compat_get_timespec(&t, timeout)) + return -EFAULT; + + ut = compat_alloc_user_space(sizeof(*ut)); + if (copy_to_user(ut, &t, sizeof(t))) + return -EFAULT; + } + return sys_io_getevents(ctx_id, min_nr, nr, events, ut); +} +#endif diff --git a/fs/compat.c b/fs/compat.c index 543b48c29ac3..3f4908c28698 100644 --- a/fs/compat.c +++ b/fs/compat.c @@ -487,45 +487,6 @@ COMPAT_SYSCALL_DEFINE3(fcntl, unsigned int, fd, unsigned int, cmd, return compat_sys_fcntl64(fd, cmd, arg); } -COMPAT_SYSCALL_DEFINE2(io_setup, unsigned, nr_reqs, u32 __user *, ctx32p) -{ - long ret; - aio_context_t ctx64; - - mm_segment_t oldfs = get_fs(); - if (unlikely(get_user(ctx64, ctx32p))) - return -EFAULT; - - set_fs(KERNEL_DS); - /* The __user pointer cast is valid because of the set_fs() */ - ret = sys_io_setup(nr_reqs, (aio_context_t __user *) &ctx64); - set_fs(oldfs); - /* truncating is ok because it's a user address */ - if (!ret) - ret = put_user((u32) ctx64, ctx32p); - return ret; -} - -COMPAT_SYSCALL_DEFINE5(io_getevents, compat_aio_context_t, ctx_id, - compat_long_t, min_nr, - compat_long_t, nr, - struct io_event __user *, events, - struct compat_timespec __user *, timeout) -{ - struct timespec t; - struct timespec __user *ut = NULL; - - if (timeout) { - if (compat_get_timespec(&t, timeout)) - return -EFAULT; - - ut = compat_alloc_user_space(sizeof(*ut)); - if (copy_to_user(ut, &t, sizeof(t)) ) - return -EFAULT; - } - return sys_io_getevents(ctx_id, min_nr, nr, events, ut); -} - /* A write operation does a read from user space and vice versa */ #define vrfy_dir(type) ((type) == READ ? VERIFY_WRITE : VERIFY_READ) @@ -602,42 +563,6 @@ out: return ret; } -static inline long -copy_iocb(long nr, u32 __user *ptr32, struct iocb __user * __user *ptr64) -{ - compat_uptr_t uptr; - int i; - - for (i = 0; i < nr; ++i) { - if (get_user(uptr, ptr32 + i)) - return -EFAULT; - if (put_user(compat_ptr(uptr), ptr64 + i)) - return -EFAULT; - } - return 0; -} - -#define MAX_AIO_SUBMITS (PAGE_SIZE/sizeof(struct iocb *)) - -COMPAT_SYSCALL_DEFINE3(io_submit, compat_aio_context_t, ctx_id, - int, nr, u32 __user *, iocb) -{ - struct iocb __user * __user *iocb64; - long ret; - - if (unlikely(nr < 0)) - return -EINVAL; - - if (nr > MAX_AIO_SUBMITS) - nr = MAX_AIO_SUBMITS; - - iocb64 = compat_alloc_user_space(nr * sizeof(*iocb64)); - ret = copy_iocb(nr, iocb, iocb64); - if (!ret) - ret = do_io_submit(ctx_id, nr, iocb64, 1); - return ret; -} - struct compat_ncp_mount_data { compat_int_t version; compat_uint_t ncp_fd; diff --git a/include/linux/aio.h b/include/linux/aio.h index 9eb42dbc5582..fdd0a343f455 100644 --- a/include/linux/aio.h +++ b/include/linux/aio.h @@ -14,14 +14,9 @@ typedef int (kiocb_cancel_fn)(struct kiocb *); /* prototypes */ #ifdef CONFIG_AIO extern void exit_aio(struct mm_struct *mm); -extern long do_io_submit(aio_context_t ctx_id, long nr, - struct iocb __user *__user *iocbpp, bool compat); void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel); #else static inline void exit_aio(struct mm_struct *mm) { } -static inline long do_io_submit(aio_context_t ctx_id, long nr, - struct iocb __user * __user *iocbpp, - bool compat) { return 0; } static inline void kiocb_set_cancel_fn(struct kiocb *req, kiocb_cancel_fn *cancel) { } #endif /* CONFIG_AIO */ diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c index 635482e60ca3..8acef8576ce9 100644 --- a/kernel/sys_ni.c +++ b/kernel/sys_ni.c @@ -150,6 +150,9 @@ cond_syscall(sys_io_destroy); cond_syscall(sys_io_submit); cond_syscall(sys_io_cancel); cond_syscall(sys_io_getevents); +cond_syscall(compat_sys_io_setup); +cond_syscall(compat_sys_io_submit); +cond_syscall(compat_sys_io_getevents); cond_syscall(sys_sysfs); cond_syscall(sys_syslog); cond_syscall(sys_process_vm_readv); From 33844e665104b169a3a7732bdcddb40e4f82b335 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Wed, 21 Dec 2016 21:55:02 -0500 Subject: [PATCH 05/10] [iov_iter] fix iterate_all_kinds() on empty iterators Problem similar to ones dealt with in "fold checks into iterate_and_advance()" and followups, except that in this case we really want to do nothing when asked for zero-length operation - unlike zero-length iterate_and_advance(), zero-length iterate_all_kinds() has no side effects, and callers are simpler that way. That got exposed when copy_from_iter_full() had been used by tipc, which builds an msghdr with zero payload and (now) feeds it to a primitive based on iterate_all_kinds() instead of iterate_and_advance(). Reported-by: Jon Maloy Tested-by: Jon Maloy Signed-off-by: Al Viro --- lib/iov_iter.c | 55 ++++++++++++++++++++++++-------------------------- 1 file changed, 26 insertions(+), 29 deletions(-) diff --git a/lib/iov_iter.c b/lib/iov_iter.c index 228892dabba6..25f572303801 100644 --- a/lib/iov_iter.c +++ b/lib/iov_iter.c @@ -73,19 +73,21 @@ } #define iterate_all_kinds(i, n, v, I, B, K) { \ - size_t skip = i->iov_offset; \ - if (unlikely(i->type & ITER_BVEC)) { \ - struct bio_vec v; \ - struct bvec_iter __bi; \ - iterate_bvec(i, n, v, __bi, skip, (B)) \ - } else if (unlikely(i->type & ITER_KVEC)) { \ - const struct kvec *kvec; \ - struct kvec v; \ - iterate_kvec(i, n, v, kvec, skip, (K)) \ - } else { \ - const struct iovec *iov; \ - struct iovec v; \ - iterate_iovec(i, n, v, iov, skip, (I)) \ + if (likely(n)) { \ + size_t skip = i->iov_offset; \ + if (unlikely(i->type & ITER_BVEC)) { \ + struct bio_vec v; \ + struct bvec_iter __bi; \ + iterate_bvec(i, n, v, __bi, skip, (B)) \ + } else if (unlikely(i->type & ITER_KVEC)) { \ + const struct kvec *kvec; \ + struct kvec v; \ + iterate_kvec(i, n, v, kvec, skip, (K)) \ + } else { \ + const struct iovec *iov; \ + struct iovec v; \ + iterate_iovec(i, n, v, iov, skip, (I)) \ + } \ } \ } @@ -576,7 +578,7 @@ bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i) WARN_ON(1); return false; } - if (unlikely(i->count < bytes)) \ + if (unlikely(i->count < bytes)) return false; iterate_all_kinds(i, bytes, v, ({ @@ -620,7 +622,7 @@ bool copy_from_iter_full_nocache(void *addr, size_t bytes, struct iov_iter *i) WARN_ON(1); return false; } - if (unlikely(i->count < bytes)) \ + if (unlikely(i->count < bytes)) return false; iterate_all_kinds(i, bytes, v, ({ if (__copy_from_user_nocache((to += v.iov_len) - v.iov_len, @@ -837,11 +839,8 @@ unsigned long iov_iter_alignment(const struct iov_iter *i) unsigned long res = 0; size_t size = i->count; - if (!size) - return 0; - if (unlikely(i->type & ITER_PIPE)) { - if (i->iov_offset && allocated(&i->pipe->bufs[i->idx])) + if (size && i->iov_offset && allocated(&i->pipe->bufs[i->idx])) return size | i->iov_offset; return size; } @@ -856,10 +855,8 @@ EXPORT_SYMBOL(iov_iter_alignment); unsigned long iov_iter_gap_alignment(const struct iov_iter *i) { - unsigned long res = 0; + unsigned long res = 0; size_t size = i->count; - if (!size) - return 0; if (unlikely(i->type & ITER_PIPE)) { WARN_ON(1); @@ -874,7 +871,7 @@ unsigned long iov_iter_gap_alignment(const struct iov_iter *i) (res |= (!res ? 0 : (unsigned long)v.iov_base) | (size != v.iov_len ? size : 0)) ); - return res; + return res; } EXPORT_SYMBOL(iov_iter_gap_alignment); @@ -908,6 +905,9 @@ static ssize_t pipe_get_pages(struct iov_iter *i, size_t capacity; int idx; + if (!maxsize) + return 0; + if (!sanity(i)) return -EFAULT; @@ -926,9 +926,6 @@ ssize_t iov_iter_get_pages(struct iov_iter *i, if (maxsize > i->count) maxsize = i->count; - if (!maxsize) - return 0; - if (unlikely(i->type & ITER_PIPE)) return pipe_get_pages(i, pages, maxsize, maxpages, start); iterate_all_kinds(i, maxsize, v, ({ @@ -975,6 +972,9 @@ static ssize_t pipe_get_pages_alloc(struct iov_iter *i, int idx; int npages; + if (!maxsize) + return 0; + if (!sanity(i)) return -EFAULT; @@ -1006,9 +1006,6 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i, if (maxsize > i->count) maxsize = i->count; - if (!maxsize) - return 0; - if (unlikely(i->type & ITER_PIPE)) return pipe_get_pages_alloc(i, pages, maxsize, start); iterate_all_kinds(i, maxsize, v, ({ From 22725ce4e4a00fbc37694e25dc5c8acef8ad1c28 Mon Sep 17 00:00:00 2001 From: "Darrick J. Wong" Date: Mon, 19 Dec 2016 15:13:26 -0800 Subject: [PATCH 06/10] vfs: fix isize/pos/len checks for reflink & dedupe Strengthen the checking of pos/len vs. i_size, clarify the return values for the clone prep function, and remove pointless code. Reviewed-by: Christoph Hellwig Signed-off-by: Darrick J. Wong Signed-off-by: Al Viro --- fs/ocfs2/refcounttree.c | 2 +- fs/read_write.c | 18 +++++++++++------- fs/xfs/xfs_reflink.c | 2 +- 3 files changed, 13 insertions(+), 9 deletions(-) diff --git a/fs/ocfs2/refcounttree.c b/fs/ocfs2/refcounttree.c index d171d2c53f7f..f8933cb53d68 100644 --- a/fs/ocfs2/refcounttree.c +++ b/fs/ocfs2/refcounttree.c @@ -4834,7 +4834,7 @@ int ocfs2_reflink_remap_range(struct file *file_in, ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, &len, is_dedupe); - if (ret || len == 0) + if (ret <= 0) goto out_unlock; /* Lock out changes to the allocation maps and remap. */ diff --git a/fs/read_write.c b/fs/read_write.c index da6de12b5c46..7537b6b6b5a2 100644 --- a/fs/read_write.c +++ b/fs/read_write.c @@ -1669,6 +1669,9 @@ static int clone_verify_area(struct file *file, loff_t pos, u64 len, bool write) * Check that the two inodes are eligible for cloning, the ranges make * sense, and then flush all dirty data. Caller must ensure that the * inodes have been locked against any other modifications. + * + * Returns: 0 for "nothing to clone", 1 for "something to clone", or + * the usual negative error code. */ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, struct inode *inode_out, loff_t pos_out, @@ -1695,17 +1698,15 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, /* Are we going all the way to the end? */ isize = i_size_read(inode_in); - if (isize == 0) { - *len = 0; + if (isize == 0) return 0; - } /* Zero length dedupe exits immediately; reflink goes to EOF. */ if (*len == 0) { - if (is_dedupe) { - *len = 0; + if (is_dedupe || pos_in == isize) return 0; - } + if (pos_in > isize) + return -EINVAL; *len = isize - pos_in; } @@ -1769,7 +1770,7 @@ int vfs_clone_file_prep_inodes(struct inode *inode_in, loff_t pos_in, return -EBADE; } - return 0; + return 1; } EXPORT_SYMBOL(vfs_clone_file_prep_inodes); @@ -1955,6 +1956,9 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same) goto out; ret = 0; + if (off + len > i_size_read(src)) + return -EINVAL; + /* pre-format output fields to sane values */ for (i = 0; i < count; i++) { same->info[i].bytes_deduped = 0ULL; diff --git a/fs/xfs/xfs_reflink.c b/fs/xfs/xfs_reflink.c index aca2d4bd4303..07593a362cd0 100644 --- a/fs/xfs/xfs_reflink.c +++ b/fs/xfs/xfs_reflink.c @@ -1161,7 +1161,7 @@ xfs_reflink_remap_range( ret = vfs_clone_file_prep_inodes(inode_in, pos_in, inode_out, pos_out, &len, is_dedupe); - if (ret || len == 0) + if (ret <= 0) goto out_unlock; trace_xfs_reflink_remap_range(src, pos_in, len, dest, pos_out); From e522751d605d99a81508e58390a8f51ee96fb662 Mon Sep 17 00:00:00 2001 From: Tomasz Majchrzak Date: Tue, 29 Nov 2016 15:18:20 +0100 Subject: [PATCH 07/10] seq_file: reset iterator to first record for zero offset If kernfs file is empty on a first read, successive read operations using the same file descriptor will return no data, even when data is available. Default kernfs 'seq_next' implementation advances iterator position even when next object is not there. Kernfs 'seq_start' for following requests will not return iterator as position is already on the second object. This defect doesn't allow to monitor badblocks sysfs files from MD raid. They are initially empty but if data appears at some stage, userspace is not able to read it. Signed-off-by: Tomasz Majchrzak Signed-off-by: Miklos Szeredi Signed-off-by: Al Viro --- fs/seq_file.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/fs/seq_file.c b/fs/seq_file.c index 368bfb92b115..a11f271800ef 100644 --- a/fs/seq_file.c +++ b/fs/seq_file.c @@ -190,6 +190,13 @@ ssize_t seq_read(struct file *file, char __user *buf, size_t size, loff_t *ppos) */ m->version = file->f_version; + /* + * if request is to read from zero offset, reset iterator to first + * record as it might have been already advanced by previous requests + */ + if (*ppos == 0) + m->index = 0; + /* Don't assume *ppos is where we left it */ if (unlikely(*ppos != m->read_pos)) { while ((err = traverse(m, *ppos)) == -EAGAIN) From 613cc2b6f272c1a8ad33aefa21cad77af23139f7 Mon Sep 17 00:00:00 2001 From: Aleksa Sarai Date: Wed, 21 Dec 2016 16:26:24 +1100 Subject: [PATCH 08/10] fs: exec: apply CLOEXEC before changing dumpable task flags If you have a process that has set itself to be non-dumpable, and it then undergoes exec(2), any CLOEXEC file descriptors it has open are "exposed" during a race window between the dumpable flags of the process being reset for exec(2) and CLOEXEC being applied to the file descriptors. This can be exploited by a process by attempting to access /proc//fd/... during this window, without requiring CAP_SYS_PTRACE. The race in question is after set_dumpable has been (for get_link, though the trace is basically the same for readlink): [vfs] -> proc_pid_link_inode_operations.get_link -> proc_pid_get_link -> proc_fd_access_allowed -> ptrace_may_access(task, PTRACE_MODE_READ_FSCREDS); Which will return 0, during the race window and CLOEXEC file descriptors will still be open during this window because do_close_on_exec has not been called yet. As a result, the ordering of these calls should be reversed to avoid this race window. This is of particular concern to container runtimes, where joining a PID namespace with file descriptors referring to the host filesystem can result in security issues (since PRCTL_SET_DUMPABLE doesn't protect against access of CLOEXEC file descriptors -- file descriptors which may reference filesystem objects the container shouldn't have access to). Cc: dev@opencontainers.org Cc: # v3.2+ Reported-by: Michael Crosby Signed-off-by: Aleksa Sarai Signed-off-by: Al Viro --- fs/exec.c | 10 ++++++++-- 1 file changed, 8 insertions(+), 2 deletions(-) diff --git a/fs/exec.c b/fs/exec.c index 8112eacf10f3..eadbf5069c38 100644 --- a/fs/exec.c +++ b/fs/exec.c @@ -19,7 +19,7 @@ * current->executable is only used by the procfs. This allows a dispatch * table to check for several different types of binary formats. We keep * trying until we recognize the file or we run out of supported binary - * formats. + * formats. */ #include @@ -1268,6 +1268,13 @@ int flush_old_exec(struct linux_binprm * bprm) flush_thread(); current->personality &= ~bprm->per_clear; + /* + * We have to apply CLOEXEC before we change whether the process is + * dumpable (in setup_new_exec) to avoid a race with a process in userspace + * trying to access the should-be-closed file descriptors of a process + * undergoing exec(2). + */ + do_close_on_exec(current->files); return 0; out: @@ -1330,7 +1337,6 @@ void setup_new_exec(struct linux_binprm * bprm) group */ current->self_exec_id++; flush_signal_handlers(current, 0); - do_close_on_exec(current->files); } EXPORT_SYMBOL(setup_new_exec); From f698cccbc89e33cda4795a375e47daaa3689485e Mon Sep 17 00:00:00 2001 From: Jeff Layton Date: Tue, 20 Dec 2016 10:56:28 -0500 Subject: [PATCH 09/10] ufs: fix function declaration for ufs_truncate_blocks sparse says: fs/ufs/inode.c:1195:6: warning: symbol 'ufs_truncate_blocks' was not declared. Should it be static? Note that the forward declaration in the file is already marked static. Signed-off-by: Jeff Layton Signed-off-by: Al Viro --- fs/ufs/inode.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ufs/inode.c b/fs/ufs/inode.c index 45ceb94e89e4..1bc0bd6a9848 100644 --- a/fs/ufs/inode.c +++ b/fs/ufs/inode.c @@ -1191,7 +1191,7 @@ out: return err; } -void ufs_truncate_blocks(struct inode *inode) +static void ufs_truncate_blocks(struct inode *inode) { if (!(S_ISREG(inode->i_mode) || S_ISDIR(inode->i_mode) || S_ISLNK(inode->i_mode))) From 128394eff343fc6d2f32172f03e24829539c5835 Mon Sep 17 00:00:00 2001 From: Al Viro Date: Fri, 16 Dec 2016 13:42:06 -0500 Subject: [PATCH 10/10] sg_write()/bsg_write() is not fit to be called under KERNEL_DS Both damn things interpret userland pointers embedded into the payload; worse, they are actually traversing those. Leaving aside the bad API design, this is very much _not_ safe to call with KERNEL_DS. Bail out early if that happens. Cc: stable@vger.kernel.org Signed-off-by: Al Viro --- block/bsg.c | 3 +++ drivers/scsi/sg.c | 3 +++ 2 files changed, 6 insertions(+) diff --git a/block/bsg.c b/block/bsg.c index 8a05a404ae70..a57046de2f07 100644 --- a/block/bsg.c +++ b/block/bsg.c @@ -655,6 +655,9 @@ bsg_write(struct file *file, const char __user *buf, size_t count, loff_t *ppos) dprintk("%s: write %Zd bytes\n", bd->name, count); + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + return -EINVAL; + bsg_set_block(bd, file); bytes_written = 0; diff --git a/drivers/scsi/sg.c b/drivers/scsi/sg.c index 070332eb41f3..dbe5b4b95df0 100644 --- a/drivers/scsi/sg.c +++ b/drivers/scsi/sg.c @@ -581,6 +581,9 @@ sg_write(struct file *filp, const char __user *buf, size_t count, loff_t * ppos) sg_io_hdr_t *hp; unsigned char cmnd[SG_MAX_CDB_SIZE]; + if (unlikely(segment_eq(get_fs(), KERNEL_DS))) + return -EINVAL; + if ((!(sfp = (Sg_fd *) filp->private_data)) || (!(sdp = sfp->parentdp))) return -ENXIO; SCSI_LOG_TIMEOUT(3, sg_printk(KERN_INFO, sdp,