iov_iter work, part 1 - isolated cleanups and optimizations.
One of the goals is to reduce the overhead of using ->read_iter() and ->write_iter() instead of ->read()/->write(); new_sync_{read,write}() has a surprising amount of overhead, in particular inside iocb_flags(). That's why the beginning of the series is in this pile; it's not directly iov_iter-related, but it's a part of the same work... Signed-off-by: Al Viro <viro@zeniv.linux.org.uk> -----BEGIN PGP SIGNATURE----- iHUEABYIAB0WIQQqUNBr3gm4hGXdBJlZ7Krx/gZQ6wUCYurGOQAKCRBZ7Krx/gZQ 6ysyAP91lvBfMRepcxpd9kvtuzWkU8A3rfSziZZteEHANB9Q7QEAiPn2a2OjWkcZ uAyUWfCkHCNx+dSMkEvUgR5okQ0exAM= =9UCV -----END PGP SIGNATURE----- Merge tag 'pull-work.iov_iter-base' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs Pull vfs iov_iter updates from Al Viro: "Part 1 - isolated cleanups and optimizations. One of the goals is to reduce the overhead of using ->read_iter() and ->write_iter() instead of ->read()/->write(). new_sync_{read,write}() has a surprising amount of overhead, in particular inside iocb_flags(). That's the explanation for the beginning of the series is in this pile; it's not directly iov_iter-related, but it's a part of the same work..." * tag 'pull-work.iov_iter-base' of git://git.kernel.org/pub/scm/linux/kernel/git/viro/vfs: first_iovec_segment(): just return address iov_iter: massage calling conventions for first_{iovec,bvec}_segment() iov_iter: first_{iovec,bvec}_segment() - simplify a bit iov_iter: lift dealing with maxpages out of first_{iovec,bvec}_segment() iov_iter_get_pages{,_alloc}(): cap the maxsize with MAX_RW_COUNT iov_iter_bvec_advance(): don't bother with bvec_iter copy_page_{to,from}_iter(): switch iovec variants to generic keep iocb_flags() result cached in struct file iocb: delay evaluation of IS_SYNC(...) until we want to check IOCB_DSYNC struct file: use anonymous union member for rcuhead and llist btrfs: use IOMAP_DIO_NOSYNC teach iomap_dio_rw() to suppress dsync No need of likely/unlikely on calls of check_copy_size()
This commit is contained in:
commit
5264406cdb
|
@ -348,7 +348,7 @@ copy_mc_to_kernel(void *to, const void *from, unsigned long size)
|
|||
static inline unsigned long __must_check
|
||||
copy_mc_to_user(void __user *to, const void *from, unsigned long n)
|
||||
{
|
||||
if (likely(check_copy_size(from, n, true))) {
|
||||
if (check_copy_size(from, n, true)) {
|
||||
if (access_ok(to, n)) {
|
||||
allow_write_to_user(to, n);
|
||||
n = copy_mc_generic((void *)to, from, n);
|
||||
|
|
|
@ -39,7 +39,7 @@ _copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned
|
|||
static __always_inline unsigned long __must_check
|
||||
copy_from_user_key(void *to, const void __user *from, unsigned long n, unsigned long key)
|
||||
{
|
||||
if (likely(check_copy_size(to, n, false)))
|
||||
if (check_copy_size(to, n, false))
|
||||
n = _copy_from_user_key(to, from, n, key);
|
||||
return n;
|
||||
}
|
||||
|
@ -50,7 +50,7 @@ _copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned l
|
|||
static __always_inline unsigned long __must_check
|
||||
copy_to_user_key(void __user *to, const void *from, unsigned long n, unsigned long key)
|
||||
{
|
||||
if (likely(check_copy_size(from, n, true)))
|
||||
if (check_copy_size(from, n, true))
|
||||
n = _copy_to_user_key(to, from, n, key);
|
||||
return n;
|
||||
}
|
||||
|
|
|
@ -37,7 +37,7 @@ static blk_opf_t dio_bio_write_op(struct kiocb *iocb)
|
|||
blk_opf_t opf = REQ_OP_WRITE | REQ_SYNC | REQ_IDLE;
|
||||
|
||||
/* avoid the need for a I/O completion work item */
|
||||
if (iocb->ki_flags & IOCB_DSYNC)
|
||||
if (iocb_is_dsync(iocb))
|
||||
opf |= REQ_FUA;
|
||||
return opf;
|
||||
}
|
||||
|
|
|
@ -112,7 +112,7 @@ static ssize_t nvmet_file_submit_bvec(struct nvmet_req *req, loff_t pos,
|
|||
|
||||
iocb->ki_pos = pos;
|
||||
iocb->ki_filp = req->ns->file;
|
||||
iocb->ki_flags = ki_flags | iocb_flags(req->ns->file);
|
||||
iocb->ki_flags = ki_flags | iocb->ki_filp->f_iocb_flags;
|
||||
|
||||
return call_iter(iocb, &iter);
|
||||
}
|
||||
|
|
2
fs/aio.c
2
fs/aio.c
|
@ -1475,7 +1475,7 @@ static int aio_prep_rw(struct kiocb *req, const struct iocb *iocb)
|
|||
req->ki_complete = aio_complete_rw;
|
||||
req->private = NULL;
|
||||
req->ki_pos = iocb->aio_offset;
|
||||
req->ki_flags = iocb_flags(req->ki_filp);
|
||||
req->ki_flags = req->ki_filp->f_iocb_flags;
|
||||
if (iocb->aio_flags & IOCB_FLAG_RESFD)
|
||||
req->ki_flags |= IOCB_EVENTFD;
|
||||
if (iocb->aio_flags & IOCB_FLAG_IOPRIO) {
|
||||
|
|
|
@ -1848,7 +1848,6 @@ static ssize_t check_direct_IO(struct btrfs_fs_info *fs_info,
|
|||
|
||||
static ssize_t btrfs_direct_write(struct kiocb *iocb, struct iov_iter *from)
|
||||
{
|
||||
const bool is_sync_write = (iocb->ki_flags & IOCB_DSYNC);
|
||||
struct file *file = iocb->ki_filp;
|
||||
struct inode *inode = file_inode(file);
|
||||
struct btrfs_fs_info *fs_info = btrfs_sb(inode->i_sb);
|
||||
|
@ -1901,15 +1900,6 @@ relock:
|
|||
goto buffered;
|
||||
}
|
||||
|
||||
/*
|
||||
* We remove IOCB_DSYNC so that we don't deadlock when iomap_dio_rw()
|
||||
* calls generic_write_sync() (through iomap_dio_complete()), because
|
||||
* that results in calling fsync (btrfs_sync_file()) which will try to
|
||||
* lock the inode in exclusive/write mode.
|
||||
*/
|
||||
if (is_sync_write)
|
||||
iocb->ki_flags &= ~IOCB_DSYNC;
|
||||
|
||||
/*
|
||||
* The iov_iter can be mapped to the same file range we are writing to.
|
||||
* If that's the case, then we will deadlock in the iomap code, because
|
||||
|
@ -1964,13 +1954,6 @@ again:
|
|||
|
||||
btrfs_inode_unlock(inode, ilock_flags);
|
||||
|
||||
/*
|
||||
* Add back IOCB_DSYNC. Our caller, btrfs_file_write_iter(), will do
|
||||
* the fsync (call generic_write_sync()).
|
||||
*/
|
||||
if (is_sync_write)
|
||||
iocb->ki_flags |= IOCB_DSYNC;
|
||||
|
||||
/* If 'err' is -ENOTBLK then it means we must fallback to buffered IO. */
|
||||
if ((err < 0 && err != -ENOTBLK) || !iov_iter_count(from))
|
||||
goto out;
|
||||
|
@ -2038,7 +2021,7 @@ ssize_t btrfs_do_write_iter(struct kiocb *iocb, struct iov_iter *from,
|
|||
struct file *file = iocb->ki_filp;
|
||||
struct btrfs_inode *inode = BTRFS_I(file_inode(file));
|
||||
ssize_t num_written, num_sync;
|
||||
const bool sync = iocb->ki_flags & IOCB_DSYNC;
|
||||
const bool sync = iocb_is_dsync(iocb);
|
||||
|
||||
/*
|
||||
* If the fs flips readonly due to some impossible error, although we
|
||||
|
|
|
@ -8165,7 +8165,8 @@ ssize_t btrfs_dio_rw(struct kiocb *iocb, struct iov_iter *iter, size_t done_befo
|
|||
struct btrfs_dio_data data;
|
||||
|
||||
return iomap_dio_rw(iocb, iter, &btrfs_dio_iomap_ops, &btrfs_dio_ops,
|
||||
IOMAP_DIO_PARTIAL, &data, done_before);
|
||||
IOMAP_DIO_PARTIAL | IOMAP_DIO_NOSYNC,
|
||||
&data, done_before);
|
||||
}
|
||||
|
||||
static int btrfs_fiemap(struct inode *inode, struct fiemap_extent_info *fieinfo,
|
||||
|
|
|
@ -1216,7 +1216,7 @@ ssize_t __blockdev_direct_IO(struct kiocb *iocb, struct inode *inode,
|
|||
*/
|
||||
if (dio->is_async && iov_iter_rw(iter) == WRITE) {
|
||||
retval = 0;
|
||||
if (iocb->ki_flags & IOCB_DSYNC)
|
||||
if (iocb_is_dsync(iocb))
|
||||
retval = dio_set_defer_completion(dio);
|
||||
else if (!dio->inode->i_sb->s_dio_done_wq) {
|
||||
/*
|
||||
|
|
|
@ -78,6 +78,7 @@ static int setfl(int fd, struct file * filp, unsigned long arg)
|
|||
}
|
||||
spin_lock(&filp->f_lock);
|
||||
filp->f_flags = (arg & SETFL_MASK) | (filp->f_flags & ~SETFL_MASK);
|
||||
filp->f_iocb_flags = iocb_flags(filp);
|
||||
spin_unlock(&filp->f_lock);
|
||||
|
||||
out:
|
||||
|
|
|
@ -45,7 +45,7 @@ static struct percpu_counter nr_files __cacheline_aligned_in_smp;
|
|||
|
||||
static void file_free_rcu(struct rcu_head *head)
|
||||
{
|
||||
struct file *f = container_of(head, struct file, f_u.fu_rcuhead);
|
||||
struct file *f = container_of(head, struct file, f_rcuhead);
|
||||
|
||||
put_cred(f->f_cred);
|
||||
kmem_cache_free(filp_cachep, f);
|
||||
|
@ -56,7 +56,7 @@ static inline void file_free(struct file *f)
|
|||
security_file_free(f);
|
||||
if (!(f->f_mode & FMODE_NOACCOUNT))
|
||||
percpu_counter_dec(&nr_files);
|
||||
call_rcu(&f->f_u.fu_rcuhead, file_free_rcu);
|
||||
call_rcu(&f->f_rcuhead, file_free_rcu);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -142,7 +142,7 @@ static struct file *__alloc_file(int flags, const struct cred *cred)
|
|||
f->f_cred = get_cred(cred);
|
||||
error = security_file_alloc(f);
|
||||
if (unlikely(error)) {
|
||||
file_free_rcu(&f->f_u.fu_rcuhead);
|
||||
file_free_rcu(&f->f_rcuhead);
|
||||
return ERR_PTR(error);
|
||||
}
|
||||
|
||||
|
@ -243,6 +243,7 @@ static struct file *alloc_file(const struct path *path, int flags,
|
|||
if ((file->f_mode & FMODE_WRITE) &&
|
||||
likely(fop->write || fop->write_iter))
|
||||
file->f_mode |= FMODE_CAN_WRITE;
|
||||
file->f_iocb_flags = iocb_flags(file);
|
||||
file->f_mode |= FMODE_OPENED;
|
||||
file->f_op = fop;
|
||||
if ((file->f_mode & (FMODE_READ | FMODE_WRITE)) == FMODE_READ)
|
||||
|
@ -343,13 +344,13 @@ static void delayed_fput(struct work_struct *unused)
|
|||
struct llist_node *node = llist_del_all(&delayed_fput_list);
|
||||
struct file *f, *t;
|
||||
|
||||
llist_for_each_entry_safe(f, t, node, f_u.fu_llist)
|
||||
llist_for_each_entry_safe(f, t, node, f_llist)
|
||||
__fput(f);
|
||||
}
|
||||
|
||||
static void ____fput(struct callback_head *work)
|
||||
{
|
||||
__fput(container_of(work, struct file, f_u.fu_rcuhead));
|
||||
__fput(container_of(work, struct file, f_rcuhead));
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -376,8 +377,8 @@ void fput(struct file *file)
|
|||
struct task_struct *task = current;
|
||||
|
||||
if (likely(!in_interrupt() && !(task->flags & PF_KTHREAD))) {
|
||||
init_task_work(&file->f_u.fu_rcuhead, ____fput);
|
||||
if (!task_work_add(task, &file->f_u.fu_rcuhead, TWA_RESUME))
|
||||
init_task_work(&file->f_rcuhead, ____fput);
|
||||
if (!task_work_add(task, &file->f_rcuhead, TWA_RESUME))
|
||||
return;
|
||||
/*
|
||||
* After this task has run exit_task_work(),
|
||||
|
@ -386,7 +387,7 @@ void fput(struct file *file)
|
|||
*/
|
||||
}
|
||||
|
||||
if (llist_add(&file->f_u.fu_llist, &delayed_fput_list))
|
||||
if (llist_add(&file->f_llist, &delayed_fput_list))
|
||||
schedule_delayed_work(&delayed_fput_work, 1);
|
||||
}
|
||||
}
|
||||
|
|
|
@ -1042,7 +1042,7 @@ static unsigned int fuse_write_flags(struct kiocb *iocb)
|
|||
{
|
||||
unsigned int flags = iocb->ki_filp->f_flags;
|
||||
|
||||
if (iocb->ki_flags & IOCB_DSYNC)
|
||||
if (iocb_is_dsync(iocb))
|
||||
flags |= O_DSYNC;
|
||||
if (iocb->ki_flags & IOCB_SYNC)
|
||||
flags |= O_SYNC;
|
||||
|
|
|
@ -548,17 +548,18 @@ __iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
|||
}
|
||||
|
||||
/* for data sync or sync, we need sync completion processing */
|
||||
if (iocb->ki_flags & IOCB_DSYNC)
|
||||
if (iocb_is_dsync(iocb) && !(dio_flags & IOMAP_DIO_NOSYNC)) {
|
||||
dio->flags |= IOMAP_DIO_NEED_SYNC;
|
||||
|
||||
/*
|
||||
* For datasync only writes, we optimistically try using FUA for
|
||||
* this IO. Any non-FUA write that occurs will clear this flag,
|
||||
* hence we know before completion whether a cache flush is
|
||||
* necessary.
|
||||
*/
|
||||
if ((iocb->ki_flags & (IOCB_DSYNC | IOCB_SYNC)) == IOCB_DSYNC)
|
||||
dio->flags |= IOMAP_DIO_WRITE_FUA;
|
||||
/*
|
||||
* For datasync only writes, we optimistically try
|
||||
* using FUA for this IO. Any non-FUA write that
|
||||
* occurs will clear this flag, hence we know before
|
||||
* completion whether a cache flush is necessary.
|
||||
*/
|
||||
if (!(iocb->ki_flags & IOCB_SYNC))
|
||||
dio->flags |= IOMAP_DIO_WRITE_FUA;
|
||||
}
|
||||
}
|
||||
|
||||
if (dio_flags & IOMAP_DIO_OVERWRITE_ONLY) {
|
||||
|
|
|
@ -894,6 +894,7 @@ static int do_dentry_open(struct file *f,
|
|||
f->f_mode |= FMODE_CAN_ODIRECT;
|
||||
|
||||
f->f_flags &= ~(O_CREAT | O_EXCL | O_NOCTTY | O_TRUNC);
|
||||
f->f_iocb_flags = iocb_flags(f);
|
||||
|
||||
file_ra_state_init(&f->f_ra, f->f_mapping->host->i_mapping);
|
||||
|
||||
|
|
|
@ -779,7 +779,7 @@ static ssize_t zonefs_file_dio_append(struct kiocb *iocb, struct iov_iter *from)
|
|||
REQ_OP_ZONE_APPEND | REQ_SYNC | REQ_IDLE, GFP_NOFS);
|
||||
bio->bi_iter.bi_sector = zi->i_zsector;
|
||||
bio->bi_ioprio = iocb->ki_ioprio;
|
||||
if (iocb->ki_flags & IOCB_DSYNC)
|
||||
if (iocb_is_dsync(iocb))
|
||||
bio->bi_opf |= REQ_FUA;
|
||||
|
||||
ret = bio_iov_iter_get_pages(bio, from);
|
||||
|
|
|
@ -943,9 +943,10 @@ static inline int ra_has_index(struct file_ra_state *ra, pgoff_t index)
|
|||
|
||||
struct file {
|
||||
union {
|
||||
struct llist_node fu_llist;
|
||||
struct rcu_head fu_rcuhead;
|
||||
} f_u;
|
||||
struct llist_node f_llist;
|
||||
struct rcu_head f_rcuhead;
|
||||
unsigned int f_iocb_flags;
|
||||
};
|
||||
struct path f_path;
|
||||
struct inode *f_inode; /* cached value */
|
||||
const struct file_operations *f_op;
|
||||
|
@ -2328,13 +2329,11 @@ static inline bool HAS_UNMAPPED_ID(struct user_namespace *mnt_userns,
|
|||
!vfsgid_valid(i_gid_into_vfsgid(mnt_userns, inode));
|
||||
}
|
||||
|
||||
static inline int iocb_flags(struct file *file);
|
||||
|
||||
static inline void init_sync_kiocb(struct kiocb *kiocb, struct file *filp)
|
||||
{
|
||||
*kiocb = (struct kiocb) {
|
||||
.ki_filp = filp,
|
||||
.ki_flags = iocb_flags(filp),
|
||||
.ki_flags = filp->f_iocb_flags,
|
||||
.ki_ioprio = get_current_ioprio(),
|
||||
};
|
||||
}
|
||||
|
@ -2850,6 +2849,12 @@ extern int vfs_fsync(struct file *file, int datasync);
|
|||
extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
|
||||
unsigned int flags);
|
||||
|
||||
static inline bool iocb_is_dsync(const struct kiocb *iocb)
|
||||
{
|
||||
return (iocb->ki_flags & IOCB_DSYNC) ||
|
||||
IS_SYNC(iocb->ki_filp->f_mapping->host);
|
||||
}
|
||||
|
||||
/*
|
||||
* Sync the bytes written if this was a synchronous write. Expect ki_pos
|
||||
* to already be updated for the write, and will return either the amount
|
||||
|
@ -2857,7 +2862,7 @@ extern int sync_file_range(struct file *file, loff_t offset, loff_t nbytes,
|
|||
*/
|
||||
static inline ssize_t generic_write_sync(struct kiocb *iocb, ssize_t count)
|
||||
{
|
||||
if (iocb->ki_flags & IOCB_DSYNC) {
|
||||
if (iocb_is_dsync(iocb)) {
|
||||
int ret = vfs_fsync_range(iocb->ki_filp,
|
||||
iocb->ki_pos - count, iocb->ki_pos - 1,
|
||||
(iocb->ki_flags & IOCB_SYNC) ? 0 : 1);
|
||||
|
@ -3380,7 +3385,7 @@ static inline int iocb_flags(struct file *file)
|
|||
res |= IOCB_APPEND;
|
||||
if (file->f_flags & O_DIRECT)
|
||||
res |= IOCB_DIRECT;
|
||||
if ((file->f_flags & O_DSYNC) || IS_SYNC(file->f_mapping->host))
|
||||
if (file->f_flags & O_DSYNC)
|
||||
res |= IOCB_DSYNC;
|
||||
if (file->f_flags & __O_SYNC)
|
||||
res |= IOCB_SYNC;
|
||||
|
|
|
@ -347,6 +347,12 @@ struct iomap_dio_ops {
|
|||
*/
|
||||
#define IOMAP_DIO_PARTIAL (1 << 2)
|
||||
|
||||
/*
|
||||
* The caller will sync the write if needed; do not sync it within
|
||||
* iomap_dio_rw. Overrides IOMAP_DIO_FORCE_WAIT.
|
||||
*/
|
||||
#define IOMAP_DIO_NOSYNC (1 << 3)
|
||||
|
||||
ssize_t iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
||||
const struct iomap_ops *ops, const struct iomap_dio_ops *dops,
|
||||
unsigned int dio_flags, void *private, size_t done_before);
|
||||
|
|
|
@ -148,7 +148,7 @@ _copy_to_user(void __user *, const void *, unsigned long);
|
|||
static __always_inline unsigned long __must_check
|
||||
copy_from_user(void *to, const void __user *from, unsigned long n)
|
||||
{
|
||||
if (likely(check_copy_size(to, n, false)))
|
||||
if (check_copy_size(to, n, false))
|
||||
n = _copy_from_user(to, from, n);
|
||||
return n;
|
||||
}
|
||||
|
@ -156,7 +156,7 @@ copy_from_user(void *to, const void __user *from, unsigned long n)
|
|||
static __always_inline unsigned long __must_check
|
||||
copy_to_user(void __user *to, const void *from, unsigned long n)
|
||||
{
|
||||
if (likely(check_copy_size(from, n, true)))
|
||||
if (check_copy_size(from, n, true))
|
||||
n = _copy_to_user(to, from, n);
|
||||
return n;
|
||||
}
|
||||
|
|
|
@ -156,19 +156,17 @@ static inline size_t copy_folio_to_iter(struct folio *folio, size_t offset,
|
|||
static __always_inline __must_check
|
||||
size_t copy_to_iter(const void *addr, size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
if (unlikely(!check_copy_size(addr, bytes, true)))
|
||||
return 0;
|
||||
else
|
||||
if (check_copy_size(addr, bytes, true))
|
||||
return _copy_to_iter(addr, bytes, i);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __always_inline __must_check
|
||||
size_t copy_from_iter(void *addr, size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
if (unlikely(!check_copy_size(addr, bytes, false)))
|
||||
return 0;
|
||||
else
|
||||
if (check_copy_size(addr, bytes, false))
|
||||
return _copy_from_iter(addr, bytes, i);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __always_inline __must_check
|
||||
|
@ -184,10 +182,9 @@ bool copy_from_iter_full(void *addr, size_t bytes, struct iov_iter *i)
|
|||
static __always_inline __must_check
|
||||
size_t copy_from_iter_nocache(void *addr, size_t bytes, struct iov_iter *i)
|
||||
{
|
||||
if (unlikely(!check_copy_size(addr, bytes, false)))
|
||||
return 0;
|
||||
else
|
||||
if (check_copy_size(addr, bytes, false))
|
||||
return _copy_from_iter_nocache(addr, bytes, i);
|
||||
return 0;
|
||||
}
|
||||
|
||||
static __always_inline __must_check
|
||||
|
|
|
@ -661,7 +661,7 @@ static int io_rw_init_file(struct io_kiocb *req, fmode_t mode)
|
|||
if (!io_req_ffs_set(req))
|
||||
req->flags |= io_file_get_flags(file) << REQ_F_SUPPORT_NOWAIT_BIT;
|
||||
|
||||
kiocb->ki_flags = iocb_flags(file);
|
||||
kiocb->ki_flags = file->f_iocb_flags;
|
||||
ret = kiocb_set_rw_flags(kiocb, rw->flags);
|
||||
if (unlikely(ret))
|
||||
return ret;
|
||||
|
|
283
lib/iov_iter.c
283
lib/iov_iter.c
|
@ -168,174 +168,6 @@ static int copyin(void *to, const void __user *from, size_t n)
|
|||
return n;
|
||||
}
|
||||
|
||||
static size_t copy_page_to_iter_iovec(struct page *page, size_t offset, size_t bytes,
|
||||
struct iov_iter *i)
|
||||
{
|
||||
size_t skip, copy, left, wanted;
|
||||
const struct iovec *iov;
|
||||
char __user *buf;
|
||||
void *kaddr, *from;
|
||||
|
||||
if (unlikely(bytes > i->count))
|
||||
bytes = i->count;
|
||||
|
||||
if (unlikely(!bytes))
|
||||
return 0;
|
||||
|
||||
might_fault();
|
||||
wanted = bytes;
|
||||
iov = i->iov;
|
||||
skip = i->iov_offset;
|
||||
buf = iov->iov_base + skip;
|
||||
copy = min(bytes, iov->iov_len - skip);
|
||||
|
||||
if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_writeable(buf, copy)) {
|
||||
kaddr = kmap_atomic(page);
|
||||
from = kaddr + offset;
|
||||
|
||||
/* first chunk, usually the only one */
|
||||
left = copyout(buf, from, copy);
|
||||
copy -= left;
|
||||
skip += copy;
|
||||
from += copy;
|
||||
bytes -= copy;
|
||||
|
||||
while (unlikely(!left && bytes)) {
|
||||
iov++;
|
||||
buf = iov->iov_base;
|
||||
copy = min(bytes, iov->iov_len);
|
||||
left = copyout(buf, from, copy);
|
||||
copy -= left;
|
||||
skip = copy;
|
||||
from += copy;
|
||||
bytes -= copy;
|
||||
}
|
||||
if (likely(!bytes)) {
|
||||
kunmap_atomic(kaddr);
|
||||
goto done;
|
||||
}
|
||||
offset = from - kaddr;
|
||||
buf += copy;
|
||||
kunmap_atomic(kaddr);
|
||||
copy = min(bytes, iov->iov_len - skip);
|
||||
}
|
||||
/* Too bad - revert to non-atomic kmap */
|
||||
|
||||
kaddr = kmap(page);
|
||||
from = kaddr + offset;
|
||||
left = copyout(buf, from, copy);
|
||||
copy -= left;
|
||||
skip += copy;
|
||||
from += copy;
|
||||
bytes -= copy;
|
||||
while (unlikely(!left && bytes)) {
|
||||
iov++;
|
||||
buf = iov->iov_base;
|
||||
copy = min(bytes, iov->iov_len);
|
||||
left = copyout(buf, from, copy);
|
||||
copy -= left;
|
||||
skip = copy;
|
||||
from += copy;
|
||||
bytes -= copy;
|
||||
}
|
||||
kunmap(page);
|
||||
|
||||
done:
|
||||
if (skip == iov->iov_len) {
|
||||
iov++;
|
||||
skip = 0;
|
||||
}
|
||||
i->count -= wanted - bytes;
|
||||
i->nr_segs -= iov - i->iov;
|
||||
i->iov = iov;
|
||||
i->iov_offset = skip;
|
||||
return wanted - bytes;
|
||||
}
|
||||
|
||||
static size_t copy_page_from_iter_iovec(struct page *page, size_t offset, size_t bytes,
|
||||
struct iov_iter *i)
|
||||
{
|
||||
size_t skip, copy, left, wanted;
|
||||
const struct iovec *iov;
|
||||
char __user *buf;
|
||||
void *kaddr, *to;
|
||||
|
||||
if (unlikely(bytes > i->count))
|
||||
bytes = i->count;
|
||||
|
||||
if (unlikely(!bytes))
|
||||
return 0;
|
||||
|
||||
might_fault();
|
||||
wanted = bytes;
|
||||
iov = i->iov;
|
||||
skip = i->iov_offset;
|
||||
buf = iov->iov_base + skip;
|
||||
copy = min(bytes, iov->iov_len - skip);
|
||||
|
||||
if (IS_ENABLED(CONFIG_HIGHMEM) && !fault_in_readable(buf, copy)) {
|
||||
kaddr = kmap_atomic(page);
|
||||
to = kaddr + offset;
|
||||
|
||||
/* first chunk, usually the only one */
|
||||
left = copyin(to, buf, copy);
|
||||
copy -= left;
|
||||
skip += copy;
|
||||
to += copy;
|
||||
bytes -= copy;
|
||||
|
||||
while (unlikely(!left && bytes)) {
|
||||
iov++;
|
||||
buf = iov->iov_base;
|
||||
copy = min(bytes, iov->iov_len);
|
||||
left = copyin(to, buf, copy);
|
||||
copy -= left;
|
||||
skip = copy;
|
||||
to += copy;
|
||||
bytes -= copy;
|
||||
}
|
||||
if (likely(!bytes)) {
|
||||
kunmap_atomic(kaddr);
|
||||
goto done;
|
||||
}
|
||||
offset = to - kaddr;
|
||||
buf += copy;
|
||||
kunmap_atomic(kaddr);
|
||||
copy = min(bytes, iov->iov_len - skip);
|
||||
}
|
||||
/* Too bad - revert to non-atomic kmap */
|
||||
|
||||
kaddr = kmap(page);
|
||||
to = kaddr + offset;
|
||||
left = copyin(to, buf, copy);
|
||||
copy -= left;
|
||||
skip += copy;
|
||||
to += copy;
|
||||
bytes -= copy;
|
||||
while (unlikely(!left && bytes)) {
|
||||
iov++;
|
||||
buf = iov->iov_base;
|
||||
copy = min(bytes, iov->iov_len);
|
||||
left = copyin(to, buf, copy);
|
||||
copy -= left;
|
||||
skip = copy;
|
||||
to += copy;
|
||||
bytes -= copy;
|
||||
}
|
||||
kunmap(page);
|
||||
|
||||
done:
|
||||
if (skip == iov->iov_len) {
|
||||
iov++;
|
||||
skip = 0;
|
||||
}
|
||||
i->count -= wanted - bytes;
|
||||
i->nr_segs -= iov - i->iov;
|
||||
i->iov = iov;
|
||||
i->iov_offset = skip;
|
||||
return wanted - bytes;
|
||||
}
|
||||
|
||||
#ifdef PIPE_PARANOIA
|
||||
static bool sanity(const struct iov_iter *i)
|
||||
{
|
||||
|
@ -848,24 +680,14 @@ static inline bool page_copy_sane(struct page *page, size_t offset, size_t n)
|
|||
static size_t __copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
|
||||
struct iov_iter *i)
|
||||
{
|
||||
if (likely(iter_is_iovec(i)))
|
||||
return copy_page_to_iter_iovec(page, offset, bytes, i);
|
||||
if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
|
||||
if (unlikely(iov_iter_is_pipe(i))) {
|
||||
return copy_page_to_iter_pipe(page, offset, bytes, i);
|
||||
} else {
|
||||
void *kaddr = kmap_local_page(page);
|
||||
size_t wanted = _copy_to_iter(kaddr + offset, bytes, i);
|
||||
kunmap_local(kaddr);
|
||||
return wanted;
|
||||
}
|
||||
if (iov_iter_is_pipe(i))
|
||||
return copy_page_to_iter_pipe(page, offset, bytes, i);
|
||||
if (unlikely(iov_iter_is_discard(i))) {
|
||||
if (unlikely(i->count < bytes))
|
||||
bytes = i->count;
|
||||
i->count -= bytes;
|
||||
return bytes;
|
||||
}
|
||||
WARN_ON(1);
|
||||
return 0;
|
||||
}
|
||||
|
||||
size_t copy_page_to_iter(struct page *page, size_t offset, size_t bytes,
|
||||
|
@ -896,17 +718,12 @@ EXPORT_SYMBOL(copy_page_to_iter);
|
|||
size_t copy_page_from_iter(struct page *page, size_t offset, size_t bytes,
|
||||
struct iov_iter *i)
|
||||
{
|
||||
if (unlikely(!page_copy_sane(page, offset, bytes)))
|
||||
return 0;
|
||||
if (likely(iter_is_iovec(i)))
|
||||
return copy_page_from_iter_iovec(page, offset, bytes, i);
|
||||
if (iov_iter_is_bvec(i) || iov_iter_is_kvec(i) || iov_iter_is_xarray(i)) {
|
||||
if (page_copy_sane(page, offset, bytes)) {
|
||||
void *kaddr = kmap_local_page(page);
|
||||
size_t wanted = _copy_from_iter(kaddr + offset, bytes, i);
|
||||
kunmap_local(kaddr);
|
||||
return wanted;
|
||||
}
|
||||
WARN_ON(1);
|
||||
return 0;
|
||||
}
|
||||
EXPORT_SYMBOL(copy_page_from_iter);
|
||||
|
@ -1029,17 +846,22 @@ static void pipe_advance(struct iov_iter *i, size_t size)
|
|||
|
||||
static void iov_iter_bvec_advance(struct iov_iter *i, size_t size)
|
||||
{
|
||||
struct bvec_iter bi;
|
||||
const struct bio_vec *bvec, *end;
|
||||
|
||||
bi.bi_size = i->count;
|
||||
bi.bi_bvec_done = i->iov_offset;
|
||||
bi.bi_idx = 0;
|
||||
bvec_iter_advance(i->bvec, &bi, size);
|
||||
if (!i->count)
|
||||
return;
|
||||
i->count -= size;
|
||||
|
||||
i->bvec += bi.bi_idx;
|
||||
i->nr_segs -= bi.bi_idx;
|
||||
i->count = bi.bi_size;
|
||||
i->iov_offset = bi.bi_bvec_done;
|
||||
size += i->iov_offset;
|
||||
|
||||
for (bvec = i->bvec, end = bvec + i->nr_segs; bvec < end; bvec++) {
|
||||
if (likely(size < bvec->bv_len))
|
||||
break;
|
||||
size -= bvec->bv_len;
|
||||
}
|
||||
i->iov_offset = size;
|
||||
i->nr_segs -= bvec - i->bvec;
|
||||
i->bvec = bvec;
|
||||
}
|
||||
|
||||
static void iov_iter_iovec_advance(struct iov_iter *i, size_t size)
|
||||
|
@ -1557,47 +1379,36 @@ static ssize_t iter_xarray_get_pages(struct iov_iter *i,
|
|||
}
|
||||
|
||||
/* must be done on non-empty ITER_IOVEC one */
|
||||
static unsigned long first_iovec_segment(const struct iov_iter *i,
|
||||
size_t *size, size_t *start,
|
||||
size_t maxsize, unsigned maxpages)
|
||||
static unsigned long first_iovec_segment(const struct iov_iter *i, size_t *size)
|
||||
{
|
||||
size_t skip;
|
||||
long k;
|
||||
|
||||
for (k = 0, skip = i->iov_offset; k < i->nr_segs; k++, skip = 0) {
|
||||
unsigned long addr = (unsigned long)i->iov[k].iov_base + skip;
|
||||
size_t len = i->iov[k].iov_len - skip;
|
||||
|
||||
if (unlikely(!len))
|
||||
continue;
|
||||
if (len > maxsize)
|
||||
len = maxsize;
|
||||
len += (*start = addr % PAGE_SIZE);
|
||||
if (len > maxpages * PAGE_SIZE)
|
||||
len = maxpages * PAGE_SIZE;
|
||||
*size = len;
|
||||
return addr & PAGE_MASK;
|
||||
if (*size > len)
|
||||
*size = len;
|
||||
return (unsigned long)i->iov[k].iov_base + skip;
|
||||
}
|
||||
BUG(); // if it had been empty, we wouldn't get called
|
||||
}
|
||||
|
||||
/* must be done on non-empty ITER_BVEC one */
|
||||
static struct page *first_bvec_segment(const struct iov_iter *i,
|
||||
size_t *size, size_t *start,
|
||||
size_t maxsize, unsigned maxpages)
|
||||
size_t *size, size_t *start)
|
||||
{
|
||||
struct page *page;
|
||||
size_t skip = i->iov_offset, len;
|
||||
|
||||
len = i->bvec->bv_len - skip;
|
||||
if (len > maxsize)
|
||||
len = maxsize;
|
||||
if (*size > len)
|
||||
*size = len;
|
||||
skip += i->bvec->bv_offset;
|
||||
page = i->bvec->bv_page + skip / PAGE_SIZE;
|
||||
len += (*start = skip % PAGE_SIZE);
|
||||
if (len > maxpages * PAGE_SIZE)
|
||||
len = maxpages * PAGE_SIZE;
|
||||
*size = len;
|
||||
*start = skip % PAGE_SIZE;
|
||||
return page;
|
||||
}
|
||||
|
||||
|
@ -1605,13 +1416,14 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
|
|||
struct page **pages, size_t maxsize, unsigned maxpages,
|
||||
size_t *start)
|
||||
{
|
||||
size_t len;
|
||||
int n, res;
|
||||
|
||||
if (maxsize > i->count)
|
||||
maxsize = i->count;
|
||||
if (!maxsize)
|
||||
return 0;
|
||||
if (maxsize > MAX_RW_COUNT)
|
||||
maxsize = MAX_RW_COUNT;
|
||||
|
||||
if (likely(iter_is_iovec(i))) {
|
||||
unsigned int gup_flags = 0;
|
||||
|
@ -1622,21 +1434,27 @@ ssize_t iov_iter_get_pages(struct iov_iter *i,
|
|||
if (i->nofault)
|
||||
gup_flags |= FOLL_NOFAULT;
|
||||
|
||||
addr = first_iovec_segment(i, &len, start, maxsize, maxpages);
|
||||
n = DIV_ROUND_UP(len, PAGE_SIZE);
|
||||
addr = first_iovec_segment(i, &maxsize);
|
||||
*start = addr % PAGE_SIZE;
|
||||
addr &= PAGE_MASK;
|
||||
n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
|
||||
if (n > maxpages)
|
||||
n = maxpages;
|
||||
res = get_user_pages_fast(addr, n, gup_flags, pages);
|
||||
if (unlikely(res <= 0))
|
||||
return res;
|
||||
return (res == n ? len : res * PAGE_SIZE) - *start;
|
||||
return min_t(size_t, maxsize, res * PAGE_SIZE - *start);
|
||||
}
|
||||
if (iov_iter_is_bvec(i)) {
|
||||
struct page *page;
|
||||
|
||||
page = first_bvec_segment(i, &len, start, maxsize, maxpages);
|
||||
n = DIV_ROUND_UP(len, PAGE_SIZE);
|
||||
while (n--)
|
||||
page = first_bvec_segment(i, &maxsize, start);
|
||||
n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
|
||||
if (n > maxpages)
|
||||
n = maxpages;
|
||||
for (int k = 0; k < n; k++)
|
||||
get_page(*pages++ = page++);
|
||||
return len - *start;
|
||||
return min_t(size_t, maxsize, n * PAGE_SIZE - *start);
|
||||
}
|
||||
if (iov_iter_is_pipe(i))
|
||||
return pipe_get_pages(i, pages, maxsize, maxpages, start);
|
||||
|
@ -1725,13 +1543,14 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
|
|||
size_t *start)
|
||||
{
|
||||
struct page **p;
|
||||
size_t len;
|
||||
int n, res;
|
||||
|
||||
if (maxsize > i->count)
|
||||
maxsize = i->count;
|
||||
if (!maxsize)
|
||||
return 0;
|
||||
if (maxsize > MAX_RW_COUNT)
|
||||
maxsize = MAX_RW_COUNT;
|
||||
|
||||
if (likely(iter_is_iovec(i))) {
|
||||
unsigned int gup_flags = 0;
|
||||
|
@ -1742,8 +1561,10 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
|
|||
if (i->nofault)
|
||||
gup_flags |= FOLL_NOFAULT;
|
||||
|
||||
addr = first_iovec_segment(i, &len, start, maxsize, ~0U);
|
||||
n = DIV_ROUND_UP(len, PAGE_SIZE);
|
||||
addr = first_iovec_segment(i, &maxsize);
|
||||
*start = addr % PAGE_SIZE;
|
||||
addr &= PAGE_MASK;
|
||||
n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
|
||||
p = get_pages_array(n);
|
||||
if (!p)
|
||||
return -ENOMEM;
|
||||
|
@ -1754,19 +1575,19 @@ ssize_t iov_iter_get_pages_alloc(struct iov_iter *i,
|
|||
return res;
|
||||
}
|
||||
*pages = p;
|
||||
return (res == n ? len : res * PAGE_SIZE) - *start;
|
||||
return min_t(size_t, maxsize, res * PAGE_SIZE - *start);
|
||||
}
|
||||
if (iov_iter_is_bvec(i)) {
|
||||
struct page *page;
|
||||
|
||||
page = first_bvec_segment(i, &len, start, maxsize, ~0U);
|
||||
n = DIV_ROUND_UP(len, PAGE_SIZE);
|
||||
page = first_bvec_segment(i, &maxsize, start);
|
||||
n = DIV_ROUND_UP(maxsize + *start, PAGE_SIZE);
|
||||
*pages = p = get_pages_array(n);
|
||||
if (!p)
|
||||
return -ENOMEM;
|
||||
while (n--)
|
||||
for (int k = 0; k < n; k++)
|
||||
get_page(*p++ = page++);
|
||||
return len - *start;
|
||||
return min_t(size_t, maxsize, n * PAGE_SIZE - *start);
|
||||
}
|
||||
if (iov_iter_is_pipe(i))
|
||||
return pipe_get_pages_alloc(i, pages, maxsize, start);
|
||||
|
|
Loading…
Reference in New Issue