Changes since last update:
- Numerous corruption fixes for copy on write - Numerous corruption fixes for blocksize < pagesize writes - Don't miscalculate AG reservations for small final AGs - Fix page cache truncation to work properly for reflink and extent shifting - Fix use-after-free when retrying failed inode/dquot buffer logging - Fix corruptions seen when using copy_file_range in directio mode -----BEGIN PGP SIGNATURE----- iQIzBAABCgAdFiEEUzaAxoMeQq6m2jMV+H93GTRKtOsFAlv1oroACgkQ+H93GTRK tOu+1RAAnteFwaq3WLDYmSrMia/4m52sxvatxlCCSjCGdvfvuZozTwbTdB6FFfuc Ql6Z6F2Lx1sHDNJvwBCsO8qPB0qOhjSnBI/wPe2kz/NETGYNp88vHX7OZvkPVONl jDaCWTcu0BWNiOGi17uTY8sBa8u1izbo5F+pEQIyUjoCgUc9JB2di9dVnUJ0byrh wZjrmPD95ojqOozqppXfFQ0QIbozpTXR3kyU9S0EhHmbnWJZ9t08Iuhd2LjOoDB4 cUFG/1qDXuFvALyM8m1mA7xSBZpA/glFgNeAtmz53aIOZ9Tl8w8cLJJBRx5AqUDU bpBU1y08Bm3OIw+uiTMkiPkCQRMDgtiHKlPxuiKqlsNY0KqYgwWlWcbU/OTvHly8 In+CnbEBqLejKyEIz3nEQ4YimfvHbAlC/3V+nx2qO45hvTXA5lEIGAbBmiLW0ni8 6eBXGeIjKAw0zYOoXC4OuKIiHlQh7AHJB25i9xJTzknRI9jqwZFGkxgdl33Vrq8W gTnfgOhMX2dGmcPrgMgtu+aiBwKf+GJv94/2EJwligExnWXQSsQmGCwKl7ysoE1g iU/MhJT5IYYP/TDqldkahUPSwD2FN4UFtzNfpeDX3H6kxM1R41l+aerdu64UPNji G98U+cWyyUmbu9ziLyREM/XyWz4UhNAz7lRId3ryeu8GPUm2AoY= =TiLQ -----END PGP SIGNATURE----- Merge tag 'xfs-4.20-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux Pull xfs fixes from Darrick Wong: "Dave and I have continued our work fixing corruption problems that can be found when running long-term burn-in exercisers on xfs. Here are some patches fixing most of the problems, but there will likely be more. :/ - Numerous corruption fixes for copy on write - Numerous corruption fixes for blocksize < pagesize writes - Don't miscalculate AG reservations for small final AGs - Fix page cache truncation to work properly for reflink and extent shifting - Fix use-after-free when retrying failed inode/dquot buffer logging - Fix corruptions seen when using copy_file_range in directio mode" * tag 'xfs-4.20-fixes-2' of git://git.kernel.org/pub/scm/fs/xfs/xfs-linux: iomap: readpages doesn't zero page tail beyond EOF vfs: vfs_dedupe_file_range() doesn't return EOPNOTSUPP iomap: dio data corruption and spurious errors when pipes fill iomap: sub-block dio needs to zeroout beyond EOF iomap: FUA is wrong for DIO O_DSYNC writes into unwritten extents xfs: delalloc -> unwritten COW fork allocation can go wrong xfs: flush removing page cache in xfs_reflink_remap_prep xfs: extent shifting doesn't fully invalidate page cache xfs: finobt AG reserves don't consider last AG can be a runt xfs: fix transient reference count error in xfs_buf_resubmit_failed_buffers xfs: uncached buffer tracing needs to print bno xfs: make xfs_file_remap_range() static xfs: fix shared extent data corruption due to missing cow reservation
This commit is contained in:
commit
abe72ff413
53
fs/iomap.c
53
fs/iomap.c
|
@ -142,13 +142,14 @@ static void
|
|||
iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
|
||||
loff_t *pos, loff_t length, unsigned *offp, unsigned *lenp)
|
||||
{
|
||||
loff_t orig_pos = *pos;
|
||||
loff_t isize = i_size_read(inode);
|
||||
unsigned block_bits = inode->i_blkbits;
|
||||
unsigned block_size = (1 << block_bits);
|
||||
unsigned poff = offset_in_page(*pos);
|
||||
unsigned plen = min_t(loff_t, PAGE_SIZE - poff, length);
|
||||
unsigned first = poff >> block_bits;
|
||||
unsigned last = (poff + plen - 1) >> block_bits;
|
||||
unsigned end = offset_in_page(i_size_read(inode)) >> block_bits;
|
||||
|
||||
/*
|
||||
* If the block size is smaller than the page size we need to check the
|
||||
|
@ -183,8 +184,12 @@ iomap_adjust_read_range(struct inode *inode, struct iomap_page *iop,
|
|||
* handle both halves separately so that we properly zero data in the
|
||||
* page cache for blocks that are entirely outside of i_size.
|
||||
*/
|
||||
if (first <= end && last > end)
|
||||
plen -= (last - end) * block_size;
|
||||
if (orig_pos <= isize && orig_pos + length > isize) {
|
||||
unsigned end = offset_in_page(isize - 1) >> block_bits;
|
||||
|
||||
if (first <= end && last > end)
|
||||
plen -= (last - end) * block_size;
|
||||
}
|
||||
|
||||
*offp = poff;
|
||||
*lenp = plen;
|
||||
|
@ -1580,7 +1585,7 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
|
|||
struct bio *bio;
|
||||
bool need_zeroout = false;
|
||||
bool use_fua = false;
|
||||
int nr_pages, ret;
|
||||
int nr_pages, ret = 0;
|
||||
size_t copied = 0;
|
||||
|
||||
if ((pos | length | align) & ((1 << blkbits) - 1))
|
||||
|
@ -1596,12 +1601,13 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
|
|||
|
||||
if (iomap->flags & IOMAP_F_NEW) {
|
||||
need_zeroout = true;
|
||||
} else {
|
||||
} else if (iomap->type == IOMAP_MAPPED) {
|
||||
/*
|
||||
* Use a FUA write if we need datasync semantics, this
|
||||
* is a pure data IO that doesn't require any metadata
|
||||
* updates and the underlying device supports FUA. This
|
||||
* allows us to avoid cache flushes on IO completion.
|
||||
* Use a FUA write if we need datasync semantics, this is a pure
|
||||
* data IO that doesn't require any metadata updates (including
|
||||
* after IO completion such as unwritten extent conversion) and
|
||||
* the underlying device supports FUA. This allows us to avoid
|
||||
* cache flushes on IO completion.
|
||||
*/
|
||||
if (!(iomap->flags & (IOMAP_F_SHARED|IOMAP_F_DIRTY)) &&
|
||||
(dio->flags & IOMAP_DIO_WRITE_FUA) &&
|
||||
|
@ -1644,8 +1650,14 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
|
|||
|
||||
ret = bio_iov_iter_get_pages(bio, &iter);
|
||||
if (unlikely(ret)) {
|
||||
/*
|
||||
* We have to stop part way through an IO. We must fall
|
||||
* through to the sub-block tail zeroing here, otherwise
|
||||
* this short IO may expose stale data in the tail of
|
||||
* the block we haven't written data to.
|
||||
*/
|
||||
bio_put(bio);
|
||||
return copied ? copied : ret;
|
||||
goto zero_tail;
|
||||
}
|
||||
|
||||
n = bio->bi_iter.bi_size;
|
||||
|
@ -1676,13 +1688,21 @@ iomap_dio_bio_actor(struct inode *inode, loff_t pos, loff_t length,
|
|||
dio->submit.cookie = submit_bio(bio);
|
||||
} while (nr_pages);
|
||||
|
||||
if (need_zeroout) {
|
||||
/*
|
||||
* We need to zeroout the tail of a sub-block write if the extent type
|
||||
* requires zeroing or the write extends beyond EOF. If we don't zero
|
||||
* the block tail in the latter case, we can expose stale data via mmap
|
||||
* reads of the EOF block.
|
||||
*/
|
||||
zero_tail:
|
||||
if (need_zeroout ||
|
||||
((dio->flags & IOMAP_DIO_WRITE) && pos >= i_size_read(inode))) {
|
||||
/* zero out from the end of the write to the end of the block */
|
||||
pad = pos & (fs_block_size - 1);
|
||||
if (pad)
|
||||
iomap_dio_zero(dio, iomap, pos, fs_block_size - pad);
|
||||
}
|
||||
return copied;
|
||||
return copied ? copied : ret;
|
||||
}
|
||||
|
||||
static loff_t
|
||||
|
@ -1857,6 +1877,15 @@ iomap_dio_rw(struct kiocb *iocb, struct iov_iter *iter,
|
|||
dio->wait_for_completion = true;
|
||||
ret = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Splicing to pipes can fail on a full pipe. We have to
|
||||
* swallow this to make it look like a short IO
|
||||
* otherwise the higher splice layers will completely
|
||||
* mishandle the error and stop moving data.
|
||||
*/
|
||||
if (ret == -EFAULT)
|
||||
ret = 0;
|
||||
break;
|
||||
}
|
||||
pos += ret;
|
||||
|
|
|
@ -2094,17 +2094,18 @@ int vfs_dedupe_file_range(struct file *file, struct file_dedupe_range *same)
|
|||
off = same->src_offset;
|
||||
len = same->src_length;
|
||||
|
||||
ret = -EISDIR;
|
||||
if (S_ISDIR(src->i_mode))
|
||||
goto out;
|
||||
return -EISDIR;
|
||||
|
||||
ret = -EINVAL;
|
||||
if (!S_ISREG(src->i_mode))
|
||||
goto out;
|
||||
return -EINVAL;
|
||||
|
||||
if (!file->f_op->remap_file_range)
|
||||
return -EOPNOTSUPP;
|
||||
|
||||
ret = remap_verify_area(file, off, len, false);
|
||||
if (ret < 0)
|
||||
goto out;
|
||||
return ret;
|
||||
ret = 0;
|
||||
|
||||
if (off + len > i_size_read(src))
|
||||
|
@ -2147,10 +2148,8 @@ next_fdput:
|
|||
fdput(dst_fd);
|
||||
next_loop:
|
||||
if (fatal_signal_pending(current))
|
||||
goto out;
|
||||
break;
|
||||
}
|
||||
|
||||
out:
|
||||
return ret;
|
||||
}
|
||||
EXPORT_SYMBOL(vfs_dedupe_file_range);
|
||||
|
|
|
@ -1694,10 +1694,13 @@ xfs_bmap_add_extent_delay_real(
|
|||
case BMAP_LEFT_FILLING | BMAP_RIGHT_FILLING | BMAP_RIGHT_CONTIG:
|
||||
/*
|
||||
* Filling in all of a previously delayed allocation extent.
|
||||
* The right neighbor is contiguous, the left is not.
|
||||
* The right neighbor is contiguous, the left is not. Take care
|
||||
* with delay -> unwritten extent allocation here because the
|
||||
* delalloc record we are overwriting is always written.
|
||||
*/
|
||||
PREV.br_startblock = new->br_startblock;
|
||||
PREV.br_blockcount += RIGHT.br_blockcount;
|
||||
PREV.br_state = new->br_state;
|
||||
|
||||
xfs_iext_next(ifp, &bma->icur);
|
||||
xfs_iext_remove(bma->ip, &bma->icur, state);
|
||||
|
|
|
@ -538,15 +538,18 @@ xfs_inobt_rec_check_count(
|
|||
|
||||
static xfs_extlen_t
|
||||
xfs_inobt_max_size(
|
||||
struct xfs_mount *mp)
|
||||
struct xfs_mount *mp,
|
||||
xfs_agnumber_t agno)
|
||||
{
|
||||
xfs_agblock_t agblocks = xfs_ag_block_count(mp, agno);
|
||||
|
||||
/* Bail out if we're uninitialized, which can happen in mkfs. */
|
||||
if (mp->m_inobt_mxr[0] == 0)
|
||||
return 0;
|
||||
|
||||
return xfs_btree_calc_size(mp->m_inobt_mnr,
|
||||
(uint64_t)mp->m_sb.sb_agblocks * mp->m_sb.sb_inopblock /
|
||||
XFS_INODES_PER_CHUNK);
|
||||
(uint64_t)agblocks * mp->m_sb.sb_inopblock /
|
||||
XFS_INODES_PER_CHUNK);
|
||||
}
|
||||
|
||||
static int
|
||||
|
@ -594,7 +597,7 @@ xfs_finobt_calc_reserves(
|
|||
if (error)
|
||||
return error;
|
||||
|
||||
*ask += xfs_inobt_max_size(mp);
|
||||
*ask += xfs_inobt_max_size(mp, agno);
|
||||
*used += tree_len;
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -1042,7 +1042,7 @@ out_trans_cancel:
|
|||
goto out_unlock;
|
||||
}
|
||||
|
||||
static int
|
||||
int
|
||||
xfs_flush_unmap_range(
|
||||
struct xfs_inode *ip,
|
||||
xfs_off_t offset,
|
||||
|
@ -1195,13 +1195,7 @@ xfs_prepare_shift(
|
|||
* Writeback and invalidate cache for the remainder of the file as we're
|
||||
* about to shift down every extent from offset to EOF.
|
||||
*/
|
||||
error = filemap_write_and_wait_range(VFS_I(ip)->i_mapping, offset, -1);
|
||||
if (error)
|
||||
return error;
|
||||
error = invalidate_inode_pages2_range(VFS_I(ip)->i_mapping,
|
||||
offset >> PAGE_SHIFT, -1);
|
||||
if (error)
|
||||
return error;
|
||||
error = xfs_flush_unmap_range(ip, offset, XFS_ISIZE(ip));
|
||||
|
||||
/*
|
||||
* Clean out anything hanging around in the cow fork now that
|
||||
|
|
|
@ -80,4 +80,7 @@ int xfs_bmap_count_blocks(struct xfs_trans *tp, struct xfs_inode *ip,
|
|||
int whichfork, xfs_extnum_t *nextents,
|
||||
xfs_filblks_t *count);
|
||||
|
||||
int xfs_flush_unmap_range(struct xfs_inode *ip, xfs_off_t offset,
|
||||
xfs_off_t len);
|
||||
|
||||
#endif /* __XFS_BMAP_UTIL_H__ */
|
||||
|
|
|
@ -1233,9 +1233,23 @@ xfs_buf_iodone(
|
|||
}
|
||||
|
||||
/*
|
||||
* Requeue a failed buffer for writeback
|
||||
* Requeue a failed buffer for writeback.
|
||||
*
|
||||
* Return true if the buffer has been re-queued properly, false otherwise
|
||||
* We clear the log item failed state here as well, but we have to be careful
|
||||
* about reference counts because the only active reference counts on the buffer
|
||||
* may be the failed log items. Hence if we clear the log item failed state
|
||||
* before queuing the buffer for IO we can release all active references to
|
||||
* the buffer and free it, leading to use after free problems in
|
||||
* xfs_buf_delwri_queue. It makes no difference to the buffer or log items which
|
||||
* order we process them in - the buffer is locked, and we own the buffer list
|
||||
* so nothing on them is going to change while we are performing this action.
|
||||
*
|
||||
* Hence we can safely queue the buffer for IO before we clear the failed log
|
||||
* item state, therefore always having an active reference to the buffer and
|
||||
* avoiding the transient zero-reference state that leads to use-after-free.
|
||||
*
|
||||
* Return true if the buffer was added to the buffer list, false if it was
|
||||
* already on the buffer list.
|
||||
*/
|
||||
bool
|
||||
xfs_buf_resubmit_failed_buffers(
|
||||
|
@ -1243,16 +1257,16 @@ xfs_buf_resubmit_failed_buffers(
|
|||
struct list_head *buffer_list)
|
||||
{
|
||||
struct xfs_log_item *lip;
|
||||
bool ret;
|
||||
|
||||
ret = xfs_buf_delwri_queue(bp, buffer_list);
|
||||
|
||||
/*
|
||||
* Clear XFS_LI_FAILED flag from all items before resubmit
|
||||
*
|
||||
* XFS_LI_FAILED set/clear is protected by ail_lock, caller this
|
||||
* XFS_LI_FAILED set/clear is protected by ail_lock, caller of this
|
||||
* function already have it acquired
|
||||
*/
|
||||
list_for_each_entry(lip, &bp->b_li_list, li_bio_list)
|
||||
xfs_clear_li_failed(lip);
|
||||
|
||||
/* Add this buffer back to the delayed write list */
|
||||
return xfs_buf_delwri_queue(bp, buffer_list);
|
||||
return ret;
|
||||
}
|
||||
|
|
|
@ -920,7 +920,7 @@ out_unlock:
|
|||
}
|
||||
|
||||
|
||||
loff_t
|
||||
STATIC loff_t
|
||||
xfs_file_remap_range(
|
||||
struct file *file_in,
|
||||
loff_t pos_in,
|
||||
|
|
|
@ -296,6 +296,7 @@ xfs_reflink_reserve_cow(
|
|||
if (error)
|
||||
return error;
|
||||
|
||||
xfs_trim_extent(imap, got.br_startoff, got.br_blockcount);
|
||||
trace_xfs_reflink_cow_alloc(ip, &got);
|
||||
return 0;
|
||||
}
|
||||
|
@ -1351,10 +1352,19 @@ xfs_reflink_remap_prep(
|
|||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
/* Zap any page cache for the destination file's range. */
|
||||
truncate_inode_pages_range(&inode_out->i_data,
|
||||
round_down(pos_out, PAGE_SIZE),
|
||||
round_up(pos_out + *len, PAGE_SIZE) - 1);
|
||||
/*
|
||||
* If pos_out > EOF, we may have dirtied blocks between EOF and
|
||||
* pos_out. In that case, we need to extend the flush and unmap to cover
|
||||
* from EOF to the end of the copy length.
|
||||
*/
|
||||
if (pos_out > XFS_ISIZE(dest)) {
|
||||
loff_t flen = *len + (pos_out - XFS_ISIZE(dest));
|
||||
ret = xfs_flush_unmap_range(dest, XFS_ISIZE(dest), flen);
|
||||
} else {
|
||||
ret = xfs_flush_unmap_range(dest, pos_out, *len);
|
||||
}
|
||||
if (ret)
|
||||
goto out_unlock;
|
||||
|
||||
return 1;
|
||||
out_unlock:
|
||||
|
|
|
@ -280,7 +280,10 @@ DECLARE_EVENT_CLASS(xfs_buf_class,
|
|||
),
|
||||
TP_fast_assign(
|
||||
__entry->dev = bp->b_target->bt_dev;
|
||||
__entry->bno = bp->b_bn;
|
||||
if (bp->b_bn == XFS_BUF_DADDR_NULL)
|
||||
__entry->bno = bp->b_maps[0].bm_bn;
|
||||
else
|
||||
__entry->bno = bp->b_bn;
|
||||
__entry->nblks = bp->b_length;
|
||||
__entry->hold = atomic_read(&bp->b_hold);
|
||||
__entry->pincount = atomic_read(&bp->b_pin_count);
|
||||
|
|
Loading…
Reference in New Issue