2018-06-06 10:42:14 +08:00
|
|
|
// SPDX-License-Identifier: GPL-2.0
|
2013-08-12 18:49:36 +08:00
|
|
|
/*
|
|
|
|
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
|
|
|
|
* Copyright (c) 2013 Red Hat, Inc.
|
|
|
|
* All Rights Reserved.
|
|
|
|
*/
|
|
|
|
#include "xfs.h"
|
|
|
|
#include "xfs_fs.h"
|
2019-06-29 10:25:35 +08:00
|
|
|
#include "xfs_shared.h"
|
2013-10-23 07:51:50 +08:00
|
|
|
#include "xfs_format.h"
|
2013-10-23 07:50:10 +08:00
|
|
|
#include "xfs_log_format.h"
|
|
|
|
#include "xfs_trans_resv.h"
|
2013-08-12 18:49:36 +08:00
|
|
|
#include "xfs_mount.h"
|
|
|
|
#include "xfs_inode.h"
|
2013-08-12 18:49:37 +08:00
|
|
|
#include "xfs_dir2.h"
|
2013-08-12 18:49:36 +08:00
|
|
|
#include "xfs_dir2_priv.h"
|
|
|
|
#include "xfs_trace.h"
|
|
|
|
#include "xfs_bmap.h"
|
2013-10-23 07:50:10 +08:00
|
|
|
#include "xfs_trans.h"
|
2013-08-12 18:49:36 +08:00
|
|
|
|
2013-08-12 18:50:09 +08:00
|
|
|
/*
|
|
|
|
* Directory file type support functions
|
|
|
|
*/
|
|
|
|
static unsigned char xfs_dir3_filetype_table[] = {
|
|
|
|
DT_UNKNOWN, DT_REG, DT_DIR, DT_CHR, DT_BLK,
|
|
|
|
DT_FIFO, DT_SOCK, DT_LNK, DT_WHT,
|
|
|
|
};
|
|
|
|
|
2017-10-18 12:37:44 +08:00
|
|
|
unsigned char
|
2013-08-12 18:50:09 +08:00
|
|
|
xfs_dir3_get_dtype(
|
|
|
|
struct xfs_mount *mp,
|
2017-06-17 02:00:05 +08:00
|
|
|
uint8_t filetype)
|
2013-08-12 18:50:09 +08:00
|
|
|
{
|
|
|
|
if (!xfs_sb_version_hasftype(&mp->m_sb))
|
|
|
|
return DT_UNKNOWN;
|
|
|
|
|
|
|
|
if (filetype >= XFS_DIR3_FT_MAX)
|
|
|
|
return DT_UNKNOWN;
|
|
|
|
|
|
|
|
return xfs_dir3_filetype_table[filetype];
|
|
|
|
}
|
|
|
|
|
2013-08-12 18:49:36 +08:00
|
|
|
STATIC int
|
|
|
|
xfs_dir2_sf_getdents(
|
2014-06-06 13:20:32 +08:00
|
|
|
struct xfs_da_args *args,
|
2013-08-12 18:49:36 +08:00
|
|
|
struct dir_context *ctx)
|
|
|
|
{
|
|
|
|
int i; /* shortform entry number */
|
2014-06-06 13:20:32 +08:00
|
|
|
struct xfs_inode *dp = args->dp; /* incore directory inode */
|
2013-08-12 18:49:36 +08:00
|
|
|
xfs_dir2_dataptr_t off; /* current entry's offset */
|
|
|
|
xfs_dir2_sf_entry_t *sfep; /* shortform directory entry */
|
|
|
|
xfs_dir2_sf_hdr_t *sfp; /* shortform structure */
|
|
|
|
xfs_dir2_dataptr_t dot_offset;
|
|
|
|
xfs_dir2_dataptr_t dotdot_offset;
|
|
|
|
xfs_ino_t ino;
|
2014-06-06 13:20:32 +08:00
|
|
|
struct xfs_da_geometry *geo = args->geo;
|
2013-08-12 18:49:36 +08:00
|
|
|
|
|
|
|
ASSERT(dp->i_df.if_flags & XFS_IFINLINE);
|
|
|
|
ASSERT(dp->i_df.if_bytes == dp->i_d.di_size);
|
|
|
|
ASSERT(dp->i_df.if_u1.if_data != NULL);
|
|
|
|
|
|
|
|
sfp = (xfs_dir2_sf_hdr_t *)dp->i_df.if_u1.if_data;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* If the block number in the offset is out of range, we're done.
|
|
|
|
*/
|
2014-06-06 13:11:18 +08:00
|
|
|
if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
|
2013-08-12 18:49:36 +08:00
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Precalculate offsets for . and .. as we will always need them.
|
|
|
|
*
|
|
|
|
* XXX(hch): the second argument is sometimes 0 and sometimes
|
2014-06-06 13:11:18 +08:00
|
|
|
* geo->datablk
|
2013-08-12 18:49:36 +08:00
|
|
|
*/
|
2014-06-06 13:11:18 +08:00
|
|
|
dot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
|
2013-10-30 06:15:02 +08:00
|
|
|
dp->d_ops->data_dot_offset);
|
2014-06-06 13:11:18 +08:00
|
|
|
dotdot_offset = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
|
2013-10-30 06:15:02 +08:00
|
|
|
dp->d_ops->data_dotdot_offset);
|
2013-08-12 18:49:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Put . entry unless we're starting past it.
|
|
|
|
*/
|
|
|
|
if (ctx->pos <= dot_offset) {
|
|
|
|
ctx->pos = dot_offset & 0x7fffffff;
|
|
|
|
if (!dir_emit(ctx, ".", 1, dp->i_ino, DT_DIR))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Put .. entry unless we're starting past it.
|
|
|
|
*/
|
|
|
|
if (ctx->pos <= dotdot_offset) {
|
2013-10-29 19:11:47 +08:00
|
|
|
ino = dp->d_ops->sf_get_parent_ino(sfp);
|
2013-08-12 18:49:36 +08:00
|
|
|
ctx->pos = dotdot_offset & 0x7fffffff;
|
|
|
|
if (!dir_emit(ctx, "..", 2, ino, DT_DIR))
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Loop while there are more entries and put'ing works.
|
|
|
|
*/
|
|
|
|
sfep = xfs_dir2_sf_firstentry(sfp);
|
|
|
|
for (i = 0; i < sfp->count; i++) {
|
2017-06-17 02:00:05 +08:00
|
|
|
uint8_t filetype;
|
2013-08-12 18:50:09 +08:00
|
|
|
|
2014-06-06 13:11:18 +08:00
|
|
|
off = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
|
2013-08-12 18:49:36 +08:00
|
|
|
xfs_dir2_sf_get_offset(sfep));
|
|
|
|
|
|
|
|
if (ctx->pos > off) {
|
2013-10-29 19:11:46 +08:00
|
|
|
sfep = dp->d_ops->sf_nextentry(sfp, sfep);
|
2013-08-12 18:49:36 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
2013-10-29 19:11:47 +08:00
|
|
|
ino = dp->d_ops->sf_get_ino(sfp, sfep);
|
|
|
|
filetype = dp->d_ops->sf_get_ftype(sfep);
|
2013-08-12 18:49:36 +08:00
|
|
|
ctx->pos = off & 0x7fffffff;
|
2013-08-12 18:50:09 +08:00
|
|
|
if (!dir_emit(ctx, (char *)sfep->name, sfep->namelen, ino,
|
2014-06-06 13:20:32 +08:00
|
|
|
xfs_dir3_get_dtype(dp->i_mount, filetype)))
|
2013-08-12 18:49:36 +08:00
|
|
|
return 0;
|
2013-10-29 19:11:46 +08:00
|
|
|
sfep = dp->d_ops->sf_nextentry(sfp, sfep);
|
2013-08-12 18:49:36 +08:00
|
|
|
}
|
|
|
|
|
2014-06-06 13:11:18 +08:00
|
|
|
ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
|
2014-06-06 13:20:32 +08:00
|
|
|
0x7fffffff;
|
2013-08-12 18:49:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Readdir for block directories.
|
|
|
|
*/
|
|
|
|
STATIC int
|
|
|
|
xfs_dir2_block_getdents(
|
2014-06-06 13:20:32 +08:00
|
|
|
struct xfs_da_args *args,
|
2013-08-12 18:49:36 +08:00
|
|
|
struct dir_context *ctx)
|
|
|
|
{
|
2014-06-06 13:20:32 +08:00
|
|
|
struct xfs_inode *dp = args->dp; /* incore directory inode */
|
2013-08-12 18:49:36 +08:00
|
|
|
xfs_dir2_data_hdr_t *hdr; /* block header */
|
|
|
|
struct xfs_buf *bp; /* buffer for block */
|
|
|
|
xfs_dir2_data_entry_t *dep; /* block data entry */
|
|
|
|
xfs_dir2_data_unused_t *dup; /* block unused entry */
|
|
|
|
char *endptr; /* end of the data entries */
|
|
|
|
int error; /* error return value */
|
|
|
|
char *ptr; /* current data entry */
|
|
|
|
int wantoff; /* starting block offset */
|
|
|
|
xfs_off_t cook;
|
2014-06-06 13:20:32 +08:00
|
|
|
struct xfs_da_geometry *geo = args->geo;
|
xfs: stop holding ILOCK over filldir callbacks
The recent change to the readdir locking made in 40194ec ("xfs:
reinstate the ilock in xfs_readdir") for CXFS directory sanity was
probably the wrong thing to do. Deep in the readdir code we
can take page faults in the filldir callback, and so taking a page
fault while holding an inode ilock creates a new set of locking
issues that lockdep warns all over the place about.
The locking order for regular inodes w.r.t. page faults is io_lock
-> pagefault -> mmap_sem -> ilock. The directory readdir code now
triggers ilock -> page fault -> mmap_sem. While we cannot deadlock
at this point, it inverts all the locking patterns that lockdep
normally sees on XFS inodes, and so triggers lockdep. We worked
around this with commit 93a8614 ("xfs: fix directory inode iolock
lockdep false positive"), but that then just moved the lockdep
warning to deeper in the page fault path and triggered on security
inode locks. Fixing the shmem issue there just moved the lockdep
reports somewhere else, and now we are getting false positives from
filesystem freezing annotations getting confused.
Further, if we enter memory reclaim in a readdir path, we now get
lockdep warning about potential deadlocks because the ilock is held
when we enter reclaim. This, again, is different to a regular file
in that we never allow memory reclaim to run while holding the ilock
for regular files. Hence lockdep now throws
ilock->kmalloc->reclaim->ilock warnings.
Basically, the problem is that the ilock is being used to protect
the directory data and the inode metadata, whereas for a regular
file the iolock protects the data and the ilock protects the
metadata. From the VFS perspective, the i_mutex serialises all
accesses to the directory data, and so not holding the ilock for
readdir doesn't matter. The issue is that CXFS doesn't access
directory data via the VFS, so it has no "data serialisaton"
mechanism. Hence we need to hold the IOLOCK in the correct places to
provide this low level directory data access serialisation.
The ilock can then be used just when the extent list needs to be
read, just like we do for regular files. The directory modification
code can take the iolock exclusive when the ilock is also taken,
and this then ensures that readdir is correct excluded while
modifications are in progress.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-08-19 08:33:00 +08:00
|
|
|
int lock_mode;
|
2013-08-12 18:49:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the block number in the offset is out of range, we're done.
|
|
|
|
*/
|
2014-06-06 13:11:18 +08:00
|
|
|
if (xfs_dir2_dataptr_to_db(geo, ctx->pos) > geo->datablk)
|
2013-08-12 18:49:36 +08:00
|
|
|
return 0;
|
|
|
|
|
xfs: stop holding ILOCK over filldir callbacks
The recent change to the readdir locking made in 40194ec ("xfs:
reinstate the ilock in xfs_readdir") for CXFS directory sanity was
probably the wrong thing to do. Deep in the readdir code we
can take page faults in the filldir callback, and so taking a page
fault while holding an inode ilock creates a new set of locking
issues that lockdep warns all over the place about.
The locking order for regular inodes w.r.t. page faults is io_lock
-> pagefault -> mmap_sem -> ilock. The directory readdir code now
triggers ilock -> page fault -> mmap_sem. While we cannot deadlock
at this point, it inverts all the locking patterns that lockdep
normally sees on XFS inodes, and so triggers lockdep. We worked
around this with commit 93a8614 ("xfs: fix directory inode iolock
lockdep false positive"), but that then just moved the lockdep
warning to deeper in the page fault path and triggered on security
inode locks. Fixing the shmem issue there just moved the lockdep
reports somewhere else, and now we are getting false positives from
filesystem freezing annotations getting confused.
Further, if we enter memory reclaim in a readdir path, we now get
lockdep warning about potential deadlocks because the ilock is held
when we enter reclaim. This, again, is different to a regular file
in that we never allow memory reclaim to run while holding the ilock
for regular files. Hence lockdep now throws
ilock->kmalloc->reclaim->ilock warnings.
Basically, the problem is that the ilock is being used to protect
the directory data and the inode metadata, whereas for a regular
file the iolock protects the data and the ilock protects the
metadata. From the VFS perspective, the i_mutex serialises all
accesses to the directory data, and so not holding the ilock for
readdir doesn't matter. The issue is that CXFS doesn't access
directory data via the VFS, so it has no "data serialisaton"
mechanism. Hence we need to hold the IOLOCK in the correct places to
provide this low level directory data access serialisation.
The ilock can then be used just when the extent list needs to be
read, just like we do for regular files. The directory modification
code can take the iolock exclusive when the ilock is also taken,
and this then ensures that readdir is correct excluded while
modifications are in progress.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-08-19 08:33:00 +08:00
|
|
|
lock_mode = xfs_ilock_data_map_shared(dp);
|
2017-06-17 02:00:14 +08:00
|
|
|
error = xfs_dir3_block_read(args->trans, dp, &bp);
|
xfs: stop holding ILOCK over filldir callbacks
The recent change to the readdir locking made in 40194ec ("xfs:
reinstate the ilock in xfs_readdir") for CXFS directory sanity was
probably the wrong thing to do. Deep in the readdir code we
can take page faults in the filldir callback, and so taking a page
fault while holding an inode ilock creates a new set of locking
issues that lockdep warns all over the place about.
The locking order for regular inodes w.r.t. page faults is io_lock
-> pagefault -> mmap_sem -> ilock. The directory readdir code now
triggers ilock -> page fault -> mmap_sem. While we cannot deadlock
at this point, it inverts all the locking patterns that lockdep
normally sees on XFS inodes, and so triggers lockdep. We worked
around this with commit 93a8614 ("xfs: fix directory inode iolock
lockdep false positive"), but that then just moved the lockdep
warning to deeper in the page fault path and triggered on security
inode locks. Fixing the shmem issue there just moved the lockdep
reports somewhere else, and now we are getting false positives from
filesystem freezing annotations getting confused.
Further, if we enter memory reclaim in a readdir path, we now get
lockdep warning about potential deadlocks because the ilock is held
when we enter reclaim. This, again, is different to a regular file
in that we never allow memory reclaim to run while holding the ilock
for regular files. Hence lockdep now throws
ilock->kmalloc->reclaim->ilock warnings.
Basically, the problem is that the ilock is being used to protect
the directory data and the inode metadata, whereas for a regular
file the iolock protects the data and the ilock protects the
metadata. From the VFS perspective, the i_mutex serialises all
accesses to the directory data, and so not holding the ilock for
readdir doesn't matter. The issue is that CXFS doesn't access
directory data via the VFS, so it has no "data serialisaton"
mechanism. Hence we need to hold the IOLOCK in the correct places to
provide this low level directory data access serialisation.
The ilock can then be used just when the extent list needs to be
read, just like we do for regular files. The directory modification
code can take the iolock exclusive when the ilock is also taken,
and this then ensures that readdir is correct excluded while
modifications are in progress.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-08-19 08:33:00 +08:00
|
|
|
xfs_iunlock(dp, lock_mode);
|
2013-08-12 18:49:36 +08:00
|
|
|
if (error)
|
|
|
|
return error;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Extract the byte offset we start at from the seek pointer.
|
|
|
|
* We'll skip entries before this.
|
|
|
|
*/
|
2014-06-06 13:08:18 +08:00
|
|
|
wantoff = xfs_dir2_dataptr_to_off(geo, ctx->pos);
|
2013-08-12 18:49:36 +08:00
|
|
|
hdr = bp->b_addr;
|
|
|
|
xfs_dir3_data_check(dp, bp);
|
|
|
|
/*
|
|
|
|
* Set up values for the loop.
|
|
|
|
*/
|
2013-10-29 19:11:49 +08:00
|
|
|
ptr = (char *)dp->d_ops->data_entry_p(hdr);
|
2018-01-17 10:54:12 +08:00
|
|
|
endptr = xfs_dir3_data_endp(geo, hdr);
|
2013-08-12 18:49:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Loop over the data portion of the block.
|
|
|
|
* Each object is a real entry (dep) or an unused one (dup).
|
|
|
|
*/
|
|
|
|
while (ptr < endptr) {
|
2017-06-17 02:00:05 +08:00
|
|
|
uint8_t filetype;
|
2013-08-12 18:50:09 +08:00
|
|
|
|
2013-08-12 18:49:36 +08:00
|
|
|
dup = (xfs_dir2_data_unused_t *)ptr;
|
|
|
|
/*
|
|
|
|
* Unused, skip it.
|
|
|
|
*/
|
|
|
|
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
|
|
|
|
ptr += be16_to_cpu(dup->length);
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
dep = (xfs_dir2_data_entry_t *)ptr;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Bump pointer for the next iteration.
|
|
|
|
*/
|
2013-10-29 19:11:48 +08:00
|
|
|
ptr += dp->d_ops->data_entsize(dep->namelen);
|
2013-08-12 18:49:36 +08:00
|
|
|
/*
|
|
|
|
* The entry is before the desired starting point, skip it.
|
|
|
|
*/
|
|
|
|
if ((char *)dep - (char *)hdr < wantoff)
|
|
|
|
continue;
|
|
|
|
|
2014-06-06 13:11:18 +08:00
|
|
|
cook = xfs_dir2_db_off_to_dataptr(geo, geo->datablk,
|
2013-08-12 18:49:36 +08:00
|
|
|
(char *)dep - (char *)hdr);
|
|
|
|
|
|
|
|
ctx->pos = cook & 0x7fffffff;
|
2013-10-29 19:11:48 +08:00
|
|
|
filetype = dp->d_ops->data_get_ftype(dep);
|
2013-08-12 18:49:36 +08:00
|
|
|
/*
|
|
|
|
* If it didn't fit, set the final offset to here & return.
|
|
|
|
*/
|
|
|
|
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
|
2013-08-12 18:50:09 +08:00
|
|
|
be64_to_cpu(dep->inumber),
|
2014-06-06 13:20:32 +08:00
|
|
|
xfs_dir3_get_dtype(dp->i_mount, filetype))) {
|
2017-06-17 02:00:14 +08:00
|
|
|
xfs_trans_brelse(args->trans, bp);
|
2013-08-12 18:49:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Reached the end of the block.
|
|
|
|
* Set the offset to a non-existent block 1 and return.
|
|
|
|
*/
|
2014-06-06 13:11:18 +08:00
|
|
|
ctx->pos = xfs_dir2_db_off_to_dataptr(geo, geo->datablk + 1, 0) &
|
2014-06-06 13:20:32 +08:00
|
|
|
0x7fffffff;
|
2017-06-17 02:00:14 +08:00
|
|
|
xfs_trans_brelse(args->trans, bp);
|
2013-08-12 18:49:36 +08:00
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
|
2017-06-15 12:23:05 +08:00
|
|
|
/*
|
|
|
|
* Read a directory block and initiate readahead for blocks beyond that.
|
|
|
|
* We maintain a sliding readahead window of the remaining space in the
|
|
|
|
* buffer rounded up to the nearest block.
|
|
|
|
*/
|
2013-08-12 18:49:36 +08:00
|
|
|
STATIC int
|
|
|
|
xfs_dir2_leaf_readbuf(
|
2014-06-06 13:20:32 +08:00
|
|
|
struct xfs_da_args *args,
|
2013-08-12 18:49:36 +08:00
|
|
|
size_t bufsize,
|
2017-06-15 12:23:05 +08:00
|
|
|
xfs_dir2_off_t *cur_off,
|
|
|
|
xfs_dablk_t *ra_blk,
|
|
|
|
struct xfs_buf **bpp)
|
2013-08-12 18:49:36 +08:00
|
|
|
{
|
2014-06-06 13:20:32 +08:00
|
|
|
struct xfs_inode *dp = args->dp;
|
2016-05-18 22:17:26 +08:00
|
|
|
struct xfs_buf *bp = NULL;
|
2017-06-15 12:23:05 +08:00
|
|
|
struct xfs_da_geometry *geo = args->geo;
|
|
|
|
struct xfs_ifork *ifp = XFS_IFORK_PTR(dp, XFS_DATA_FORK);
|
|
|
|
struct xfs_bmbt_irec map;
|
2013-08-12 18:49:36 +08:00
|
|
|
struct blk_plug plug;
|
2017-06-15 12:23:05 +08:00
|
|
|
xfs_dir2_off_t new_off;
|
|
|
|
xfs_dablk_t next_ra;
|
|
|
|
xfs_dablk_t map_off;
|
|
|
|
xfs_dablk_t last_da;
|
2017-11-04 01:34:43 +08:00
|
|
|
struct xfs_iext_cursor icur;
|
2017-06-15 12:23:05 +08:00
|
|
|
int ra_want;
|
2013-08-12 18:49:36 +08:00
|
|
|
int error = 0;
|
|
|
|
|
2017-06-15 12:23:05 +08:00
|
|
|
if (!(ifp->if_flags & XFS_IFEXTENTS)) {
|
|
|
|
error = xfs_iread_extents(args->trans, dp, XFS_DATA_FORK);
|
2013-08-12 18:49:36 +08:00
|
|
|
if (error)
|
2017-06-15 12:23:05 +08:00
|
|
|
goto out;
|
2013-08-12 18:49:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
2017-06-15 12:23:05 +08:00
|
|
|
* Look for mapped directory blocks at or above the current offset.
|
|
|
|
* Truncate down to the nearest directory block to start the scanning
|
|
|
|
* operation.
|
2013-08-12 18:49:36 +08:00
|
|
|
*/
|
2017-06-15 12:23:05 +08:00
|
|
|
last_da = xfs_dir2_byte_to_da(geo, XFS_DIR2_LEAF_OFFSET);
|
|
|
|
map_off = xfs_dir2_db_to_da(geo, xfs_dir2_byte_to_db(geo, *cur_off));
|
2017-11-04 01:34:43 +08:00
|
|
|
if (!xfs_iext_lookup_extent(dp, ifp, map_off, &icur, &map))
|
2013-08-12 18:49:36 +08:00
|
|
|
goto out;
|
2017-06-15 12:23:05 +08:00
|
|
|
if (map.br_startoff >= last_da)
|
|
|
|
goto out;
|
|
|
|
xfs_trim_extent(&map, map_off, last_da - map_off);
|
2013-08-12 18:49:36 +08:00
|
|
|
|
2017-06-15 12:23:05 +08:00
|
|
|
/* Read the directory block of that first mapping. */
|
|
|
|
new_off = xfs_dir2_da_to_byte(geo, map.br_startoff);
|
|
|
|
if (new_off > *cur_off)
|
|
|
|
*cur_off = new_off;
|
|
|
|
error = xfs_dir3_data_read(args->trans, dp, map.br_startoff, -1, &bp);
|
2013-08-12 18:49:36 +08:00
|
|
|
if (error)
|
2017-06-15 12:23:05 +08:00
|
|
|
goto out;
|
2013-08-12 18:49:36 +08:00
|
|
|
|
|
|
|
/*
|
2017-06-15 12:23:05 +08:00
|
|
|
* Start readahead for the next bufsize's worth of dir data blocks.
|
|
|
|
* We may have already issued readahead for some of that range;
|
|
|
|
* ra_blk tracks the last block we tried to read(ahead).
|
2013-08-12 18:49:36 +08:00
|
|
|
*/
|
2017-06-15 12:23:05 +08:00
|
|
|
ra_want = howmany(bufsize + geo->blksize, (1 << geo->fsblog));
|
|
|
|
if (*ra_blk >= last_da)
|
|
|
|
goto out;
|
|
|
|
else if (*ra_blk == 0)
|
|
|
|
*ra_blk = map.br_startoff;
|
|
|
|
next_ra = map.br_startoff + geo->fsbcount;
|
|
|
|
if (next_ra >= last_da)
|
|
|
|
goto out_no_ra;
|
|
|
|
if (map.br_blockcount < geo->fsbcount &&
|
2017-11-04 01:34:43 +08:00
|
|
|
!xfs_iext_next_extent(ifp, &icur, &map))
|
2017-06-15 12:23:05 +08:00
|
|
|
goto out_no_ra;
|
|
|
|
if (map.br_startoff >= last_da)
|
|
|
|
goto out_no_ra;
|
|
|
|
xfs_trim_extent(&map, next_ra, last_da - next_ra);
|
|
|
|
|
|
|
|
/* Start ra for each dir (not fs) block that has a mapping. */
|
2013-08-12 18:49:36 +08:00
|
|
|
blk_start_plug(&plug);
|
2017-06-15 12:23:05 +08:00
|
|
|
while (ra_want > 0) {
|
|
|
|
next_ra = roundup((xfs_dablk_t)map.br_startoff, geo->fsbcount);
|
|
|
|
while (ra_want > 0 &&
|
|
|
|
next_ra < map.br_startoff + map.br_blockcount) {
|
|
|
|
if (next_ra >= last_da) {
|
|
|
|
*ra_blk = last_da;
|
|
|
|
break;
|
|
|
|
}
|
|
|
|
if (next_ra > *ra_blk) {
|
|
|
|
xfs_dir3_data_readahead(dp, next_ra, -2);
|
|
|
|
*ra_blk = next_ra;
|
2013-08-12 18:49:36 +08:00
|
|
|
}
|
2017-06-15 12:23:05 +08:00
|
|
|
ra_want -= geo->fsbcount;
|
|
|
|
next_ra += geo->fsbcount;
|
|
|
|
}
|
2017-11-04 01:34:43 +08:00
|
|
|
if (!xfs_iext_next_extent(ifp, &icur, &map)) {
|
2017-06-15 12:23:05 +08:00
|
|
|
*ra_blk = last_da;
|
|
|
|
break;
|
2013-08-12 18:49:36 +08:00
|
|
|
}
|
|
|
|
}
|
|
|
|
blk_finish_plug(&plug);
|
|
|
|
|
|
|
|
out:
|
|
|
|
*bpp = bp;
|
|
|
|
return error;
|
2017-06-15 12:23:05 +08:00
|
|
|
out_no_ra:
|
|
|
|
*ra_blk = last_da;
|
|
|
|
goto out;
|
2013-08-12 18:49:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Getdents (readdir) for leaf and node directories.
|
|
|
|
* This reads the data blocks only, so is the same for both forms.
|
|
|
|
*/
|
|
|
|
STATIC int
|
|
|
|
xfs_dir2_leaf_getdents(
|
2014-06-06 13:20:32 +08:00
|
|
|
struct xfs_da_args *args,
|
2013-08-12 18:49:36 +08:00
|
|
|
struct dir_context *ctx,
|
|
|
|
size_t bufsize)
|
|
|
|
{
|
2014-06-06 13:20:32 +08:00
|
|
|
struct xfs_inode *dp = args->dp;
|
2013-08-12 18:49:36 +08:00
|
|
|
struct xfs_buf *bp = NULL; /* data block buffer */
|
|
|
|
xfs_dir2_data_hdr_t *hdr; /* data block header */
|
|
|
|
xfs_dir2_data_entry_t *dep; /* data entry */
|
|
|
|
xfs_dir2_data_unused_t *dup; /* unused entry */
|
|
|
|
char *ptr = NULL; /* pointer to current data */
|
2014-06-06 13:20:32 +08:00
|
|
|
struct xfs_da_geometry *geo = args->geo;
|
2017-06-15 12:23:05 +08:00
|
|
|
xfs_dablk_t rablk = 0; /* current readahead block */
|
|
|
|
xfs_dir2_off_t curoff; /* current overall offset */
|
|
|
|
int length; /* temporary length value */
|
|
|
|
int byteoff; /* offset in current block */
|
|
|
|
int lock_mode;
|
|
|
|
int error = 0; /* error return value */
|
2013-08-12 18:49:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* If the offset is at or past the largest allowed value,
|
|
|
|
* give up right away.
|
|
|
|
*/
|
|
|
|
if (ctx->pos >= XFS_DIR2_MAX_DATAPTR)
|
|
|
|
return 0;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Inside the loop we keep the main offset value as a byte offset
|
|
|
|
* in the directory file.
|
|
|
|
*/
|
2014-04-14 17:02:30 +08:00
|
|
|
curoff = xfs_dir2_dataptr_to_byte(ctx->pos);
|
2013-08-12 18:49:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Loop over directory entries until we reach the end offset.
|
|
|
|
* Get more blocks and readahead as necessary.
|
|
|
|
*/
|
|
|
|
while (curoff < XFS_DIR2_LEAF_OFFSET) {
|
2017-06-17 02:00:05 +08:00
|
|
|
uint8_t filetype;
|
2013-08-12 18:50:09 +08:00
|
|
|
|
2013-08-12 18:49:36 +08:00
|
|
|
/*
|
|
|
|
* If we have no buffer, or we're off the end of the
|
|
|
|
* current buffer, need to get another one.
|
|
|
|
*/
|
2014-06-06 13:15:59 +08:00
|
|
|
if (!bp || ptr >= (char *)bp->b_addr + geo->blksize) {
|
2016-05-18 22:17:26 +08:00
|
|
|
if (bp) {
|
2017-06-15 12:23:05 +08:00
|
|
|
xfs_trans_brelse(args->trans, bp);
|
2016-05-18 22:17:26 +08:00
|
|
|
bp = NULL;
|
|
|
|
}
|
2013-08-12 18:49:36 +08:00
|
|
|
|
xfs: stop holding ILOCK over filldir callbacks
The recent change to the readdir locking made in 40194ec ("xfs:
reinstate the ilock in xfs_readdir") for CXFS directory sanity was
probably the wrong thing to do. Deep in the readdir code we
can take page faults in the filldir callback, and so taking a page
fault while holding an inode ilock creates a new set of locking
issues that lockdep warns all over the place about.
The locking order for regular inodes w.r.t. page faults is io_lock
-> pagefault -> mmap_sem -> ilock. The directory readdir code now
triggers ilock -> page fault -> mmap_sem. While we cannot deadlock
at this point, it inverts all the locking patterns that lockdep
normally sees on XFS inodes, and so triggers lockdep. We worked
around this with commit 93a8614 ("xfs: fix directory inode iolock
lockdep false positive"), but that then just moved the lockdep
warning to deeper in the page fault path and triggered on security
inode locks. Fixing the shmem issue there just moved the lockdep
reports somewhere else, and now we are getting false positives from
filesystem freezing annotations getting confused.
Further, if we enter memory reclaim in a readdir path, we now get
lockdep warning about potential deadlocks because the ilock is held
when we enter reclaim. This, again, is different to a regular file
in that we never allow memory reclaim to run while holding the ilock
for regular files. Hence lockdep now throws
ilock->kmalloc->reclaim->ilock warnings.
Basically, the problem is that the ilock is being used to protect
the directory data and the inode metadata, whereas for a regular
file the iolock protects the data and the ilock protects the
metadata. From the VFS perspective, the i_mutex serialises all
accesses to the directory data, and so not holding the ilock for
readdir doesn't matter. The issue is that CXFS doesn't access
directory data via the VFS, so it has no "data serialisaton"
mechanism. Hence we need to hold the IOLOCK in the correct places to
provide this low level directory data access serialisation.
The ilock can then be used just when the extent list needs to be
read, just like we do for regular files. The directory modification
code can take the iolock exclusive when the ilock is also taken,
and this then ensures that readdir is correct excluded while
modifications are in progress.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-08-19 08:33:00 +08:00
|
|
|
lock_mode = xfs_ilock_data_map_shared(dp);
|
2017-06-15 12:23:05 +08:00
|
|
|
error = xfs_dir2_leaf_readbuf(args, bufsize, &curoff,
|
|
|
|
&rablk, &bp);
|
xfs: stop holding ILOCK over filldir callbacks
The recent change to the readdir locking made in 40194ec ("xfs:
reinstate the ilock in xfs_readdir") for CXFS directory sanity was
probably the wrong thing to do. Deep in the readdir code we
can take page faults in the filldir callback, and so taking a page
fault while holding an inode ilock creates a new set of locking
issues that lockdep warns all over the place about.
The locking order for regular inodes w.r.t. page faults is io_lock
-> pagefault -> mmap_sem -> ilock. The directory readdir code now
triggers ilock -> page fault -> mmap_sem. While we cannot deadlock
at this point, it inverts all the locking patterns that lockdep
normally sees on XFS inodes, and so triggers lockdep. We worked
around this with commit 93a8614 ("xfs: fix directory inode iolock
lockdep false positive"), but that then just moved the lockdep
warning to deeper in the page fault path and triggered on security
inode locks. Fixing the shmem issue there just moved the lockdep
reports somewhere else, and now we are getting false positives from
filesystem freezing annotations getting confused.
Further, if we enter memory reclaim in a readdir path, we now get
lockdep warning about potential deadlocks because the ilock is held
when we enter reclaim. This, again, is different to a regular file
in that we never allow memory reclaim to run while holding the ilock
for regular files. Hence lockdep now throws
ilock->kmalloc->reclaim->ilock warnings.
Basically, the problem is that the ilock is being used to protect
the directory data and the inode metadata, whereas for a regular
file the iolock protects the data and the ilock protects the
metadata. From the VFS perspective, the i_mutex serialises all
accesses to the directory data, and so not holding the ilock for
readdir doesn't matter. The issue is that CXFS doesn't access
directory data via the VFS, so it has no "data serialisaton"
mechanism. Hence we need to hold the IOLOCK in the correct places to
provide this low level directory data access serialisation.
The ilock can then be used just when the extent list needs to be
read, just like we do for regular files. The directory modification
code can take the iolock exclusive when the ilock is also taken,
and this then ensures that readdir is correct excluded while
modifications are in progress.
Signed-off-by: Dave Chinner <dchinner@redhat.com>
Reviewed-by: Brian Foster <bfoster@redhat.com>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2015-08-19 08:33:00 +08:00
|
|
|
xfs_iunlock(dp, lock_mode);
|
2017-06-15 12:23:05 +08:00
|
|
|
if (error || !bp)
|
2013-08-12 18:49:36 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
hdr = bp->b_addr;
|
|
|
|
xfs_dir3_data_check(dp, bp);
|
|
|
|
/*
|
|
|
|
* Find our position in the block.
|
|
|
|
*/
|
2013-10-29 19:11:49 +08:00
|
|
|
ptr = (char *)dp->d_ops->data_entry_p(hdr);
|
2014-06-06 13:20:32 +08:00
|
|
|
byteoff = xfs_dir2_byte_to_off(geo, curoff);
|
2013-08-12 18:49:36 +08:00
|
|
|
/*
|
|
|
|
* Skip past the header.
|
|
|
|
*/
|
|
|
|
if (byteoff == 0)
|
2013-10-30 06:15:02 +08:00
|
|
|
curoff += dp->d_ops->data_entry_offset;
|
2013-08-12 18:49:36 +08:00
|
|
|
/*
|
|
|
|
* Skip past entries until we reach our offset.
|
|
|
|
*/
|
|
|
|
else {
|
|
|
|
while ((char *)ptr - (char *)hdr < byteoff) {
|
|
|
|
dup = (xfs_dir2_data_unused_t *)ptr;
|
|
|
|
|
|
|
|
if (be16_to_cpu(dup->freetag)
|
|
|
|
== XFS_DIR2_DATA_FREE_TAG) {
|
|
|
|
|
|
|
|
length = be16_to_cpu(dup->length);
|
|
|
|
ptr += length;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
dep = (xfs_dir2_data_entry_t *)ptr;
|
|
|
|
length =
|
2013-10-29 19:11:48 +08:00
|
|
|
dp->d_ops->data_entsize(dep->namelen);
|
2013-08-12 18:49:36 +08:00
|
|
|
ptr += length;
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* Now set our real offset.
|
|
|
|
*/
|
|
|
|
curoff =
|
2014-06-06 13:08:18 +08:00
|
|
|
xfs_dir2_db_off_to_byte(geo,
|
|
|
|
xfs_dir2_byte_to_db(geo, curoff),
|
2013-08-12 18:49:36 +08:00
|
|
|
(char *)ptr - (char *)hdr);
|
2014-06-06 13:15:59 +08:00
|
|
|
if (ptr >= (char *)hdr + geo->blksize) {
|
2013-08-12 18:49:36 +08:00
|
|
|
continue;
|
|
|
|
}
|
|
|
|
}
|
|
|
|
}
|
|
|
|
/*
|
|
|
|
* We have a pointer to an entry.
|
|
|
|
* Is it a live one?
|
|
|
|
*/
|
|
|
|
dup = (xfs_dir2_data_unused_t *)ptr;
|
|
|
|
/*
|
|
|
|
* No, it's unused, skip over it.
|
|
|
|
*/
|
|
|
|
if (be16_to_cpu(dup->freetag) == XFS_DIR2_DATA_FREE_TAG) {
|
|
|
|
length = be16_to_cpu(dup->length);
|
|
|
|
ptr += length;
|
|
|
|
curoff += length;
|
|
|
|
continue;
|
|
|
|
}
|
|
|
|
|
|
|
|
dep = (xfs_dir2_data_entry_t *)ptr;
|
2013-10-29 19:11:48 +08:00
|
|
|
length = dp->d_ops->data_entsize(dep->namelen);
|
|
|
|
filetype = dp->d_ops->data_get_ftype(dep);
|
2013-08-12 18:49:36 +08:00
|
|
|
|
2014-04-14 17:02:30 +08:00
|
|
|
ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
|
2013-08-12 18:49:36 +08:00
|
|
|
if (!dir_emit(ctx, (char *)dep->name, dep->namelen,
|
2013-08-12 18:50:09 +08:00
|
|
|
be64_to_cpu(dep->inumber),
|
2014-06-06 13:20:32 +08:00
|
|
|
xfs_dir3_get_dtype(dp->i_mount, filetype)))
|
2013-08-12 18:49:36 +08:00
|
|
|
break;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Advance to next entry in the block.
|
|
|
|
*/
|
|
|
|
ptr += length;
|
|
|
|
curoff += length;
|
|
|
|
/* bufsize may have just been a guess; don't go negative */
|
|
|
|
bufsize = bufsize > length ? bufsize - length : 0;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* All done. Set output offset value to current offset.
|
|
|
|
*/
|
2014-04-14 17:02:30 +08:00
|
|
|
if (curoff > xfs_dir2_dataptr_to_byte(XFS_DIR2_MAX_DATAPTR))
|
2013-08-12 18:49:36 +08:00
|
|
|
ctx->pos = XFS_DIR2_MAX_DATAPTR & 0x7fffffff;
|
|
|
|
else
|
2014-04-14 17:02:30 +08:00
|
|
|
ctx->pos = xfs_dir2_byte_to_dataptr(curoff) & 0x7fffffff;
|
2013-08-12 18:49:36 +08:00
|
|
|
if (bp)
|
2017-06-17 02:00:14 +08:00
|
|
|
xfs_trans_brelse(args->trans, bp);
|
2013-08-12 18:49:36 +08:00
|
|
|
return error;
|
|
|
|
}
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Read a directory.
|
2017-06-17 02:00:14 +08:00
|
|
|
*
|
|
|
|
* If supplied, the transaction collects locked dir buffers to avoid
|
|
|
|
* nested buffer deadlocks. This function does not dirty the
|
|
|
|
* transaction. The caller should ensure that the inode is locked
|
|
|
|
* before calling this function.
|
2013-08-12 18:49:36 +08:00
|
|
|
*/
|
|
|
|
int
|
|
|
|
xfs_readdir(
|
2017-06-17 02:00:14 +08:00
|
|
|
struct xfs_trans *tp,
|
2014-06-06 13:20:32 +08:00
|
|
|
struct xfs_inode *dp,
|
|
|
|
struct dir_context *ctx,
|
|
|
|
size_t bufsize)
|
2013-08-12 18:49:36 +08:00
|
|
|
{
|
2014-06-10 05:30:36 +08:00
|
|
|
struct xfs_da_args args = { NULL };
|
2014-06-06 13:20:32 +08:00
|
|
|
int rval;
|
|
|
|
int v;
|
2013-08-12 18:49:36 +08:00
|
|
|
|
|
|
|
trace_xfs_readdir(dp);
|
|
|
|
|
|
|
|
if (XFS_FORCED_SHUTDOWN(dp->i_mount))
|
2014-06-25 12:58:08 +08:00
|
|
|
return -EIO;
|
2013-08-12 18:49:36 +08:00
|
|
|
|
2016-02-09 13:54:58 +08:00
|
|
|
ASSERT(S_ISDIR(VFS_I(dp)->i_mode));
|
2015-10-12 15:21:22 +08:00
|
|
|
XFS_STATS_INC(dp->i_mount, xs_dir_getdents);
|
2013-08-12 18:49:36 +08:00
|
|
|
|
2014-06-06 13:20:32 +08:00
|
|
|
args.dp = dp;
|
|
|
|
args.geo = dp->i_mount->m_dir_geo;
|
2017-06-17 02:00:14 +08:00
|
|
|
args.trans = tp;
|
2014-06-06 13:20:32 +08:00
|
|
|
|
2013-08-12 18:49:36 +08:00
|
|
|
if (dp->i_d.di_format == XFS_DINODE_FMT_LOCAL)
|
2014-06-06 13:20:32 +08:00
|
|
|
rval = xfs_dir2_sf_getdents(&args, ctx);
|
|
|
|
else if ((rval = xfs_dir2_isblock(&args, &v)))
|
2013-08-12 18:49:36 +08:00
|
|
|
;
|
|
|
|
else if (v)
|
2014-06-06 13:20:32 +08:00
|
|
|
rval = xfs_dir2_block_getdents(&args, ctx);
|
2013-08-12 18:49:36 +08:00
|
|
|
else
|
2014-06-06 13:20:32 +08:00
|
|
|
rval = xfs_dir2_leaf_getdents(&args, ctx, bufsize);
|
2013-12-07 04:30:11 +08:00
|
|
|
|
2013-08-12 18:49:36 +08:00
|
|
|
return rval;
|
|
|
|
}
|