2005-04-17 06:20:36 +08:00
|
|
|
/*
|
2005-11-02 11:58:39 +08:00
|
|
|
* Copyright (c) 2000-2005 Silicon Graphics, Inc.
|
|
|
|
* All Rights Reserved.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2005-11-02 11:58:39 +08:00
|
|
|
* This program is free software; you can redistribute it and/or
|
|
|
|
* modify it under the terms of the GNU General Public License as
|
2005-04-17 06:20:36 +08:00
|
|
|
* published by the Free Software Foundation.
|
|
|
|
*
|
2005-11-02 11:58:39 +08:00
|
|
|
* This program is distributed in the hope that it would be useful,
|
|
|
|
* but WITHOUT ANY WARRANTY; without even the implied warranty of
|
|
|
|
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
|
|
|
|
* GNU General Public License for more details.
|
2005-04-17 06:20:36 +08:00
|
|
|
*
|
2005-11-02 11:58:39 +08:00
|
|
|
* You should have received a copy of the GNU General Public License
|
|
|
|
* along with this program; if not, write the Free Software Foundation,
|
|
|
|
* Inc., 51 Franklin St, Fifth Floor, Boston, MA 02110-1301 USA
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
#ifndef __XFS_MOUNT_H__
|
|
|
|
#define __XFS_MOUNT_H__
|
|
|
|
|
2008-10-30 14:06:18 +08:00
|
|
|
#include "xfs_sync.h"
|
2007-02-10 15:35:15 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
typedef struct xfs_trans_reservations {
|
|
|
|
uint tr_write; /* extent alloc trans */
|
|
|
|
uint tr_itruncate; /* truncate trans */
|
|
|
|
uint tr_rename; /* rename trans */
|
|
|
|
uint tr_link; /* link trans */
|
|
|
|
uint tr_remove; /* unlink trans */
|
|
|
|
uint tr_symlink; /* symlink trans */
|
|
|
|
uint tr_create; /* create trans */
|
|
|
|
uint tr_mkdir; /* mkdir trans */
|
|
|
|
uint tr_ifree; /* inode free trans */
|
|
|
|
uint tr_ichange; /* inode update trans */
|
|
|
|
uint tr_growdata; /* fs data section grow trans */
|
|
|
|
uint tr_swrite; /* sync write inode trans */
|
|
|
|
uint tr_addafork; /* cvt inode to attributed trans */
|
|
|
|
uint tr_writeid; /* write setuid/setgid file */
|
|
|
|
uint tr_attrinval; /* attr fork buffer invalidation */
|
|
|
|
uint tr_attrset; /* set/create an attribute */
|
|
|
|
uint tr_attrrm; /* remove an attribute */
|
|
|
|
uint tr_clearagi; /* clear bad agi unlinked ino bucket */
|
|
|
|
uint tr_growrtalloc; /* grow realtime allocations */
|
|
|
|
uint tr_growrtzero; /* grow realtime zeroing */
|
|
|
|
uint tr_growrtfree; /* grow realtime freeing */
|
|
|
|
} xfs_trans_reservations_t;
|
|
|
|
|
|
|
|
#ifndef __KERNEL__
|
2008-10-30 14:05:38 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_DADDR_TO_AGNO(mp,d) \
|
|
|
|
((xfs_agnumber_t)(XFS_BB_TO_FSBT(mp, d) / (mp)->m_sb.sb_agblocks))
|
|
|
|
#define XFS_DADDR_TO_AGBNO(mp,d) \
|
|
|
|
((xfs_agblock_t)(XFS_BB_TO_FSBT(mp, d) % (mp)->m_sb.sb_agblocks))
|
2008-10-30 14:05:38 +08:00
|
|
|
|
|
|
|
#else /* __KERNEL__ */
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
struct cred;
|
|
|
|
struct log;
|
|
|
|
struct xfs_mount_args;
|
|
|
|
struct xfs_inode;
|
|
|
|
struct xfs_bmbt_irec;
|
|
|
|
struct xfs_bmap_free;
|
2006-06-09 12:48:12 +08:00
|
|
|
struct xfs_extdelta;
|
|
|
|
struct xfs_swapext;
|
2007-07-11 09:09:12 +08:00
|
|
|
struct xfs_mru_cache;
|
2008-05-21 14:41:01 +08:00
|
|
|
struct xfs_nameops;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Prototypes and functions for the Data Migration subsystem.
|
|
|
|
*/
|
|
|
|
|
2008-03-06 10:45:58 +08:00
|
|
|
typedef int (*xfs_send_data_t)(int, struct xfs_inode *,
|
2008-03-06 10:44:57 +08:00
|
|
|
xfs_off_t, size_t, int, int *);
|
2005-04-17 06:20:36 +08:00
|
|
|
typedef int (*xfs_send_mmap_t)(struct vm_area_struct *, uint);
|
2008-03-06 10:45:58 +08:00
|
|
|
typedef int (*xfs_send_destroy_t)(struct xfs_inode *, dm_right_t);
|
2007-08-30 15:21:30 +08:00
|
|
|
typedef int (*xfs_send_namesp_t)(dm_eventtype_t, struct xfs_mount *,
|
2008-03-06 10:45:58 +08:00
|
|
|
struct xfs_inode *, dm_right_t,
|
|
|
|
struct xfs_inode *, dm_right_t,
|
2008-04-10 10:22:07 +08:00
|
|
|
const char *, const char *, mode_t, int, int);
|
2007-08-29 09:59:36 +08:00
|
|
|
typedef int (*xfs_send_mount_t)(struct xfs_mount *, dm_right_t,
|
|
|
|
char *, char *);
|
2008-03-06 10:45:58 +08:00
|
|
|
typedef void (*xfs_send_unmount_t)(struct xfs_mount *, struct xfs_inode *,
|
2005-04-17 06:20:36 +08:00
|
|
|
dm_right_t, mode_t, int, int);
|
|
|
|
|
|
|
|
typedef struct xfs_dmops {
|
|
|
|
xfs_send_data_t xfs_send_data;
|
|
|
|
xfs_send_mmap_t xfs_send_mmap;
|
|
|
|
xfs_send_destroy_t xfs_send_destroy;
|
|
|
|
xfs_send_namesp_t xfs_send_namesp;
|
2007-08-29 09:59:36 +08:00
|
|
|
xfs_send_mount_t xfs_send_mount;
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_send_unmount_t xfs_send_unmount;
|
|
|
|
} xfs_dmops_t;
|
|
|
|
|
2008-03-06 10:45:58 +08:00
|
|
|
#define XFS_SEND_DATA(mp, ev,ip,off,len,fl,lock) \
|
|
|
|
(*(mp)->m_dm_ops->xfs_send_data)(ev,ip,off,len,fl,lock)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_SEND_MMAP(mp, vma,fl) \
|
2007-08-29 09:59:36 +08:00
|
|
|
(*(mp)->m_dm_ops->xfs_send_mmap)(vma,fl)
|
2008-03-06 10:45:58 +08:00
|
|
|
#define XFS_SEND_DESTROY(mp, ip,right) \
|
|
|
|
(*(mp)->m_dm_ops->xfs_send_destroy)(ip,right)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_SEND_NAMESP(mp, ev,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
|
2007-08-29 09:59:36 +08:00
|
|
|
(*(mp)->m_dm_ops->xfs_send_namesp)(ev,NULL,b1,r1,b2,r2,n1,n2,mode,rval,fl)
|
2007-08-30 15:21:30 +08:00
|
|
|
#define XFS_SEND_PREUNMOUNT(mp,b1,r1,b2,r2,n1,n2,mode,rval,fl) \
|
|
|
|
(*(mp)->m_dm_ops->xfs_send_namesp)(DM_EVENT_PREUNMOUNT,mp,b1,r1,b2,r2,n1,n2,mode,rval,fl)
|
2007-08-29 09:59:36 +08:00
|
|
|
#define XFS_SEND_MOUNT(mp,right,path,name) \
|
|
|
|
(*(mp)->m_dm_ops->xfs_send_mount)(mp,right,path,name)
|
2008-03-06 10:45:58 +08:00
|
|
|
#define XFS_SEND_UNMOUNT(mp, ip,right,mode,rval,fl) \
|
|
|
|
(*(mp)->m_dm_ops->xfs_send_unmount)(mp,ip,right,mode,rval,fl)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Prototypes and functions for the Quota Management subsystem.
|
|
|
|
*/
|
|
|
|
|
|
|
|
struct xfs_dquot;
|
|
|
|
struct xfs_dqtrxops;
|
|
|
|
struct xfs_quotainfo;
|
|
|
|
|
|
|
|
typedef int (*xfs_qminit_t)(struct xfs_mount *, uint *, uint *);
|
2008-08-13 14:49:32 +08:00
|
|
|
typedef int (*xfs_qmmount_t)(struct xfs_mount *, uint, uint);
|
2005-04-17 06:20:36 +08:00
|
|
|
typedef int (*xfs_qmunmount_t)(struct xfs_mount *);
|
|
|
|
typedef void (*xfs_qmdone_t)(struct xfs_mount *);
|
|
|
|
typedef void (*xfs_dqrele_t)(struct xfs_dquot *);
|
|
|
|
typedef int (*xfs_dqattach_t)(struct xfs_inode *, uint);
|
|
|
|
typedef void (*xfs_dqdetach_t)(struct xfs_inode *);
|
|
|
|
typedef int (*xfs_dqpurgeall_t)(struct xfs_mount *, uint);
|
|
|
|
typedef int (*xfs_dqvopalloc_t)(struct xfs_mount *,
|
2005-06-21 13:38:48 +08:00
|
|
|
struct xfs_inode *, uid_t, gid_t, prid_t, uint,
|
2005-04-17 06:20:36 +08:00
|
|
|
struct xfs_dquot **, struct xfs_dquot **);
|
|
|
|
typedef void (*xfs_dqvopcreate_t)(struct xfs_trans *, struct xfs_inode *,
|
|
|
|
struct xfs_dquot *, struct xfs_dquot *);
|
|
|
|
typedef int (*xfs_dqvoprename_t)(struct xfs_inode **);
|
|
|
|
typedef struct xfs_dquot * (*xfs_dqvopchown_t)(
|
|
|
|
struct xfs_trans *, struct xfs_inode *,
|
|
|
|
struct xfs_dquot **, struct xfs_dquot *);
|
|
|
|
typedef int (*xfs_dqvopchownresv_t)(struct xfs_trans *, struct xfs_inode *,
|
|
|
|
struct xfs_dquot *, struct xfs_dquot *, uint);
|
2007-08-30 15:19:57 +08:00
|
|
|
typedef void (*xfs_dqstatvfs_t)(struct xfs_inode *, bhv_statvfs_t *);
|
|
|
|
typedef int (*xfs_dqsync_t)(struct xfs_mount *, int flags);
|
|
|
|
typedef int (*xfs_quotactl_t)(struct xfs_mount *, int, int, xfs_caddr_t);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
typedef struct xfs_qmops {
|
|
|
|
xfs_qminit_t xfs_qminit;
|
|
|
|
xfs_qmdone_t xfs_qmdone;
|
|
|
|
xfs_qmmount_t xfs_qmmount;
|
|
|
|
xfs_qmunmount_t xfs_qmunmount;
|
|
|
|
xfs_dqrele_t xfs_dqrele;
|
|
|
|
xfs_dqattach_t xfs_dqattach;
|
|
|
|
xfs_dqdetach_t xfs_dqdetach;
|
|
|
|
xfs_dqpurgeall_t xfs_dqpurgeall;
|
|
|
|
xfs_dqvopalloc_t xfs_dqvopalloc;
|
|
|
|
xfs_dqvopcreate_t xfs_dqvopcreate;
|
|
|
|
xfs_dqvoprename_t xfs_dqvoprename;
|
|
|
|
xfs_dqvopchown_t xfs_dqvopchown;
|
|
|
|
xfs_dqvopchownresv_t xfs_dqvopchownresv;
|
2007-08-30 15:19:57 +08:00
|
|
|
xfs_dqstatvfs_t xfs_dqstatvfs;
|
|
|
|
xfs_dqsync_t xfs_dqsync;
|
|
|
|
xfs_quotactl_t xfs_quotactl;
|
2005-04-17 06:20:36 +08:00
|
|
|
struct xfs_dqtrxops *xfs_dqtrxops;
|
|
|
|
} xfs_qmops_t;
|
|
|
|
|
|
|
|
#define XFS_QM_INIT(mp, mnt, fl) \
|
2007-08-30 15:19:57 +08:00
|
|
|
(*(mp)->m_qm_ops->xfs_qminit)(mp, mnt, fl)
|
2008-08-13 14:49:32 +08:00
|
|
|
#define XFS_QM_MOUNT(mp, mnt, fl) \
|
|
|
|
(*(mp)->m_qm_ops->xfs_qmmount)(mp, mnt, fl)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_QM_UNMOUNT(mp) \
|
2007-08-30 15:19:57 +08:00
|
|
|
(*(mp)->m_qm_ops->xfs_qmunmount)(mp)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_QM_DONE(mp) \
|
2007-08-30 15:19:57 +08:00
|
|
|
(*(mp)->m_qm_ops->xfs_qmdone)(mp)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_QM_DQRELE(mp, dq) \
|
2007-08-30 15:19:57 +08:00
|
|
|
(*(mp)->m_qm_ops->xfs_dqrele)(dq)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_QM_DQATTACH(mp, ip, fl) \
|
2007-08-30 15:19:57 +08:00
|
|
|
(*(mp)->m_qm_ops->xfs_dqattach)(ip, fl)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_QM_DQDETACH(mp, ip) \
|
2007-08-30 15:19:57 +08:00
|
|
|
(*(mp)->m_qm_ops->xfs_dqdetach)(ip)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_QM_DQPURGEALL(mp, fl) \
|
2007-08-30 15:19:57 +08:00
|
|
|
(*(mp)->m_qm_ops->xfs_dqpurgeall)(mp, fl)
|
2005-06-21 13:38:48 +08:00
|
|
|
#define XFS_QM_DQVOPALLOC(mp, ip, uid, gid, prid, fl, dq1, dq2) \
|
2007-08-30 15:19:57 +08:00
|
|
|
(*(mp)->m_qm_ops->xfs_dqvopalloc)(mp, ip, uid, gid, prid, fl, dq1, dq2)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_QM_DQVOPCREATE(mp, tp, ip, dq1, dq2) \
|
2007-08-30 15:19:57 +08:00
|
|
|
(*(mp)->m_qm_ops->xfs_dqvopcreate)(tp, ip, dq1, dq2)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_QM_DQVOPRENAME(mp, ip) \
|
2007-08-30 15:19:57 +08:00
|
|
|
(*(mp)->m_qm_ops->xfs_dqvoprename)(ip)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_QM_DQVOPCHOWN(mp, tp, ip, dqp, dq) \
|
2007-08-30 15:19:57 +08:00
|
|
|
(*(mp)->m_qm_ops->xfs_dqvopchown)(tp, ip, dqp, dq)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_QM_DQVOPCHOWNRESV(mp, tp, ip, dq1, dq2, fl) \
|
2007-08-30 15:19:57 +08:00
|
|
|
(*(mp)->m_qm_ops->xfs_dqvopchownresv)(tp, ip, dq1, dq2, fl)
|
|
|
|
#define XFS_QM_DQSTATVFS(ip, statp) \
|
|
|
|
(*(ip)->i_mount->m_qm_ops->xfs_dqstatvfs)(ip, statp)
|
|
|
|
#define XFS_QM_DQSYNC(mp, flags) \
|
|
|
|
(*(mp)->m_qm_ops->xfs_dqsync)(mp, flags)
|
|
|
|
#define XFS_QM_QUOTACTL(mp, cmd, id, addr) \
|
|
|
|
(*(mp)->m_qm_ops->xfs_quotactl)(mp, cmd, id, addr)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2006-03-14 10:13:09 +08:00
|
|
|
#ifdef HAVE_PERCPU_SB
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Valid per-cpu incore superblock counters. Note that if you add new counters,
|
|
|
|
* you may need to define new counter disabled bit field descriptors as there
|
|
|
|
* are more possible fields in the superblock that can fit in a bitfield on a
|
|
|
|
* 32 bit platform. The XFS_SBS_* values for the current current counters just
|
|
|
|
* fit.
|
|
|
|
*/
|
|
|
|
typedef struct xfs_icsb_cnts {
|
|
|
|
uint64_t icsb_fdblocks;
|
|
|
|
uint64_t icsb_ifree;
|
|
|
|
uint64_t icsb_icount;
|
2006-03-14 10:29:16 +08:00
|
|
|
unsigned long icsb_flags;
|
2006-03-14 10:13:09 +08:00
|
|
|
} xfs_icsb_cnts_t;
|
|
|
|
|
2006-03-14 10:29:16 +08:00
|
|
|
#define XFS_ICSB_FLAG_LOCK (1 << 0) /* counter lock bit */
|
|
|
|
|
2006-03-14 10:13:09 +08:00
|
|
|
#define XFS_ICSB_LAZY_COUNT (1 << 1) /* accuracy not needed */
|
|
|
|
|
|
|
|
extern int xfs_icsb_init_counters(struct xfs_mount *);
|
2007-02-10 15:36:29 +08:00
|
|
|
extern void xfs_icsb_reinit_counters(struct xfs_mount *);
|
2008-05-20 13:10:52 +08:00
|
|
|
extern void xfs_icsb_destroy_counters(struct xfs_mount *);
|
2008-04-22 15:34:37 +08:00
|
|
|
extern void xfs_icsb_sync_counters(struct xfs_mount *, int);
|
|
|
|
extern void xfs_icsb_sync_counters_locked(struct xfs_mount *, int);
|
2006-03-14 10:13:09 +08:00
|
|
|
|
|
|
|
#else
|
2008-05-20 13:10:52 +08:00
|
|
|
#define xfs_icsb_init_counters(mp) (0)
|
|
|
|
#define xfs_icsb_destroy_counters(mp) do { } while (0)
|
|
|
|
#define xfs_icsb_reinit_counters(mp) do { } while (0)
|
2008-04-22 15:34:37 +08:00
|
|
|
#define xfs_icsb_sync_counters(mp, flags) do { } while (0)
|
2008-04-29 10:53:00 +08:00
|
|
|
#define xfs_icsb_sync_counters_locked(mp, flags) do { } while (0)
|
2006-03-14 10:13:09 +08:00
|
|
|
#endif
|
2005-04-17 06:20:36 +08:00
|
|
|
|
[XFS] Move AIL pushing into it's own thread
When many hundreds to thousands of threads all try to do simultaneous
transactions and the log is in a tail-pushing situation (i.e. full), we
can get multiple threads walking the AIL list and contending on the AIL
lock.
The AIL push is, in effect, a simple I/O dispatch algorithm complicated by
the ordering constraints placed on it by the transaction subsystem. It
really does not need multiple threads to push on it - even when only a
single CPU is pushing the AIL, it can push the I/O out far faster that
pretty much any disk subsystem can handle.
So, to avoid contention problems stemming from multiple list walkers, move
the list walk off into another thread and simply provide a "target" to
push to. When a thread requires a push, it sets the target and wakes the
push thread, then goes to sleep waiting for the required amount of space
to become available in the log.
This mechanism should also be a lot fairer under heavy load as the waiters
will queue in arrival order, rather than queuing in "who completed a push
first" order.
Also, by moving the pushing to a separate thread we can do more
effectively overload detection and prevention as we can keep context from
loop iteration to loop iteration. That is, we can push only part of the
list each loop and not have to loop back to the start of the list every
time we run. This should also help by reducing the number of items we try
to lock and/or push items that we cannot move.
Note that this patch is not intended to solve the inefficiencies in the
AIL structure and the associated issues with extremely large list
contents. That needs to be addresses separately; parallel access would
cause problems to any new structure as well, so I'm only aiming to isolate
the structure from unbounded parallelism here.
SGI-PV: 972759
SGI-Modid: xfs-linux-melb:xfs-kern:30371a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
2008-02-05 09:13:32 +08:00
|
|
|
typedef struct xfs_ail {
|
2008-03-27 14:58:27 +08:00
|
|
|
struct list_head xa_ail;
|
[XFS] Move AIL pushing into it's own thread
When many hundreds to thousands of threads all try to do simultaneous
transactions and the log is in a tail-pushing situation (i.e. full), we
can get multiple threads walking the AIL list and contending on the AIL
lock.
The AIL push is, in effect, a simple I/O dispatch algorithm complicated by
the ordering constraints placed on it by the transaction subsystem. It
really does not need multiple threads to push on it - even when only a
single CPU is pushing the AIL, it can push the I/O out far faster that
pretty much any disk subsystem can handle.
So, to avoid contention problems stemming from multiple list walkers, move
the list walk off into another thread and simply provide a "target" to
push to. When a thread requires a push, it sets the target and wakes the
push thread, then goes to sleep waiting for the required amount of space
to become available in the log.
This mechanism should also be a lot fairer under heavy load as the waiters
will queue in arrival order, rather than queuing in "who completed a push
first" order.
Also, by moving the pushing to a separate thread we can do more
effectively overload detection and prevention as we can keep context from
loop iteration to loop iteration. That is, we can push only part of the
list each loop and not have to loop back to the start of the list every
time we run. This should also help by reducing the number of items we try
to lock and/or push items that we cannot move.
Note that this patch is not intended to solve the inefficiencies in the
AIL structure and the associated issues with extremely large list
contents. That needs to be addresses separately; parallel access would
cause problems to any new structure as well, so I'm only aiming to isolate
the structure from unbounded parallelism here.
SGI-PV: 972759
SGI-Modid: xfs-linux-melb:xfs-kern:30371a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
2008-02-05 09:13:32 +08:00
|
|
|
uint xa_gen;
|
|
|
|
struct task_struct *xa_task;
|
|
|
|
xfs_lsn_t xa_target;
|
|
|
|
} xfs_ail_t;
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
typedef struct xfs_mount {
|
2007-08-30 15:21:30 +08:00
|
|
|
struct super_block *m_super;
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_tid_t m_tid; /* next unused tid for fs */
|
2007-10-11 15:36:05 +08:00
|
|
|
spinlock_t m_ail_lock; /* fs AIL mutex */
|
[XFS] Move AIL pushing into it's own thread
When many hundreds to thousands of threads all try to do simultaneous
transactions and the log is in a tail-pushing situation (i.e. full), we
can get multiple threads walking the AIL list and contending on the AIL
lock.
The AIL push is, in effect, a simple I/O dispatch algorithm complicated by
the ordering constraints placed on it by the transaction subsystem. It
really does not need multiple threads to push on it - even when only a
single CPU is pushing the AIL, it can push the I/O out far faster that
pretty much any disk subsystem can handle.
So, to avoid contention problems stemming from multiple list walkers, move
the list walk off into another thread and simply provide a "target" to
push to. When a thread requires a push, it sets the target and wakes the
push thread, then goes to sleep waiting for the required amount of space
to become available in the log.
This mechanism should also be a lot fairer under heavy load as the waiters
will queue in arrival order, rather than queuing in "who completed a push
first" order.
Also, by moving the pushing to a separate thread we can do more
effectively overload detection and prevention as we can keep context from
loop iteration to loop iteration. That is, we can push only part of the
list each loop and not have to loop back to the start of the list every
time we run. This should also help by reducing the number of items we try
to lock and/or push items that we cannot move.
Note that this patch is not intended to solve the inefficiencies in the
AIL structure and the associated issues with extremely large list
contents. That needs to be addresses separately; parallel access would
cause problems to any new structure as well, so I'm only aiming to isolate
the structure from unbounded parallelism here.
SGI-PV: 972759
SGI-Modid: xfs-linux-melb:xfs-kern:30371a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Lachlan McIlroy <lachlan@sgi.com>
2008-02-05 09:13:32 +08:00
|
|
|
xfs_ail_t m_ail; /* fs active log item list */
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_sb_t m_sb; /* copy of fs superblock */
|
2007-10-11 15:42:32 +08:00
|
|
|
spinlock_t m_sb_lock; /* sb counter lock */
|
2005-04-17 06:20:36 +08:00
|
|
|
struct xfs_buf *m_sb_bp; /* buffer for superblock */
|
|
|
|
char *m_fsname; /* filesystem name */
|
|
|
|
int m_fsname_len; /* strlen of fs name */
|
2005-11-02 08:44:33 +08:00
|
|
|
char *m_rtname; /* realtime device name */
|
|
|
|
char *m_logname; /* external log device name */
|
2005-04-17 06:20:36 +08:00
|
|
|
int m_bsize; /* fs logical block size */
|
|
|
|
xfs_agnumber_t m_agfrotor; /* last ag where space found */
|
|
|
|
xfs_agnumber_t m_agirotor; /* last ag dir inode alloced */
|
2007-10-11 15:43:43 +08:00
|
|
|
spinlock_t m_agirotor_lock;/* .. and lock protecting it */
|
2005-04-17 06:20:36 +08:00
|
|
|
xfs_agnumber_t m_maxagi; /* highest inode alloc group */
|
|
|
|
struct xfs_inode *m_inodes; /* active inode list */
|
|
|
|
struct list_head m_del_inodes; /* inodes to reclaim */
|
|
|
|
mutex_t m_ilock; /* inode list mutex */
|
|
|
|
uint m_ireclaims; /* count of calls to reclaim*/
|
|
|
|
uint m_readio_log; /* min read size log bytes */
|
|
|
|
uint m_readio_blocks; /* min read size blocks */
|
|
|
|
uint m_writeio_log; /* min write size log bytes */
|
|
|
|
uint m_writeio_blocks; /* min write size blocks */
|
|
|
|
struct log *m_log; /* log specific stuff */
|
|
|
|
int m_logbufs; /* number of log buffers */
|
|
|
|
int m_logbsize; /* size of each log buffer */
|
|
|
|
uint m_rsumlevels; /* rt summary levels */
|
|
|
|
uint m_rsumsize; /* size of rt summary, bytes */
|
|
|
|
struct xfs_inode *m_rbmip; /* pointer to bitmap inode */
|
|
|
|
struct xfs_inode *m_rsumip; /* pointer to summary inode */
|
|
|
|
struct xfs_inode *m_rootip; /* pointer to root directory */
|
|
|
|
struct xfs_quotainfo *m_quotainfo; /* disk quota information */
|
|
|
|
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
|
|
|
|
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
|
|
|
|
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
|
|
|
|
__uint8_t m_blkbit_log; /* blocklog + NBBY */
|
|
|
|
__uint8_t m_blkbb_log; /* blocklog - BBSHIFT */
|
|
|
|
__uint8_t m_agno_log; /* log #ag's */
|
|
|
|
__uint8_t m_agino_log; /* #bits for agino in inum */
|
|
|
|
__uint16_t m_inode_cluster_size;/* min inode buf size */
|
|
|
|
uint m_blockmask; /* sb_blocksize-1 */
|
|
|
|
uint m_blockwsize; /* sb_blocksize in words */
|
|
|
|
uint m_blockwmask; /* blockwsize-1 */
|
|
|
|
uint m_alloc_mxr[2]; /* XFS_ALLOC_BLOCK_MAXRECS */
|
|
|
|
uint m_alloc_mnr[2]; /* XFS_ALLOC_BLOCK_MINRECS */
|
|
|
|
uint m_bmap_dmxr[2]; /* XFS_BMAP_BLOCK_DMAXRECS */
|
|
|
|
uint m_bmap_dmnr[2]; /* XFS_BMAP_BLOCK_DMINRECS */
|
|
|
|
uint m_inobt_mxr[2]; /* XFS_INOBT_BLOCK_MAXRECS */
|
|
|
|
uint m_inobt_mnr[2]; /* XFS_INOBT_BLOCK_MINRECS */
|
|
|
|
uint m_ag_maxlevels; /* XFS_AG_MAXLEVELS */
|
|
|
|
uint m_bm_maxlevels[2]; /* XFS_BM_MAXLEVELS */
|
|
|
|
uint m_in_maxlevels; /* XFS_IN_MAXLEVELS */
|
|
|
|
struct xfs_perag *m_perag; /* per-ag accounting info */
|
|
|
|
struct rw_semaphore m_peraglock; /* lock for m_perag (pointer) */
|
2007-08-30 15:21:54 +08:00
|
|
|
struct mutex m_growlock; /* growfs mutex */
|
2005-04-17 06:20:36 +08:00
|
|
|
int m_fixedfsid[2]; /* unchanged for life of FS */
|
|
|
|
uint m_dmevmask; /* DMI events for this FS */
|
2005-11-02 12:09:22 +08:00
|
|
|
__uint64_t m_flags; /* global mount flags */
|
2005-04-17 06:20:36 +08:00
|
|
|
uint m_attroffset; /* inode attribute offset */
|
|
|
|
uint m_dir_node_ents; /* #entries in a dir danode */
|
|
|
|
uint m_attr_node_ents; /* #entries in attr danode */
|
|
|
|
int m_ialloc_inos; /* inodes in inode allocation */
|
|
|
|
int m_ialloc_blks; /* blocks in inode allocation */
|
|
|
|
int m_litino; /* size of inode union area */
|
|
|
|
int m_inoalign_mask;/* mask sb_inoalignmt if used */
|
|
|
|
uint m_qflags; /* quota status flags */
|
|
|
|
xfs_trans_reservations_t m_reservations;/* precomputed res values */
|
|
|
|
__uint64_t m_maxicount; /* maximum inode count */
|
|
|
|
__uint64_t m_maxioffset; /* maximum inode offset */
|
|
|
|
__uint64_t m_resblks; /* total reserved blocks */
|
|
|
|
__uint64_t m_resblks_avail;/* available reserved blocks */
|
|
|
|
#if XFS_BIG_INUMS
|
|
|
|
xfs_ino_t m_inoadd; /* add value for ino64_offset */
|
|
|
|
#endif
|
|
|
|
int m_dalign; /* stripe unit */
|
|
|
|
int m_swidth; /* stripe width */
|
2006-03-29 06:55:14 +08:00
|
|
|
int m_sinoalign; /* stripe unit inode alignment */
|
2005-04-17 06:20:36 +08:00
|
|
|
int m_attr_magicpct;/* 37% of the blocksize */
|
|
|
|
int m_dir_magicpct; /* 37% of the dir blocksize */
|
|
|
|
__uint8_t m_mk_sharedro; /* mark shared ro on unmount */
|
|
|
|
__uint8_t m_inode_quiesce;/* call quiesce on new inodes.
|
|
|
|
field governed by m_ilock */
|
|
|
|
__uint8_t m_sectbb_log; /* sectlog - BBSHIFT */
|
2008-05-21 14:41:01 +08:00
|
|
|
const struct xfs_nameops *m_dirnameops; /* vector of dir name ops */
|
2005-04-17 06:20:36 +08:00
|
|
|
int m_dirblksize; /* directory block sz--bytes */
|
|
|
|
int m_dirblkfsbs; /* directory block sz--fsbs */
|
|
|
|
xfs_dablk_t m_dirdatablk; /* blockno of dir data v2 */
|
|
|
|
xfs_dablk_t m_dirleafblk; /* blockno of dir non-data v2 */
|
|
|
|
xfs_dablk_t m_dirfreeblk; /* blockno of dirfreeindex v2 */
|
|
|
|
uint m_chsize; /* size of next field */
|
|
|
|
struct xfs_chash *m_chash; /* fs private inode per-cluster
|
|
|
|
* hash table */
|
2007-08-29 09:59:36 +08:00
|
|
|
struct xfs_dmops *m_dm_ops; /* vector of DMI ops */
|
2007-08-30 15:19:57 +08:00
|
|
|
struct xfs_qmops *m_qm_ops; /* vector of XQM ops */
|
2005-04-17 06:20:36 +08:00
|
|
|
atomic_t m_active_trans; /* number trans frozen */
|
2006-03-14 10:13:09 +08:00
|
|
|
#ifdef HAVE_PERCPU_SB
|
|
|
|
xfs_icsb_cnts_t *m_sb_cnts; /* per-cpu superblock counters */
|
|
|
|
unsigned long m_icsb_counters; /* disabled per-cpu counters */
|
2006-03-14 10:23:52 +08:00
|
|
|
struct notifier_block m_icsb_notifier; /* hotplug cpu notifier */
|
2007-02-10 15:35:09 +08:00
|
|
|
struct mutex m_icsb_mutex; /* balancer sync lock */
|
2006-03-14 10:13:09 +08:00
|
|
|
#endif
|
2007-07-11 09:09:12 +08:00
|
|
|
struct xfs_mru_cache *m_filestream; /* per-mount filestream data */
|
2007-08-30 15:21:22 +08:00
|
|
|
struct task_struct *m_sync_task; /* generalised sync thread */
|
|
|
|
bhv_vfs_sync_work_t m_sync_work; /* work item for VFS_SYNC */
|
|
|
|
struct list_head m_sync_list; /* sync thread work item list */
|
|
|
|
spinlock_t m_sync_lock; /* work item list lock */
|
|
|
|
int m_sync_seq; /* sync thread generation no. */
|
|
|
|
wait_queue_head_t m_wait_single_sync_task;
|
2005-04-17 06:20:36 +08:00
|
|
|
} xfs_mount_t;
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Flags for m_flags.
|
|
|
|
*/
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
#define XFS_MOUNT_WSYNC (1ULL << 0) /* for nfs - all metadata ops
|
2005-04-17 06:20:36 +08:00
|
|
|
must be synchronous except
|
|
|
|
for space allocations */
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
#define XFS_MOUNT_INO64 (1ULL << 1)
|
2007-08-30 15:21:12 +08:00
|
|
|
#define XFS_MOUNT_DMAPI (1ULL << 2) /* dmapi is enabled */
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
#define XFS_MOUNT_WAS_CLEAN (1ULL << 3)
|
2005-11-02 12:09:22 +08:00
|
|
|
#define XFS_MOUNT_FS_SHUTDOWN (1ULL << 4) /* atomic stop of all filesystem
|
2005-04-17 06:20:36 +08:00
|
|
|
operations, typically for
|
|
|
|
disk errors in metadata */
|
2005-11-02 12:09:22 +08:00
|
|
|
#define XFS_MOUNT_RETERR (1ULL << 6) /* return alignment errors to
|
2005-04-17 06:20:36 +08:00
|
|
|
user */
|
2005-11-02 12:09:22 +08:00
|
|
|
#define XFS_MOUNT_NOALIGN (1ULL << 7) /* turn off stripe alignment
|
2005-04-17 06:20:36 +08:00
|
|
|
allocations */
|
2006-01-11 12:32:01 +08:00
|
|
|
#define XFS_MOUNT_ATTR2 (1ULL << 8) /* allow use of attr2 format */
|
2007-08-30 15:21:12 +08:00
|
|
|
#define XFS_MOUNT_GRPID (1ULL << 9) /* group-ID assigned from directory */
|
2005-11-02 12:09:22 +08:00
|
|
|
#define XFS_MOUNT_NORECOVERY (1ULL << 10) /* no recovery - dirty fs */
|
|
|
|
#define XFS_MOUNT_SHARED (1ULL << 11) /* shared mount */
|
|
|
|
#define XFS_MOUNT_DFLT_IOSIZE (1ULL << 12) /* set default i/o size */
|
|
|
|
#define XFS_MOUNT_OSYNCISOSYNC (1ULL << 13) /* o_sync is REALLY o_sync */
|
2005-04-17 06:20:36 +08:00
|
|
|
/* osyncisdsync is now default*/
|
2005-11-02 12:09:22 +08:00
|
|
|
#define XFS_MOUNT_32BITINODES (1ULL << 14) /* do not create inodes above
|
2005-04-17 06:20:36 +08:00
|
|
|
* 32 bits in size */
|
2007-08-30 15:21:12 +08:00
|
|
|
#define XFS_MOUNT_SMALL_INUMS (1ULL << 15) /* users wants 32bit inodes */
|
2005-11-02 12:09:22 +08:00
|
|
|
#define XFS_MOUNT_NOUUID (1ULL << 16) /* ignore uuid during mount */
|
|
|
|
#define XFS_MOUNT_BARRIER (1ULL << 17)
|
2008-02-29 10:58:40 +08:00
|
|
|
#define XFS_MOUNT_IKEEP (1ULL << 18) /* keep empty inode clusters*/
|
2005-11-02 12:09:22 +08:00
|
|
|
#define XFS_MOUNT_SWALLOC (1ULL << 19) /* turn on stripe width
|
2005-04-17 06:20:36 +08:00
|
|
|
* allocation */
|
2007-08-30 15:21:12 +08:00
|
|
|
#define XFS_MOUNT_RDONLY (1ULL << 20) /* read-only fs */
|
2005-11-02 12:09:22 +08:00
|
|
|
#define XFS_MOUNT_DIRSYNC (1ULL << 21) /* synchronous directory ops */
|
|
|
|
#define XFS_MOUNT_COMPAT_IOSIZE (1ULL << 22) /* don't report large preferred
|
2005-11-02 07:33:05 +08:00
|
|
|
* I/O size in stat() */
|
2006-03-14 10:13:09 +08:00
|
|
|
#define XFS_MOUNT_NO_PERCPU_SB (1ULL << 23) /* don't use per-cpu superblock
|
|
|
|
counters */
|
2007-07-11 09:09:12 +08:00
|
|
|
#define XFS_MOUNT_FILESTREAMS (1ULL << 24) /* enable the filestreams
|
|
|
|
allocator */
|
2008-04-30 16:15:28 +08:00
|
|
|
#define XFS_MOUNT_NOATTR2 (1ULL << 25) /* disable use of attr2 format */
|
2005-11-02 07:33:05 +08:00
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Default minimum read and write sizes.
|
|
|
|
*/
|
|
|
|
#define XFS_READIO_LOG_LARGE 16
|
|
|
|
#define XFS_WRITEIO_LOG_LARGE 16
|
|
|
|
|
|
|
|
/*
|
2005-05-06 04:28:29 +08:00
|
|
|
* Max and min values for mount-option defined I/O
|
|
|
|
* preallocation sizes.
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
2005-05-06 04:28:29 +08:00
|
|
|
#define XFS_MAX_IO_LOG 30 /* 1G */
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_MIN_IO_LOG PAGE_SHIFT
|
|
|
|
|
|
|
|
/*
|
|
|
|
* Synchronous read and write sizes. This should be
|
|
|
|
* better for NFSv2 wsync filesystems.
|
|
|
|
*/
|
|
|
|
#define XFS_WSYNC_READIO_LOG 15 /* 32K */
|
|
|
|
#define XFS_WSYNC_WRITEIO_LOG 14 /* 16K */
|
|
|
|
|
2005-11-02 07:33:05 +08:00
|
|
|
/*
|
|
|
|
* Allow large block sizes to be reported to userspace programs if the
|
2008-04-10 10:22:07 +08:00
|
|
|
* "largeio" mount option is used.
|
2005-11-02 07:33:05 +08:00
|
|
|
*
|
|
|
|
* If compatibility mode is specified, simply return the basic unit of caching
|
|
|
|
* so that we don't get inefficient read/modify/write I/O from user apps.
|
|
|
|
* Otherwise....
|
|
|
|
*
|
|
|
|
* If the underlying volume is a stripe, then return the stripe width in bytes
|
|
|
|
* as the recommended I/O size. It is not a stripe and we've set a default
|
|
|
|
* buffered I/O size, return that, otherwise return the compat default.
|
|
|
|
*/
|
|
|
|
static inline unsigned long
|
|
|
|
xfs_preferred_iosize(xfs_mount_t *mp)
|
|
|
|
{
|
|
|
|
if (mp->m_flags & XFS_MOUNT_COMPAT_IOSIZE)
|
|
|
|
return PAGE_CACHE_SIZE;
|
|
|
|
return (mp->m_swidth ?
|
|
|
|
(mp->m_swidth << mp->m_sb.sb_blocklog) :
|
|
|
|
((mp->m_flags & XFS_MOUNT_DFLT_IOSIZE) ?
|
|
|
|
(1 << (int)MAX(mp->m_readio_log, mp->m_writeio_log)) :
|
|
|
|
PAGE_CACHE_SIZE));
|
|
|
|
}
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_MAXIOFFSET(mp) ((mp)->m_maxioffset)
|
|
|
|
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
#define XFS_LAST_UNMOUNT_WAS_CLEAN(mp) \
|
|
|
|
((mp)->m_flags & XFS_MOUNT_WAS_CLEAN)
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_FORCED_SHUTDOWN(mp) ((mp)->m_flags & XFS_MOUNT_FS_SHUTDOWN)
|
2007-08-30 15:20:39 +08:00
|
|
|
void xfs_do_force_shutdown(struct xfs_mount *mp, int flags, char *fname,
|
|
|
|
int lnnum);
|
2005-04-17 06:20:36 +08:00
|
|
|
#define xfs_force_shutdown(m,f) \
|
2007-08-30 15:20:39 +08:00
|
|
|
xfs_do_force_shutdown(m, f, __FILE__, __LINE__)
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* Flags for xfs_mountfs
|
|
|
|
*/
|
2006-03-31 11:04:17 +08:00
|
|
|
#define XFS_MFSI_QUIET 0x40 /* Be silent if mount errors found */
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#define XFS_DADDR_TO_AGNO(mp,d) xfs_daddr_to_agno(mp,d)
|
2005-11-02 11:38:42 +08:00
|
|
|
static inline xfs_agnumber_t
|
|
|
|
xfs_daddr_to_agno(struct xfs_mount *mp, xfs_daddr_t d)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-11-02 11:38:42 +08:00
|
|
|
xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
|
|
|
|
do_div(ld, mp->m_sb.sb_agblocks);
|
|
|
|
return (xfs_agnumber_t) ld;
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
|
|
|
#define XFS_DADDR_TO_AGBNO(mp,d) xfs_daddr_to_agbno(mp,d)
|
2005-11-02 11:38:42 +08:00
|
|
|
static inline xfs_agblock_t
|
|
|
|
xfs_daddr_to_agbno(struct xfs_mount *mp, xfs_daddr_t d)
|
2005-04-17 06:20:36 +08:00
|
|
|
{
|
2005-11-02 11:38:42 +08:00
|
|
|
xfs_daddr_t ld = XFS_BB_TO_FSBT(mp, d);
|
|
|
|
return (xfs_agblock_t) do_div(ld, mp->m_sb.sb_agblocks);
|
2005-04-17 06:20:36 +08:00
|
|
|
}
|
|
|
|
|
2007-08-28 12:00:13 +08:00
|
|
|
/*
|
|
|
|
* perag get/put wrappers for eventual ref counting
|
|
|
|
*/
|
|
|
|
static inline xfs_perag_t *
|
|
|
|
xfs_get_perag(struct xfs_mount *mp, xfs_ino_t ino)
|
|
|
|
{
|
|
|
|
return &mp->m_perag[XFS_INO_TO_AGNO(mp, ino)];
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void
|
|
|
|
xfs_put_perag(struct xfs_mount *mp, xfs_perag_t *pag)
|
|
|
|
{
|
|
|
|
/* nothing to see here, move along */
|
|
|
|
}
|
|
|
|
|
2007-02-10 15:35:15 +08:00
|
|
|
/*
|
|
|
|
* Per-cpu superblock locking functions
|
|
|
|
*/
|
|
|
|
#ifdef HAVE_PERCPU_SB
|
|
|
|
STATIC_INLINE void
|
|
|
|
xfs_icsb_lock(xfs_mount_t *mp)
|
|
|
|
{
|
|
|
|
mutex_lock(&mp->m_icsb_mutex);
|
|
|
|
}
|
|
|
|
|
|
|
|
STATIC_INLINE void
|
|
|
|
xfs_icsb_unlock(xfs_mount_t *mp)
|
|
|
|
{
|
|
|
|
mutex_unlock(&mp->m_icsb_mutex);
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
#define xfs_icsb_lock(mp)
|
|
|
|
#define xfs_icsb_unlock(mp)
|
|
|
|
#endif
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
/*
|
|
|
|
* This structure is for use by the xfs_mod_incore_sb_batch() routine.
|
2007-02-10 15:36:10 +08:00
|
|
|
* xfs_growfs can specify a few fields which are more than int limit
|
2005-04-17 06:20:36 +08:00
|
|
|
*/
|
|
|
|
typedef struct xfs_mod_sb {
|
|
|
|
xfs_sb_field_t msb_field; /* Field to modify, see below */
|
2007-02-10 15:36:10 +08:00
|
|
|
int64_t msb_delta; /* Change to make to specified field */
|
2005-04-17 06:20:36 +08:00
|
|
|
} xfs_mod_sb_t;
|
|
|
|
|
2006-01-10 07:59:21 +08:00
|
|
|
#define XFS_MOUNT_ILOCK(mp) mutex_lock(&((mp)->m_ilock))
|
2005-04-17 06:20:36 +08:00
|
|
|
#define XFS_MOUNT_IUNLOCK(mp) mutex_unlock(&((mp)->m_ilock))
|
|
|
|
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
extern int xfs_log_sbcount(xfs_mount_t *, uint);
|
2008-08-13 14:49:32 +08:00
|
|
|
extern int xfs_mountfs(xfs_mount_t *mp);
|
2005-11-02 07:26:59 +08:00
|
|
|
extern void xfs_mountfs_check_barriers(xfs_mount_t *mp);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2008-08-13 14:49:57 +08:00
|
|
|
extern void xfs_unmountfs(xfs_mount_t *);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int xfs_unmountfs_writesb(xfs_mount_t *);
|
|
|
|
extern int xfs_unmount_flush(xfs_mount_t *, int);
|
2007-02-10 15:36:10 +08:00
|
|
|
extern int xfs_mod_incore_sb(xfs_mount_t *, xfs_sb_field_t, int64_t, int);
|
2006-03-14 10:13:09 +08:00
|
|
|
extern int xfs_mod_incore_sb_unlocked(xfs_mount_t *, xfs_sb_field_t,
|
2007-02-10 15:36:10 +08:00
|
|
|
int64_t, int);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern int xfs_mod_incore_sb_batch(xfs_mount_t *, xfs_mod_sb_t *,
|
|
|
|
uint, int);
|
|
|
|
extern struct xfs_buf *xfs_getsb(xfs_mount_t *, int);
|
2006-03-31 11:04:17 +08:00
|
|
|
extern int xfs_readsb(xfs_mount_t *, int);
|
2005-04-17 06:20:36 +08:00
|
|
|
extern void xfs_freesb(xfs_mount_t *);
|
[XFS] Lazy Superblock Counters
When we have a couple of hundred transactions on the fly at once, they all
typically modify the on disk superblock in some way.
create/unclink/mkdir/rmdir modify inode counts, allocation/freeing modify
free block counts.
When these counts are modified in a transaction, they must eventually lock
the superblock buffer and apply the mods. The buffer then remains locked
until the transaction is committed into the incore log buffer. The result
of this is that with enough transactions on the fly the incore superblock
buffer becomes a bottleneck.
The result of contention on the incore superblock buffer is that
transaction rates fall - the more pressure that is put on the superblock
buffer, the slower things go.
The key to removing the contention is to not require the superblock fields
in question to be locked. We do that by not marking the superblock dirty
in the transaction. IOWs, we modify the incore superblock but do not
modify the cached superblock buffer. In short, we do not log superblock
modifications to critical fields in the superblock on every transaction.
In fact we only do it just before we write the superblock to disk every
sync period or just before unmount.
This creates an interesting problem - if we don't log or write out the
fields in every transaction, then how do the values get recovered after a
crash? the answer is simple - we keep enough duplicate, logged information
in other structures that we can reconstruct the correct count after log
recovery has been performed.
It is the AGF and AGI structures that contain the duplicate information;
after recovery, we walk every AGI and AGF and sum their individual
counters to get the correct value, and we do a transaction into the log to
correct them. An optimisation of this is that if we have a clean unmount
record, we know the value in the superblock is correct, so we can avoid
the summation walk under normal conditions and so mount/recovery times do
not change under normal operation.
One wrinkle that was discovered during development was that the blocks
used in the freespace btrees are never accounted for in the AGF counters.
This was once a valid optimisation to make; when the filesystem is full,
the free space btrees are empty and consume no space. Hence when it
matters, the "accounting" is correct. But that means the when we do the
AGF summations, we would not have a correct count and xfs_check would
complain. Hence a new counter was added to track the number of blocks used
by the free space btrees. This is an *on-disk format change*.
As a result of this, lazy superblock counters are a mkfs option and at the
moment on linux there is no way to convert an old filesystem. This is
possible - xfs_db can be used to twiddle the right bits and then
xfs_repair will do the format conversion for you. Similarly, you can
convert backwards as well. At some point we'll add functionality to
xfs_admin to do the bit twiddling easily....
SGI-PV: 964999
SGI-Modid: xfs-linux-melb:xfs-kern:28652a
Signed-off-by: David Chinner <dgc@sgi.com>
Signed-off-by: Christoph Hellwig <hch@infradead.org>
Signed-off-by: Tim Shimmin <tes@sgi.com>
2007-05-24 13:26:31 +08:00
|
|
|
extern int xfs_fs_writable(xfs_mount_t *);
|
2007-02-10 15:35:33 +08:00
|
|
|
extern int xfs_syncsub(xfs_mount_t *, int, int *);
|
|
|
|
extern int xfs_sync_inodes(xfs_mount_t *, int, int *);
|
2007-05-14 16:24:02 +08:00
|
|
|
extern int xfs_sb_validate_fsb_count(struct xfs_sb *, __uint64_t);
|
2005-04-17 06:20:36 +08:00
|
|
|
|
2007-08-29 09:59:36 +08:00
|
|
|
extern int xfs_dmops_get(struct xfs_mount *, struct xfs_mount_args *);
|
|
|
|
extern void xfs_dmops_put(struct xfs_mount *);
|
2007-08-30 15:19:57 +08:00
|
|
|
extern int xfs_qmops_get(struct xfs_mount *, struct xfs_mount_args *);
|
|
|
|
extern void xfs_qmops_put(struct xfs_mount *);
|
2007-08-29 09:59:36 +08:00
|
|
|
|
|
|
|
extern struct xfs_dmops xfs_dmcore_xfs;
|
2005-04-17 06:20:36 +08:00
|
|
|
|
|
|
|
#endif /* __KERNEL__ */
|
|
|
|
|
2008-10-30 14:05:38 +08:00
|
|
|
extern void xfs_mod_sb(struct xfs_trans *, __int64_t);
|
|
|
|
extern xfs_agnumber_t xfs_initialize_perag(struct xfs_mount *, xfs_agnumber_t);
|
|
|
|
extern void xfs_sb_from_disk(struct xfs_sb *, struct xfs_dsb *);
|
|
|
|
extern void xfs_sb_to_disk(struct xfs_dsb *, struct xfs_sb *, __int64_t);
|
|
|
|
|
2005-04-17 06:20:36 +08:00
|
|
|
#endif /* __XFS_MOUNT_H__ */
|