xfs: fix cpu hotplug mess [v2]

Ritesh and Eric separately reported crashes in XFS's hook function for
 CPU hot remove if the remove event races with a filesystem being
 mounted.  I also noticed via generic/650 that once in a while the log
 will shut down over an apparent overrun of a transaction reservation;
 this turned out to be due to CIL percpu list aggregation failing to pick
 up the percpu list items from a dying CPU.
 
 Either way, the solution here is to eliminate the need for a CPU dying
 hook by using a private cpumask to track which CPUs have added to their
 percpu lists directly, and iterating with that mask.  This fixes the log
 problems and (I think) solves a theoretical UAF bug in the inodegc code
 too.
 
 v2: fix a few put_cpu uses, add necessary memory barriers, and use
     atomic cpumask operations
 
 This has been lightly tested with fstests.  Enjoy!
 
 Signed-off-by: Darrick J. Wong <djwong@kernel.org>
 -----BEGIN PGP SIGNATURE-----
 
 iHUEABYKAB0WIQQ2qTKExjcn+O1o2YRKO3ySh0YRpgUCZQChOQAKCRBKO3ySh0YR
 plauAQCV0RymbwD/ONbvpor3yK4R3YO1pa923KtoiQ9IAV5uswD/YBWvyI76BhNs
 B8hwbEDm3X2ZjQaikxI+Xx2cMaAhkgY=
 =EJ0b
 -----END PGP SIGNATURE-----

Merge tag 'fix-percpu-lists-6.6_2023-09-12' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux into xfs-6.6-fixesA

xfs: fix cpu hotplug mess

Ritesh and Eric separately reported crashes in XFS's hook function for
CPU hot remove if the remove event races with a filesystem being
mounted.  I also noticed via generic/650 that once in a while the log
will shut down over an apparent overrun of a transaction reservation;
this turned out to be due to CIL percpu list aggregation failing to pick
up the percpu list items from a dying CPU.

Either way, the solution here is to eliminate the need for a CPU dying
hook by using a private cpumask to track which CPUs have added to their
percpu lists directly, and iterating with that mask.  This fixes the log
problems and (I think) solves a theoretical UAF bug in the inodegc code
too.

Signed-off-by: Darrick J. Wong <djwong@kernel.org>
Signed-off-by: Chandan Babu R <chandanbabu@kernel.org>

* tag 'fix-percpu-lists-6.6_2023-09-12' of https://git.kernel.org/pub/scm/linux/kernel/git/djwong/xfs-linux:
  xfs: remove CPU hotplug infrastructure
  xfs: remove the all-mounts list
  xfs: use per-mount cpumask to track nonempty percpu inodegc lists
  xfs: fix per-cpu CIL structure aggregation racing with dying cpus
This commit is contained in:
Chandan Babu R 2023-09-13 10:11:44 +05:30
commit 0a229c935a
7 changed files with 56 additions and 183 deletions

View File

@ -443,7 +443,7 @@ xfs_inodegc_queue_all(
int cpu;
bool ret = false;
for_each_online_cpu(cpu) {
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list)) {
mod_delayed_work_on(cpu, mp->m_inodegc_wq, &gc->work, 0);
@ -463,7 +463,7 @@ xfs_inodegc_wait_all(
int error = 0;
flush_workqueue(mp->m_inodegc_wq);
for_each_online_cpu(cpu) {
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
struct xfs_inodegc *gc;
gc = per_cpu_ptr(mp->m_inodegc, cpu);
@ -1845,9 +1845,17 @@ xfs_inodegc_worker(
struct xfs_inodegc, work);
struct llist_node *node = llist_del_all(&gc->list);
struct xfs_inode *ip, *n;
struct xfs_mount *mp = gc->mp;
unsigned int nofs_flag;
ASSERT(gc->cpu == smp_processor_id());
/*
* Clear the cpu mask bit and ensure that we have seen the latest
* update of the gc structure associated with this CPU. This matches
* with the release semantics used when setting the cpumask bit in
* xfs_inodegc_queue.
*/
cpumask_clear_cpu(gc->cpu, &mp->m_inodegc_cpumask);
smp_mb__after_atomic();
WRITE_ONCE(gc->items, 0);
@ -1862,7 +1870,7 @@ xfs_inodegc_worker(
nofs_flag = memalloc_nofs_save();
ip = llist_entry(node, struct xfs_inode, i_gclist);
trace_xfs_inodegc_worker(ip->i_mount, READ_ONCE(gc->shrinker_hits));
trace_xfs_inodegc_worker(mp, READ_ONCE(gc->shrinker_hits));
WRITE_ONCE(gc->shrinker_hits, 0);
llist_for_each_entry_safe(ip, n, node, i_gclist) {
@ -2057,6 +2065,7 @@ xfs_inodegc_queue(
struct xfs_inodegc *gc;
int items;
unsigned int shrinker_hits;
unsigned int cpu_nr;
unsigned long queue_delay = 1;
trace_xfs_inode_set_need_inactive(ip);
@ -2064,18 +2073,28 @@ xfs_inodegc_queue(
ip->i_flags |= XFS_NEED_INACTIVE;
spin_unlock(&ip->i_flags_lock);
gc = get_cpu_ptr(mp->m_inodegc);
cpu_nr = get_cpu();
gc = this_cpu_ptr(mp->m_inodegc);
llist_add(&ip->i_gclist, &gc->list);
items = READ_ONCE(gc->items);
WRITE_ONCE(gc->items, items + 1);
shrinker_hits = READ_ONCE(gc->shrinker_hits);
/*
* Ensure the list add is always seen by anyone who finds the cpumask
* bit set. This effectively gives the cpumask bit set operation
* release ordering semantics.
*/
smp_mb__before_atomic();
if (!cpumask_test_cpu(cpu_nr, &mp->m_inodegc_cpumask))
cpumask_test_and_set_cpu(cpu_nr, &mp->m_inodegc_cpumask);
/*
* We queue the work while holding the current CPU so that the work
* is scheduled to run on this CPU.
*/
if (!xfs_is_inodegc_enabled(mp)) {
put_cpu_ptr(gc);
put_cpu();
return;
}
@ -2085,7 +2104,7 @@ xfs_inodegc_queue(
trace_xfs_inodegc_queue(mp, __return_address);
mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
queue_delay);
put_cpu_ptr(gc);
put_cpu();
if (xfs_inodegc_want_flush_work(ip, items, shrinker_hits)) {
trace_xfs_inodegc_throttle(mp, __return_address);
@ -2093,47 +2112,6 @@ xfs_inodegc_queue(
}
}
/*
* Fold the dead CPU inodegc queue into the current CPUs queue.
*/
void
xfs_inodegc_cpu_dead(
struct xfs_mount *mp,
unsigned int dead_cpu)
{
struct xfs_inodegc *dead_gc, *gc;
struct llist_node *first, *last;
unsigned int count = 0;
dead_gc = per_cpu_ptr(mp->m_inodegc, dead_cpu);
cancel_delayed_work_sync(&dead_gc->work);
if (llist_empty(&dead_gc->list))
return;
first = dead_gc->list.first;
last = first;
while (last->next) {
last = last->next;
count++;
}
dead_gc->list.first = NULL;
dead_gc->items = 0;
/* Add pending work to current CPU */
gc = get_cpu_ptr(mp->m_inodegc);
llist_add_batch(first, last, &gc->list);
count += READ_ONCE(gc->items);
WRITE_ONCE(gc->items, count);
if (xfs_is_inodegc_enabled(mp)) {
trace_xfs_inodegc_queue(mp, __return_address);
mod_delayed_work_on(current_cpu(), mp->m_inodegc_wq, &gc->work,
0);
}
put_cpu_ptr(gc);
}
/*
* We set the inode flag atomically with the radix tree tag. Once we get tag
* lookups on the radix tree, this inode flag can go away.
@ -2195,7 +2173,7 @@ xfs_inodegc_shrinker_count(
if (!xfs_is_inodegc_enabled(mp))
return 0;
for_each_online_cpu(cpu) {
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list))
return XFS_INODEGC_SHRINKER_COUNT;
@ -2220,7 +2198,7 @@ xfs_inodegc_shrinker_scan(
trace_xfs_inodegc_shrinker_scan(mp, sc, __return_address);
for_each_online_cpu(cpu) {
for_each_cpu(cpu, &mp->m_inodegc_cpumask) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
if (!llist_empty(&gc->list)) {
unsigned int h = READ_ONCE(gc->shrinker_hits);

View File

@ -79,7 +79,6 @@ void xfs_inodegc_push(struct xfs_mount *mp);
int xfs_inodegc_flush(struct xfs_mount *mp);
void xfs_inodegc_stop(struct xfs_mount *mp);
void xfs_inodegc_start(struct xfs_mount *mp);
void xfs_inodegc_cpu_dead(struct xfs_mount *mp, unsigned int cpu);
int xfs_inodegc_register_shrinker(struct xfs_mount *mp);
#endif

View File

@ -124,7 +124,7 @@ xlog_cil_push_pcp_aggregate(
struct xlog_cil_pcp *cilpcp;
int cpu;
for_each_online_cpu(cpu) {
for_each_cpu(cpu, &ctx->cil_pcpmask) {
cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
ctx->ticket->t_curr_res += cilpcp->space_reserved;
@ -165,7 +165,13 @@ xlog_cil_insert_pcp_aggregate(
if (!test_and_clear_bit(XLOG_CIL_PCP_SPACE, &cil->xc_flags))
return;
for_each_online_cpu(cpu) {
/*
* We can race with other cpus setting cil_pcpmask. However, we've
* atomically cleared PCP_SPACE which forces other threads to add to
* the global space used count. cil_pcpmask is a superset of cilpcp
* structures that could have a nonzero space_used.
*/
for_each_cpu(cpu, &ctx->cil_pcpmask) {
int old, prev;
cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
@ -554,6 +560,7 @@ xlog_cil_insert_items(
int iovhdr_res = 0, split_res = 0, ctx_res = 0;
int space_used;
int order;
unsigned int cpu_nr;
struct xlog_cil_pcp *cilpcp;
ASSERT(tp);
@ -577,7 +584,12 @@ xlog_cil_insert_items(
* can't be scheduled away between split sample/update operations that
* are done without outside locking to serialise them.
*/
cilpcp = get_cpu_ptr(cil->xc_pcp);
cpu_nr = get_cpu();
cilpcp = this_cpu_ptr(cil->xc_pcp);
/* Tell the future push that there was work added by this CPU. */
if (!cpumask_test_cpu(cpu_nr, &ctx->cil_pcpmask))
cpumask_test_and_set_cpu(cpu_nr, &ctx->cil_pcpmask);
/*
* We need to take the CIL checkpoint unit reservation on the first
@ -663,7 +675,7 @@ xlog_cil_insert_items(
continue;
list_add_tail(&lip->li_cil, &cilpcp->log_items);
}
put_cpu_ptr(cilpcp);
put_cpu();
/*
* If we've overrun the reservation, dump the tx details before we move
@ -1790,38 +1802,6 @@ out_shutdown:
return 0;
}
/*
* Move dead percpu state to the relevant CIL context structures.
*
* We have to lock the CIL context here to ensure that nothing is modifying
* the percpu state, either addition or removal. Both of these are done under
* the CIL context lock, so grabbing that exclusively here will ensure we can
* safely drain the cilpcp for the CPU that is dying.
*/
void
xlog_cil_pcp_dead(
struct xlog *log,
unsigned int cpu)
{
struct xfs_cil *cil = log->l_cilp;
struct xlog_cil_pcp *cilpcp = per_cpu_ptr(cil->xc_pcp, cpu);
struct xfs_cil_ctx *ctx;
down_write(&cil->xc_ctx_lock);
ctx = cil->xc_ctx;
if (ctx->ticket)
ctx->ticket->t_curr_res += cilpcp->space_reserved;
cilpcp->space_reserved = 0;
if (!list_empty(&cilpcp->log_items))
list_splice_init(&cilpcp->log_items, &ctx->log_items);
if (!list_empty(&cilpcp->busy_extents))
list_splice_init(&cilpcp->busy_extents, &ctx->busy_extents);
atomic_add(cilpcp->space_used, &ctx->space_used);
cilpcp->space_used = 0;
up_write(&cil->xc_ctx_lock);
}
/*
* Perform initial CIL structure initialisation.
*/

View File

@ -231,6 +231,12 @@ struct xfs_cil_ctx {
struct work_struct discard_endio_work;
struct work_struct push_work;
atomic_t order_id;
/*
* CPUs that could have added items to the percpu CIL data. Access is
* coordinated with xc_ctx_lock.
*/
struct cpumask cil_pcpmask;
};
/*
@ -278,9 +284,6 @@ struct xfs_cil {
wait_queue_head_t xc_push_wait; /* background push throttle */
void __percpu *xc_pcp; /* percpu CIL structures */
#ifdef CONFIG_HOTPLUG_CPU
struct list_head xc_pcp_list;
#endif
} ____cacheline_aligned_in_smp;
/* xc_flags bit values */
@ -705,9 +708,4 @@ xlog_kvmalloc(
return p;
}
/*
* CIL CPU dead notifier
*/
void xlog_cil_pcp_dead(struct xlog *log, unsigned int cpu);
#endif /* __XFS_LOG_PRIV_H__ */

View File

@ -60,6 +60,7 @@ struct xfs_error_cfg {
* Per-cpu deferred inode inactivation GC lists.
*/
struct xfs_inodegc {
struct xfs_mount *mp;
struct llist_head list;
struct delayed_work work;
int error;
@ -67,9 +68,7 @@ struct xfs_inodegc {
/* approximate count of inodes in the list */
unsigned int items;
unsigned int shrinker_hits;
#if defined(DEBUG) || defined(XFS_WARN)
unsigned int cpu;
#endif
};
/*
@ -98,7 +97,6 @@ typedef struct xfs_mount {
xfs_buftarg_t *m_ddev_targp; /* saves taking the address */
xfs_buftarg_t *m_logdev_targp;/* ptr to log device */
xfs_buftarg_t *m_rtdev_targp; /* ptr to rt device */
struct list_head m_mount_list; /* global mount list */
void __percpu *m_inodegc; /* percpu inodegc structures */
/*
@ -249,6 +247,9 @@ typedef struct xfs_mount {
unsigned int *m_errortag;
struct xfs_kobj m_errortag_kobj;
#endif
/* cpus that have inodes queued for inactivation */
struct cpumask m_inodegc_cpumask;
} xfs_mount_t;
#define M_IGEO(mp) (&(mp)->m_ino_geo)

View File

@ -56,28 +56,6 @@ static struct kset *xfs_kset; /* top-level xfs sysfs dir */
static struct xfs_kobj xfs_dbg_kobj; /* global debug sysfs attrs */
#endif
#ifdef CONFIG_HOTPLUG_CPU
static LIST_HEAD(xfs_mount_list);
static DEFINE_SPINLOCK(xfs_mount_list_lock);
static inline void xfs_mount_list_add(struct xfs_mount *mp)
{
spin_lock(&xfs_mount_list_lock);
list_add(&mp->m_mount_list, &xfs_mount_list);
spin_unlock(&xfs_mount_list_lock);
}
static inline void xfs_mount_list_del(struct xfs_mount *mp)
{
spin_lock(&xfs_mount_list_lock);
list_del(&mp->m_mount_list);
spin_unlock(&xfs_mount_list_lock);
}
#else /* !CONFIG_HOTPLUG_CPU */
static inline void xfs_mount_list_add(struct xfs_mount *mp) {}
static inline void xfs_mount_list_del(struct xfs_mount *mp) {}
#endif
enum xfs_dax_mode {
XFS_DAX_INODE = 0,
XFS_DAX_ALWAYS = 1,
@ -1135,9 +1113,8 @@ xfs_inodegc_init_percpu(
for_each_possible_cpu(cpu) {
gc = per_cpu_ptr(mp->m_inodegc, cpu);
#if defined(DEBUG) || defined(XFS_WARN)
gc->cpu = cpu;
#endif
gc->mp = mp;
init_llist_head(&gc->list);
gc->items = 0;
gc->error = 0;
@ -1168,7 +1145,6 @@ xfs_fs_put_super(
xfs_freesb(mp);
xchk_mount_stats_free(mp);
free_percpu(mp->m_stats.xs_stats);
xfs_mount_list_del(mp);
xfs_inodegc_free_percpu(mp);
xfs_destroy_percpu_counters(mp);
xfs_destroy_mount_workqueues(mp);
@ -1577,13 +1553,6 @@ xfs_fs_fill_super(
if (error)
goto out_destroy_counters;
/*
* All percpu data structures requiring cleanup when a cpu goes offline
* must be allocated before adding this @mp to the cpu-dead handler's
* mount list.
*/
xfs_mount_list_add(mp);
/* Allocate stats memory before we do operations that might use it */
mp->m_stats.xs_stats = alloc_percpu(struct xfsstats);
if (!mp->m_stats.xs_stats) {
@ -1781,7 +1750,6 @@ xfs_fs_fill_super(
out_free_stats:
free_percpu(mp->m_stats.xs_stats);
out_destroy_inodegc:
xfs_mount_list_del(mp);
xfs_inodegc_free_percpu(mp);
out_destroy_counters:
xfs_destroy_percpu_counters(mp);
@ -2326,49 +2294,6 @@ xfs_destroy_workqueues(void)
destroy_workqueue(xfs_alloc_wq);
}
#ifdef CONFIG_HOTPLUG_CPU
static int
xfs_cpu_dead(
unsigned int cpu)
{
struct xfs_mount *mp, *n;
spin_lock(&xfs_mount_list_lock);
list_for_each_entry_safe(mp, n, &xfs_mount_list, m_mount_list) {
spin_unlock(&xfs_mount_list_lock);
xfs_inodegc_cpu_dead(mp, cpu);
xlog_cil_pcp_dead(mp->m_log, cpu);
spin_lock(&xfs_mount_list_lock);
}
spin_unlock(&xfs_mount_list_lock);
return 0;
}
static int __init
xfs_cpu_hotplug_init(void)
{
int error;
error = cpuhp_setup_state_nocalls(CPUHP_XFS_DEAD, "xfs:dead", NULL,
xfs_cpu_dead);
if (error < 0)
xfs_alert(NULL,
"Failed to initialise CPU hotplug, error %d. XFS is non-functional.",
error);
return error;
}
static void
xfs_cpu_hotplug_destroy(void)
{
cpuhp_remove_state_nocalls(CPUHP_XFS_DEAD);
}
#else /* !CONFIG_HOTPLUG_CPU */
static inline int xfs_cpu_hotplug_init(void) { return 0; }
static inline void xfs_cpu_hotplug_destroy(void) {}
#endif
STATIC int __init
init_xfs_fs(void)
{
@ -2385,13 +2310,9 @@ init_xfs_fs(void)
xfs_dir_startup();
error = xfs_cpu_hotplug_init();
if (error)
goto out;
error = xfs_init_caches();
if (error)
goto out_destroy_hp;
goto out;
error = xfs_init_workqueues();
if (error)
@ -2475,8 +2396,6 @@ init_xfs_fs(void)
xfs_destroy_workqueues();
out_destroy_caches:
xfs_destroy_caches();
out_destroy_hp:
xfs_cpu_hotplug_destroy();
out:
return error;
}
@ -2500,7 +2419,6 @@ exit_xfs_fs(void)
xfs_destroy_workqueues();
xfs_destroy_caches();
xfs_uuid_table_free();
xfs_cpu_hotplug_destroy();
}
module_init(init_xfs_fs);

View File

@ -90,7 +90,6 @@ enum cpuhp_state {
CPUHP_FS_BUFF_DEAD,
CPUHP_PRINTK_DEAD,
CPUHP_MM_MEMCQ_DEAD,
CPUHP_XFS_DEAD,
CPUHP_PERCPU_CNT_DEAD,
CPUHP_RADIX_DEAD,
CPUHP_PAGE_ALLOC,