28 hotfixes.
23 are cc:stable and the other 5 address issues which were introduced during this merge cycle. 20 are for MM and the remainder are for other subsystems. -----BEGIN PGP SIGNATURE----- iHUEABYIAB0WIQTTMBEPP41GrTpTJgfdBJ7gKXxAjgUCZDCmIAAKCRDdBJ7gKXxA jhZuAQDn8ErAotUpLn1Pq6WU1liPenGoraBo/a2ubpOjguSINwD+J7L85vgVmA78 YzoKHObW18yBW7JSzpWZ2zw8q2gLQwQ= =a1n7 -----END PGP SIGNATURE----- Merge tag 'mm-hotfixes-stable-2023-04-07-16-23' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm Pull MM fixes from Andrew Morton: "28 hotfixes. 23 are cc:stable and the other five address issues which were introduced during this merge cycle. 20 are for MM and the remainder are for other subsystems" * tag 'mm-hotfixes-stable-2023-04-07-16-23' of git://git.kernel.org/pub/scm/linux/kernel/git/akpm/mm: (28 commits) maple_tree: fix a potential concurrency bug in RCU mode maple_tree: fix get wrong data_end in mtree_lookup_walk() mm/swap: fix swap_info_struct race between swapoff and get_swap_pages() nilfs2: fix sysfs interface lifetime mm: take a page reference when removing device exclusive entries mm: vmalloc: avoid warn_alloc noise caused by fatal signal nilfs2: initialize "struct nilfs_binfo_dat"->bi_pad field nilfs2: fix potential UAF of struct nilfs_sc_info in nilfs_segctor_thread() zsmalloc: document freeable stats zsmalloc: document new fullness grouping fsdax: force clear dirty mark if CoW mm/hugetlb: fix uffd wr-protection for CoW optimization path mm: enable maple tree RCU mode by default maple_tree: add RCU lock checking to rcu callback functions maple_tree: add smp_rmb() to dead node detection maple_tree: fix write memory barrier of nodes once dead for RCU mode maple_tree: remove extra smp_wmb() from mas_dead_leaves() maple_tree: fix freeing of nodes in rcu mode maple_tree: detect dead nodes in mas_start() maple_tree: be more cautious about dead nodes ...
This commit is contained in:
commit
6fda0bb806
2
.mailmap
2
.mailmap
|
@ -265,7 +265,9 @@ Krzysztof Kozlowski <krzk@kernel.org> <k.kozlowski@samsung.com>
|
|||
Krzysztof Kozlowski <krzk@kernel.org> <krzysztof.kozlowski@canonical.com>
|
||||
Kuninori Morimoto <kuninori.morimoto.gx@renesas.com>
|
||||
Kuogee Hsieh <quic_khsieh@quicinc.com> <khsieh@codeaurora.org>
|
||||
Leonard Crestez <leonard.crestez@nxp.com> Leonard Crestez <cdleonard@gmail.com>
|
||||
Leonardo Bras <leobras.c@gmail.com> <leonardo@linux.ibm.com>
|
||||
Leonard Göhrs <l.goehrs@pengutronix.de>
|
||||
Leonid I Ananiev <leonid.i.ananiev@intel.com>
|
||||
Leon Romanovsky <leon@kernel.org> <leon@leon.nu>
|
||||
Leon Romanovsky <leon@kernel.org> <leonro@mellanox.com>
|
||||
|
|
|
@ -39,13 +39,12 @@ With CONFIG_ZSMALLOC_STAT, we could see zsmalloc internal information via
|
|||
|
||||
# cat /sys/kernel/debug/zsmalloc/zram0/classes
|
||||
|
||||
class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage
|
||||
class size 10% 20% 30% 40% 50% 60% 70% 80% 90% 99% 100% obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
...
|
||||
...
|
||||
9 176 0 1 186 129 8 4
|
||||
10 192 1 0 2880 2872 135 3
|
||||
11 208 0 1 819 795 42 2
|
||||
12 224 0 1 219 159 12 4
|
||||
30 512 0 12 4 1 0 1 0 0 1 0 414 3464 3346 433 1 14
|
||||
31 528 2 7 2 2 1 0 1 0 0 2 117 4154 3793 536 4 44
|
||||
32 544 6 3 4 1 2 1 0 0 0 1 260 4170 3965 556 2 26
|
||||
...
|
||||
...
|
||||
|
||||
|
@ -54,10 +53,28 @@ class
|
|||
index
|
||||
size
|
||||
object size zspage stores
|
||||
almost_empty
|
||||
the number of ZS_ALMOST_EMPTY zspages(see below)
|
||||
almost_full
|
||||
the number of ZS_ALMOST_FULL zspages(see below)
|
||||
10%
|
||||
the number of zspages with usage ratio less than 10% (see below)
|
||||
20%
|
||||
the number of zspages with usage ratio between 10% and 20%
|
||||
30%
|
||||
the number of zspages with usage ratio between 20% and 30%
|
||||
40%
|
||||
the number of zspages with usage ratio between 30% and 40%
|
||||
50%
|
||||
the number of zspages with usage ratio between 40% and 50%
|
||||
60%
|
||||
the number of zspages with usage ratio between 50% and 60%
|
||||
70%
|
||||
the number of zspages with usage ratio between 60% and 70%
|
||||
80%
|
||||
the number of zspages with usage ratio between 70% and 80%
|
||||
90%
|
||||
the number of zspages with usage ratio between 80% and 90%
|
||||
99%
|
||||
the number of zspages with usage ratio between 90% and 99%
|
||||
100%
|
||||
the number of zspages with usage ratio 100%
|
||||
obj_allocated
|
||||
the number of objects allocated
|
||||
obj_used
|
||||
|
@ -66,19 +83,14 @@ pages_used
|
|||
the number of pages allocated for the class
|
||||
pages_per_zspage
|
||||
the number of 0-order pages to make a zspage
|
||||
freeable
|
||||
the approximate number of pages class compaction can free
|
||||
|
||||
We assign a zspage to ZS_ALMOST_EMPTY fullness group when n <= N / f, where
|
||||
|
||||
* n = number of allocated objects
|
||||
* N = total number of objects zspage can store
|
||||
* f = fullness_threshold_frac(ie, 4 at the moment)
|
||||
|
||||
Similarly, we assign zspage to:
|
||||
|
||||
* ZS_ALMOST_FULL when n > N / f
|
||||
* ZS_EMPTY when n == 0
|
||||
* ZS_FULL when n == N
|
||||
|
||||
Each zspage maintains inuse counter which keeps track of the number of
|
||||
objects stored in the zspage. The inuse counter determines the zspage's
|
||||
"fullness group" which is calculated as the ratio of the "inuse" objects to
|
||||
the total number of objects the zspage can hold (objs_per_zspage). The
|
||||
closer the inuse counter is to objs_per_zspage, the better.
|
||||
|
||||
Internals
|
||||
=========
|
||||
|
@ -94,10 +106,10 @@ of objects that each zspage can store.
|
|||
|
||||
For instance, consider the following size classes:::
|
||||
|
||||
class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
...
|
||||
94 1536 0 0 0 0 0 3 0
|
||||
100 1632 0 0 0 0 0 2 0
|
||||
94 1536 0 .... 0 0 0 0 3 0
|
||||
100 1632 0 .... 0 0 0 0 2 0
|
||||
...
|
||||
|
||||
|
||||
|
@ -134,10 +146,11 @@ reduces memory wastage.
|
|||
|
||||
Let's take a closer look at the bottom of `/sys/kernel/debug/zsmalloc/zramX/classes`:::
|
||||
|
||||
class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
|
||||
...
|
||||
202 3264 0 0 0 0 0 4 0
|
||||
254 4096 0 0 0 0 0 1 0
|
||||
202 3264 0 .. 0 0 0 0 4 0
|
||||
254 4096 0 .. 0 0 0 0 1 0
|
||||
...
|
||||
|
||||
Size class #202 stores objects of size 3264 bytes and has a maximum of 4 pages
|
||||
|
@ -151,40 +164,42 @@ efficient storage of large objects.
|
|||
|
||||
For zspage chain size of 8, huge class watermark becomes 3632 bytes:::
|
||||
|
||||
class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
|
||||
...
|
||||
202 3264 0 0 0 0 0 4 0
|
||||
211 3408 0 0 0 0 0 5 0
|
||||
217 3504 0 0 0 0 0 6 0
|
||||
222 3584 0 0 0 0 0 7 0
|
||||
225 3632 0 0 0 0 0 8 0
|
||||
254 4096 0 0 0 0 0 1 0
|
||||
202 3264 0 .. 0 0 0 0 4 0
|
||||
211 3408 0 .. 0 0 0 0 5 0
|
||||
217 3504 0 .. 0 0 0 0 6 0
|
||||
222 3584 0 .. 0 0 0 0 7 0
|
||||
225 3632 0 .. 0 0 0 0 8 0
|
||||
254 4096 0 .. 0 0 0 0 1 0
|
||||
...
|
||||
|
||||
For zspage chain size of 16, huge class watermark becomes 3840 bytes:::
|
||||
|
||||
class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
|
||||
...
|
||||
202 3264 0 0 0 0 0 4 0
|
||||
206 3328 0 0 0 0 0 13 0
|
||||
207 3344 0 0 0 0 0 9 0
|
||||
208 3360 0 0 0 0 0 14 0
|
||||
211 3408 0 0 0 0 0 5 0
|
||||
212 3424 0 0 0 0 0 16 0
|
||||
214 3456 0 0 0 0 0 11 0
|
||||
217 3504 0 0 0 0 0 6 0
|
||||
219 3536 0 0 0 0 0 13 0
|
||||
222 3584 0 0 0 0 0 7 0
|
||||
223 3600 0 0 0 0 0 15 0
|
||||
225 3632 0 0 0 0 0 8 0
|
||||
228 3680 0 0 0 0 0 9 0
|
||||
230 3712 0 0 0 0 0 10 0
|
||||
232 3744 0 0 0 0 0 11 0
|
||||
234 3776 0 0 0 0 0 12 0
|
||||
235 3792 0 0 0 0 0 13 0
|
||||
236 3808 0 0 0 0 0 14 0
|
||||
238 3840 0 0 0 0 0 15 0
|
||||
254 4096 0 0 0 0 0 1 0
|
||||
202 3264 0 .. 0 0 0 0 4 0
|
||||
206 3328 0 .. 0 0 0 0 13 0
|
||||
207 3344 0 .. 0 0 0 0 9 0
|
||||
208 3360 0 .. 0 0 0 0 14 0
|
||||
211 3408 0 .. 0 0 0 0 5 0
|
||||
212 3424 0 .. 0 0 0 0 16 0
|
||||
214 3456 0 .. 0 0 0 0 11 0
|
||||
217 3504 0 .. 0 0 0 0 6 0
|
||||
219 3536 0 .. 0 0 0 0 13 0
|
||||
222 3584 0 .. 0 0 0 0 7 0
|
||||
223 3600 0 .. 0 0 0 0 15 0
|
||||
225 3632 0 .. 0 0 0 0 8 0
|
||||
228 3680 0 .. 0 0 0 0 9 0
|
||||
230 3712 0 .. 0 0 0 0 10 0
|
||||
232 3744 0 .. 0 0 0 0 11 0
|
||||
234 3776 0 .. 0 0 0 0 12 0
|
||||
235 3792 0 .. 0 0 0 0 13 0
|
||||
236 3808 0 .. 0 0 0 0 14 0
|
||||
238 3840 0 .. 0 0 0 0 15 0
|
||||
254 4096 0 .. 0 0 0 0 1 0
|
||||
...
|
||||
|
||||
Overall the combined zspage chain size effect on zsmalloc pool configuration:::
|
||||
|
@ -214,9 +229,10 @@ zram as a build artifacts storage (Linux kernel compilation).
|
|||
|
||||
zsmalloc classes stats:::
|
||||
|
||||
class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
|
||||
...
|
||||
Total 13 51 413836 412973 159955 3
|
||||
Total 13 .. 51 413836 412973 159955 3
|
||||
|
||||
zram mm_stat:::
|
||||
|
||||
|
@ -227,9 +243,10 @@ zram as a build artifacts storage (Linux kernel compilation).
|
|||
|
||||
zsmalloc classes stats:::
|
||||
|
||||
class size almost_full almost_empty obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
class size 10% .... 100% obj_allocated obj_used pages_used pages_per_zspage freeable
|
||||
|
||||
...
|
||||
Total 18 87 414852 412978 156666 0
|
||||
Total 18 .. 87 414852 412978 156666 0
|
||||
|
||||
zram mm_stat:::
|
||||
|
||||
|
|
52
fs/dax.c
52
fs/dax.c
|
@ -781,6 +781,33 @@ out:
|
|||
return ret;
|
||||
}
|
||||
|
||||
static int __dax_clear_dirty_range(struct address_space *mapping,
|
||||
pgoff_t start, pgoff_t end)
|
||||
{
|
||||
XA_STATE(xas, &mapping->i_pages, start);
|
||||
unsigned int scanned = 0;
|
||||
void *entry;
|
||||
|
||||
xas_lock_irq(&xas);
|
||||
xas_for_each(&xas, entry, end) {
|
||||
entry = get_unlocked_entry(&xas, 0);
|
||||
xas_clear_mark(&xas, PAGECACHE_TAG_DIRTY);
|
||||
xas_clear_mark(&xas, PAGECACHE_TAG_TOWRITE);
|
||||
put_unlocked_entry(&xas, entry, WAKE_NEXT);
|
||||
|
||||
if (++scanned % XA_CHECK_SCHED)
|
||||
continue;
|
||||
|
||||
xas_pause(&xas);
|
||||
xas_unlock_irq(&xas);
|
||||
cond_resched();
|
||||
xas_lock_irq(&xas);
|
||||
}
|
||||
xas_unlock_irq(&xas);
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* Delete DAX entry at @index from @mapping. Wait for it
|
||||
* to be unlocked before deleting it.
|
||||
|
@ -1258,15 +1285,20 @@ static s64 dax_unshare_iter(struct iomap_iter *iter)
|
|||
/* don't bother with blocks that are not shared to start with */
|
||||
if (!(iomap->flags & IOMAP_F_SHARED))
|
||||
return length;
|
||||
/* don't bother with holes or unwritten extents */
|
||||
if (srcmap->type == IOMAP_HOLE || srcmap->type == IOMAP_UNWRITTEN)
|
||||
return length;
|
||||
|
||||
id = dax_read_lock();
|
||||
ret = dax_iomap_direct_access(iomap, pos, length, &daddr, NULL);
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
|
||||
/* zero the distance if srcmap is HOLE or UNWRITTEN */
|
||||
if (srcmap->flags & IOMAP_F_SHARED || srcmap->type == IOMAP_UNWRITTEN) {
|
||||
memset(daddr, 0, length);
|
||||
dax_flush(iomap->dax_dev, daddr, length);
|
||||
ret = length;
|
||||
goto out_unlock;
|
||||
}
|
||||
|
||||
ret = dax_iomap_direct_access(srcmap, pos, length, &saddr, NULL);
|
||||
if (ret < 0)
|
||||
goto out_unlock;
|
||||
|
@ -1435,6 +1467,16 @@ static loff_t dax_iomap_iter(const struct iomap_iter *iomi,
|
|||
* written by write(2) is visible in mmap.
|
||||
*/
|
||||
if (iomap->flags & IOMAP_F_NEW || cow) {
|
||||
/*
|
||||
* Filesystem allows CoW on non-shared extents. The src extents
|
||||
* may have been mmapped with dirty mark before. To be able to
|
||||
* invalidate its dax entries, we need to clear the dirty mark
|
||||
* in advance.
|
||||
*/
|
||||
if (cow)
|
||||
__dax_clear_dirty_range(iomi->inode->i_mapping,
|
||||
pos >> PAGE_SHIFT,
|
||||
(end - 1) >> PAGE_SHIFT);
|
||||
invalidate_inode_pages2_range(iomi->inode->i_mapping,
|
||||
pos >> PAGE_SHIFT,
|
||||
(end - 1) >> PAGE_SHIFT);
|
||||
|
@ -2022,8 +2064,8 @@ int dax_dedupe_file_range_compare(struct inode *src, loff_t srcoff,
|
|||
|
||||
while ((ret = iomap_iter(&src_iter, ops)) > 0 &&
|
||||
(ret = iomap_iter(&dst_iter, ops)) > 0) {
|
||||
compared = dax_range_compare_iter(&src_iter, &dst_iter, len,
|
||||
same);
|
||||
compared = dax_range_compare_iter(&src_iter, &dst_iter,
|
||||
min(src_iter.len, dst_iter.len), same);
|
||||
if (compared < 0)
|
||||
return ret;
|
||||
src_iter.processed = dst_iter.processed = compared;
|
||||
|
|
|
@ -2219,6 +2219,7 @@ static int nilfs_btree_assign_p(struct nilfs_bmap *btree,
|
|||
/* on-disk format */
|
||||
binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
|
||||
binfo->bi_dat.bi_level = level;
|
||||
memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -314,6 +314,7 @@ static int nilfs_direct_assign_p(struct nilfs_bmap *direct,
|
|||
|
||||
binfo->bi_dat.bi_blkoff = cpu_to_le64(key);
|
||||
binfo->bi_dat.bi_level = 0;
|
||||
memset(binfo->bi_dat.bi_pad, 0, sizeof(binfo->bi_dat.bi_pad));
|
||||
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -2609,11 +2609,10 @@ static int nilfs_segctor_thread(void *arg)
|
|||
goto loop;
|
||||
|
||||
end_thread:
|
||||
spin_unlock(&sci->sc_state_lock);
|
||||
|
||||
/* end sync. */
|
||||
sci->sc_task = NULL;
|
||||
wake_up(&sci->sc_wait_task); /* for nilfs_segctor_kill_thread() */
|
||||
spin_unlock(&sci->sc_state_lock);
|
||||
return 0;
|
||||
}
|
||||
|
||||
|
|
|
@ -482,6 +482,7 @@ static void nilfs_put_super(struct super_block *sb)
|
|||
up_write(&nilfs->ns_sem);
|
||||
}
|
||||
|
||||
nilfs_sysfs_delete_device_group(nilfs);
|
||||
iput(nilfs->ns_sufile);
|
||||
iput(nilfs->ns_cpfile);
|
||||
iput(nilfs->ns_dat);
|
||||
|
@ -1105,6 +1106,7 @@ nilfs_fill_super(struct super_block *sb, void *data, int silent)
|
|||
nilfs_put_root(fsroot);
|
||||
|
||||
failed_unload:
|
||||
nilfs_sysfs_delete_device_group(nilfs);
|
||||
iput(nilfs->ns_sufile);
|
||||
iput(nilfs->ns_cpfile);
|
||||
iput(nilfs->ns_dat);
|
||||
|
|
|
@ -87,7 +87,6 @@ void destroy_nilfs(struct the_nilfs *nilfs)
|
|||
{
|
||||
might_sleep();
|
||||
if (nilfs_init(nilfs)) {
|
||||
nilfs_sysfs_delete_device_group(nilfs);
|
||||
brelse(nilfs->ns_sbh[0]);
|
||||
brelse(nilfs->ns_sbh[1]);
|
||||
}
|
||||
|
@ -305,6 +304,10 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
|
|||
goto failed;
|
||||
}
|
||||
|
||||
err = nilfs_sysfs_create_device_group(sb);
|
||||
if (unlikely(err))
|
||||
goto sysfs_error;
|
||||
|
||||
if (valid_fs)
|
||||
goto skip_recovery;
|
||||
|
||||
|
@ -366,6 +369,9 @@ int load_nilfs(struct the_nilfs *nilfs, struct super_block *sb)
|
|||
goto failed;
|
||||
|
||||
failed_unload:
|
||||
nilfs_sysfs_delete_device_group(nilfs);
|
||||
|
||||
sysfs_error:
|
||||
iput(nilfs->ns_cpfile);
|
||||
iput(nilfs->ns_sufile);
|
||||
iput(nilfs->ns_dat);
|
||||
|
@ -697,10 +703,6 @@ int init_nilfs(struct the_nilfs *nilfs, struct super_block *sb, char *data)
|
|||
if (err)
|
||||
goto failed_sbh;
|
||||
|
||||
err = nilfs_sysfs_create_device_group(sb);
|
||||
if (err)
|
||||
goto failed_sbh;
|
||||
|
||||
set_nilfs_init(nilfs);
|
||||
err = 0;
|
||||
out:
|
||||
|
|
|
@ -774,7 +774,8 @@ struct mm_struct {
|
|||
unsigned long cpu_bitmap[];
|
||||
};
|
||||
|
||||
#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN)
|
||||
#define MM_MT_FLAGS (MT_FLAGS_ALLOC_RANGE | MT_FLAGS_LOCK_EXTERN | \
|
||||
MT_FLAGS_USE_RCU)
|
||||
extern struct mm_struct init_mm;
|
||||
|
||||
/* Pointer magic because the dynamic array size confuses some compilers. */
|
||||
|
|
|
@ -617,6 +617,7 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
|||
if (retval)
|
||||
goto out;
|
||||
|
||||
mt_clear_in_rcu(vmi.mas.tree);
|
||||
for_each_vma(old_vmi, mpnt) {
|
||||
struct file *file;
|
||||
|
||||
|
@ -700,6 +701,8 @@ static __latent_entropy int dup_mmap(struct mm_struct *mm,
|
|||
retval = arch_dup_mmap(oldmm, mm);
|
||||
loop_out:
|
||||
vma_iter_free(&vmi);
|
||||
if (!retval)
|
||||
mt_set_in_rcu(vmi.mas.tree);
|
||||
out:
|
||||
mmap_write_unlock(mm);
|
||||
flush_tlb_mm(oldmm);
|
||||
|
|
|
@ -1143,7 +1143,7 @@ menu "Scheduler Debugging"
|
|||
|
||||
config SCHED_DEBUG
|
||||
bool "Collect scheduler debugging info"
|
||||
depends on DEBUG_KERNEL && PROC_FS
|
||||
depends on DEBUG_KERNEL && DEBUG_FS
|
||||
default y
|
||||
help
|
||||
If you say Y here, the /sys/kernel/debug/sched file will be provided
|
||||
|
@ -1392,7 +1392,7 @@ config LOCKDEP_STACK_TRACE_HASH_BITS
|
|||
range 10 30
|
||||
default 14
|
||||
help
|
||||
Try increasing this value if you need large MAX_STACK_TRACE_ENTRIES.
|
||||
Try increasing this value if you need large STACK_TRACE_HASH_SIZE.
|
||||
|
||||
config LOCKDEP_CIRCULAR_QUEUE_BITS
|
||||
int "Bitsize for elements in circular_queue struct"
|
||||
|
|
285
lib/maple_tree.c
285
lib/maple_tree.c
|
@ -185,7 +185,7 @@ static void mt_free_rcu(struct rcu_head *head)
|
|||
*/
|
||||
static void ma_free_rcu(struct maple_node *node)
|
||||
{
|
||||
node->parent = ma_parent_ptr(node);
|
||||
WARN_ON(node->parent != ma_parent_ptr(node));
|
||||
call_rcu(&node->rcu, mt_free_rcu);
|
||||
}
|
||||
|
||||
|
@ -539,11 +539,14 @@ static inline struct maple_node *mte_parent(const struct maple_enode *enode)
|
|||
*/
|
||||
static inline bool ma_dead_node(const struct maple_node *node)
|
||||
{
|
||||
struct maple_node *parent = (void *)((unsigned long)
|
||||
node->parent & ~MAPLE_NODE_MASK);
|
||||
struct maple_node *parent;
|
||||
|
||||
/* Do not reorder reads from the node prior to the parent check */
|
||||
smp_rmb();
|
||||
parent = (void *)((unsigned long) node->parent & ~MAPLE_NODE_MASK);
|
||||
return (parent == node);
|
||||
}
|
||||
|
||||
/*
|
||||
* mte_dead_node() - check if the @enode is dead.
|
||||
* @enode: The encoded maple node
|
||||
|
@ -555,6 +558,8 @@ static inline bool mte_dead_node(const struct maple_enode *enode)
|
|||
struct maple_node *parent, *node;
|
||||
|
||||
node = mte_to_node(enode);
|
||||
/* Do not reorder reads from the node prior to the parent check */
|
||||
smp_rmb();
|
||||
parent = mte_parent(enode);
|
||||
return (parent == node);
|
||||
}
|
||||
|
@ -625,6 +630,8 @@ static inline unsigned int mas_alloc_req(const struct ma_state *mas)
|
|||
* @node - the maple node
|
||||
* @type - the node type
|
||||
*
|
||||
* In the event of a dead node, this array may be %NULL
|
||||
*
|
||||
* Return: A pointer to the maple node pivots
|
||||
*/
|
||||
static inline unsigned long *ma_pivots(struct maple_node *node,
|
||||
|
@ -817,6 +824,11 @@ static inline void *mt_slot(const struct maple_tree *mt,
|
|||
return rcu_dereference_check(slots[offset], mt_locked(mt));
|
||||
}
|
||||
|
||||
static inline void *mt_slot_locked(struct maple_tree *mt, void __rcu **slots,
|
||||
unsigned char offset)
|
||||
{
|
||||
return rcu_dereference_protected(slots[offset], mt_locked(mt));
|
||||
}
|
||||
/*
|
||||
* mas_slot_locked() - Get the slot value when holding the maple tree lock.
|
||||
* @mas: The maple state
|
||||
|
@ -828,7 +840,7 @@ static inline void *mt_slot(const struct maple_tree *mt,
|
|||
static inline void *mas_slot_locked(struct ma_state *mas, void __rcu **slots,
|
||||
unsigned char offset)
|
||||
{
|
||||
return rcu_dereference_protected(slots[offset], mt_locked(mas->tree));
|
||||
return mt_slot_locked(mas->tree, slots, offset);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -899,6 +911,45 @@ static inline void ma_set_meta(struct maple_node *mn, enum maple_type mt,
|
|||
meta->end = end;
|
||||
}
|
||||
|
||||
/*
|
||||
* mt_clear_meta() - clear the metadata information of a node, if it exists
|
||||
* @mt: The maple tree
|
||||
* @mn: The maple node
|
||||
* @type: The maple node type
|
||||
* @offset: The offset of the highest sub-gap in this node.
|
||||
* @end: The end of the data in this node.
|
||||
*/
|
||||
static inline void mt_clear_meta(struct maple_tree *mt, struct maple_node *mn,
|
||||
enum maple_type type)
|
||||
{
|
||||
struct maple_metadata *meta;
|
||||
unsigned long *pivots;
|
||||
void __rcu **slots;
|
||||
void *next;
|
||||
|
||||
switch (type) {
|
||||
case maple_range_64:
|
||||
pivots = mn->mr64.pivot;
|
||||
if (unlikely(pivots[MAPLE_RANGE64_SLOTS - 2])) {
|
||||
slots = mn->mr64.slot;
|
||||
next = mt_slot_locked(mt, slots,
|
||||
MAPLE_RANGE64_SLOTS - 1);
|
||||
if (unlikely((mte_to_node(next) &&
|
||||
mte_node_type(next))))
|
||||
return; /* no metadata, could be node */
|
||||
}
|
||||
fallthrough;
|
||||
case maple_arange_64:
|
||||
meta = ma_meta(mn, type);
|
||||
break;
|
||||
default:
|
||||
return;
|
||||
}
|
||||
|
||||
meta->gap = 0;
|
||||
meta->end = 0;
|
||||
}
|
||||
|
||||
/*
|
||||
* ma_meta_end() - Get the data end of a node from the metadata
|
||||
* @mn: The maple node
|
||||
|
@ -1096,8 +1147,11 @@ static int mas_ascend(struct ma_state *mas)
|
|||
a_type = mas_parent_enum(mas, p_enode);
|
||||
a_node = mte_parent(p_enode);
|
||||
a_slot = mte_parent_slot(p_enode);
|
||||
pivots = ma_pivots(a_node, a_type);
|
||||
a_enode = mt_mk_node(a_node, a_type);
|
||||
pivots = ma_pivots(a_node, a_type);
|
||||
|
||||
if (unlikely(ma_dead_node(a_node)))
|
||||
return 1;
|
||||
|
||||
if (!set_min && a_slot) {
|
||||
set_min = true;
|
||||
|
@ -1354,12 +1408,16 @@ static inline struct maple_enode *mas_start(struct ma_state *mas)
|
|||
mas->max = ULONG_MAX;
|
||||
mas->depth = 0;
|
||||
|
||||
retry:
|
||||
root = mas_root(mas);
|
||||
/* Tree with nodes */
|
||||
if (likely(xa_is_node(root))) {
|
||||
mas->depth = 1;
|
||||
mas->node = mte_safe_root(root);
|
||||
mas->offset = 0;
|
||||
if (mte_dead_node(mas->node))
|
||||
goto retry;
|
||||
|
||||
return NULL;
|
||||
}
|
||||
|
||||
|
@ -1401,6 +1459,9 @@ static inline unsigned char ma_data_end(struct maple_node *node,
|
|||
{
|
||||
unsigned char offset;
|
||||
|
||||
if (!pivots)
|
||||
return 0;
|
||||
|
||||
if (type == maple_arange_64)
|
||||
return ma_meta_end(node, type);
|
||||
|
||||
|
@ -1436,6 +1497,9 @@ static inline unsigned char mas_data_end(struct ma_state *mas)
|
|||
return ma_meta_end(node, type);
|
||||
|
||||
pivots = ma_pivots(node, type);
|
||||
if (unlikely(ma_dead_node(node)))
|
||||
return 0;
|
||||
|
||||
offset = mt_pivots[type] - 1;
|
||||
if (likely(!pivots[offset]))
|
||||
return ma_meta_end(node, type);
|
||||
|
@ -1724,8 +1788,10 @@ static inline void mas_replace(struct ma_state *mas, bool advanced)
|
|||
rcu_assign_pointer(slots[offset], mas->node);
|
||||
}
|
||||
|
||||
if (!advanced)
|
||||
if (!advanced) {
|
||||
mte_set_node_dead(old_enode);
|
||||
mas_free(mas, old_enode);
|
||||
}
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -3659,10 +3725,9 @@ static inline int mas_root_expand(struct ma_state *mas, void *entry)
|
|||
slot++;
|
||||
mas->depth = 1;
|
||||
mas_set_height(mas);
|
||||
|
||||
ma_set_meta(node, maple_leaf_64, 0, slot);
|
||||
/* swap the new root into the tree */
|
||||
rcu_assign_pointer(mas->tree->ma_root, mte_mk_root(mas->node));
|
||||
ma_set_meta(node, maple_leaf_64, 0, slot);
|
||||
return slot;
|
||||
}
|
||||
|
||||
|
@ -3875,18 +3940,13 @@ static inline void *mtree_lookup_walk(struct ma_state *mas)
|
|||
end = ma_data_end(node, type, pivots, max);
|
||||
if (unlikely(ma_dead_node(node)))
|
||||
goto dead_node;
|
||||
|
||||
if (pivots[offset] >= mas->index)
|
||||
goto next;
|
||||
|
||||
do {
|
||||
offset++;
|
||||
} while ((offset < end) && (pivots[offset] < mas->index));
|
||||
if (pivots[offset] >= mas->index) {
|
||||
max = pivots[offset];
|
||||
break;
|
||||
}
|
||||
} while (++offset < end);
|
||||
|
||||
if (likely(offset > end))
|
||||
max = pivots[offset];
|
||||
|
||||
next:
|
||||
slots = ma_slots(node, type);
|
||||
next = mt_slot(mas->tree, slots, offset);
|
||||
if (unlikely(ma_dead_node(node)))
|
||||
|
@ -4164,6 +4224,7 @@ static inline bool mas_wr_node_store(struct ma_wr_state *wr_mas)
|
|||
done:
|
||||
mas_leaf_set_meta(mas, newnode, dst_pivots, maple_leaf_64, new_end);
|
||||
if (in_rcu) {
|
||||
mte_set_node_dead(mas->node);
|
||||
mas->node = mt_mk_node(newnode, wr_mas->type);
|
||||
mas_replace(mas, false);
|
||||
} else {
|
||||
|
@ -4505,6 +4566,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min)
|
|||
node = mas_mn(mas);
|
||||
slots = ma_slots(node, mt);
|
||||
pivots = ma_pivots(node, mt);
|
||||
if (unlikely(ma_dead_node(node)))
|
||||
return 1;
|
||||
|
||||
mas->max = pivots[offset];
|
||||
if (offset)
|
||||
mas->min = pivots[offset - 1] + 1;
|
||||
|
@ -4526,6 +4590,9 @@ static inline int mas_prev_node(struct ma_state *mas, unsigned long min)
|
|||
slots = ma_slots(node, mt);
|
||||
pivots = ma_pivots(node, mt);
|
||||
offset = ma_data_end(node, mt, pivots, mas->max);
|
||||
if (unlikely(ma_dead_node(node)))
|
||||
return 1;
|
||||
|
||||
if (offset)
|
||||
mas->min = pivots[offset - 1] + 1;
|
||||
|
||||
|
@ -4574,6 +4641,7 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node,
|
|||
struct maple_enode *enode;
|
||||
int level = 0;
|
||||
unsigned char offset;
|
||||
unsigned char node_end;
|
||||
enum maple_type mt;
|
||||
void __rcu **slots;
|
||||
|
||||
|
@ -4597,7 +4665,11 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node,
|
|||
node = mas_mn(mas);
|
||||
mt = mte_node_type(mas->node);
|
||||
pivots = ma_pivots(node, mt);
|
||||
} while (unlikely(offset == ma_data_end(node, mt, pivots, mas->max)));
|
||||
node_end = ma_data_end(node, mt, pivots, mas->max);
|
||||
if (unlikely(ma_dead_node(node)))
|
||||
return 1;
|
||||
|
||||
} while (unlikely(offset == node_end));
|
||||
|
||||
slots = ma_slots(node, mt);
|
||||
pivot = mas_safe_pivot(mas, pivots, ++offset, mt);
|
||||
|
@ -4613,6 +4685,9 @@ static inline int mas_next_node(struct ma_state *mas, struct maple_node *node,
|
|||
mt = mte_node_type(mas->node);
|
||||
slots = ma_slots(node, mt);
|
||||
pivots = ma_pivots(node, mt);
|
||||
if (unlikely(ma_dead_node(node)))
|
||||
return 1;
|
||||
|
||||
offset = 0;
|
||||
pivot = pivots[0];
|
||||
}
|
||||
|
@ -4659,11 +4734,14 @@ static inline void *mas_next_nentry(struct ma_state *mas,
|
|||
return NULL;
|
||||
}
|
||||
|
||||
pivots = ma_pivots(node, type);
|
||||
slots = ma_slots(node, type);
|
||||
mas->index = mas_safe_min(mas, pivots, mas->offset);
|
||||
pivots = ma_pivots(node, type);
|
||||
count = ma_data_end(node, type, pivots, mas->max);
|
||||
if (ma_dead_node(node))
|
||||
if (unlikely(ma_dead_node(node)))
|
||||
return NULL;
|
||||
|
||||
mas->index = mas_safe_min(mas, pivots, mas->offset);
|
||||
if (unlikely(ma_dead_node(node)))
|
||||
return NULL;
|
||||
|
||||
if (mas->index > max)
|
||||
|
@ -4817,6 +4895,11 @@ retry:
|
|||
|
||||
slots = ma_slots(mn, mt);
|
||||
pivots = ma_pivots(mn, mt);
|
||||
if (unlikely(ma_dead_node(mn))) {
|
||||
mas_rewalk(mas, index);
|
||||
goto retry;
|
||||
}
|
||||
|
||||
if (offset == mt_pivots[mt])
|
||||
pivot = mas->max;
|
||||
else
|
||||
|
@ -5400,24 +5483,26 @@ no_gap:
|
|||
}
|
||||
|
||||
/*
|
||||
* mas_dead_leaves() - Mark all leaves of a node as dead.
|
||||
* mte_dead_leaves() - Mark all leaves of a node as dead.
|
||||
* @mas: The maple state
|
||||
* @slots: Pointer to the slot array
|
||||
* @type: The maple node type
|
||||
*
|
||||
* Must hold the write lock.
|
||||
*
|
||||
* Return: The number of leaves marked as dead.
|
||||
*/
|
||||
static inline
|
||||
unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots)
|
||||
unsigned char mte_dead_leaves(struct maple_enode *enode, struct maple_tree *mt,
|
||||
void __rcu **slots)
|
||||
{
|
||||
struct maple_node *node;
|
||||
enum maple_type type;
|
||||
void *entry;
|
||||
int offset;
|
||||
|
||||
for (offset = 0; offset < mt_slot_count(mas->node); offset++) {
|
||||
entry = mas_slot_locked(mas, slots, offset);
|
||||
for (offset = 0; offset < mt_slot_count(enode); offset++) {
|
||||
entry = mt_slot(mt, slots, offset);
|
||||
type = mte_node_type(entry);
|
||||
node = mte_to_node(entry);
|
||||
/* Use both node and type to catch LE & BE metadata */
|
||||
|
@ -5425,7 +5510,6 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots)
|
|||
break;
|
||||
|
||||
mte_set_node_dead(entry);
|
||||
smp_wmb(); /* Needed for RCU */
|
||||
node->type = type;
|
||||
rcu_assign_pointer(slots[offset], node);
|
||||
}
|
||||
|
@ -5433,151 +5517,160 @@ unsigned char mas_dead_leaves(struct ma_state *mas, void __rcu **slots)
|
|||
return offset;
|
||||
}
|
||||
|
||||
static void __rcu **mas_dead_walk(struct ma_state *mas, unsigned char offset)
|
||||
/**
|
||||
* mte_dead_walk() - Walk down a dead tree to just before the leaves
|
||||
* @enode: The maple encoded node
|
||||
* @offset: The starting offset
|
||||
*
|
||||
* Note: This can only be used from the RCU callback context.
|
||||
*/
|
||||
static void __rcu **mte_dead_walk(struct maple_enode **enode, unsigned char offset)
|
||||
{
|
||||
struct maple_node *node, *next;
|
||||
void __rcu **slots = NULL;
|
||||
|
||||
next = mas_mn(mas);
|
||||
next = mte_to_node(*enode);
|
||||
do {
|
||||
mas->node = ma_enode_ptr(next);
|
||||
node = mas_mn(mas);
|
||||
*enode = ma_enode_ptr(next);
|
||||
node = mte_to_node(*enode);
|
||||
slots = ma_slots(node, node->type);
|
||||
next = mas_slot_locked(mas, slots, offset);
|
||||
next = rcu_dereference_protected(slots[offset],
|
||||
lock_is_held(&rcu_callback_map));
|
||||
offset = 0;
|
||||
} while (!ma_is_leaf(next->type));
|
||||
|
||||
return slots;
|
||||
}
|
||||
|
||||
/**
|
||||
* mt_free_walk() - Walk & free a tree in the RCU callback context
|
||||
* @head: The RCU head that's within the node.
|
||||
*
|
||||
* Note: This can only be used from the RCU callback context.
|
||||
*/
|
||||
static void mt_free_walk(struct rcu_head *head)
|
||||
{
|
||||
void __rcu **slots;
|
||||
struct maple_node *node, *start;
|
||||
struct maple_tree mt;
|
||||
struct maple_enode *enode;
|
||||
unsigned char offset;
|
||||
enum maple_type type;
|
||||
MA_STATE(mas, &mt, 0, 0);
|
||||
|
||||
node = container_of(head, struct maple_node, rcu);
|
||||
|
||||
if (ma_is_leaf(node->type))
|
||||
goto free_leaf;
|
||||
|
||||
mt_init_flags(&mt, node->ma_flags);
|
||||
mas_lock(&mas);
|
||||
start = node;
|
||||
mas.node = mt_mk_node(node, node->type);
|
||||
slots = mas_dead_walk(&mas, 0);
|
||||
node = mas_mn(&mas);
|
||||
enode = mt_mk_node(node, node->type);
|
||||
slots = mte_dead_walk(&enode, 0);
|
||||
node = mte_to_node(enode);
|
||||
do {
|
||||
mt_free_bulk(node->slot_len, slots);
|
||||
offset = node->parent_slot + 1;
|
||||
mas.node = node->piv_parent;
|
||||
if (mas_mn(&mas) == node)
|
||||
goto start_slots_free;
|
||||
enode = node->piv_parent;
|
||||
if (mte_to_node(enode) == node)
|
||||
goto free_leaf;
|
||||
|
||||
type = mte_node_type(mas.node);
|
||||
slots = ma_slots(mte_to_node(mas.node), type);
|
||||
if ((offset < mt_slots[type]) && (slots[offset]))
|
||||
slots = mas_dead_walk(&mas, offset);
|
||||
|
||||
node = mas_mn(&mas);
|
||||
type = mte_node_type(enode);
|
||||
slots = ma_slots(mte_to_node(enode), type);
|
||||
if ((offset < mt_slots[type]) &&
|
||||
rcu_dereference_protected(slots[offset],
|
||||
lock_is_held(&rcu_callback_map)))
|
||||
slots = mte_dead_walk(&enode, offset);
|
||||
node = mte_to_node(enode);
|
||||
} while ((node != start) || (node->slot_len < offset));
|
||||
|
||||
slots = ma_slots(node, node->type);
|
||||
mt_free_bulk(node->slot_len, slots);
|
||||
|
||||
start_slots_free:
|
||||
mas_unlock(&mas);
|
||||
free_leaf:
|
||||
mt_free_rcu(&node->rcu);
|
||||
}
|
||||
|
||||
static inline void __rcu **mas_destroy_descend(struct ma_state *mas,
|
||||
struct maple_enode *prev, unsigned char offset)
|
||||
static inline void __rcu **mte_destroy_descend(struct maple_enode **enode,
|
||||
struct maple_tree *mt, struct maple_enode *prev, unsigned char offset)
|
||||
{
|
||||
struct maple_node *node;
|
||||
struct maple_enode *next = mas->node;
|
||||
struct maple_enode *next = *enode;
|
||||
void __rcu **slots = NULL;
|
||||
enum maple_type type;
|
||||
unsigned char next_offset = 0;
|
||||
|
||||
do {
|
||||
mas->node = next;
|
||||
node = mas_mn(mas);
|
||||
slots = ma_slots(node, mte_node_type(mas->node));
|
||||
next = mas_slot_locked(mas, slots, 0);
|
||||
*enode = next;
|
||||
node = mte_to_node(*enode);
|
||||
type = mte_node_type(*enode);
|
||||
slots = ma_slots(node, type);
|
||||
next = mt_slot_locked(mt, slots, next_offset);
|
||||
if ((mte_dead_node(next)))
|
||||
next = mas_slot_locked(mas, slots, 1);
|
||||
next = mt_slot_locked(mt, slots, ++next_offset);
|
||||
|
||||
mte_set_node_dead(mas->node);
|
||||
node->type = mte_node_type(mas->node);
|
||||
mte_set_node_dead(*enode);
|
||||
node->type = type;
|
||||
node->piv_parent = prev;
|
||||
node->parent_slot = offset;
|
||||
offset = 0;
|
||||
prev = mas->node;
|
||||
offset = next_offset;
|
||||
next_offset = 0;
|
||||
prev = *enode;
|
||||
} while (!mte_is_leaf(next));
|
||||
|
||||
return slots;
|
||||
}
|
||||
|
||||
static void mt_destroy_walk(struct maple_enode *enode, unsigned char ma_flags,
|
||||
static void mt_destroy_walk(struct maple_enode *enode, struct maple_tree *mt,
|
||||
bool free)
|
||||
{
|
||||
void __rcu **slots;
|
||||
struct maple_node *node = mte_to_node(enode);
|
||||
struct maple_enode *start;
|
||||
struct maple_tree mt;
|
||||
|
||||
MA_STATE(mas, &mt, 0, 0);
|
||||
|
||||
if (mte_is_leaf(enode))
|
||||
if (mte_is_leaf(enode)) {
|
||||
node->type = mte_node_type(enode);
|
||||
goto free_leaf;
|
||||
}
|
||||
|
||||
mt_init_flags(&mt, ma_flags);
|
||||
mas_lock(&mas);
|
||||
|
||||
mas.node = start = enode;
|
||||
slots = mas_destroy_descend(&mas, start, 0);
|
||||
node = mas_mn(&mas);
|
||||
start = enode;
|
||||
slots = mte_destroy_descend(&enode, mt, start, 0);
|
||||
node = mte_to_node(enode); // Updated in the above call.
|
||||
do {
|
||||
enum maple_type type;
|
||||
unsigned char offset;
|
||||
struct maple_enode *parent, *tmp;
|
||||
|
||||
node->slot_len = mas_dead_leaves(&mas, slots);
|
||||
node->slot_len = mte_dead_leaves(enode, mt, slots);
|
||||
if (free)
|
||||
mt_free_bulk(node->slot_len, slots);
|
||||
offset = node->parent_slot + 1;
|
||||
mas.node = node->piv_parent;
|
||||
if (mas_mn(&mas) == node)
|
||||
goto start_slots_free;
|
||||
enode = node->piv_parent;
|
||||
if (mte_to_node(enode) == node)
|
||||
goto free_leaf;
|
||||
|
||||
type = mte_node_type(mas.node);
|
||||
slots = ma_slots(mte_to_node(mas.node), type);
|
||||
type = mte_node_type(enode);
|
||||
slots = ma_slots(mte_to_node(enode), type);
|
||||
if (offset >= mt_slots[type])
|
||||
goto next;
|
||||
|
||||
tmp = mas_slot_locked(&mas, slots, offset);
|
||||
tmp = mt_slot_locked(mt, slots, offset);
|
||||
if (mte_node_type(tmp) && mte_to_node(tmp)) {
|
||||
parent = mas.node;
|
||||
mas.node = tmp;
|
||||
slots = mas_destroy_descend(&mas, parent, offset);
|
||||
parent = enode;
|
||||
enode = tmp;
|
||||
slots = mte_destroy_descend(&enode, mt, parent, offset);
|
||||
}
|
||||
next:
|
||||
node = mas_mn(&mas);
|
||||
} while (start != mas.node);
|
||||
node = mte_to_node(enode);
|
||||
} while (start != enode);
|
||||
|
||||
node = mas_mn(&mas);
|
||||
node->slot_len = mas_dead_leaves(&mas, slots);
|
||||
node = mte_to_node(enode);
|
||||
node->slot_len = mte_dead_leaves(enode, mt, slots);
|
||||
if (free)
|
||||
mt_free_bulk(node->slot_len, slots);
|
||||
|
||||
start_slots_free:
|
||||
mas_unlock(&mas);
|
||||
|
||||
free_leaf:
|
||||
if (free)
|
||||
mt_free_rcu(&node->rcu);
|
||||
else
|
||||
mt_clear_meta(mt, node, node->type);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -5593,10 +5686,10 @@ static inline void mte_destroy_walk(struct maple_enode *enode,
|
|||
struct maple_node *node = mte_to_node(enode);
|
||||
|
||||
if (mt_in_rcu(mt)) {
|
||||
mt_destroy_walk(enode, mt->ma_flags, false);
|
||||
mt_destroy_walk(enode, mt, false);
|
||||
call_rcu(&node->rcu, mt_free_walk);
|
||||
} else {
|
||||
mt_destroy_walk(enode, mt->ma_flags, true);
|
||||
mt_destroy_walk(enode, mt, true);
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -6617,11 +6710,11 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn,
|
|||
while (likely(!ma_is_leaf(mt))) {
|
||||
MT_BUG_ON(mas->tree, mte_dead_node(mas->node));
|
||||
slots = ma_slots(mn, mt);
|
||||
pivots = ma_pivots(mn, mt);
|
||||
max = pivots[0];
|
||||
entry = mas_slot(mas, slots, 0);
|
||||
pivots = ma_pivots(mn, mt);
|
||||
if (unlikely(ma_dead_node(mn)))
|
||||
return NULL;
|
||||
max = pivots[0];
|
||||
mas->node = entry;
|
||||
mn = mas_mn(mas);
|
||||
mt = mte_node_type(mas->node);
|
||||
|
@ -6641,13 +6734,13 @@ static inline void *mas_first_entry(struct ma_state *mas, struct maple_node *mn,
|
|||
if (likely(entry))
|
||||
return entry;
|
||||
|
||||
pivots = ma_pivots(mn, mt);
|
||||
mas->index = pivots[0] + 1;
|
||||
mas->offset = 1;
|
||||
entry = mas_slot(mas, slots, 1);
|
||||
pivots = ma_pivots(mn, mt);
|
||||
if (unlikely(ma_dead_node(mn)))
|
||||
return NULL;
|
||||
|
||||
mas->index = pivots[0] + 1;
|
||||
if (mas->index > limit)
|
||||
goto none;
|
||||
|
||||
|
|
14
mm/hugetlb.c
14
mm/hugetlb.c
|
@ -5478,7 +5478,7 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
struct folio *pagecache_folio, spinlock_t *ptl)
|
||||
{
|
||||
const bool unshare = flags & FAULT_FLAG_UNSHARE;
|
||||
pte_t pte;
|
||||
pte_t pte = huge_ptep_get(ptep);
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
struct page *old_page;
|
||||
struct folio *new_folio;
|
||||
|
@ -5487,6 +5487,17 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
unsigned long haddr = address & huge_page_mask(h);
|
||||
struct mmu_notifier_range range;
|
||||
|
||||
/*
|
||||
* Never handle CoW for uffd-wp protected pages. It should be only
|
||||
* handled when the uffd-wp protection is removed.
|
||||
*
|
||||
* Note that only the CoW optimization path (in hugetlb_no_page())
|
||||
* can trigger this, because hugetlb_fault() will always resolve
|
||||
* uffd-wp bit first.
|
||||
*/
|
||||
if (!unshare && huge_pte_uffd_wp(pte))
|
||||
return 0;
|
||||
|
||||
/*
|
||||
* hugetlb does not support FOLL_FORCE-style write faults that keep the
|
||||
* PTE mapped R/O such as maybe_mkwrite() would do.
|
||||
|
@ -5500,7 +5511,6 @@ static vm_fault_t hugetlb_wp(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
return 0;
|
||||
}
|
||||
|
||||
pte = huge_ptep_get(ptep);
|
||||
old_page = pte_page(pte);
|
||||
|
||||
delayacct_wpcopy_start();
|
||||
|
|
|
@ -556,15 +556,11 @@ static unsigned long kfence_init_pool(void)
|
|||
* enters __slab_free() slow-path.
|
||||
*/
|
||||
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
|
||||
struct slab *slab = page_slab(&pages[i]);
|
||||
struct slab *slab = page_slab(nth_page(pages, i));
|
||||
|
||||
if (!i || (i % 2))
|
||||
continue;
|
||||
|
||||
/* Verify we do not have a compound head page. */
|
||||
if (WARN_ON(compound_head(&pages[i]) != &pages[i]))
|
||||
return addr;
|
||||
|
||||
__folio_set_slab(slab_folio(slab));
|
||||
#ifdef CONFIG_MEMCG
|
||||
slab->memcg_data = (unsigned long)&kfence_metadata[i / 2 - 1].objcg |
|
||||
|
@ -597,12 +593,26 @@ static unsigned long kfence_init_pool(void)
|
|||
|
||||
/* Protect the right redzone. */
|
||||
if (unlikely(!kfence_protect(addr + PAGE_SIZE)))
|
||||
return addr;
|
||||
goto reset_slab;
|
||||
|
||||
addr += 2 * PAGE_SIZE;
|
||||
}
|
||||
|
||||
return 0;
|
||||
|
||||
reset_slab:
|
||||
for (i = 0; i < KFENCE_POOL_SIZE / PAGE_SIZE; i++) {
|
||||
struct slab *slab = page_slab(nth_page(pages, i));
|
||||
|
||||
if (!i || (i % 2))
|
||||
continue;
|
||||
#ifdef CONFIG_MEMCG
|
||||
slab->memcg_data = 0;
|
||||
#endif
|
||||
__folio_clear_slab(slab_folio(slab));
|
||||
}
|
||||
|
||||
return addr;
|
||||
}
|
||||
|
||||
static bool __init kfence_init_pool_early(void)
|
||||
|
@ -632,16 +642,6 @@ static bool __init kfence_init_pool_early(void)
|
|||
* fails for the first page, and therefore expect addr==__kfence_pool in
|
||||
* most failure cases.
|
||||
*/
|
||||
for (char *p = (char *)addr; p < __kfence_pool + KFENCE_POOL_SIZE; p += PAGE_SIZE) {
|
||||
struct slab *slab = virt_to_slab(p);
|
||||
|
||||
if (!slab)
|
||||
continue;
|
||||
#ifdef CONFIG_MEMCG
|
||||
slab->memcg_data = 0;
|
||||
#endif
|
||||
__folio_clear_slab(slab_folio(slab));
|
||||
}
|
||||
memblock_free_late(__pa(addr), KFENCE_POOL_SIZE - (addr - (unsigned long)__kfence_pool));
|
||||
__kfence_pool = NULL;
|
||||
return false;
|
||||
|
|
16
mm/memory.c
16
mm/memory.c
|
@ -3563,8 +3563,21 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
|
|||
struct vm_area_struct *vma = vmf->vma;
|
||||
struct mmu_notifier_range range;
|
||||
|
||||
if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags))
|
||||
/*
|
||||
* We need a reference to lock the folio because we don't hold
|
||||
* the PTL so a racing thread can remove the device-exclusive
|
||||
* entry and unmap it. If the folio is free the entry must
|
||||
* have been removed already. If it happens to have already
|
||||
* been re-allocated after being freed all we do is lock and
|
||||
* unlock it.
|
||||
*/
|
||||
if (!folio_try_get(folio))
|
||||
return 0;
|
||||
|
||||
if (!folio_lock_or_retry(folio, vma->vm_mm, vmf->flags)) {
|
||||
folio_put(folio);
|
||||
return VM_FAULT_RETRY;
|
||||
}
|
||||
mmu_notifier_range_init_owner(&range, MMU_NOTIFY_EXCLUSIVE, 0,
|
||||
vma->vm_mm, vmf->address & PAGE_MASK,
|
||||
(vmf->address & PAGE_MASK) + PAGE_SIZE, NULL);
|
||||
|
@ -3577,6 +3590,7 @@ static vm_fault_t remove_device_exclusive_entry(struct vm_fault *vmf)
|
|||
|
||||
pte_unmap_unlock(vmf->pte, vmf->ptl);
|
||||
folio_unlock(folio);
|
||||
folio_put(folio);
|
||||
|
||||
mmu_notifier_invalidate_range_end(&range);
|
||||
return 0;
|
||||
|
|
|
@ -2277,7 +2277,7 @@ do_vmi_align_munmap(struct vma_iterator *vmi, struct vm_area_struct *vma,
|
|||
int count = 0;
|
||||
int error = -ENOMEM;
|
||||
MA_STATE(mas_detach, &mt_detach, 0, 0);
|
||||
mt_init_flags(&mt_detach, MT_FLAGS_LOCK_EXTERN);
|
||||
mt_init_flags(&mt_detach, vmi->mas.tree->ma_flags & MT_FLAGS_LOCK_MASK);
|
||||
mt_set_external_lock(&mt_detach, &mm->mmap_lock);
|
||||
|
||||
/*
|
||||
|
@ -3037,6 +3037,7 @@ void exit_mmap(struct mm_struct *mm)
|
|||
*/
|
||||
set_bit(MMF_OOM_SKIP, &mm->flags);
|
||||
mmap_write_lock(mm);
|
||||
mt_clear_in_rcu(&mm->mm_mt);
|
||||
free_pgtables(&tlb, &mm->mm_mt, vma, FIRST_USER_ADDRESS,
|
||||
USER_PGTABLES_CEILING);
|
||||
tlb_finish_mmu(&tlb);
|
||||
|
|
|
@ -679,6 +679,7 @@ static void __del_from_avail_list(struct swap_info_struct *p)
|
|||
{
|
||||
int nid;
|
||||
|
||||
assert_spin_locked(&p->lock);
|
||||
for_each_node(nid)
|
||||
plist_del(&p->avail_lists[nid], &swap_avail_heads[nid]);
|
||||
}
|
||||
|
@ -2434,8 +2435,8 @@ SYSCALL_DEFINE1(swapoff, const char __user *, specialfile)
|
|||
spin_unlock(&swap_lock);
|
||||
goto out_dput;
|
||||
}
|
||||
del_from_avail_list(p);
|
||||
spin_lock(&p->lock);
|
||||
del_from_avail_list(p);
|
||||
if (p->prio < 0) {
|
||||
struct swap_info_struct *si = p;
|
||||
int nid;
|
||||
|
|
|
@ -3042,9 +3042,11 @@ static void *__vmalloc_area_node(struct vm_struct *area, gfp_t gfp_mask,
|
|||
* allocation request, free them via vfree() if any.
|
||||
*/
|
||||
if (area->nr_pages != nr_small_pages) {
|
||||
warn_alloc(gfp_mask, NULL,
|
||||
"vmalloc error: size %lu, page order %u, failed to allocate pages",
|
||||
area->nr_pages * PAGE_SIZE, page_order);
|
||||
/* vm_area_alloc_pages() can also fail due to a fatal signal */
|
||||
if (!fatal_signal_pending(current))
|
||||
warn_alloc(gfp_mask, NULL,
|
||||
"vmalloc error: size %lu, page order %u, failed to allocate pages",
|
||||
area->nr_pages * PAGE_SIZE, page_order);
|
||||
goto fail;
|
||||
}
|
||||
|
||||
|
|
|
@ -108,6 +108,7 @@ static noinline void check_new_node(struct maple_tree *mt)
|
|||
MT_BUG_ON(mt, mn->slot[1] != NULL);
|
||||
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
|
||||
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
mas.node = MAS_START;
|
||||
mas_nomem(&mas, GFP_KERNEL);
|
||||
|
@ -160,6 +161,7 @@ static noinline void check_new_node(struct maple_tree *mt)
|
|||
MT_BUG_ON(mt, mas_allocated(&mas) != i);
|
||||
MT_BUG_ON(mt, !mn);
|
||||
MT_BUG_ON(mt, not_empty(mn));
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
}
|
||||
|
||||
|
@ -192,6 +194,7 @@ static noinline void check_new_node(struct maple_tree *mt)
|
|||
MT_BUG_ON(mt, not_empty(mn));
|
||||
MT_BUG_ON(mt, mas_allocated(&mas) != i - 1);
|
||||
MT_BUG_ON(mt, !mn);
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
}
|
||||
|
||||
|
@ -210,6 +213,7 @@ static noinline void check_new_node(struct maple_tree *mt)
|
|||
mn = mas_pop_node(&mas);
|
||||
MT_BUG_ON(mt, not_empty(mn));
|
||||
MT_BUG_ON(mt, mas_allocated(&mas) != j - 1);
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
}
|
||||
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
|
||||
|
@ -233,6 +237,7 @@ static noinline void check_new_node(struct maple_tree *mt)
|
|||
MT_BUG_ON(mt, mas_allocated(&mas) != i - j);
|
||||
mn = mas_pop_node(&mas);
|
||||
MT_BUG_ON(mt, not_empty(mn));
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
MT_BUG_ON(mt, mas_allocated(&mas) != i - j - 1);
|
||||
}
|
||||
|
@ -269,6 +274,7 @@ static noinline void check_new_node(struct maple_tree *mt)
|
|||
mn = mas_pop_node(&mas); /* get the next node. */
|
||||
MT_BUG_ON(mt, mn == NULL);
|
||||
MT_BUG_ON(mt, not_empty(mn));
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
}
|
||||
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
|
||||
|
@ -294,6 +300,7 @@ static noinline void check_new_node(struct maple_tree *mt)
|
|||
mn = mas_pop_node(&mas2); /* get the next node. */
|
||||
MT_BUG_ON(mt, mn == NULL);
|
||||
MT_BUG_ON(mt, not_empty(mn));
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
}
|
||||
MT_BUG_ON(mt, mas_allocated(&mas2) != 0);
|
||||
|
@ -334,10 +341,12 @@ static noinline void check_new_node(struct maple_tree *mt)
|
|||
MT_BUG_ON(mt, mas_allocated(&mas) != MAPLE_ALLOC_SLOTS + 2);
|
||||
mn = mas_pop_node(&mas);
|
||||
MT_BUG_ON(mt, not_empty(mn));
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
for (i = 1; i <= MAPLE_ALLOC_SLOTS + 1; i++) {
|
||||
mn = mas_pop_node(&mas);
|
||||
MT_BUG_ON(mt, not_empty(mn));
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
}
|
||||
MT_BUG_ON(mt, mas_allocated(&mas) != 0);
|
||||
|
@ -375,6 +384,7 @@ static noinline void check_new_node(struct maple_tree *mt)
|
|||
mas_node_count(&mas, i); /* Request */
|
||||
mas_nomem(&mas, GFP_KERNEL); /* Fill request */
|
||||
mn = mas_pop_node(&mas); /* get the next node. */
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
mas_destroy(&mas);
|
||||
|
||||
|
@ -382,10 +392,13 @@ static noinline void check_new_node(struct maple_tree *mt)
|
|||
mas_node_count(&mas, i); /* Request */
|
||||
mas_nomem(&mas, GFP_KERNEL); /* Fill request */
|
||||
mn = mas_pop_node(&mas); /* get the next node. */
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
mn = mas_pop_node(&mas); /* get the next node. */
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
mn = mas_pop_node(&mas); /* get the next node. */
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
mas_destroy(&mas);
|
||||
}
|
||||
|
@ -35369,6 +35382,7 @@ static noinline void check_prealloc(struct maple_tree *mt)
|
|||
MT_BUG_ON(mt, allocated != 1 + height * 3);
|
||||
mn = mas_pop_node(&mas);
|
||||
MT_BUG_ON(mt, mas_allocated(&mas) != allocated - 1);
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
|
||||
mas_destroy(&mas);
|
||||
|
@ -35386,6 +35400,7 @@ static noinline void check_prealloc(struct maple_tree *mt)
|
|||
mas_destroy(&mas);
|
||||
allocated = mas_allocated(&mas);
|
||||
MT_BUG_ON(mt, allocated != 0);
|
||||
mn->parent = ma_parent_ptr(mn);
|
||||
ma_free_rcu(mn);
|
||||
|
||||
MT_BUG_ON(mt, mas_preallocate(&mas, GFP_KERNEL) != 0);
|
||||
|
@ -35756,6 +35771,7 @@ void farmer_tests(void)
|
|||
tree.ma_root = mt_mk_node(node, maple_leaf_64);
|
||||
mt_dump(&tree);
|
||||
|
||||
node->parent = ma_parent_ptr(node);
|
||||
ma_free_rcu(node);
|
||||
|
||||
/* Check things that will make lockdep angry */
|
||||
|
|
Loading…
Reference in New Issue