2023-01-11 19:36:13 +08:00
|
|
|
/* SPDX-License-Identifier: GPL-2.0 */
|
|
|
|
|
|
|
|
#ifndef BTRFS_LRU_CACHE_H
|
|
|
|
#define BTRFS_LRU_CACHE_H
|
|
|
|
|
|
|
|
#include <linux/maple_tree.h>
|
|
|
|
#include <linux/list.h>
|
|
|
|
|
|
|
|
/*
|
|
|
|
* A cache entry. This is meant to be embedded in a structure of a user of
|
|
|
|
* this module. Similar to how struct list_head and struct rb_node are used.
|
|
|
|
*
|
|
|
|
* Note: it should be embedded as the first element in a struct (offset 0), and
|
|
|
|
* this module assumes it was allocated with kmalloc(), so it calls kfree() when
|
|
|
|
* it needs to free an entry.
|
|
|
|
*/
|
|
|
|
struct btrfs_lru_cache_entry {
|
|
|
|
struct list_head lru_list;
|
|
|
|
u64 key;
|
2023-01-11 19:36:16 +08:00
|
|
|
/*
|
|
|
|
* Optional generation associated to a key. Use 0 if not needed/used.
|
|
|
|
* Entries with the same key and different generations are stored in a
|
|
|
|
* linked list, so use this only for cases where there's a small number
|
|
|
|
* of different generations.
|
|
|
|
*/
|
|
|
|
u64 gen;
|
btrfs: adapt lru cache to allow for 64 bits keys on 32 bits systems
The lru cache is backed by a maple tree, which uses the unsigned long
type for keys, and that type has a width of 32 bits on 32 bits systems
and a width of 64 bits on 64 bits systems.
Currently there is only one user of the lru cache, the send backref cache,
which uses a sector number as a key, a logical address right shifted by
fs_info->sectorsize_bits, so a 32 bits width is not yet a problem (the
same happens with the radix tree we use to track extent buffers,
fs_info->buffer_radix).
However the next patches in the series will start using the lru cache for
cases where inode numbers are the keys, and the inode numbers are always
64 bits, even if we are running on a 32 bits system.
So adapt the lru cache to allow multiple values under the same key, by
having the maple tree store a head entry that points to a list of entries
instead of pointing to a single entry. This is a similar approach to what
we currently do for the name cache in send (which uses a radix tree that
has indexes with an unsigned long type as well), and will allow later to
use the lru cache for the send name cache as well.
This patch is part of a larger patchset and the changelog of the last
patch in the series contains a sample performance test and results.
The patches that comprise the patchset are the following:
btrfs: send: directly return from did_overwrite_ref() and simplify it
btrfs: send: avoid unnecessary generation search at did_overwrite_ref()
btrfs: send: directly return from will_overwrite_ref() and simplify it
btrfs: send: avoid extra b+tree searches when checking reference overrides
btrfs: send: remove send_progress argument from can_rmdir()
btrfs: send: avoid duplicated orphan dir allocation and initialization
btrfs: send: avoid unnecessary orphan dir rbtree search at can_rmdir()
btrfs: send: reduce searches on parent root when checking if dir can be removed
btrfs: send: iterate waiting dir move rbtree only once when processing refs
btrfs: send: initialize all the red black trees earlier
btrfs: send: genericize the backref cache to allow it to be reused
btrfs: adapt lru cache to allow for 64 bits keys on 32 bits systems
btrfs: send: cache information about created directories
btrfs: allow a generation number to be associated with lru cache entries
btrfs: add an api to delete a specific entry from the lru cache
btrfs: send: use the lru cache to implement the name cache
btrfs: send: update size of roots array for backref cache entries
btrfs: send: cache utimes operations for directories if possible
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-01-11 19:36:14 +08:00
|
|
|
/*
|
|
|
|
* The maple tree uses unsigned long type for the keys, which is 32 bits
|
|
|
|
* on 32 bits systems, and 64 bits on 64 bits systems. So if we want to
|
|
|
|
* use something like inode numbers as keys, which are always a u64, we
|
|
|
|
* have to deal with this in a special way - we store the key in the
|
|
|
|
* entry itself, as a u64, and the values inserted into the maple tree
|
|
|
|
* are linked lists of entries - so in case we are on a 64 bits system,
|
|
|
|
* that list always has a single entry, while on 32 bits systems it
|
|
|
|
* may have more than one, with each entry having the same value for
|
|
|
|
* their lower 32 bits of the u64 key.
|
|
|
|
*/
|
|
|
|
struct list_head list;
|
2023-01-11 19:36:13 +08:00
|
|
|
};
|
|
|
|
|
|
|
|
struct btrfs_lru_cache {
|
|
|
|
struct list_head lru_list;
|
|
|
|
struct maple_tree entries;
|
|
|
|
/* Number of entries stored in the cache. */
|
|
|
|
unsigned int size;
|
|
|
|
/* Maximum number of entries the cache can have. */
|
|
|
|
unsigned int max_size;
|
|
|
|
};
|
|
|
|
|
btrfs: send: cache utimes operations for directories if possible
Whenever we add or remove an entry to a directory, we issue an utimes
command for the directory. If we add 1000 entries to a directory (create
1000 files under it or move 1000 files to it), then we issue the same
utimes command 1000 times, which increases the send stream size, results
in more pipe IO, one search in the send b+tree, allocating one path for
the search, etc, as well as making the receiver do a system call for each
duplicated utimes command.
We also issue an utimes command when we create a new directory, but later
we might add entries to it corresponding to inodes with an higher inode
number, so it's pointless to issue the utimes command before we create
the last inode under the directory.
So use a lru cache to track directories for which we must send a utimes
command. When we need to remove an entry from the cache, we issue the
utimes command for the respective directory. When finishing the send
operation, we go over each cache element and issue the respective utimes
command. Finally the caching is entirely optional, just a performance
optimization, meaning that if we fail to cache (due to memory allocation
failure), we issue the utimes command right away, that is, we fallback
to the previous, unoptimized, behaviour.
This patch belongs to a patchset comprised of the following patches:
btrfs: send: directly return from did_overwrite_ref() and simplify it
btrfs: send: avoid unnecessary generation search at did_overwrite_ref()
btrfs: send: directly return from will_overwrite_ref() and simplify it
btrfs: send: avoid extra b+tree searches when checking reference overrides
btrfs: send: remove send_progress argument from can_rmdir()
btrfs: send: avoid duplicated orphan dir allocation and initialization
btrfs: send: avoid unnecessary orphan dir rbtree search at can_rmdir()
btrfs: send: reduce searches on parent root when checking if dir can be removed
btrfs: send: iterate waiting dir move rbtree only once when processing refs
btrfs: send: initialize all the red black trees earlier
btrfs: send: genericize the backref cache to allow it to be reused
btrfs: adapt lru cache to allow for 64 bits keys on 32 bits systems
btrfs: send: cache information about created directories
btrfs: allow a generation number to be associated with lru cache entries
btrfs: add an api to delete a specific entry from the lru cache
btrfs: send: use the lru cache to implement the name cache
btrfs: send: update size of roots array for backref cache entries
btrfs: send: cache utimes operations for directories if possible
The following test was run before and after applying the whole patchset,
and on a non-debug kernel (Debian's default kernel config):
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
mount $DEV $MNT
mkdir $MNT/A
for ((i = 1; i <= 20000; i++)); do
echo -n > $MNT/A/file_$i
done
btrfs subvolume snapshot -r $MNT $MNT/snap1
mkdir $MNT/B
for ((i = 20000; i <= 40000; i++)); do
echo -n > $MNT/B/file_$i
done
mv $MNT/A/file_* $MNT/B/
btrfs subvolume snapshot -r $MNT $MNT/snap2
start=$(date +%s%N)
btrfs send -p $MNT/snap1 $MNT/snap2 > /dev/null
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Incremental send took $dur milliseconds"
umount $MNT
Before the whole patchset: 18408 milliseconds
After the whole patchset: 1942 milliseconds (9.5x speedup)
Using 60000 files instead of 40000:
Before the whole patchset: 39764 milliseconds
After the whole patchset: 3076 milliseconds (12.9x speedup)
Using 20000 files instead of 40000:
Before the whole patchset: 5072 milliseconds
After the whole patchset: 916 milliseconds (5.5x speedup)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-01-11 19:36:20 +08:00
|
|
|
#define btrfs_lru_cache_for_each_entry_safe(cache, entry, tmp) \
|
|
|
|
list_for_each_entry_safe_reverse((entry), (tmp), &(cache)->lru_list, lru_list)
|
|
|
|
|
2023-01-11 19:36:13 +08:00
|
|
|
static inline unsigned int btrfs_lru_cache_size(const struct btrfs_lru_cache *cache)
|
|
|
|
{
|
|
|
|
return cache->size;
|
|
|
|
}
|
|
|
|
|
btrfs: send: cache utimes operations for directories if possible
Whenever we add or remove an entry to a directory, we issue an utimes
command for the directory. If we add 1000 entries to a directory (create
1000 files under it or move 1000 files to it), then we issue the same
utimes command 1000 times, which increases the send stream size, results
in more pipe IO, one search in the send b+tree, allocating one path for
the search, etc, as well as making the receiver do a system call for each
duplicated utimes command.
We also issue an utimes command when we create a new directory, but later
we might add entries to it corresponding to inodes with an higher inode
number, so it's pointless to issue the utimes command before we create
the last inode under the directory.
So use a lru cache to track directories for which we must send a utimes
command. When we need to remove an entry from the cache, we issue the
utimes command for the respective directory. When finishing the send
operation, we go over each cache element and issue the respective utimes
command. Finally the caching is entirely optional, just a performance
optimization, meaning that if we fail to cache (due to memory allocation
failure), we issue the utimes command right away, that is, we fallback
to the previous, unoptimized, behaviour.
This patch belongs to a patchset comprised of the following patches:
btrfs: send: directly return from did_overwrite_ref() and simplify it
btrfs: send: avoid unnecessary generation search at did_overwrite_ref()
btrfs: send: directly return from will_overwrite_ref() and simplify it
btrfs: send: avoid extra b+tree searches when checking reference overrides
btrfs: send: remove send_progress argument from can_rmdir()
btrfs: send: avoid duplicated orphan dir allocation and initialization
btrfs: send: avoid unnecessary orphan dir rbtree search at can_rmdir()
btrfs: send: reduce searches on parent root when checking if dir can be removed
btrfs: send: iterate waiting dir move rbtree only once when processing refs
btrfs: send: initialize all the red black trees earlier
btrfs: send: genericize the backref cache to allow it to be reused
btrfs: adapt lru cache to allow for 64 bits keys on 32 bits systems
btrfs: send: cache information about created directories
btrfs: allow a generation number to be associated with lru cache entries
btrfs: add an api to delete a specific entry from the lru cache
btrfs: send: use the lru cache to implement the name cache
btrfs: send: update size of roots array for backref cache entries
btrfs: send: cache utimes operations for directories if possible
The following test was run before and after applying the whole patchset,
and on a non-debug kernel (Debian's default kernel config):
#!/bin/bash
MNT=/mnt/sdi
DEV=/dev/sdi
mkfs.btrfs -f $DEV > /dev/null
mount $DEV $MNT
mkdir $MNT/A
for ((i = 1; i <= 20000; i++)); do
echo -n > $MNT/A/file_$i
done
btrfs subvolume snapshot -r $MNT $MNT/snap1
mkdir $MNT/B
for ((i = 20000; i <= 40000; i++)); do
echo -n > $MNT/B/file_$i
done
mv $MNT/A/file_* $MNT/B/
btrfs subvolume snapshot -r $MNT $MNT/snap2
start=$(date +%s%N)
btrfs send -p $MNT/snap1 $MNT/snap2 > /dev/null
end=$(date +%s%N)
dur=$(( (end - start) / 1000000 ))
echo "Incremental send took $dur milliseconds"
umount $MNT
Before the whole patchset: 18408 milliseconds
After the whole patchset: 1942 milliseconds (9.5x speedup)
Using 60000 files instead of 40000:
Before the whole patchset: 39764 milliseconds
After the whole patchset: 3076 milliseconds (12.9x speedup)
Using 20000 files instead of 40000:
Before the whole patchset: 5072 milliseconds
After the whole patchset: 916 milliseconds (5.5x speedup)
Signed-off-by: Filipe Manana <fdmanana@suse.com>
Reviewed-by: David Sterba <dsterba@suse.com>
Signed-off-by: David Sterba <dsterba@suse.com>
2023-01-11 19:36:20 +08:00
|
|
|
static inline struct btrfs_lru_cache_entry *btrfs_lru_cache_lru_entry(
|
|
|
|
struct btrfs_lru_cache *cache)
|
|
|
|
{
|
|
|
|
return list_first_entry_or_null(&cache->lru_list,
|
|
|
|
struct btrfs_lru_cache_entry, lru_list);
|
|
|
|
}
|
|
|
|
|
2023-01-11 19:36:13 +08:00
|
|
|
void btrfs_lru_cache_init(struct btrfs_lru_cache *cache, unsigned int max_size);
|
|
|
|
struct btrfs_lru_cache_entry *btrfs_lru_cache_lookup(struct btrfs_lru_cache *cache,
|
2023-01-11 19:36:16 +08:00
|
|
|
u64 key, u64 gen);
|
2023-01-11 19:36:13 +08:00
|
|
|
int btrfs_lru_cache_store(struct btrfs_lru_cache *cache,
|
|
|
|
struct btrfs_lru_cache_entry *new_entry,
|
|
|
|
gfp_t gfp);
|
2023-01-11 19:36:17 +08:00
|
|
|
void btrfs_lru_cache_remove(struct btrfs_lru_cache *cache,
|
|
|
|
struct btrfs_lru_cache_entry *entry);
|
2023-01-11 19:36:13 +08:00
|
|
|
void btrfs_lru_cache_clear(struct btrfs_lru_cache *cache);
|
|
|
|
|
|
|
|
#endif
|