2015-09-09 05:58:40 +08:00
|
|
|
#ifndef _LINUX_DAX_H
|
|
|
|
#define _LINUX_DAX_H
|
|
|
|
|
|
|
|
#include <linux/fs.h>
|
|
|
|
#include <linux/mm.h>
|
2016-05-13 00:29:17 +08:00
|
|
|
#include <linux/radix-tree.h>
|
2015-09-09 05:58:40 +08:00
|
|
|
#include <asm/pgtable.h>
|
|
|
|
|
2016-09-19 09:24:49 +08:00
|
|
|
struct iomap_ops;
|
2017-01-25 10:44:18 +08:00
|
|
|
struct dax_device;
|
|
|
|
struct dax_operations {
|
|
|
|
/*
|
|
|
|
* direct_access: translate a device-relative
|
|
|
|
* logical-page-offset into an absolute physical pfn. Return the
|
|
|
|
* number of pages available for DAX at that pfn.
|
|
|
|
*/
|
|
|
|
long (*direct_access)(struct dax_device *, pgoff_t, long,
|
|
|
|
void **, pfn_t *);
|
|
|
|
};
|
2016-09-19 09:24:49 +08:00
|
|
|
|
2017-04-12 00:49:49 +08:00
|
|
|
int dax_read_lock(void);
|
|
|
|
void dax_read_unlock(int id);
|
2017-04-20 06:14:31 +08:00
|
|
|
struct dax_device *dax_get_by_host(const char *host);
|
2017-01-25 15:02:09 +08:00
|
|
|
struct dax_device *alloc_dax(void *private, const char *host,
|
|
|
|
const struct dax_operations *ops);
|
|
|
|
void put_dax(struct dax_device *dax_dev);
|
|
|
|
bool dax_alive(struct dax_device *dax_dev);
|
|
|
|
void kill_dax(struct dax_device *dax_dev);
|
|
|
|
void *dax_get_private(struct dax_device *dax_dev);
|
2017-04-12 00:49:49 +08:00
|
|
|
|
2016-11-08 08:33:35 +08:00
|
|
|
/*
|
dax: add struct iomap based DAX PMD support
DAX PMDs have been disabled since Jan Kara introduced DAX radix tree based
locking. This patch allows DAX PMDs to participate in the DAX radix tree
based locking scheme so that they can be re-enabled using the new struct
iomap based fault handlers.
There are currently three types of DAX 4k entries: 4k zero pages, 4k DAX
mappings that have an associated block allocation, and 4k DAX empty
entries. The empty entries exist to provide locking for the duration of a
given page fault.
This patch adds three equivalent 2MiB DAX entries: Huge Zero Page (HZP)
entries, PMD DAX entries that have associated block allocations, and 2 MiB
DAX empty entries.
Unlike the 4k case where we insert a struct page* into the radix tree for
4k zero pages, for HZP we insert a DAX exceptional entry with the new
RADIX_DAX_HZP flag set. This is because we use a single 2 MiB zero page in
every 2MiB hole mapping, and it doesn't make sense to have that same struct
page* with multiple entries in multiple trees. This would cause contention
on the single page lock for the one Huge Zero Page, and it would break the
page->index and page->mapping associations that are assumed to be valid in
many other places in the kernel.
One difficult use case is when one thread is trying to use 4k entries in
radix tree for a given offset, and another thread is using 2 MiB entries
for that same offset. The current code handles this by making the 2 MiB
user fall back to 4k entries for most cases. This was done because it is
the simplest solution, and because the use of 2MiB pages is already
opportunistic.
If we were to try to upgrade from 4k pages to 2MiB pages for a given range,
we run into the problem of how we lock out 4k page faults for the entire
2MiB range while we clean out the radix tree so we can insert the 2MiB
entry. We can solve this problem if we need to, but I think that the cases
where both 2MiB entries and 4K entries are being used for the same range
will be rare enough and the gain small enough that it probably won't be
worth the complexity.
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-11-08 08:34:45 +08:00
|
|
|
* We use lowest available bit in exceptional entry for locking, one bit for
|
|
|
|
* the entry size (PMD) and two more to tell us if the entry is a huge zero
|
|
|
|
* page (HZP) or an empty entry that is just used for locking. In total four
|
|
|
|
* special bits.
|
|
|
|
*
|
|
|
|
* If the PMD bit isn't set the entry has size PAGE_SIZE, and if the HZP and
|
|
|
|
* EMPTY bits aren't set the entry is a normal DAX entry with a filesystem
|
|
|
|
* block allocation.
|
2016-11-08 08:33:35 +08:00
|
|
|
*/
|
dax: add struct iomap based DAX PMD support
DAX PMDs have been disabled since Jan Kara introduced DAX radix tree based
locking. This patch allows DAX PMDs to participate in the DAX radix tree
based locking scheme so that they can be re-enabled using the new struct
iomap based fault handlers.
There are currently three types of DAX 4k entries: 4k zero pages, 4k DAX
mappings that have an associated block allocation, and 4k DAX empty
entries. The empty entries exist to provide locking for the duration of a
given page fault.
This patch adds three equivalent 2MiB DAX entries: Huge Zero Page (HZP)
entries, PMD DAX entries that have associated block allocations, and 2 MiB
DAX empty entries.
Unlike the 4k case where we insert a struct page* into the radix tree for
4k zero pages, for HZP we insert a DAX exceptional entry with the new
RADIX_DAX_HZP flag set. This is because we use a single 2 MiB zero page in
every 2MiB hole mapping, and it doesn't make sense to have that same struct
page* with multiple entries in multiple trees. This would cause contention
on the single page lock for the one Huge Zero Page, and it would break the
page->index and page->mapping associations that are assumed to be valid in
many other places in the kernel.
One difficult use case is when one thread is trying to use 4k entries in
radix tree for a given offset, and another thread is using 2 MiB entries
for that same offset. The current code handles this by making the 2 MiB
user fall back to 4k entries for most cases. This was done because it is
the simplest solution, and because the use of 2MiB pages is already
opportunistic.
If we were to try to upgrade from 4k pages to 2MiB pages for a given range,
we run into the problem of how we lock out 4k page faults for the entire
2MiB range while we clean out the radix tree so we can insert the 2MiB
entry. We can solve this problem if we need to, but I think that the cases
where both 2MiB entries and 4K entries are being used for the same range
will be rare enough and the gain small enough that it probably won't be
worth the complexity.
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-11-08 08:34:45 +08:00
|
|
|
#define RADIX_DAX_SHIFT (RADIX_TREE_EXCEPTIONAL_SHIFT + 4)
|
2016-05-13 00:29:16 +08:00
|
|
|
#define RADIX_DAX_ENTRY_LOCK (1 << RADIX_TREE_EXCEPTIONAL_SHIFT)
|
dax: add struct iomap based DAX PMD support
DAX PMDs have been disabled since Jan Kara introduced DAX radix tree based
locking. This patch allows DAX PMDs to participate in the DAX radix tree
based locking scheme so that they can be re-enabled using the new struct
iomap based fault handlers.
There are currently three types of DAX 4k entries: 4k zero pages, 4k DAX
mappings that have an associated block allocation, and 4k DAX empty
entries. The empty entries exist to provide locking for the duration of a
given page fault.
This patch adds three equivalent 2MiB DAX entries: Huge Zero Page (HZP)
entries, PMD DAX entries that have associated block allocations, and 2 MiB
DAX empty entries.
Unlike the 4k case where we insert a struct page* into the radix tree for
4k zero pages, for HZP we insert a DAX exceptional entry with the new
RADIX_DAX_HZP flag set. This is because we use a single 2 MiB zero page in
every 2MiB hole mapping, and it doesn't make sense to have that same struct
page* with multiple entries in multiple trees. This would cause contention
on the single page lock for the one Huge Zero Page, and it would break the
page->index and page->mapping associations that are assumed to be valid in
many other places in the kernel.
One difficult use case is when one thread is trying to use 4k entries in
radix tree for a given offset, and another thread is using 2 MiB entries
for that same offset. The current code handles this by making the 2 MiB
user fall back to 4k entries for most cases. This was done because it is
the simplest solution, and because the use of 2MiB pages is already
opportunistic.
If we were to try to upgrade from 4k pages to 2MiB pages for a given range,
we run into the problem of how we lock out 4k page faults for the entire
2MiB range while we clean out the radix tree so we can insert the 2MiB
entry. We can solve this problem if we need to, but I think that the cases
where both 2MiB entries and 4K entries are being used for the same range
will be rare enough and the gain small enough that it probably won't be
worth the complexity.
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-11-08 08:34:45 +08:00
|
|
|
#define RADIX_DAX_PMD (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 1))
|
|
|
|
#define RADIX_DAX_HZP (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 2))
|
|
|
|
#define RADIX_DAX_EMPTY (1 << (RADIX_TREE_EXCEPTIONAL_SHIFT + 3))
|
2016-11-08 08:33:35 +08:00
|
|
|
|
dax: add struct iomap based DAX PMD support
DAX PMDs have been disabled since Jan Kara introduced DAX radix tree based
locking. This patch allows DAX PMDs to participate in the DAX radix tree
based locking scheme so that they can be re-enabled using the new struct
iomap based fault handlers.
There are currently three types of DAX 4k entries: 4k zero pages, 4k DAX
mappings that have an associated block allocation, and 4k DAX empty
entries. The empty entries exist to provide locking for the duration of a
given page fault.
This patch adds three equivalent 2MiB DAX entries: Huge Zero Page (HZP)
entries, PMD DAX entries that have associated block allocations, and 2 MiB
DAX empty entries.
Unlike the 4k case where we insert a struct page* into the radix tree for
4k zero pages, for HZP we insert a DAX exceptional entry with the new
RADIX_DAX_HZP flag set. This is because we use a single 2 MiB zero page in
every 2MiB hole mapping, and it doesn't make sense to have that same struct
page* with multiple entries in multiple trees. This would cause contention
on the single page lock for the one Huge Zero Page, and it would break the
page->index and page->mapping associations that are assumed to be valid in
many other places in the kernel.
One difficult use case is when one thread is trying to use 4k entries in
radix tree for a given offset, and another thread is using 2 MiB entries
for that same offset. The current code handles this by making the 2 MiB
user fall back to 4k entries for most cases. This was done because it is
the simplest solution, and because the use of 2MiB pages is already
opportunistic.
If we were to try to upgrade from 4k pages to 2MiB pages for a given range,
we run into the problem of how we lock out 4k page faults for the entire
2MiB range while we clean out the radix tree so we can insert the 2MiB
entry. We can solve this problem if we need to, but I think that the cases
where both 2MiB entries and 4K entries are being used for the same range
will be rare enough and the gain small enough that it probably won't be
worth the complexity.
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-11-08 08:34:45 +08:00
|
|
|
static inline unsigned long dax_radix_sector(void *entry)
|
|
|
|
{
|
|
|
|
return (unsigned long)entry >> RADIX_DAX_SHIFT;
|
|
|
|
}
|
|
|
|
|
|
|
|
static inline void *dax_radix_locked_entry(sector_t sector, unsigned long flags)
|
|
|
|
{
|
|
|
|
return (void *)(RADIX_TREE_EXCEPTIONAL_ENTRY | flags |
|
|
|
|
((unsigned long)sector << RADIX_DAX_SHIFT) |
|
|
|
|
RADIX_DAX_ENTRY_LOCK);
|
|
|
|
}
|
2016-05-13 00:29:16 +08:00
|
|
|
|
2016-11-08 08:32:46 +08:00
|
|
|
ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter,
|
2017-01-28 15:20:26 +08:00
|
|
|
const struct iomap_ops *ops);
|
2017-02-25 06:57:08 +08:00
|
|
|
int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size,
|
|
|
|
const struct iomap_ops *ops);
|
2016-05-13 00:29:18 +08:00
|
|
|
int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index);
|
2016-08-10 23:22:44 +08:00
|
|
|
int dax_invalidate_mapping_entry(struct address_space *mapping, pgoff_t index);
|
|
|
|
int dax_invalidate_mapping_entry_sync(struct address_space *mapping,
|
|
|
|
pgoff_t index);
|
2016-05-13 00:29:18 +08:00
|
|
|
void dax_wake_mapping_entry_waiter(struct address_space *mapping,
|
2016-11-08 08:32:20 +08:00
|
|
|
pgoff_t index, void *entry, bool wake_all);
|
2016-01-29 12:25:31 +08:00
|
|
|
|
|
|
|
#ifdef CONFIG_FS_DAX
|
|
|
|
struct page *read_dax_sector(struct block_device *bdev, sector_t n);
|
2016-05-09 16:47:04 +08:00
|
|
|
int __dax_zero_page_range(struct block_device *bdev, sector_t sector,
|
|
|
|
unsigned int offset, unsigned int length);
|
2016-01-29 12:25:31 +08:00
|
|
|
#else
|
|
|
|
static inline struct page *read_dax_sector(struct block_device *bdev,
|
|
|
|
sector_t n)
|
|
|
|
{
|
|
|
|
return ERR_PTR(-ENXIO);
|
|
|
|
}
|
2016-05-09 16:47:04 +08:00
|
|
|
static inline int __dax_zero_page_range(struct block_device *bdev,
|
|
|
|
sector_t sector, unsigned int offset, unsigned int length)
|
|
|
|
{
|
|
|
|
return -ENXIO;
|
|
|
|
}
|
2016-01-29 12:25:31 +08:00
|
|
|
#endif
|
|
|
|
|
dax: add struct iomap based DAX PMD support
DAX PMDs have been disabled since Jan Kara introduced DAX radix tree based
locking. This patch allows DAX PMDs to participate in the DAX radix tree
based locking scheme so that they can be re-enabled using the new struct
iomap based fault handlers.
There are currently three types of DAX 4k entries: 4k zero pages, 4k DAX
mappings that have an associated block allocation, and 4k DAX empty
entries. The empty entries exist to provide locking for the duration of a
given page fault.
This patch adds three equivalent 2MiB DAX entries: Huge Zero Page (HZP)
entries, PMD DAX entries that have associated block allocations, and 2 MiB
DAX empty entries.
Unlike the 4k case where we insert a struct page* into the radix tree for
4k zero pages, for HZP we insert a DAX exceptional entry with the new
RADIX_DAX_HZP flag set. This is because we use a single 2 MiB zero page in
every 2MiB hole mapping, and it doesn't make sense to have that same struct
page* with multiple entries in multiple trees. This would cause contention
on the single page lock for the one Huge Zero Page, and it would break the
page->index and page->mapping associations that are assumed to be valid in
many other places in the kernel.
One difficult use case is when one thread is trying to use 4k entries in
radix tree for a given offset, and another thread is using 2 MiB entries
for that same offset. The current code handles this by making the 2 MiB
user fall back to 4k entries for most cases. This was done because it is
the simplest solution, and because the use of 2MiB pages is already
opportunistic.
If we were to try to upgrade from 4k pages to 2MiB pages for a given range,
we run into the problem of how we lock out 4k page faults for the entire
2MiB range while we clean out the radix tree so we can insert the 2MiB
entry. We can solve this problem if we need to, but I think that the cases
where both 2MiB entries and 4K entries are being used for the same range
will be rare enough and the gain small enough that it probably won't be
worth the complexity.
Signed-off-by: Ross Zwisler <ross.zwisler@linux.intel.com>
Reviewed-by: Jan Kara <jack@suse.cz>
Signed-off-by: Dave Chinner <david@fromorbit.com>
2016-11-08 08:34:45 +08:00
|
|
|
#ifdef CONFIG_FS_DAX_PMD
|
|
|
|
static inline unsigned int dax_radix_order(void *entry)
|
|
|
|
{
|
|
|
|
if ((unsigned long)entry & RADIX_DAX_PMD)
|
|
|
|
return PMD_SHIFT - PAGE_SHIFT;
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#else
|
|
|
|
static inline unsigned int dax_radix_order(void *entry)
|
|
|
|
{
|
|
|
|
return 0;
|
|
|
|
}
|
|
|
|
#endif
|
2017-02-25 06:56:41 +08:00
|
|
|
int dax_pfn_mkwrite(struct vm_fault *vmf);
|
2015-09-09 05:58:40 +08:00
|
|
|
|
2015-09-09 05:58:45 +08:00
|
|
|
static inline bool vma_is_dax(struct vm_area_struct *vma)
|
|
|
|
{
|
|
|
|
return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host);
|
|
|
|
}
|
2016-01-23 07:10:40 +08:00
|
|
|
|
|
|
|
static inline bool dax_mapping(struct address_space *mapping)
|
|
|
|
{
|
|
|
|
return mapping->host && IS_DAX(mapping->host);
|
|
|
|
}
|
2016-02-27 07:19:55 +08:00
|
|
|
|
|
|
|
struct writeback_control;
|
|
|
|
int dax_writeback_mapping_range(struct address_space *mapping,
|
|
|
|
struct block_device *bdev, struct writeback_control *wbc);
|
2015-09-09 05:58:40 +08:00
|
|
|
#endif
|