From d4bbe7068b60e9263f08c54e6c2a0166c0f37317 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:31 -0800 Subject: [PATCH 01/12] dax: fix NULL pointer dereference in __dax_dbg() In __dax_pmd_fault() we currently assume that get_block() will always set bh.b_bdev and we unconditionally dereference it in __dax_dbg(). This assumption isn't always true - when called for reads of holes ext4_dax_mmap_get_block() returns a buffer head where bh->b_bdev is never set. I hit this BUG while testing the DAX PMD fault path. Instead, initialize bh.b_bdev before passing bh into get_block(). It is possible that the filesystem's get_block() will update bh.b_bdev, and this is fine - we just want to initialize bh.b_bdev to something reasonable so that the calls to __dax_dbg() work and print something useful. Signed-off-by: Ross Zwisler Reported-by: Dan Williams Cc: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 1 + 1 file changed, 1 insertion(+) diff --git a/fs/dax.c b/fs/dax.c index 7af879759064..513bba5f1e02 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -624,6 +624,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; From de14b9cb5e02b5daaea139590393af5ccccc4229 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:34 -0800 Subject: [PATCH 02/12] dax: fix conversion of holes to PMDs When we get a DAX PMD fault for a write it is possible that there could be some number of 4k zero pages already present for the same range that were inserted to service reads from a hole. These 4k zero pages need to be unmapped from the VMAs and removed from the struct address_space radix tree before the real DAX PMD entry can be inserted. For PTE faults this same use case also exists and is handled by a combination of unmap_mapping_range() to unmap the VMAs and delete_from_page_cache() to remove the page from the address_space radix tree. For PMD faults we do have a call to unmap_mapping_range() (protected by a buffer_new() check), but nothing clears out the radix tree entry. The buffer_new() check is also incorrect as the current ext4 and XFS filesystem code will never return a buffer_head with BH_New set, even when allocating new blocks over a hole. Instead the filesystem will zero the blocks manually and return a buffer_head with only BH_Mapped set. Fix this situation by removing the buffer_new() check and adding a call to truncate_inode_pages_range() to clear out the radix tree entries before we insert the DAX PMD. Signed-off-by: Ross Zwisler Reported-by: Dan Williams Tested-by: Dan Williams Reviewed-by: Jan Kara Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 513bba5f1e02..5b84a46201c2 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -589,6 +589,7 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, bool write = flags & FAULT_FLAG_WRITE; struct block_device *bdev; pgoff_t size, pgoff; + loff_t lstart, lend; sector_t block; int result = 0; @@ -643,15 +644,13 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, goto fallback; } - /* - * If we allocated new storage, make sure no process has any - * zero pages covering this hole - */ - if (buffer_new(&bh)) { - i_mmap_unlock_read(mapping); - unmap_mapping_range(mapping, pgoff << PAGE_SHIFT, PMD_SIZE, 0); - i_mmap_lock_read(mapping); - } + /* make sure no process has any zero pages covering this hole */ + lstart = pgoff << PAGE_SHIFT; + lend = lstart + PMD_SIZE - 1; /* inclusive */ + i_mmap_unlock_read(mapping); + unmap_mapping_range(mapping, lstart, PMD_SIZE, 0); + truncate_inode_pages_range(mapping, lstart, lend); + i_mmap_lock_read(mapping); /* * If a truncate happened while we were allocating blocks, we may @@ -665,7 +664,8 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, goto out; } if ((pgoff | PG_PMD_COLOUR) >= size) { - dax_pmd_dbg(&bh, address, "pgoff unaligned"); + dax_pmd_dbg(&bh, address, + "offset + huge page size > file size"); goto fallback; } From 3f4a2670deea53e3765e24a7f46aafe6f077cb68 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:37 -0800 Subject: [PATCH 03/12] pmem: add wb_cache_pmem() to the PMEM API __arch_wb_cache_pmem() was already an internal implementation detail of the x86 PMEM API, but this functionality needs to be exported as part of the general PMEM API to handle the fsync/msync case for DAX mmaps. One thing worth noting is that we really do want this to be part of the PMEM API as opposed to a stand-alone function like clflush_cache_range() because of ordering restrictions. By having wb_cache_pmem() as part of the PMEM API we can leave it unordered, call it multiple times to write back large amounts of memory, and then order the multiple calls with a single wmb_pmem(). Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/include/asm/pmem.h | 11 ++++++----- include/linux/pmem.h | 22 +++++++++++++++++++++- 2 files changed, 27 insertions(+), 6 deletions(-) diff --git a/arch/x86/include/asm/pmem.h b/arch/x86/include/asm/pmem.h index 1544fabcd7f9..c57fd1ea9689 100644 --- a/arch/x86/include/asm/pmem.h +++ b/arch/x86/include/asm/pmem.h @@ -67,18 +67,19 @@ static inline void arch_wmb_pmem(void) } /** - * __arch_wb_cache_pmem - write back a cache range with CLWB + * arch_wb_cache_pmem - write back a cache range with CLWB * @vaddr: virtual start address * @size: number of bytes to write back * * Write back a cache range using the CLWB (cache line write back) * instruction. This function requires explicit ordering with an - * arch_wmb_pmem() call. This API is internal to the x86 PMEM implementation. + * arch_wmb_pmem() call. */ -static inline void __arch_wb_cache_pmem(void *vaddr, size_t size) +static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) { u16 x86_clflush_size = boot_cpu_data.x86_clflush_size; unsigned long clflush_mask = x86_clflush_size - 1; + void *vaddr = (void __force *)addr; void *vend = vaddr + size; void *p; @@ -115,7 +116,7 @@ static inline size_t arch_copy_from_iter_pmem(void __pmem *addr, size_t bytes, len = copy_from_iter_nocache(vaddr, bytes, i); if (__iter_needs_pmem_wb(i)) - __arch_wb_cache_pmem(vaddr, bytes); + arch_wb_cache_pmem(addr, bytes); return len; } @@ -133,7 +134,7 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) void *vaddr = (void __force *)addr; memset(vaddr, 0, size); - __arch_wb_cache_pmem(vaddr, size); + arch_wb_cache_pmem(addr, size); } static inline bool __arch_has_wmb_pmem(void) diff --git a/include/linux/pmem.h b/include/linux/pmem.h index acfea8ce4a07..7c3d11a6b4ad 100644 --- a/include/linux/pmem.h +++ b/include/linux/pmem.h @@ -53,12 +53,18 @@ static inline void arch_clear_pmem(void __pmem *addr, size_t size) { BUG(); } + +static inline void arch_wb_cache_pmem(void __pmem *addr, size_t size) +{ + BUG(); +} #endif /* * Architectures that define ARCH_HAS_PMEM_API must provide * implementations for arch_memcpy_to_pmem(), arch_wmb_pmem(), - * arch_copy_from_iter_pmem(), arch_clear_pmem() and arch_has_wmb_pmem(). + * arch_copy_from_iter_pmem(), arch_clear_pmem(), arch_wb_cache_pmem() + * and arch_has_wmb_pmem(). */ static inline void memcpy_from_pmem(void *dst, void __pmem const *src, size_t size) { @@ -178,4 +184,18 @@ static inline void clear_pmem(void __pmem *addr, size_t size) else default_clear_pmem(addr, size); } + +/** + * wb_cache_pmem - write back processor cache for PMEM memory range + * @addr: virtual start address + * @size: number of bytes to write back + * + * Write back the processor cache range starting at 'addr' for 'size' bytes. + * This function requires explicit ordering with a wmb_pmem() call. + */ +static inline void wb_cache_pmem(void __pmem *addr, size_t size) +{ + if (arch_has_pmem_api()) + arch_wb_cache_pmem(addr, size); +} #endif /* __PMEM_H__ */ From f9fe48bece3af2d60e1bad65db4825f5a025dd36 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:40 -0800 Subject: [PATCH 04/12] dax: support dirty DAX entries in radix tree Add support for tracking dirty DAX entries in the struct address_space radix tree. This tree is already used for dirty page writeback, and it already supports the use of exceptional (non struct page*) entries. In order to properly track dirty DAX pages we will insert new exceptional entries into the radix tree that represent dirty DAX PTE or PMD pages. These exceptional entries will also contain the writeback addresses for the PTE or PMD faults that we can use at fsync/msync time. There are currently two types of exceptional entries (shmem and shadow) that can be placed into the radix tree, and this adds a third. We rely on the fact that only one type of exceptional entry can be found in a given radix tree based on its usage. This happens for free with DAX vs shmem but we explicitly prevent shadow entries from being added to radix trees for DAX mappings. The only shadow entries that would be generated for DAX radix trees would be to track zero page mappings that were created for holes. These pages would receive minimal benefit from having shadow entries, and the choice to have only one type of exceptional entry in a given radix tree makes the logic simpler both in clear_exceptional_entry() and in the rest of DAX. Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/block_dev.c | 2 +- fs/inode.c | 2 +- include/linux/dax.h | 5 +++ include/linux/fs.h | 3 +- include/linux/radix-tree.h | 9 +++++ mm/filemap.c | 17 ++++++---- mm/truncate.c | 69 +++++++++++++++++++++----------------- mm/vmscan.c | 9 ++++- mm/workingset.c | 4 +-- 9 files changed, 78 insertions(+), 42 deletions(-) diff --git a/fs/block_dev.c b/fs/block_dev.c index ba762ea07f67..60895e500e15 100644 --- a/fs/block_dev.c +++ b/fs/block_dev.c @@ -75,7 +75,7 @@ void kill_bdev(struct block_device *bdev) { struct address_space *mapping = bdev->bd_inode->i_mapping; - if (mapping->nrpages == 0 && mapping->nrshadows == 0) + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; invalidate_bh_lrus(); diff --git a/fs/inode.c b/fs/inode.c index e491e54d2430..1e6dd388ba7f 100644 --- a/fs/inode.c +++ b/fs/inode.c @@ -495,7 +495,7 @@ void clear_inode(struct inode *inode) */ spin_lock_irq(&inode->i_data.tree_lock); BUG_ON(inode->i_data.nrpages); - BUG_ON(inode->i_data.nrshadows); + BUG_ON(inode->i_data.nrexceptional); spin_unlock_irq(&inode->i_data.tree_lock); BUG_ON(!list_empty(&inode->i_data.private_list)); BUG_ON(!(inode->i_state & I_FREEING)); diff --git a/include/linux/dax.h b/include/linux/dax.h index b415e521528d..e9d57f680f50 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -36,4 +36,9 @@ static inline bool vma_is_dax(struct vm_area_struct *vma) { return vma->vm_file && IS_DAX(vma->vm_file->f_mapping->host); } + +static inline bool dax_mapping(struct address_space *mapping) +{ + return mapping->host && IS_DAX(mapping->host); +} #endif diff --git a/include/linux/fs.h b/include/linux/fs.h index eb73d74ed992..0d7570320d63 100644 --- a/include/linux/fs.h +++ b/include/linux/fs.h @@ -433,7 +433,8 @@ struct address_space { struct rw_semaphore i_mmap_rwsem; /* protect tree, count, list */ /* Protected by tree_lock together with the radix tree */ unsigned long nrpages; /* number of total pages */ - unsigned long nrshadows; /* number of shadow entries */ + /* number of shadow or DAX exceptional entries */ + unsigned long nrexceptional; pgoff_t writeback_index;/* writeback starts here */ const struct address_space_operations *a_ops; /* methods */ unsigned long flags; /* error bits/gfp mask */ diff --git a/include/linux/radix-tree.h b/include/linux/radix-tree.h index 57e7d87d2d4c..7c88ad156a29 100644 --- a/include/linux/radix-tree.h +++ b/include/linux/radix-tree.h @@ -51,6 +51,15 @@ #define RADIX_TREE_EXCEPTIONAL_ENTRY 2 #define RADIX_TREE_EXCEPTIONAL_SHIFT 2 +#define RADIX_DAX_MASK 0xf +#define RADIX_DAX_SHIFT 4 +#define RADIX_DAX_PTE (0x4 | RADIX_TREE_EXCEPTIONAL_ENTRY) +#define RADIX_DAX_PMD (0x8 | RADIX_TREE_EXCEPTIONAL_ENTRY) +#define RADIX_DAX_TYPE(entry) ((unsigned long)entry & RADIX_DAX_MASK) +#define RADIX_DAX_SECTOR(entry) (((unsigned long)entry >> RADIX_DAX_SHIFT)) +#define RADIX_DAX_ENTRY(sector, pmd) ((void *)((unsigned long)sector << \ + RADIX_DAX_SHIFT | (pmd ? RADIX_DAX_PMD : RADIX_DAX_PTE))) + static inline int radix_tree_is_indirect_ptr(void *ptr) { return (int)((unsigned long)ptr & RADIX_TREE_INDIRECT_PTR); diff --git a/mm/filemap.c b/mm/filemap.c index 847ee43c2806..7b8be78cfd9e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -11,6 +11,7 @@ */ #include #include +#include #include #include #include @@ -123,9 +124,9 @@ static void page_cache_tree_delete(struct address_space *mapping, __radix_tree_lookup(&mapping->page_tree, page->index, &node, &slot); if (shadow) { - mapping->nrshadows++; + mapping->nrexceptional++; /* - * Make sure the nrshadows update is committed before + * Make sure the nrexceptional update is committed before * the nrpages update so that final truncate racing * with reclaim does not see both counters 0 at the * same time and miss a shadow entry. @@ -579,9 +580,13 @@ static int page_cache_tree_insert(struct address_space *mapping, p = radix_tree_deref_slot_protected(slot, &mapping->tree_lock); if (!radix_tree_exceptional_entry(p)) return -EEXIST; + + if (WARN_ON(dax_mapping(mapping))) + return -EINVAL; + if (shadowp) *shadowp = p; - mapping->nrshadows--; + mapping->nrexceptional--; if (node) workingset_node_shadows_dec(node); } @@ -1245,9 +1250,9 @@ repeat: if (radix_tree_deref_retry(page)) goto restart; /* - * A shadow entry of a recently evicted page, - * or a swap entry from shmem/tmpfs. Return - * it without attempting to raise page count. + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. */ goto export; } diff --git a/mm/truncate.c b/mm/truncate.c index 76e35ad97102..e3ee0e27cd17 100644 --- a/mm/truncate.c +++ b/mm/truncate.c @@ -9,6 +9,7 @@ #include #include +#include #include #include #include @@ -34,31 +35,39 @@ static void clear_exceptional_entry(struct address_space *mapping, return; spin_lock_irq(&mapping->tree_lock); - /* - * Regular page slots are stabilized by the page lock even - * without the tree itself locked. These unlocked entries - * need verification under the tree lock. - */ - if (!__radix_tree_lookup(&mapping->page_tree, index, &node, &slot)) - goto unlock; - if (*slot != entry) - goto unlock; - radix_tree_replace_slot(slot, NULL); - mapping->nrshadows--; - if (!node) - goto unlock; - workingset_node_shadows_dec(node); - /* - * Don't track node without shadow entries. - * - * Avoid acquiring the list_lru lock if already untracked. - * The list_empty() test is safe as node->private_list is - * protected by mapping->tree_lock. - */ - if (!workingset_node_shadows(node) && - !list_empty(&node->private_list)) - list_lru_del(&workingset_shadow_nodes, &node->private_list); - __radix_tree_delete_node(&mapping->page_tree, node); + + if (dax_mapping(mapping)) { + if (radix_tree_delete_item(&mapping->page_tree, index, entry)) + mapping->nrexceptional--; + } else { + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(&mapping->page_tree, index, &node, + &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + radix_tree_replace_slot(slot, NULL); + mapping->nrexceptional--; + if (!node) + goto unlock; + workingset_node_shadows_dec(node); + /* + * Don't track node without shadow entries. + * + * Avoid acquiring the list_lru lock if already untracked. + * The list_empty() test is safe as node->private_list is + * protected by mapping->tree_lock. + */ + if (!workingset_node_shadows(node) && + !list_empty(&node->private_list)) + list_lru_del(&workingset_shadow_nodes, + &node->private_list); + __radix_tree_delete_node(&mapping->page_tree, node); + } unlock: spin_unlock_irq(&mapping->tree_lock); } @@ -228,7 +237,7 @@ void truncate_inode_pages_range(struct address_space *mapping, int i; cleancache_invalidate_inode(mapping); - if (mapping->nrpages == 0 && mapping->nrshadows == 0) + if (mapping->nrpages == 0 && mapping->nrexceptional == 0) return; /* Offsets within partial pages */ @@ -402,7 +411,7 @@ EXPORT_SYMBOL(truncate_inode_pages); */ void truncate_inode_pages_final(struct address_space *mapping) { - unsigned long nrshadows; + unsigned long nrexceptional; unsigned long nrpages; /* @@ -416,14 +425,14 @@ void truncate_inode_pages_final(struct address_space *mapping) /* * When reclaim installs eviction entries, it increases - * nrshadows first, then decreases nrpages. Make sure we see + * nrexceptional first, then decreases nrpages. Make sure we see * this in the right order or we might miss an entry. */ nrpages = mapping->nrpages; smp_rmb(); - nrshadows = mapping->nrshadows; + nrexceptional = mapping->nrexceptional; - if (nrpages || nrshadows) { + if (nrpages || nrexceptional) { /* * As truncation uses a lockless tree lookup, cycle * the tree lock to make sure any ongoing tree diff --git a/mm/vmscan.c b/mm/vmscan.c index bd620b65db52..eb3dd37ccd7c 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -46,6 +46,7 @@ #include #include #include +#include #include #include @@ -671,9 +672,15 @@ static int __remove_mapping(struct address_space *mapping, struct page *page, * inode reclaim needs to empty out the radix tree or * the nodes are lost. Don't plant shadows behind its * back. + * + * We also don't store shadows for DAX mappings because the + * only page cache pages found in these are zero pages + * covering holes, and because we don't want to mix DAX + * exceptional entries and shadow exceptional entries in the + * same page_tree. */ if (reclaimed && page_is_file_cache(page) && - !mapping_exiting(mapping)) + !mapping_exiting(mapping) && !dax_mapping(mapping)) shadow = workingset_eviction(mapping, page); __delete_from_page_cache(page, shadow, memcg); spin_unlock_irqrestore(&mapping->tree_lock, flags); diff --git a/mm/workingset.c b/mm/workingset.c index aa017133744b..61ead9e5549d 100644 --- a/mm/workingset.c +++ b/mm/workingset.c @@ -351,8 +351,8 @@ static enum lru_status shadow_lru_isolate(struct list_head *item, node->slots[i] = NULL; BUG_ON(node->count < (1U << RADIX_TREE_COUNT_SHIFT)); node->count -= 1U << RADIX_TREE_COUNT_SHIFT; - BUG_ON(!mapping->nrshadows); - mapping->nrshadows--; + BUG_ON(!mapping->nrexceptional); + mapping->nrexceptional--; } } BUG_ON(node->count); From 7e7f774984cd88c45c18e7ffaf0256c3e9118043 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:44 -0800 Subject: [PATCH 05/12] mm: add find_get_entries_tag() Add find_get_entries_tag() to the family of functions that include find_get_entries(), find_get_pages() and find_get_pages_tag(). This is needed for DAX dirty page handling because we need a list of both page offsets and radix tree entries ('indices' and 'entries' in this function) that are marked with the PAGECACHE_TAG_TOWRITE tag. Signed-off-by: Ross Zwisler Reviewed-by: Jan Kara Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/pagemap.h | 3 ++ mm/filemap.c | 68 +++++++++++++++++++++++++++++++++++++++++ 2 files changed, 71 insertions(+) diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h index 4d08b6c33557..92395a0a7dc5 100644 --- a/include/linux/pagemap.h +++ b/include/linux/pagemap.h @@ -361,6 +361,9 @@ unsigned find_get_pages_contig(struct address_space *mapping, pgoff_t start, unsigned int nr_pages, struct page **pages); unsigned find_get_pages_tag(struct address_space *mapping, pgoff_t *index, int tag, unsigned int nr_pages, struct page **pages); +unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, + int tag, unsigned int nr_entries, + struct page **entries, pgoff_t *indices); struct page *grab_cache_page_write_begin(struct address_space *mapping, pgoff_t index, unsigned flags); diff --git a/mm/filemap.c b/mm/filemap.c index 7b8be78cfd9e..1e215fc36c83 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -1499,6 +1499,74 @@ repeat: } EXPORT_SYMBOL(find_get_pages_tag); +/** + * find_get_entries_tag - find and return entries that match @tag + * @mapping: the address_space to search + * @start: the starting page cache index + * @tag: the tag index + * @nr_entries: the maximum number of entries + * @entries: where the resulting entries are placed + * @indices: the cache indices corresponding to the entries in @entries + * + * Like find_get_entries, except we only return entries which are tagged with + * @tag. + */ +unsigned find_get_entries_tag(struct address_space *mapping, pgoff_t start, + int tag, unsigned int nr_entries, + struct page **entries, pgoff_t *indices) +{ + void **slot; + unsigned int ret = 0; + struct radix_tree_iter iter; + + if (!nr_entries) + return 0; + + rcu_read_lock(); +restart: + radix_tree_for_each_tagged(slot, &mapping->page_tree, + &iter, start, tag) { + struct page *page; +repeat: + page = radix_tree_deref_slot(slot); + if (unlikely(!page)) + continue; + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + /* + * Transient condition which can only trigger + * when entry at index 0 moves out of or back + * to root: none yet gotten, safe to restart. + */ + goto restart; + } + + /* + * A shadow entry of a recently evicted page, a swap + * entry from shmem/tmpfs or a DAX entry. Return it + * without attempting to raise page count. + */ + goto export; + } + if (!page_cache_get_speculative(page)) + goto repeat; + + /* Has the page moved? */ + if (unlikely(page != *slot)) { + page_cache_release(page); + goto repeat; + } +export: + indices[ret] = iter.index; + entries[ret] = page; + if (++ret == nr_entries) + break; + } + rcu_read_unlock(); + return ret; +} +EXPORT_SYMBOL(find_get_entries_tag); + /* * CD/DVDs are error prone. When a medium error occurs, the driver may fail * a _large_ part of the i/o request. Imagine the worst scenario: From 9973c98ecfda3a1dfcab981665b5f1e39bcde64a Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:47 -0800 Subject: [PATCH 06/12] dax: add support for fsync/sync To properly handle fsync/msync in an efficient way DAX needs to track dirty pages so it is able to flush them durably to media on demand. The tracking of dirty pages is done via the radix tree in struct address_space. This radix tree is already used by the page writeback infrastructure for tracking dirty pages associated with an open file, and it already has support for exceptional (non struct page*) entries. We build upon these features to add exceptional entries to the radix tree for DAX dirty PMD or PTE pages at fault time. [dan.j.williams@intel.com: fix dax_pmd_dbg build warning] Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 274 +++++++++++++++++++++++++++++++++++++++++--- include/linux/dax.h | 2 + mm/filemap.c | 6 + 3 files changed, 266 insertions(+), 16 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index 5b84a46201c2..d5f6aca5a4d7 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -24,6 +24,7 @@ #include #include #include +#include #include #include #include @@ -324,6 +325,199 @@ static int copy_user_bh(struct page *to, struct inode *inode, return 0; } +#define NO_SECTOR -1 +#define DAX_PMD_INDEX(page_index) (page_index & (PMD_MASK >> PAGE_CACHE_SHIFT)) + +static int dax_radix_entry(struct address_space *mapping, pgoff_t index, + sector_t sector, bool pmd_entry, bool dirty) +{ + struct radix_tree_root *page_tree = &mapping->page_tree; + pgoff_t pmd_index = DAX_PMD_INDEX(index); + int type, error = 0; + void *entry; + + WARN_ON_ONCE(pmd_entry && !dirty); + __mark_inode_dirty(mapping->host, I_DIRTY_PAGES); + + spin_lock_irq(&mapping->tree_lock); + + entry = radix_tree_lookup(page_tree, pmd_index); + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) { + index = pmd_index; + goto dirty; + } + + entry = radix_tree_lookup(page_tree, index); + if (entry) { + type = RADIX_DAX_TYPE(entry); + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && + type != RADIX_DAX_PMD)) { + error = -EIO; + goto unlock; + } + + if (!pmd_entry || type == RADIX_DAX_PMD) + goto dirty; + + /* + * We only insert dirty PMD entries into the radix tree. This + * means we don't need to worry about removing a dirty PTE + * entry and inserting a clean PMD entry, thus reducing the + * range we would flush with a follow-up fsync/msync call. + */ + radix_tree_delete(&mapping->page_tree, index); + mapping->nrexceptional--; + } + + if (sector == NO_SECTOR) { + /* + * This can happen during correct operation if our pfn_mkwrite + * fault raced against a hole punch operation. If this + * happens the pte that was hole punched will have been + * unmapped and the radix tree entry will have been removed by + * the time we are called, but the call will still happen. We + * will return all the way up to wp_pfn_shared(), where the + * pte_same() check will fail, eventually causing page fault + * to be retried by the CPU. + */ + goto unlock; + } + + error = radix_tree_insert(page_tree, index, + RADIX_DAX_ENTRY(sector, pmd_entry)); + if (error) + goto unlock; + + mapping->nrexceptional++; + dirty: + if (dirty) + radix_tree_tag_set(page_tree, index, PAGECACHE_TAG_DIRTY); + unlock: + spin_unlock_irq(&mapping->tree_lock); + return error; +} + +static int dax_writeback_one(struct block_device *bdev, + struct address_space *mapping, pgoff_t index, void *entry) +{ + struct radix_tree_root *page_tree = &mapping->page_tree; + int type = RADIX_DAX_TYPE(entry); + struct radix_tree_node *node; + struct blk_dax_ctl dax; + void **slot; + int ret = 0; + + spin_lock_irq(&mapping->tree_lock); + /* + * Regular page slots are stabilized by the page lock even + * without the tree itself locked. These unlocked entries + * need verification under the tree lock. + */ + if (!__radix_tree_lookup(page_tree, index, &node, &slot)) + goto unlock; + if (*slot != entry) + goto unlock; + + /* another fsync thread may have already written back this entry */ + if (!radix_tree_tag_get(page_tree, index, PAGECACHE_TAG_TOWRITE)) + goto unlock; + + if (WARN_ON_ONCE(type != RADIX_DAX_PTE && type != RADIX_DAX_PMD)) { + ret = -EIO; + goto unlock; + } + + dax.sector = RADIX_DAX_SECTOR(entry); + dax.size = (type == RADIX_DAX_PMD ? PMD_SIZE : PAGE_SIZE); + spin_unlock_irq(&mapping->tree_lock); + + /* + * We cannot hold tree_lock while calling dax_map_atomic() because it + * eventually calls cond_resched(). + */ + ret = dax_map_atomic(bdev, &dax); + if (ret < 0) + return ret; + + if (WARN_ON_ONCE(ret < dax.size)) { + ret = -EIO; + goto unmap; + } + + wb_cache_pmem(dax.addr, dax.size); + + spin_lock_irq(&mapping->tree_lock); + radix_tree_tag_clear(page_tree, index, PAGECACHE_TAG_TOWRITE); + spin_unlock_irq(&mapping->tree_lock); + unmap: + dax_unmap_atomic(bdev, &dax); + return ret; + + unlock: + spin_unlock_irq(&mapping->tree_lock); + return ret; +} + +/* + * Flush the mapping to the persistent domain within the byte range of [start, + * end]. This is required by data integrity operations to ensure file data is + * on persistent storage prior to completion of the operation. + */ +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, + loff_t end) +{ + struct inode *inode = mapping->host; + struct block_device *bdev = inode->i_sb->s_bdev; + pgoff_t start_index, end_index, pmd_index; + pgoff_t indices[PAGEVEC_SIZE]; + struct pagevec pvec; + bool done = false; + int i, ret = 0; + void *entry; + + if (WARN_ON_ONCE(inode->i_blkbits != PAGE_SHIFT)) + return -EIO; + + start_index = start >> PAGE_CACHE_SHIFT; + end_index = end >> PAGE_CACHE_SHIFT; + pmd_index = DAX_PMD_INDEX(start_index); + + rcu_read_lock(); + entry = radix_tree_lookup(&mapping->page_tree, pmd_index); + rcu_read_unlock(); + + /* see if the start of our range is covered by a PMD entry */ + if (entry && RADIX_DAX_TYPE(entry) == RADIX_DAX_PMD) + start_index = pmd_index; + + tag_pages_for_writeback(mapping, start_index, end_index); + + pagevec_init(&pvec, 0); + while (!done) { + pvec.nr = find_get_entries_tag(mapping, start_index, + PAGECACHE_TAG_TOWRITE, PAGEVEC_SIZE, + pvec.pages, indices); + + if (pvec.nr == 0) + break; + + for (i = 0; i < pvec.nr; i++) { + if (indices[i] > end_index) { + done = true; + break; + } + + ret = dax_writeback_one(bdev, mapping, indices[i], + pvec.pages[i]); + if (ret < 0) + return ret; + } + } + wmb_pmem(); + return 0; +} +EXPORT_SYMBOL_GPL(dax_writeback_mapping_range); + static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, struct vm_area_struct *vma, struct vm_fault *vmf) { @@ -363,6 +557,11 @@ static int dax_insert_mapping(struct inode *inode, struct buffer_head *bh, } dax_unmap_atomic(bdev, &dax); + error = dax_radix_entry(mapping, vmf->pgoff, dax.sector, false, + vmf->flags & FAULT_FLAG_WRITE); + if (error) + goto out; + error = vm_insert_mixed(vma, vaddr, dax.pfn); out: @@ -487,6 +686,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, delete_from_page_cache(page); unlock_page(page); page_cache_release(page); + page = NULL; } /* @@ -589,9 +789,9 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, bool write = flags & FAULT_FLAG_WRITE; struct block_device *bdev; pgoff_t size, pgoff; - loff_t lstart, lend; sector_t block; - int result = 0; + int error, result = 0; + bool alloc = false; /* dax pmd mappings require pfn_t_devmap() */ if (!IS_ENABLED(CONFIG_FS_DAX_PMD)) @@ -629,10 +829,17 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, block = (sector_t)pgoff << (PAGE_SHIFT - blkbits); bh.b_size = PMD_SIZE; - if (get_block(inode, block, &bh, write) != 0) + + if (get_block(inode, block, &bh, 0) != 0) return VM_FAULT_SIGBUS; + + if (!buffer_mapped(&bh) && write) { + if (get_block(inode, block, &bh, 1) != 0) + return VM_FAULT_SIGBUS; + alloc = true; + } + bdev = bh.b_bdev; - i_mmap_lock_read(mapping); /* * If the filesystem isn't willing to tell us the length of a hole, @@ -641,15 +848,20 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, */ if (!buffer_size_valid(&bh) || bh.b_size < PMD_SIZE) { dax_pmd_dbg(&bh, address, "allocated block too small"); - goto fallback; + return VM_FAULT_FALLBACK; + } + + /* + * If we allocated new storage, make sure no process has any + * zero pages covering this hole + */ + if (alloc) { + loff_t lstart = pgoff << PAGE_SHIFT; + loff_t lend = lstart + PMD_SIZE - 1; /* inclusive */ + + truncate_pagecache_range(inode, lstart, lend); } - /* make sure no process has any zero pages covering this hole */ - lstart = pgoff << PAGE_SHIFT; - lend = lstart + PMD_SIZE - 1; /* inclusive */ - i_mmap_unlock_read(mapping); - unmap_mapping_range(mapping, lstart, PMD_SIZE, 0); - truncate_inode_pages_range(mapping, lstart, lend); i_mmap_lock_read(mapping); /* @@ -733,6 +945,31 @@ int __dax_pmd_fault(struct vm_area_struct *vma, unsigned long address, } dax_unmap_atomic(bdev, &dax); + /* + * For PTE faults we insert a radix tree entry for reads, and + * leave it clean. Then on the first write we dirty the radix + * tree entry via the dax_pfn_mkwrite() path. This sequence + * allows the dax_pfn_mkwrite() call to be simpler and avoid a + * call into get_block() to translate the pgoff to a sector in + * order to be able to create a new radix tree entry. + * + * The PMD path doesn't have an equivalent to + * dax_pfn_mkwrite(), though, so for a read followed by a + * write we traverse all the way through __dax_pmd_fault() + * twice. This means we can just skip inserting a radix tree + * entry completely on the initial read and just wait until + * the write to insert a dirty entry. + */ + if (write) { + error = dax_radix_entry(mapping, pgoff, dax.sector, + true, true); + if (error) { + dax_pmd_dbg(&bh, address, + "PMD radix insertion failed"); + goto fallback; + } + } + dev_dbg(part_to_dev(bdev->bd_part), "%s: %s addr: %lx pfn: %lx sect: %llx\n", __func__, current->comm, address, @@ -791,15 +1028,20 @@ EXPORT_SYMBOL_GPL(dax_pmd_fault); * dax_pfn_mkwrite - handle first write to DAX page * @vma: The virtual memory area where the fault occurred * @vmf: The description of the fault - * */ int dax_pfn_mkwrite(struct vm_area_struct *vma, struct vm_fault *vmf) { - struct super_block *sb = file_inode(vma->vm_file)->i_sb; + struct file *file = vma->vm_file; - sb_start_pagefault(sb); - file_update_time(vma->vm_file); - sb_end_pagefault(sb); + /* + * We pass NO_SECTOR to dax_radix_entry() because we expect that a + * RADIX_DAX_PTE entry already exists in the radix tree from a + * previous call to __dax_fault(). We just want to look up that PTE + * entry using vmf->pgoff and make sure the dirty tag is set. This + * saves us from having to make a call to get_block() here to look + * up the sector. + */ + dax_radix_entry(file->f_mapping, vmf->pgoff, NO_SECTOR, false, true); return VM_FAULT_NOPAGE; } EXPORT_SYMBOL_GPL(dax_pfn_mkwrite); diff --git a/include/linux/dax.h b/include/linux/dax.h index e9d57f680f50..8204c3dc3800 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -41,4 +41,6 @@ static inline bool dax_mapping(struct address_space *mapping) { return mapping->host && IS_DAX(mapping->host); } +int dax_writeback_mapping_range(struct address_space *mapping, loff_t start, + loff_t end); #endif diff --git a/mm/filemap.c b/mm/filemap.c index 1e215fc36c83..2e7c8d980d5e 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -482,6 +482,12 @@ int filemap_write_and_wait_range(struct address_space *mapping, { int err = 0; + if (dax_mapping(mapping) && mapping->nrexceptional) { + err = dax_writeback_mapping_range(mapping, lstart, lend); + if (err) + return err; + } + if (mapping->nrpages) { err = __filemap_fdatawrite_range(mapping, lstart, lend, WB_SYNC_ALL); From 80b4adcafc076d4179431656b7e83afea99ddec8 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:50 -0800 Subject: [PATCH 07/12] ext2: call dax_pfn_mkwrite() for DAX fsync/msync To properly support the new DAX fsync/msync infrastructure filesystems need to call dax_pfn_mkwrite() so that DAX can track when user pages are dirtied. Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext2/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext2/file.c b/fs/ext2/file.c index 11a42c5a09ae..2c88d683cd91 100644 --- a/fs/ext2/file.c +++ b/fs/ext2/file.c @@ -102,8 +102,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, { struct inode *inode = file_inode(vma->vm_file); struct ext2_inode_info *ei = EXT2_I(inode); - int ret = VM_FAULT_NOPAGE; loff_t size; + int ret; sb_start_pagefault(inode->i_sb); file_update_time(vma->vm_file); @@ -113,6 +113,8 @@ static int ext2_dax_pfn_mkwrite(struct vm_area_struct *vma, size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; + else + ret = dax_pfn_mkwrite(vma, vmf); up_read(&ei->dax_sem); sb_end_pagefault(inode->i_sb); From d5be7a03b002a0faec136687ec03dbee6c579930 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:53 -0800 Subject: [PATCH 08/12] ext4: call dax_pfn_mkwrite() for DAX fsync/msync To properly support the new DAX fsync/msync infrastructure filesystems need to call dax_pfn_mkwrite() so that DAX can track when user pages are dirtied. Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ext4/file.c | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/ext4/file.c b/fs/ext4/file.c index 749b222e6498..8c8965cc4aab 100644 --- a/fs/ext4/file.c +++ b/fs/ext4/file.c @@ -291,8 +291,8 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, { struct inode *inode = file_inode(vma->vm_file); struct super_block *sb = inode->i_sb; - int ret = VM_FAULT_NOPAGE; loff_t size; + int ret; sb_start_pagefault(sb); file_update_time(vma->vm_file); @@ -300,6 +300,8 @@ static int ext4_dax_pfn_mkwrite(struct vm_area_struct *vma, size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; + else + ret = dax_pfn_mkwrite(vma, vmf); up_read(&EXT4_I(inode)->i_mmap_sem); sb_end_pagefault(sb); From 5eb88dca9ce4546bdfca6e5defcd50fbe8ea8411 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:56 -0800 Subject: [PATCH 09/12] xfs: call dax_pfn_mkwrite() for DAX fsync/msync To properly support the new DAX fsync/msync infrastructure filesystems need to call dax_pfn_mkwrite() so that DAX can track when user pages are dirtied. Signed-off-by: Ross Zwisler Cc: "H. Peter Anvin" Cc: "J. Bruce Fields" Cc: "Theodore Ts'o" Cc: Alexander Viro Cc: Andreas Dilger Cc: Dave Chinner Cc: Ingo Molnar Cc: Jan Kara Cc: Jeff Layton Cc: Matthew Wilcox Cc: Thomas Gleixner Cc: Dan Williams Cc: Matthew Wilcox Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/xfs/xfs_file.c | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/fs/xfs/xfs_file.c b/fs/xfs/xfs_file.c index ebe9b8290a70..55e16e2402a7 100644 --- a/fs/xfs/xfs_file.c +++ b/fs/xfs/xfs_file.c @@ -1610,9 +1610,8 @@ xfs_filemap_pmd_fault( /* * pfn_mkwrite was originally inteneded to ensure we capture time stamp * updates on write faults. In reality, it's need to serialise against - * truncate similar to page_mkwrite. Hence we open-code dax_pfn_mkwrite() - * here and cycle the XFS_MMAPLOCK_SHARED to ensure we serialise the fault - * barrier in place. + * truncate similar to page_mkwrite. Hence we cycle the XFS_MMAPLOCK_SHARED + * to ensure we serialise the fault barrier in place. */ static int xfs_filemap_pfn_mkwrite( @@ -1635,6 +1634,8 @@ xfs_filemap_pfn_mkwrite( size = (i_size_read(inode) + PAGE_SIZE - 1) >> PAGE_SHIFT; if (vmf->pgoff >= size) ret = VM_FAULT_SIGBUS; + else if (IS_DAX(inode)) + ret = dax_pfn_mkwrite(vma, vmf); xfs_iunlock(ip, XFS_MMAPLOCK_SHARED); sb_end_pagefault(inode->i_sb); return ret; From eab95db69d334745d3034072f4a7204084136c88 Mon Sep 17 00:00:00 2001 From: Ross Zwisler Date: Fri, 22 Jan 2016 15:10:59 -0800 Subject: [PATCH 10/12] dax: never rely on bh.b_dev being set by get_block() Previously in DAX we assumed that calls to get_block() would set bh.b_bdev, and we would then use that value even in error cases for debugging. This caused a NULL pointer dereference in __dax_dbg() which was fixed by a previous commit, but that commit only changed the one place where we were hitting an error. Instead, update dax.c so that we always initialize bh.b_bdev as best we can based on the information that DAX has. get_block() may or may not update to a new value, but this at least lets us get something helpful from bh.b_bdev for error messages and not have to worry about whether it was set by get_block() or not. Signed-off-by: Ross Zwisler Reported-by: Jan Kara Reviewed-by: Jan Kara Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/fs/dax.c b/fs/dax.c index d5f6aca5a4d7..206650f82762 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -246,6 +246,7 @@ ssize_t dax_do_io(struct kiocb *iocb, struct inode *inode, loff_t end = pos + iov_iter_count(iter); memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; if ((flags & DIO_LOCKING) && iov_iter_rw(iter) == READ) { struct address_space *mapping = inode->i_mapping; @@ -607,6 +608,7 @@ int __dax_fault(struct vm_area_struct *vma, struct vm_fault *vmf, memset(&bh, 0, sizeof(bh)); block = (sector_t)vmf->pgoff << (PAGE_SHIFT - blkbits); + bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_SIZE; repeat: @@ -1078,6 +1080,7 @@ int dax_zero_page_range(struct inode *inode, loff_t from, unsigned length, BUG_ON((offset + length) > PAGE_CACHE_SIZE); memset(&bh, 0, sizeof(bh)); + bh.b_bdev = inode->i_sb->s_bdev; bh.b_size = PAGE_CACHE_SIZE; err = get_block(inode, index, &bh, 0); if (err < 0) From 1d5cfdb076288df5eb95545a547a39905e95c930 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Fri, 22 Jan 2016 15:11:02 -0800 Subject: [PATCH 11/12] tree wide: use kvfree() than conditional kfree()/vfree() There are many locations that do if (memory_was_allocated_by_vmalloc) vfree(ptr); else kfree(ptr); but kvfree() can handle both kmalloc()ed memory and vmalloc()ed memory using is_vmalloc_addr(). Unless callers have special reasons, we can replace this branch with kvfree(). Please check and reply if you found problems. Signed-off-by: Tetsuo Handa Acked-by: Michal Hocko Acked-by: Jan Kara Acked-by: Russell King Reviewed-by: Andreas Dilger Acked-by: "Rafael J. Wysocki" Acked-by: David Rientjes Cc: "Luck, Tony" Cc: Oleg Drokin Cc: Boris Petkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/arm/mm/dma-mapping.c | 11 ++------ drivers/acpi/apei/erst.c | 6 ++--- drivers/block/drbd/drbd_bitmap.c | 26 +++++-------------- drivers/block/drbd/drbd_int.h | 3 --- drivers/char/mspec.c | 15 +++-------- drivers/gpu/drm/drm_hashtab.c | 5 +--- .../include/linux/libcfs/libcfs_private.h | 8 ++---- fs/coda/coda_linux.h | 3 +-- fs/jffs2/build.c | 8 ++---- fs/jffs2/fs.c | 5 +--- fs/jffs2/super.c | 5 +--- fs/udf/super.c | 7 +---- ipc/sem.c | 2 +- ipc/util.c | 11 +++----- ipc/util.h | 2 +- mm/percpu.c | 18 +++++-------- net/ipv4/fib_trie.c | 4 +-- 17 files changed, 36 insertions(+), 103 deletions(-) diff --git a/arch/arm/mm/dma-mapping.c b/arch/arm/mm/dma-mapping.c index 534a60ae282e..0eca3812527e 100644 --- a/arch/arm/mm/dma-mapping.c +++ b/arch/arm/mm/dma-mapping.c @@ -1200,10 +1200,7 @@ error: while (i--) if (pages[i]) __free_pages(pages[i], 0); - if (array_size <= PAGE_SIZE) - kfree(pages); - else - vfree(pages); + kvfree(pages); return NULL; } @@ -1211,7 +1208,6 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages, size_t size, struct dma_attrs *attrs) { int count = size >> PAGE_SHIFT; - int array_size = count * sizeof(struct page *); int i; if (dma_get_attr(DMA_ATTR_FORCE_CONTIGUOUS, attrs)) { @@ -1222,10 +1218,7 @@ static int __iommu_free_buffer(struct device *dev, struct page **pages, __free_pages(pages[i], 0); } - if (array_size <= PAGE_SIZE) - kfree(pages); - else - vfree(pages); + kvfree(pages); return 0; } diff --git a/drivers/acpi/apei/erst.c b/drivers/acpi/apei/erst.c index 6682c5daf742..6e6bc1059301 100644 --- a/drivers/acpi/apei/erst.c +++ b/drivers/acpi/apei/erst.c @@ -32,6 +32,7 @@ #include #include #include +#include /* kvfree() */ #include #include "apei-internal.h" @@ -532,10 +533,7 @@ retry: return -ENOMEM; memcpy(new_entries, entries, erst_record_id_cache.len * sizeof(entries[0])); - if (erst_record_id_cache.size < PAGE_SIZE) - kfree(entries); - else - vfree(entries); + kvfree(entries); erst_record_id_cache.entries = entries = new_entries; erst_record_id_cache.size = new_size; } diff --git a/drivers/block/drbd/drbd_bitmap.c b/drivers/block/drbd/drbd_bitmap.c index 0dabc9b93725..92d6fc020a65 100644 --- a/drivers/block/drbd/drbd_bitmap.c +++ b/drivers/block/drbd/drbd_bitmap.c @@ -364,12 +364,9 @@ static void bm_free_pages(struct page **pages, unsigned long number) } } -static void bm_vk_free(void *ptr, int v) +static inline void bm_vk_free(void *ptr) { - if (v) - vfree(ptr); - else - kfree(ptr); + kvfree(ptr); } /* @@ -379,7 +376,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) { struct page **old_pages = b->bm_pages; struct page **new_pages, *page; - unsigned int i, bytes, vmalloced = 0; + unsigned int i, bytes; unsigned long have = b->bm_number_of_pages; BUG_ON(have == 0 && old_pages != NULL); @@ -401,7 +398,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) PAGE_KERNEL); if (!new_pages) return NULL; - vmalloced = 1; } if (want >= have) { @@ -411,7 +407,7 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) page = alloc_page(GFP_NOIO | __GFP_HIGHMEM); if (!page) { bm_free_pages(new_pages + have, i - have); - bm_vk_free(new_pages, vmalloced); + bm_vk_free(new_pages); return NULL; } /* we want to know which page it is @@ -427,11 +423,6 @@ static struct page **bm_realloc_pages(struct drbd_bitmap *b, unsigned long want) */ } - if (vmalloced) - b->bm_flags |= BM_P_VMALLOCED; - else - b->bm_flags &= ~BM_P_VMALLOCED; - return new_pages; } @@ -469,7 +460,7 @@ void drbd_bm_cleanup(struct drbd_device *device) if (!expect(device->bitmap)) return; bm_free_pages(device->bitmap->bm_pages, device->bitmap->bm_number_of_pages); - bm_vk_free(device->bitmap->bm_pages, (BM_P_VMALLOCED & device->bitmap->bm_flags)); + bm_vk_free(device->bitmap->bm_pages); kfree(device->bitmap); device->bitmap = NULL; } @@ -643,7 +634,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi unsigned long want, have, onpages; /* number of pages */ struct page **npages, **opages = NULL; int err = 0, growing; - int opages_vmalloced; if (!expect(b)) return -ENOMEM; @@ -656,8 +646,6 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi if (capacity == b->bm_dev_capacity) goto out; - opages_vmalloced = (BM_P_VMALLOCED & b->bm_flags); - if (capacity == 0) { spin_lock_irq(&b->bm_lock); opages = b->bm_pages; @@ -671,7 +659,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi b->bm_dev_capacity = 0; spin_unlock_irq(&b->bm_lock); bm_free_pages(opages, onpages); - bm_vk_free(opages, opages_vmalloced); + bm_vk_free(opages); goto out; } bits = BM_SECT_TO_BIT(ALIGN(capacity, BM_SECT_PER_BIT)); @@ -744,7 +732,7 @@ int drbd_bm_resize(struct drbd_device *device, sector_t capacity, int set_new_bi spin_unlock_irq(&b->bm_lock); if (opages != npages) - bm_vk_free(opages, opages_vmalloced); + bm_vk_free(opages); if (!growing) b->bm_set = bm_count_bits(b); drbd_info(device, "resync bitmap: bits=%lu words=%lu pages=%lu\n", bits, words, want); diff --git a/drivers/block/drbd/drbd_int.h b/drivers/block/drbd/drbd_int.h index b6844feb9f9b..34bc84efc29e 100644 --- a/drivers/block/drbd/drbd_int.h +++ b/drivers/block/drbd/drbd_int.h @@ -536,9 +536,6 @@ struct drbd_bitmap; /* opaque for drbd_device */ /* definition of bits in bm_flags to be used in drbd_bm_lock * and drbd_bitmap_io and friends. */ enum bm_flag { - /* do we need to kfree, or vfree bm_pages? */ - BM_P_VMALLOCED = 0x10000, /* internal use only, will be masked out */ - /* currently locked for bulk operation */ BM_LOCKED_MASK = 0xf, diff --git a/drivers/char/mspec.c b/drivers/char/mspec.c index f1d7fa45c275..f3f92d5fcda0 100644 --- a/drivers/char/mspec.c +++ b/drivers/char/mspec.c @@ -93,14 +93,11 @@ struct vma_data { spinlock_t lock; /* Serialize access to this structure. */ int count; /* Number of pages allocated. */ enum mspec_page_type type; /* Type of pages allocated. */ - int flags; /* See VMD_xxx below. */ unsigned long vm_start; /* Original (unsplit) base. */ unsigned long vm_end; /* Original (unsplit) end. */ unsigned long maddr[0]; /* Array of MSPEC addresses. */ }; -#define VMD_VMALLOCED 0x1 /* vmalloc'd rather than kmalloc'd */ - /* used on shub2 to clear FOP cache in the HUB */ static unsigned long scratch_page[MAX_NUMNODES]; #define SH2_AMO_CACHE_ENTRIES 4 @@ -185,10 +182,7 @@ mspec_close(struct vm_area_struct *vma) "failed to zero page %ld\n", my_page); } - if (vdata->flags & VMD_VMALLOCED) - vfree(vdata); - else - kfree(vdata); + kvfree(vdata); } /* @@ -256,7 +250,7 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, enum mspec_page_type type) { struct vma_data *vdata; - int pages, vdata_size, flags = 0; + int pages, vdata_size; if (vma->vm_pgoff != 0) return -EINVAL; @@ -271,16 +265,13 @@ mspec_mmap(struct file *file, struct vm_area_struct *vma, vdata_size = sizeof(struct vma_data) + pages * sizeof(long); if (vdata_size <= PAGE_SIZE) vdata = kzalloc(vdata_size, GFP_KERNEL); - else { + else vdata = vzalloc(vdata_size); - flags = VMD_VMALLOCED; - } if (!vdata) return -ENOMEM; vdata->vm_start = vma->vm_start; vdata->vm_end = vma->vm_end; - vdata->flags = flags; vdata->type = type; spin_lock_init(&vdata->lock); atomic_set(&vdata->refcnt, 1); diff --git a/drivers/gpu/drm/drm_hashtab.c b/drivers/gpu/drm/drm_hashtab.c index c3b80fd65d62..7b30b307674b 100644 --- a/drivers/gpu/drm/drm_hashtab.c +++ b/drivers/gpu/drm/drm_hashtab.c @@ -198,10 +198,7 @@ EXPORT_SYMBOL(drm_ht_remove_item); void drm_ht_remove(struct drm_open_hash *ht) { if (ht->table) { - if ((PAGE_SIZE / sizeof(*ht->table)) >> ht->order) - kfree(ht->table); - else - vfree(ht->table); + kvfree(ht->table); ht->table = NULL; } } diff --git a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h index d6273e143324..a80d993b882e 100644 --- a/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h +++ b/drivers/staging/lustre/include/linux/libcfs/libcfs_private.h @@ -151,16 +151,12 @@ do { \ #define LIBCFS_FREE(ptr, size) \ do { \ - int s = (size); \ if (unlikely((ptr) == NULL)) { \ CERROR("LIBCFS: free NULL '" #ptr "' (%d bytes) at " \ - "%s:%d\n", s, __FILE__, __LINE__); \ + "%s:%d\n", (int)(size), __FILE__, __LINE__); \ break; \ } \ - if (unlikely(s > LIBCFS_VMALLOC_SIZE)) \ - vfree(ptr); \ - else \ - kfree(ptr); \ + kvfree(ptr); \ } while (0) /******************************************************************************/ diff --git a/fs/coda/coda_linux.h b/fs/coda/coda_linux.h index f829fe963f5b..5104d84c4f64 100644 --- a/fs/coda/coda_linux.h +++ b/fs/coda/coda_linux.h @@ -72,8 +72,7 @@ void coda_sysctl_clean(void); } while (0) -#define CODA_FREE(ptr,size) \ - do { if (size < PAGE_SIZE) kfree((ptr)); else vfree((ptr)); } while (0) +#define CODA_FREE(ptr, size) kvfree((ptr)) /* inode to cnode access functions */ diff --git a/fs/jffs2/build.c b/fs/jffs2/build.c index a3750f902adc..0ae91ad6df2d 100644 --- a/fs/jffs2/build.c +++ b/fs/jffs2/build.c @@ -17,6 +17,7 @@ #include #include #include +#include /* kvfree() */ #include "nodelist.h" static void jffs2_build_remove_unlinked_inode(struct jffs2_sb_info *, @@ -383,12 +384,7 @@ int jffs2_do_mount_fs(struct jffs2_sb_info *c) return 0; out_free: -#ifndef __ECOS - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else -#endif - kfree(c->blocks); + kvfree(c->blocks); return ret; } diff --git a/fs/jffs2/fs.c b/fs/jffs2/fs.c index 2caf1682036d..bead25ae8fe4 100644 --- a/fs/jffs2/fs.c +++ b/fs/jffs2/fs.c @@ -596,10 +596,7 @@ int jffs2_do_fill_super(struct super_block *sb, void *data, int silent) out_root: jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else - kfree(c->blocks); + kvfree(c->blocks); out_inohash: jffs2_clear_xattr_subsystem(c); kfree(c->inocache_list); diff --git a/fs/jffs2/super.c b/fs/jffs2/super.c index bb080c272149..0a9a114bb9d1 100644 --- a/fs/jffs2/super.c +++ b/fs/jffs2/super.c @@ -331,10 +331,7 @@ static void jffs2_put_super (struct super_block *sb) jffs2_free_ino_caches(c); jffs2_free_raw_node_refs(c); - if (jffs2_blocks_use_vmalloc(c)) - vfree(c->blocks); - else - kfree(c->blocks); + kvfree(c->blocks); jffs2_flash_cleanup(c); kfree(c->inocache_list); jffs2_clear_xattr_subsystem(c); diff --git a/fs/udf/super.c b/fs/udf/super.c index 0fbb4c7c72e8..a522c15a0bfd 100644 --- a/fs/udf/super.c +++ b/fs/udf/super.c @@ -279,17 +279,12 @@ static void udf_sb_free_bitmap(struct udf_bitmap *bitmap) { int i; int nr_groups = bitmap->s_nr_groups; - int size = sizeof(struct udf_bitmap) + (sizeof(struct buffer_head *) * - nr_groups); for (i = 0; i < nr_groups; i++) if (bitmap->s_block_bitmap[i]) brelse(bitmap->s_block_bitmap[i]); - if (size <= PAGE_SIZE) - kfree(bitmap); - else - vfree(bitmap); + kvfree(bitmap); } static void udf_free_partition(struct udf_part_map *map) diff --git a/ipc/sem.c b/ipc/sem.c index b471e5a3863d..cddd5b5fde51 100644 --- a/ipc/sem.c +++ b/ipc/sem.c @@ -1493,7 +1493,7 @@ out_rcu_wakeup: wake_up_sem_queue_do(&tasks); out_free: if (sem_io != fast_sem_io) - ipc_free(sem_io, sizeof(ushort)*nsems); + ipc_free(sem_io); return err; } diff --git a/ipc/util.c b/ipc/util.c index 0f401d94b7c6..798cad18dd87 100644 --- a/ipc/util.c +++ b/ipc/util.c @@ -414,17 +414,12 @@ void *ipc_alloc(int size) /** * ipc_free - free ipc space * @ptr: pointer returned by ipc_alloc - * @size: size of block * - * Free a block created with ipc_alloc(). The caller must know the size - * used in the allocation call. + * Free a block created with ipc_alloc(). */ -void ipc_free(void *ptr, int size) +void ipc_free(void *ptr) { - if (size > PAGE_SIZE) - vfree(ptr); - else - kfree(ptr); + kvfree(ptr); } /** diff --git a/ipc/util.h b/ipc/util.h index 3a8a5a0eca62..51f7ca58ac67 100644 --- a/ipc/util.h +++ b/ipc/util.h @@ -118,7 +118,7 @@ int ipcperms(struct ipc_namespace *ns, struct kern_ipc_perm *ipcp, short flg); * both function can sleep */ void *ipc_alloc(int size); -void ipc_free(void *ptr, int size); +void ipc_free(void *ptr); /* * For allocation that need to be freed by RCU. diff --git a/mm/percpu.c b/mm/percpu.c index 8a943b97a053..998607adf6eb 100644 --- a/mm/percpu.c +++ b/mm/percpu.c @@ -305,16 +305,12 @@ static void *pcpu_mem_zalloc(size_t size) /** * pcpu_mem_free - free memory * @ptr: memory to free - * @size: size of the area * * Free @ptr. @ptr should have been allocated using pcpu_mem_zalloc(). */ -static void pcpu_mem_free(void *ptr, size_t size) +static void pcpu_mem_free(void *ptr) { - if (size <= PAGE_SIZE) - kfree(ptr); - else - vfree(ptr); + kvfree(ptr); } /** @@ -463,8 +459,8 @@ out_unlock: * pcpu_mem_free() might end up calling vfree() which uses * IRQ-unsafe lock and thus can't be called under pcpu_lock. */ - pcpu_mem_free(old, old_size); - pcpu_mem_free(new, new_size); + pcpu_mem_free(old); + pcpu_mem_free(new); return 0; } @@ -732,7 +728,7 @@ static struct pcpu_chunk *pcpu_alloc_chunk(void) chunk->map = pcpu_mem_zalloc(PCPU_DFL_MAP_ALLOC * sizeof(chunk->map[0])); if (!chunk->map) { - pcpu_mem_free(chunk, pcpu_chunk_struct_size); + pcpu_mem_free(chunk); return NULL; } @@ -753,8 +749,8 @@ static void pcpu_free_chunk(struct pcpu_chunk *chunk) { if (!chunk) return; - pcpu_mem_free(chunk->map, chunk->map_alloc * sizeof(chunk->map[0])); - pcpu_mem_free(chunk, pcpu_chunk_struct_size); + pcpu_mem_free(chunk->map); + pcpu_mem_free(chunk); } /** diff --git a/net/ipv4/fib_trie.c b/net/ipv4/fib_trie.c index 744e5936c10d..7aea0ccb6be6 100644 --- a/net/ipv4/fib_trie.c +++ b/net/ipv4/fib_trie.c @@ -289,10 +289,8 @@ static void __node_free_rcu(struct rcu_head *head) if (!n->tn_bits) kmem_cache_free(trie_leaf_kmem, n); - else if (n->tn_bits <= TNODE_KMALLOC_MAX) - kfree(n); else - vfree(n); + kvfree(n); } #define node_free(n) call_rcu(&tn_info(n)->rcu, __node_free_rcu) From 114bf37e04d839b555b3dc460b5e6ce156f49cf0 Mon Sep 17 00:00:00 2001 From: Rich Felker Date: Fri, 22 Jan 2016 15:11:05 -0800 Subject: [PATCH 12/12] MAINTAINERS: return arch/sh to maintained state, with new maintainers Add Yoshinori Sato and Rich Felker as maintainers for arch/sh (SUPERH). Signed-off-by: Rich Felker Signed-off-by: Yoshinori Sato Acked-by: D. Jeff Dionne Acked-by: Rob Landley Acked-by: Peter Zijlstra (Intel) Acked-by: Simon Horman Acked-by: Geert Uytterhoeven Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/MAINTAINERS b/MAINTAINERS index b8a717c4f863..fc1718500181 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -10453,9 +10453,11 @@ S: Maintained F: drivers/net/ethernet/dlink/sundance.c SUPERH +M: Yoshinori Sato +M: Rich Felker L: linux-sh@vger.kernel.org Q: http://patchwork.kernel.org/project/linux-sh/list/ -S: Orphan +S: Maintained F: Documentation/sh/ F: arch/sh/ F: drivers/sh/