From ab77dab46210bb630e06c6803c5d84074bacd351 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:04:29 -0700 Subject: [PATCH 001/118] fs/dax.c: use new return type vm_fault_t Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. commit 1c8f422059ae ("mm: change return type to vm_fault_t") There was an existing bug inside dax_load_hole() if vm_insert_mixed had failed to allocate a page table, we'd return VM_FAULT_NOPAGE instead of VM_FAULT_OOM. With new vmf_insert_mixed() this issue is addressed. vm_insert_mixed_mkwrite has inefficiency when it returns an error value, driver has to convert it to vm_fault_t type. With new vmf_insert_mixed_mkwrite() this limitation will be addressed. Link: http://lkml.kernel.org/r/20180510181121.GA15239@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Jan Kara Reviewed-by: Matthew Wilcox Reviewed-by: Ross Zwisler Cc: Alexander Viro Cc: Dan Williams Cc: Michal Hocko Cc: "Kirill A. Shutemov" Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/dax.c | 78 +++++++++++++++++++++------------------------ include/linux/dax.h | 4 +-- include/linux/mm.h | 4 +-- mm/memory.c | 21 +++++++++--- 4 files changed, 58 insertions(+), 49 deletions(-) diff --git a/fs/dax.c b/fs/dax.c index aa86d9f971a4..08656a2f2aa6 100644 --- a/fs/dax.c +++ b/fs/dax.c @@ -905,12 +905,12 @@ out: * If this page is ever written to we will re-fault and change the mapping to * point to real DAX storage instead. */ -static int dax_load_hole(struct address_space *mapping, void *entry, +static vm_fault_t dax_load_hole(struct address_space *mapping, void *entry, struct vm_fault *vmf) { struct inode *inode = mapping->host; unsigned long vaddr = vmf->address; - int ret = VM_FAULT_NOPAGE; + vm_fault_t ret = VM_FAULT_NOPAGE; struct page *zero_page; void *entry2; pfn_t pfn; @@ -929,7 +929,7 @@ static int dax_load_hole(struct address_space *mapping, void *entry, goto out; } - vm_insert_mixed(vmf->vma, vaddr, pfn); + ret = vmf_insert_mixed(vmf->vma, vaddr, pfn); out: trace_dax_load_hole(inode, vmf, ret); return ret; @@ -1112,7 +1112,7 @@ dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, } EXPORT_SYMBOL_GPL(dax_iomap_rw); -static int dax_fault_return(int error) +static vm_fault_t dax_fault_return(int error) { if (error == 0) return VM_FAULT_NOPAGE; @@ -1132,7 +1132,7 @@ static bool dax_fault_is_synchronous(unsigned long flags, && (iomap->flags & IOMAP_F_DIRTY); } -static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; @@ -1145,18 +1145,18 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, int error, major = 0; bool write = vmf->flags & FAULT_FLAG_WRITE; bool sync; - int vmf_ret = 0; + vm_fault_t ret = 0; void *entry; pfn_t pfn; - trace_dax_pte_fault(inode, vmf, vmf_ret); + trace_dax_pte_fault(inode, vmf, ret); /* * Check whether offset isn't beyond end of file now. Caller is supposed * to hold locks serializing us with truncate / punch hole so this is * a reliable test. */ if (pos >= i_size_read(inode)) { - vmf_ret = VM_FAULT_SIGBUS; + ret = VM_FAULT_SIGBUS; goto out; } @@ -1165,7 +1165,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, entry = grab_mapping_entry(mapping, vmf->pgoff, 0); if (IS_ERR(entry)) { - vmf_ret = dax_fault_return(PTR_ERR(entry)); + ret = dax_fault_return(PTR_ERR(entry)); goto out; } @@ -1176,7 +1176,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, * retried. */ if (pmd_trans_huge(*vmf->pmd) || pmd_devmap(*vmf->pmd)) { - vmf_ret = VM_FAULT_NOPAGE; + ret = VM_FAULT_NOPAGE; goto unlock_entry; } @@ -1189,7 +1189,7 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, if (iomap_errp) *iomap_errp = error; if (error) { - vmf_ret = dax_fault_return(error); + ret = dax_fault_return(error); goto unlock_entry; } if (WARN_ON_ONCE(iomap.offset + iomap.length < pos + PAGE_SIZE)) { @@ -1219,9 +1219,9 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto error_finish_iomap; __SetPageUptodate(vmf->cow_page); - vmf_ret = finish_fault(vmf); - if (!vmf_ret) - vmf_ret = VM_FAULT_DONE_COW; + ret = finish_fault(vmf); + if (!ret) + ret = VM_FAULT_DONE_COW; goto finish_iomap; } @@ -1257,23 +1257,20 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, goto error_finish_iomap; } *pfnp = pfn; - vmf_ret = VM_FAULT_NEEDDSYNC | major; + ret = VM_FAULT_NEEDDSYNC | major; goto finish_iomap; } trace_dax_insert_mapping(inode, vmf, entry); if (write) - error = vm_insert_mixed_mkwrite(vma, vaddr, pfn); + ret = vmf_insert_mixed_mkwrite(vma, vaddr, pfn); else - error = vm_insert_mixed(vma, vaddr, pfn); + ret = vmf_insert_mixed(vma, vaddr, pfn); - /* -EBUSY is fine, somebody else faulted on the same PTE */ - if (error == -EBUSY) - error = 0; - break; + goto finish_iomap; case IOMAP_UNWRITTEN: case IOMAP_HOLE: if (!write) { - vmf_ret = dax_load_hole(mapping, entry, vmf); + ret = dax_load_hole(mapping, entry, vmf); goto finish_iomap; } /*FALLTHRU*/ @@ -1284,12 +1281,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, } error_finish_iomap: - vmf_ret = dax_fault_return(error) | major; + ret = dax_fault_return(error); finish_iomap: if (ops->iomap_end) { int copied = PAGE_SIZE; - if (vmf_ret & VM_FAULT_ERROR) + if (ret & VM_FAULT_ERROR) copied = 0; /* * The fault is done by now and there's no way back (other @@ -1302,12 +1299,12 @@ static int dax_iomap_pte_fault(struct vm_fault *vmf, pfn_t *pfnp, unlock_entry: put_locked_mapping_entry(mapping, vmf->pgoff); out: - trace_dax_pte_fault_done(inode, vmf, vmf_ret); - return vmf_ret; + trace_dax_pte_fault_done(inode, vmf, ret); + return ret | major; } #ifdef CONFIG_FS_DAX_PMD -static int dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, +static vm_fault_t dax_pmd_load_hole(struct vm_fault *vmf, struct iomap *iomap, void *entry) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; @@ -1348,7 +1345,7 @@ fallback: return VM_FAULT_FALLBACK; } -static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { struct vm_area_struct *vma = vmf->vma; @@ -1358,7 +1355,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, bool sync; unsigned int iomap_flags = (write ? IOMAP_WRITE : 0) | IOMAP_FAULT; struct inode *inode = mapping->host; - int result = VM_FAULT_FALLBACK; + vm_fault_t result = VM_FAULT_FALLBACK; struct iomap iomap = { 0 }; pgoff_t max_pgoff, pgoff; void *entry; @@ -1509,7 +1506,7 @@ out: return result; } #else -static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, +static vm_fault_t dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, const struct iomap_ops *ops) { return VM_FAULT_FALLBACK; @@ -1529,7 +1526,7 @@ static int dax_iomap_pmd_fault(struct vm_fault *vmf, pfn_t *pfnp, * has done all the necessary locking for page fault to proceed * successfully. */ -int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, +vm_fault_t dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t *pfnp, int *iomap_errp, const struct iomap_ops *ops) { switch (pe_size) { @@ -1553,14 +1550,14 @@ EXPORT_SYMBOL_GPL(dax_iomap_fault); * DAX file. It takes care of marking corresponding radix tree entry as dirty * as well. */ -static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, +static vm_fault_t dax_insert_pfn_mkwrite(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t pfn) { struct address_space *mapping = vmf->vma->vm_file->f_mapping; void *entry, **slot; pgoff_t index = vmf->pgoff; - int vmf_ret, error; + vm_fault_t ret; xa_lock_irq(&mapping->i_pages); entry = get_unlocked_mapping_entry(mapping, index, &slot); @@ -1579,21 +1576,20 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, xa_unlock_irq(&mapping->i_pages); switch (pe_size) { case PE_SIZE_PTE: - error = vm_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); - vmf_ret = dax_fault_return(error); + ret = vmf_insert_mixed_mkwrite(vmf->vma, vmf->address, pfn); break; #ifdef CONFIG_FS_DAX_PMD case PE_SIZE_PMD: - vmf_ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, + ret = vmf_insert_pfn_pmd(vmf->vma, vmf->address, vmf->pmd, pfn, true); break; #endif default: - vmf_ret = VM_FAULT_FALLBACK; + ret = VM_FAULT_FALLBACK; } put_locked_mapping_entry(mapping, index); - trace_dax_insert_pfn_mkwrite(mapping->host, vmf, vmf_ret); - return vmf_ret; + trace_dax_insert_pfn_mkwrite(mapping->host, vmf, ret); + return ret; } /** @@ -1606,8 +1602,8 @@ static int dax_insert_pfn_mkwrite(struct vm_fault *vmf, * stored persistently on the media and handles inserting of appropriate page * table entry. */ -int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - pfn_t pfn) +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, + enum page_entry_size pe_size, pfn_t pfn) { int err; loff_t start = ((loff_t)vmf->pgoff) << PAGE_SHIFT; diff --git a/include/linux/dax.h b/include/linux/dax.h index c99692ddd4b5..88504e87cd6c 100644 --- a/include/linux/dax.h +++ b/include/linux/dax.h @@ -125,8 +125,8 @@ ssize_t dax_iomap_rw(struct kiocb *iocb, struct iov_iter *iter, const struct iomap_ops *ops); int dax_iomap_fault(struct vm_fault *vmf, enum page_entry_size pe_size, pfn_t *pfnp, int *errp, const struct iomap_ops *ops); -int dax_finish_sync_fault(struct vm_fault *vmf, enum page_entry_size pe_size, - pfn_t pfn); +vm_fault_t dax_finish_sync_fault(struct vm_fault *vmf, + enum page_entry_size pe_size, pfn_t pfn); int dax_delete_mapping_entry(struct address_space *mapping, pgoff_t index); int dax_invalidate_mapping_entry_sync(struct address_space *mapping, pgoff_t index); diff --git a/include/linux/mm.h b/include/linux/mm.h index 0495e6f97fae..b7012bb1d218 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2431,8 +2431,8 @@ int vm_insert_pfn_prot(struct vm_area_struct *vma, unsigned long addr, unsigned long pfn, pgprot_t pgprot); int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, pfn_t pfn); -int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn); +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn); int vm_iomap_memory(struct vm_area_struct *vma, phys_addr_t start, unsigned long len); static inline vm_fault_t vmf_insert_page(struct vm_area_struct *vma, diff --git a/mm/memory.c b/mm/memory.c index 01f5464e0fd2..6a97893a8de7 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -1955,12 +1955,25 @@ int vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, } EXPORT_SYMBOL(vm_insert_mixed); -int vm_insert_mixed_mkwrite(struct vm_area_struct *vma, unsigned long addr, - pfn_t pfn) +/* + * If the insertion of PTE failed because someone else already added a + * different entry in the mean time, we treat that as success as we assume + * the same entry was actually inserted. + */ + +vm_fault_t vmf_insert_mixed_mkwrite(struct vm_area_struct *vma, + unsigned long addr, pfn_t pfn) { - return __vm_insert_mixed(vma, addr, pfn, true); + int err; + + err = __vm_insert_mixed(vma, addr, pfn, true); + if (err == -ENOMEM) + return VM_FAULT_OOM; + if (err < 0 && err != -EBUSY) + return VM_FAULT_SIGBUS; + return VM_FAULT_NOPAGE; } -EXPORT_SYMBOL(vm_insert_mixed_mkwrite); +EXPORT_SYMBOL(vmf_insert_mixed_mkwrite); /* * maps a range of physical memory into the requested pages. the old From 882ea1d64eb3956f7d877bcbc6370bd3c6d81543 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 7 Jun 2018 17:04:33 -0700 Subject: [PATCH 002/118] scripts: use SPDX tag in get_maintainer and checkpatch Add the appropriate SPDX tag to these scripts. Miscellanea: o Add my copyright to checkpatch Link: http://lkml.kernel.org/r/d08e49e8f6562c58a63792aa64306d1851f81f4b.camel@perches.com Signed-off-by: Joe Perches Cc: Andy Whitcroft Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 4 +++- scripts/get_maintainer.pl | 4 ++-- 2 files changed, 5 insertions(+), 3 deletions(-) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index e6033d3c48d3..dfd265e036f3 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -1,9 +1,11 @@ #!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 +# # (c) 2001, Dave Jones. (the file handling bit) # (c) 2005, Joel Schopp (the ugly bit) # (c) 2007,2008, Andy Whitcroft (new conditions, test suite) # (c) 2008-2010 Andy Whitcroft -# Licensed under the terms of the GNU GPL License version 2 +# (c) 2010-2018 Joe Perches use strict; use warnings; diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 99c96e86eccb..30eca36b4dad 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -1,4 +1,6 @@ #!/usr/bin/env perl +# SPDX-License-Identifier: GPL-2.0 +# # (c) 2007, Joe Perches # created from checkpatch.pl # @@ -7,8 +9,6 @@ # # usage: perl scripts/get_maintainer.pl [OPTIONS] # perl scripts/get_maintainer.pl [OPTIONS] -f -# -# Licensed under the terms of the GNU GPL License version 2 use warnings; use strict; From 5bc55d654bdd679b8fee0ca7a3b180cbeaa4168d Mon Sep 17 00:00:00 2001 From: Jia Guo Date: Thu, 7 Jun 2018 17:04:38 -0700 Subject: [PATCH 003/118] ocfs2: clean up redundant function declarations ocfs2_extend_allocation() has been deleted, clean up its declaration. Also change the static function name from __ocfs2_extend_allocation() to ocfs2_extend_allocation() to be consistent with the corresponding trace events as well as comments for ocfs2_lock_allocators(). Link: http://lkml.kernel.org/r/09cf7125-6f12-e53e-20f5-e606b2c16b48@huawei.com Fixes: 964f14a0d350 ("ocfs2: clean up some dead code") Signed-off-by: Jia Guo Acked-by: Joseph Qi Reviewed-by: Jun Piao Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/file.c | 10 +++++----- fs/ocfs2/file.h | 2 -- 2 files changed, 5 insertions(+), 7 deletions(-) diff --git a/fs/ocfs2/file.c b/fs/ocfs2/file.c index 6ee94bc23f5b..a2a8603d27e0 100644 --- a/fs/ocfs2/file.c +++ b/fs/ocfs2/file.c @@ -563,8 +563,8 @@ int ocfs2_add_inode_data(struct ocfs2_super *osb, return ret; } -static int __ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten) +static int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, + u32 clusters_to_add, int mark_unwritten) { int status = 0; int restart_func = 0; @@ -1035,8 +1035,8 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, clusters_to_add -= oi->ip_clusters; if (clusters_to_add) { - ret = __ocfs2_extend_allocation(inode, oi->ip_clusters, - clusters_to_add, 0); + ret = ocfs2_extend_allocation(inode, oi->ip_clusters, + clusters_to_add, 0); if (ret) { mlog_errno(ret); goto out; @@ -1493,7 +1493,7 @@ static int ocfs2_allocate_unwritten_extents(struct inode *inode, goto next; } - ret = __ocfs2_extend_allocation(inode, cpos, alloc_size, 1); + ret = ocfs2_extend_allocation(inode, cpos, alloc_size, 1); if (ret) { if (ret != -ENOSPC) mlog_errno(ret); diff --git a/fs/ocfs2/file.h b/fs/ocfs2/file.h index 1fdc9839cd93..7eb7f03531f6 100644 --- a/fs/ocfs2/file.h +++ b/fs/ocfs2/file.h @@ -65,8 +65,6 @@ int ocfs2_extend_no_holes(struct inode *inode, struct buffer_head *di_bh, u64 new_i_size, u64 zero_to); int ocfs2_zero_extend(struct inode *inode, struct buffer_head *di_bh, loff_t zero_to); -int ocfs2_extend_allocation(struct inode *inode, u32 logical_start, - u32 clusters_to_add, int mark_unwritten); int ocfs2_setattr(struct dentry *dentry, struct iattr *attr); int ocfs2_getattr(const struct path *path, struct kstat *stat, u32 request_mask, unsigned int flags); From 133b81f28edebfd9b20486351589e460e8c052e4 Mon Sep 17 00:00:00 2001 From: Larry Chen Date: Thu, 7 Jun 2018 17:04:43 -0700 Subject: [PATCH 004/118] ocfs2: ocfs2_inode_lock_tracker does not distinguish lock level ocfs2_inode_lock_tracker as a variant of ocfs2_inode_lock, is used to prevent deadlock due to recursive lock acquisition. But this function does not distinguish whether the requested level is EX or PR. If a RP lock has been attained, this function will immediately return success afterwards even an EX lock is requested. But actually the return value does not mean that the process got a EX lock, because ocfs2_inode_lock has not been called. When taking lock levels into account, we face some different situations: 1. no lock is held In this case, just lock the inode and return 0 2. We are holding a lock For this situation, things diverges into several cases wanted holding what to do ex ex see 2.1 below ex pr see 2.2 below pr ex see 2.1 below pr pr see 2.1 below 2.1 lock level that is been held is compatible with the wanted level, so no lock action will be tacken. 2.2 Otherwise, an upgrade is needed, but it is forbidden. Reason why upgrade within a process is forbidden is that lock upgrade may cause dead lock. The following illustrate how it happens. process 1 process 2 ocfs2_inode_lock_tracker(ex=0) <====== ocfs2_inode_lock_tracker(ex=1) ocfs2_inode_lock_tracker(ex=1) For the status quo of ocfs2, without this patch, neither a bug nor end-user impact will be caused because the wrong logic is avoided. But I'm afraid this generic interface, may be called by other developers in future and used in this situation. a process ocfs2_inode_lock_tracker(ex=0) ocfs2_inode_lock_tracker(ex=1) Link: http://lkml.kernel.org/r/20180510053230.17217-1-lchen@suse.com Signed-off-by: Larry Chen Reviewed-by: Gang He Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/dlmglue.c | 125 +++++++++++++++++++++++++++++++++------------ fs/ocfs2/dlmglue.h | 1 + 2 files changed, 93 insertions(+), 33 deletions(-) diff --git a/fs/ocfs2/dlmglue.c b/fs/ocfs2/dlmglue.c index 97a972efab83..68728de12864 100644 --- a/fs/ocfs2/dlmglue.c +++ b/fs/ocfs2/dlmglue.c @@ -788,6 +788,23 @@ static inline void ocfs2_add_holder(struct ocfs2_lock_res *lockres, spin_unlock(&lockres->l_lock); } +static struct ocfs2_lock_holder * +ocfs2_pid_holder(struct ocfs2_lock_res *lockres, + struct pid *pid) +{ + struct ocfs2_lock_holder *oh; + + spin_lock(&lockres->l_lock); + list_for_each_entry(oh, &lockres->l_holders, oh_list) { + if (oh->oh_owner_pid == pid) { + spin_unlock(&lockres->l_lock); + return oh; + } + } + spin_unlock(&lockres->l_lock); + return NULL; +} + static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, struct ocfs2_lock_holder *oh) { @@ -798,24 +815,6 @@ static inline void ocfs2_remove_holder(struct ocfs2_lock_res *lockres, put_pid(oh->oh_owner_pid); } -static inline int ocfs2_is_locked_by_me(struct ocfs2_lock_res *lockres) -{ - struct ocfs2_lock_holder *oh; - struct pid *pid; - - /* look in the list of holders for one with the current task as owner */ - spin_lock(&lockres->l_lock); - pid = task_pid(current); - list_for_each_entry(oh, &lockres->l_holders, oh_list) { - if (oh->oh_owner_pid == pid) { - spin_unlock(&lockres->l_lock); - return 1; - } - } - spin_unlock(&lockres->l_lock); - - return 0; -} static inline void ocfs2_inc_holders(struct ocfs2_lock_res *lockres, int level) @@ -2610,34 +2609,93 @@ void ocfs2_inode_unlock(struct inode *inode, * * return < 0 on error, return == 0 if there's no lock holder on the stack * before this call, return == 1 if this call would be a recursive locking. + * return == -1 if this lock attempt will cause an upgrade which is forbidden. + * + * When taking lock levels into account,we face some different situations. + * + * 1. no lock is held + * In this case, just lock the inode as requested and return 0 + * + * 2. We are holding a lock + * For this situation, things diverges into several cases + * + * wanted holding what to do + * ex ex see 2.1 below + * ex pr see 2.2 below + * pr ex see 2.1 below + * pr pr see 2.1 below + * + * 2.1 lock level that is been held is compatible + * with the wanted level, so no lock action will be tacken. + * + * 2.2 Otherwise, an upgrade is needed, but it is forbidden. + * + * Reason why upgrade within a process is forbidden is that + * lock upgrade may cause dead lock. The following illustrates + * how it happens. + * + * thread on node1 thread on node2 + * ocfs2_inode_lock_tracker(ex=0) + * + * <====== ocfs2_inode_lock_tracker(ex=1) + * + * ocfs2_inode_lock_tracker(ex=1) */ int ocfs2_inode_lock_tracker(struct inode *inode, struct buffer_head **ret_bh, int ex, struct ocfs2_lock_holder *oh) { - int status; - int arg_flags = 0, has_locked; + int status = 0; struct ocfs2_lock_res *lockres; + struct ocfs2_lock_holder *tmp_oh; + struct pid *pid = task_pid(current); + lockres = &OCFS2_I(inode)->ip_inode_lockres; - has_locked = ocfs2_is_locked_by_me(lockres); - /* Just get buffer head if the cluster lock has been taken */ - if (has_locked) - arg_flags = OCFS2_META_LOCK_GETBH; + tmp_oh = ocfs2_pid_holder(lockres, pid); - if (likely(!has_locked || ret_bh)) { - status = ocfs2_inode_lock_full(inode, ret_bh, ex, arg_flags); + if (!tmp_oh) { + /* + * This corresponds to the case 1. + * We haven't got any lock before. + */ + status = ocfs2_inode_lock_full(inode, ret_bh, ex, 0); + if (status < 0) { + if (status != -ENOENT) + mlog_errno(status); + return status; + } + + oh->oh_ex = ex; + ocfs2_add_holder(lockres, oh); + return 0; + } + + if (unlikely(ex && !tmp_oh->oh_ex)) { + /* + * case 2.2 upgrade may cause dead lock, forbid it. + */ + mlog(ML_ERROR, "Recursive locking is not permitted to " + "upgrade to EX level from PR level.\n"); + dump_stack(); + return -EINVAL; + } + + /* + * case 2.1 OCFS2_META_LOCK_GETBH flag make ocfs2_inode_lock_full. + * ignore the lock level and just update it. + */ + if (ret_bh) { + status = ocfs2_inode_lock_full(inode, ret_bh, ex, + OCFS2_META_LOCK_GETBH); if (status < 0) { if (status != -ENOENT) mlog_errno(status); return status; } } - if (!has_locked) - ocfs2_add_holder(lockres, oh); - - return has_locked; + return tmp_oh ? 1 : 0; } void ocfs2_inode_unlock_tracker(struct inode *inode, @@ -2649,12 +2707,13 @@ void ocfs2_inode_unlock_tracker(struct inode *inode, lockres = &OCFS2_I(inode)->ip_inode_lockres; /* had_lock means that the currect process already takes the cluster - * lock previously. If had_lock is 1, we have nothing to do here, and - * it will get unlocked where we got the lock. + * lock previously. + * If had_lock is 1, we have nothing to do here. + * If had_lock is 0, we will release the lock. */ if (!had_lock) { + ocfs2_inode_unlock(inode, oh->oh_ex); ocfs2_remove_holder(lockres, oh); - ocfs2_inode_unlock(inode, ex); } } diff --git a/fs/ocfs2/dlmglue.h b/fs/ocfs2/dlmglue.h index 256e0a9067b8..4ec1c828f6e0 100644 --- a/fs/ocfs2/dlmglue.h +++ b/fs/ocfs2/dlmglue.h @@ -96,6 +96,7 @@ struct ocfs2_trim_fs_info { struct ocfs2_lock_holder { struct list_head oh_list; struct pid *oh_owner_pid; + int oh_ex; }; /* ocfs2_inode_lock_full() 'arg_flags' flags */ From 731a40fab1b305ad71e2b769c0e3550c092dc215 Mon Sep 17 00:00:00 2001 From: Zhen Lei Date: Thu, 7 Jun 2018 17:04:47 -0700 Subject: [PATCH 005/118] ocfs2: eliminate a misreported warning The warning is invalid because the parameter chunksize passed from ocfs2_info_freefrag_scan_chain-->ocfs2_info_update_ffg is guaranteed to be positive. So __ilog2_u32 cannot return -1. fs/ocfs2/ioctl.c: In function 'ocfs2_info_update_ffg': fs/ocfs2/ioctl.c:411:17: warning: array subscript is below array bounds [-Warray-bounds] hist->fc_chunks[index]++; ^ fs/ocfs2/ioctl.c:411:17: warning: array subscript is below array bounds [-Warray-bounds] Link: http://lkml.kernel.org/r/1524655799-12112-1-git-send-email-thunder.leizhen@huawei.com Signed-off-by: Zhen Lei Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ioctl.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/ocfs2/ioctl.c b/fs/ocfs2/ioctl.c index ab30c005cc4b..994726ada857 100644 --- a/fs/ocfs2/ioctl.c +++ b/fs/ocfs2/ioctl.c @@ -402,7 +402,7 @@ out_err: static void o2ffg_update_histogram(struct ocfs2_info_free_chunk_list *hist, unsigned int chunksize) { - int index; + u32 index; index = __ilog2_u32(chunksize); if (index >= OCFS2_INFO_MAX_HIST) From f3797d8ae596636ab42e59493b3bcbe4a04f296e Mon Sep 17 00:00:00 2001 From: Guozhonghua Date: Thu, 7 Jun 2018 17:04:51 -0700 Subject: [PATCH 006/118] ocfs2: correct the comments position of struct ocfs2_dir_block_trailer Correct the comments position of the structure ocfs2_dir_block_trailer. Link: http://lkml.kernel.org/r/71604351584F6A4EBAE558C676F37CA401071C5FDE@H3CMLB12-EX.srv.huawei-3com.com Signed-off-by: guozhonghua Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/ocfs2_fs.h | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/fs/ocfs2/ocfs2_fs.h b/fs/ocfs2/ocfs2_fs.h index 5bb4a89f9045..7071ad0dec90 100644 --- a/fs/ocfs2/ocfs2_fs.h +++ b/fs/ocfs2/ocfs2_fs.h @@ -807,11 +807,11 @@ struct ocfs2_dir_block_trailer { * in this block. (unused) */ /*10*/ __u8 db_signature[8]; /* Signature for verification */ __le64 db_reserved2; - __le64 db_free_next; /* Next block in list (unused) */ -/*20*/ __le64 db_blkno; /* Offset on disk, in blocks */ - __le64 db_parent_dinode; /* dinode which owns me, in +/*20*/ __le64 db_free_next; /* Next block in list (unused) */ + __le64 db_blkno; /* Offset on disk, in blocks */ +/*30*/ __le64 db_parent_dinode; /* dinode which owns me, in blocks */ -/*30*/ struct ocfs2_block_check db_check; /* Error checking */ + struct ocfs2_block_check db_check; /* Error checking */ /*40*/ }; From 64202a21a449837aa155764029216d69019cfb15 Mon Sep 17 00:00:00 2001 From: Salvatore Mesoraca Date: Thu, 7 Jun 2018 17:04:55 -0700 Subject: [PATCH 007/118] ocfs2: drop a VLA in ocfs2_orphan_del() Avoid a VLA by using a real constant expression instead of a variable. The compiler should be able to optimize the original code and avoid using an actual VLA. Anyway this change is useful because it will avoid a false positive with -Wvla, it might also help the compiler generating better code. Link: http://lkml.kernel.org/r/1520970710-19732-1-git-send-email-s.mesoraca16@gmail.com Signed-off-by: Salvatore Mesoraca Reviewed-by: Kees Cook Reviewed-by: Andrew Morton Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/namei.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/fs/ocfs2/namei.c b/fs/ocfs2/namei.c index 8dd6f703c819..b7ca84bc3df7 100644 --- a/fs/ocfs2/namei.c +++ b/fs/ocfs2/namei.c @@ -2332,8 +2332,7 @@ int ocfs2_orphan_del(struct ocfs2_super *osb, struct buffer_head *orphan_dir_bh, bool dio) { - const int namelen = OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN; - char name[namelen + 1]; + char name[OCFS2_DIO_ORPHAN_PREFIX_LEN + OCFS2_ORPHAN_NAMELEN + 1]; struct ocfs2_dinode *orphan_fe; int status = 0; struct ocfs2_dir_lookup_result lookup = { NULL, }; From c6137fe36d2ac160d919bf97bc3dfd6494b0b471 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:04:59 -0700 Subject: [PATCH 008/118] fs: ocfs2: use new return type vm_fault_t Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Ref-> commit 1c8f422059ae ("mm: change return type to vm_fault_t") vmf_error() is the newly introduce inline function in 4.18. Fix one checkpatch.pl warning by replacing BUG_ON() with WARN_ON() [akpm@linux-foundation.org: undo BUG_ON->WARN_ON change] Link: http://lkml.kernel.org/r/20180523153258.GA28451@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Cc: Mark Fasheh Cc: Joel Becker Cc: Junxiao Bi Cc: Joseph Qi Cc: Changwei Ge Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/ocfs2/mmap.c | 44 ++++++++++++++++++++------------------------ 1 file changed, 20 insertions(+), 24 deletions(-) diff --git a/fs/ocfs2/mmap.c b/fs/ocfs2/mmap.c index fb9a20e3d608..05220b365fb9 100644 --- a/fs/ocfs2/mmap.c +++ b/fs/ocfs2/mmap.c @@ -44,11 +44,11 @@ #include "ocfs2_trace.h" -static int ocfs2_fault(struct vm_fault *vmf) +static vm_fault_t ocfs2_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; sigset_t oldset; - int ret; + vm_fault_t ret; ocfs2_block_signals(&oldset); ret = filemap_fault(vmf); @@ -59,10 +59,11 @@ static int ocfs2_fault(struct vm_fault *vmf) return ret; } -static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, - struct page *page) +static vm_fault_t __ocfs2_page_mkwrite(struct file *file, + struct buffer_head *di_bh, struct page *page) { - int ret = VM_FAULT_NOPAGE; + int err; + vm_fault_t ret = VM_FAULT_NOPAGE; struct inode *inode = file_inode(file); struct address_space *mapping = inode->i_mapping; loff_t pos = page_offset(page); @@ -105,15 +106,12 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, if (page->index == last_index) len = ((size - 1) & ~PAGE_MASK) + 1; - ret = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, + err = ocfs2_write_begin_nolock(mapping, pos, len, OCFS2_WRITE_MMAP, &locked_page, &fsdata, di_bh, page); - if (ret) { - if (ret != -ENOSPC) - mlog_errno(ret); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + if (err) { + if (err != -ENOSPC) + mlog_errno(err); + ret = vmf_error(err); goto out; } @@ -121,20 +119,21 @@ static int __ocfs2_page_mkwrite(struct file *file, struct buffer_head *di_bh, ret = VM_FAULT_NOPAGE; goto out; } - ret = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); - BUG_ON(ret != len); + err = ocfs2_write_end_nolock(mapping, pos, len, len, fsdata); + BUG_ON(err != len); ret = VM_FAULT_LOCKED; out: return ret; } -static int ocfs2_page_mkwrite(struct vm_fault *vmf) +static vm_fault_t ocfs2_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); struct buffer_head *di_bh = NULL; sigset_t oldset; - int ret; + int err; + vm_fault_t ret; sb_start_pagefault(inode->i_sb); ocfs2_block_signals(&oldset); @@ -144,13 +143,10 @@ static int ocfs2_page_mkwrite(struct vm_fault *vmf) * node. Taking the data lock will also ensure that we don't * attempt page truncation as part of a downconvert. */ - ret = ocfs2_inode_lock(inode, &di_bh, 1); - if (ret < 0) { - mlog_errno(ret); - if (ret == -ENOMEM) - ret = VM_FAULT_OOM; - else - ret = VM_FAULT_SIGBUS; + err = ocfs2_inode_lock(inode, &di_bh, 1); + if (err < 0) { + mlog_errno(err); + ret = vmf_error(err); goto out; } From 8d856c72b42d585fb17a8aa18454e03a0cf9b2b8 Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 7 Jun 2018 17:05:03 -0700 Subject: [PATCH 009/118] net/9p: detect invalid options as much as possible Currently when detecting invalid options in option parsing, some options(e.g. msize) just set errno and allow to continuously validate other options so that it can detect invalid options as much as possible and give proper error messages together. This patch applies same rule to option 'trans' and 'version' when detecting -EINVAL. Link: http://lkml.kernel.org/r/1525340676-34072-1-git-send-email-cgxu519@gmx.com Signed-off-by: Chengguang Xu Reviewed-by: Andrew Morton Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/9p/client.c | 13 +++++-------- 1 file changed, 5 insertions(+), 8 deletions(-) diff --git a/net/9p/client.c b/net/9p/client.c index 21e6df1cc70f..18c5271910dc 100644 --- a/net/9p/client.c +++ b/net/9p/client.c @@ -198,8 +198,6 @@ static int parse_opts(char *opts, struct p9_client *clnt) pr_info("Could not find request transport: %s\n", s); ret = -EINVAL; - kfree(s); - goto free_and_return; } kfree(s); break; @@ -214,13 +212,12 @@ static int parse_opts(char *opts, struct p9_client *clnt) "problem allocating copy of version arg\n"); goto free_and_return; } - ret = get_protocol_version(s); - if (ret == -EINVAL) { - kfree(s); - goto free_and_return; - } + r = get_protocol_version(s); + if (r < 0) + ret = r; + else + clnt->proto_version = r; kfree(s); - clnt->proto_version = ret; break; default: continue; From 478ae0ca08c239e83803191317ae99d66708306c Mon Sep 17 00:00:00 2001 From: Chengguang Xu Date: Thu, 7 Jun 2018 17:05:07 -0700 Subject: [PATCH 010/118] fs/9p: detect invalid options as much as possible Currently when detecting invalid options in option parsing, some options(e.g. msize) just set errno and allow to continuously validate other options so that it can detect invalid options as much as possible and give proper error messages together. This patch applies same rule to option 'cache' and 'access' when detecting -EINVAL. Link: http://lkml.kernel.org/r/1525340676-34072-2-git-send-email-cgxu519@gmx.com Signed-off-by: Chengguang Xu Reviewed-by: Andrew Morton Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/9p/v9fs.c | 29 ++++++++++++----------------- 1 file changed, 12 insertions(+), 17 deletions(-) diff --git a/fs/9p/v9fs.c b/fs/9p/v9fs.c index e622f0f10502..0429c8ee58f1 100644 --- a/fs/9p/v9fs.c +++ b/fs/9p/v9fs.c @@ -210,12 +210,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; - continue; - } - v9ses->debug = option; + } else { + v9ses->debug = option; #ifdef CONFIG_NET_9P_DEBUG - p9_debug_level = option; + p9_debug_level = option; #endif + } break; case Opt_dfltuid: @@ -231,7 +231,6 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "uid field, but not a uid?\n"); ret = -EINVAL; - continue; } break; case Opt_dfltgid: @@ -247,7 +246,6 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "gid field, but not a gid?\n"); ret = -EINVAL; - continue; } break; case Opt_afid: @@ -256,9 +254,9 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) p9_debug(P9_DEBUG_ERROR, "integer field, but no integer?\n"); ret = r; - continue; + } else { + v9ses->afid = option; } - v9ses->afid = option; break; case Opt_uname: kfree(v9ses->uname); @@ -306,13 +304,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) "problem allocating copy of cache arg\n"); goto free_and_return; } - ret = get_cache_mode(s); - if (ret == -EINVAL) { - kfree(s); - goto free_and_return; - } + r = get_cache_mode(s); + if (r < 0) + ret = r; + else + v9ses->cache = r; - v9ses->cache = ret; kfree(s); break; @@ -341,14 +338,12 @@ static int v9fs_parse_options(struct v9fs_session_info *v9ses, char *opts) pr_info("Unknown access argument %s\n", s); kfree(s); - goto free_and_return; + continue; } v9ses->uid = make_kuid(current_user_ns(), uid); if (!uid_valid(v9ses->uid)) { ret = -EINVAL; pr_info("Uknown uid %s\n", s); - kfree(s); - goto free_and_return; } } From e56ee574bc46b9734066008b3dc105bd7d4cbc0c Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 7 Jun 2018 17:05:10 -0700 Subject: [PATCH 011/118] net/9p/trans_xen.c: don't inclide rwlock.h directly rwlock.h should not be included directly. Instead linux/splinlock.h should be included. One thing it does is to break the RT build. Link: http://lkml.kernel.org/r/20180504100319.11880-1-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Andrew Morton Cc: Eric Van Hensbergen Cc: Ron Minnich Cc: Latchesar Ionkov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- net/9p/trans_xen.c | 1 - 1 file changed, 1 deletion(-) diff --git a/net/9p/trans_xen.c b/net/9p/trans_xen.c index 0f19960390a6..2e2b8bca54f3 100644 --- a/net/9p/trans_xen.c +++ b/net/9p/trans_xen.c @@ -38,7 +38,6 @@ #include #include -#include #include #include #include From 128227e7fe4087b60f1bd31f762e61237eb23790 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:05:13 -0700 Subject: [PATCH 012/118] slab: __GFP_ZERO is incompatible with a constructor __GFP_ZERO requests that the object be initialised to all-zeroes, while the purpose of a constructor is to initialise an object to a particular pattern. We cannot do both. Add a warning to catch any users who mistakenly pass a __GFP_ZERO flag when allocating a slab with a constructor. Link: http://lkml.kernel.org/r/20180412191322.GA21205@bombadil.infradead.org Fixes: d07dbea46405 ("Slab allocators: support __GFP_ZERO in all allocators") Signed-off-by: Matthew Wilcox Acked-by: Johannes Weiner Acked-by: Vlastimil Babka Acked-by: Michal Hocko Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 ++ mm/slob.c | 4 +++- mm/slub.c | 2 ++ 3 files changed, 7 insertions(+), 1 deletion(-) diff --git a/mm/slab.c b/mm/slab.c index 2f308253c3d7..c1fe8099b3cd 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -2665,6 +2665,7 @@ static struct page *cache_grow_begin(struct kmem_cache *cachep, invalid_mask, &invalid_mask, flags, &flags); dump_stack(); } + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); local_flags = flags & (GFP_CONSTRAINT_MASK|GFP_RECLAIM_MASK); check_irq_off(); @@ -3071,6 +3072,7 @@ static inline void cache_alloc_debugcheck_before(struct kmem_cache *cachep, static void *cache_alloc_debugcheck_after(struct kmem_cache *cachep, gfp_t flags, void *objp, unsigned long caller) { + WARN_ON_ONCE(cachep->ctor && (flags & __GFP_ZERO)); if (!objp) return objp; if (cachep->flags & SLAB_POISON) { diff --git a/mm/slob.c b/mm/slob.c index 623e8a5c46ce..307c2c9feb44 100644 --- a/mm/slob.c +++ b/mm/slob.c @@ -555,8 +555,10 @@ static void *slob_alloc_node(struct kmem_cache *c, gfp_t flags, int node) flags, node); } - if (b && c->ctor) + if (b && c->ctor) { + WARN_ON_ONCE(flags & __GFP_ZERO); c->ctor(b); + } kmemleak_alloc_recursive(b, c->size, 1, c->flags, flags); return b; diff --git a/mm/slub.c b/mm/slub.c index 44aa7847324a..dc960401ce90 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -2444,6 +2444,8 @@ static inline void *new_slab_objects(struct kmem_cache *s, gfp_t flags, struct kmem_cache_cpu *c = *pc; struct page *page; + WARN_ON_ONCE(s->ctor && (flags & __GFP_ZERO)); + freelist = get_partial(s, flags, node, c); if (freelist) From a38965bf941b7c2af50de09c96bc5f03e136caef Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Thu, 7 Jun 2018 17:05:17 -0700 Subject: [PATCH 013/118] mm/slub.c: add __printf verification to slab_err() __printf is useful to verify format and arguments. Remove the following warning (with W=1): mm/slub.c:721:2: warning: function might be possible candidate for `gnu_printf' format attribute [-Wsuggest-attribute=format] Link: http://lkml.kernel.org/r/20180505200706.19986-1-malat@debian.org Signed-off-by: Mathieu Malaterre Reviewed-by: Andrew Morton Cc: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/slub.c b/mm/slub.c index dc960401ce90..ec5916483b07 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -711,7 +711,7 @@ void object_err(struct kmem_cache *s, struct page *page, print_trailer(s, page, object); } -static void slab_err(struct kmem_cache *s, struct page *page, +static __printf(3, 4) void slab_err(struct kmem_cache *s, struct page *page, const char *fmt, ...) { va_list args; From 05088e5de02d274efba5e902d6cc9618e5de671a Mon Sep 17 00:00:00 2001 From: Canjiang Lu Date: Thu, 7 Jun 2018 17:05:20 -0700 Subject: [PATCH 014/118] mm/slub: remove obsolete comment The obsolete comment removed in this patch was introduced by 51df1142816e4 ("slub: Dynamically size kmalloc cache allocations"). I paste related modification from that commit: +#ifdef CONFIG_NUMA + /* + * Allocate kmem_cache_node properly from the kmem_cache slab. + * kmem_cache_node is separately allocated so no need to + * update any list pointers. + */ + temp_kmem_cache_node = kmem_cache_node; + kmem_cache_node = kmem_cache_alloc(kmem_cache, GFP_NOWAIT); + memcpy(kmem_cache_node, temp_kmem_cache_node, kmem_size); + + kmem_cache_bootstrap_fixup(kmem_cache_node); + + caches++; +#else + /* + * kmem_cache has kmem_cache_node embedded and we moved it! + * Update the list heads + */ + INIT_LIST_HEAD(&kmem_cache->local_node.partial); + list_splice(&temp_kmem_cache->local_node.partial, &kmem_cache->local_node.partial); +#ifdef CONFIG_SLUB_DEBUG + INIT_LIST_HEAD(&kmem_cache->local_node.full); + list_splice(&temp_kmem_cache->local_node.full, &kmem_cache->local_node.full); +#endif As we can see there're used to distinguish the difference handling between NUMA/non-NUMA configuration in the original commit. I think it doesn't make any sense in current implementation which is placed above kmem_cache_node = bootstrap(&boot_kmem_cache_node); So maybe it's better to remove them now? Link: http://lkml.kernel.org/r/5af26f58.1c69fb81.1be0e.c520SMTPIN_ADDED_BROKEN@mx.google.com Signed-off-by: Canjiang Lu Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 6 ------ 1 file changed, 6 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index ec5916483b07..48f75872c356 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -4241,12 +4241,6 @@ void __init kmem_cache_init(void) SLAB_HWCACHE_ALIGN, 0, 0); kmem_cache = bootstrap(&boot_kmem_cache); - - /* - * Allocate kmem_cache_node properly from the kmem_cache slab. - * kmem_cache_node is separately allocated so no need to - * update any list pointers. - */ kmem_cache_node = bootstrap(&boot_kmem_cache_node); /* Now we can use the kmem_cache to allocate kmalloc slabs */ From 05fec35ebb0e0e1f4603cc241fdf65fd3abb3092 Mon Sep 17 00:00:00 2001 From: Baoquan He Date: Thu, 7 Jun 2018 17:05:24 -0700 Subject: [PATCH 015/118] slab: clean up the code comment in slab kmem_cache struct In commit 3b0efdfa1e7 ("mm, sl[aou]b: Extract common fields from struct kmem_cache") the variable 'obj_size' was moved above, however the related code comment is not updated accordingly. Do it here. Link: http://lkml.kernel.org/r/20180603032402.27526-1-bhe@redhat.com Signed-off-by: Baoquan He Reviewed-by: Andrew Morton Acked-by: Christoph Lameter Cc: Pekka Enberg Cc: David Rientjes Cc: Joonsoo Kim Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slab_def.h | 7 ++++--- 1 file changed, 4 insertions(+), 3 deletions(-) diff --git a/include/linux/slab_def.h b/include/linux/slab_def.h index d9228e4d0320..3485c58cfd1c 100644 --- a/include/linux/slab_def.h +++ b/include/linux/slab_def.h @@ -67,9 +67,10 @@ struct kmem_cache { /* * If debugging is enabled, then the allocator can add additional - * fields and/or padding to every object. size contains the total - * object size including these internal fields, the following two - * variables contain the offset to the user object and its size. + * fields and/or padding to every object. 'size' contains the total + * object size including these internal fields, while 'obj_offset' + * and 'object_size' contain the offset to the user object and its + * size. */ int obj_offset; #endif /* CONFIG_DEBUG_SLAB */ From 88aa7cc688d48ddd84558b41d5905a0db9535c4b Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 7 Jun 2018 17:05:28 -0700 Subject: [PATCH 016/118] mm: introduce arg_lock to protect arg_start|end and env_start|end in mm_struct mmap_sem is on the hot path of kernel, and it very contended, but it is abused too. It is used to protect arg_start|end and evn_start|end when reading /proc/$PID/cmdline and /proc/$PID/environ, but it doesn't make sense since those proc files just expect to read 4 values atomically and not related to VM, they could be set to arbitrary values by C/R. And, the mmap_sem contention may cause unexpected issue like below: INFO: task ps:14018 blocked for more than 120 seconds. Tainted: G E 4.9.79-009.ali3000.alios7.x86_64 #1 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. ps D 0 14018 1 0x00000004 Call Trace: schedule+0x36/0x80 rwsem_down_read_failed+0xf0/0x150 call_rwsem_down_read_failed+0x18/0x30 down_read+0x20/0x40 proc_pid_cmdline_read+0xd9/0x4e0 __vfs_read+0x37/0x150 vfs_read+0x96/0x130 SyS_read+0x55/0xc0 entry_SYSCALL_64_fastpath+0x1a/0xc5 Both Alexey Dobriyan and Michal Hocko suggested to use dedicated lock for them to mitigate the abuse of mmap_sem. So, introduce a new spinlock in mm_struct to protect the concurrent access to arg_start|end, env_start|end and others, as well as replace write map_sem to read to protect the race condition between prctl and sys_brk which might break check_data_rlimit(), and makes prctl more friendly to other VM operations. This patch just eliminates the abuse of mmap_sem, but it can't resolve the above hung task warning completely since the later access_remote_vm() call needs acquire mmap_sem. The mmap_sem scalability issue will be solved in the future. [yang.shi@linux.alibaba.com: add comment about mmap_sem and arg_lock] Link: http://lkml.kernel.org/r/1524077799-80690-1-git-send-email-yang.shi@linux.alibaba.com Link: http://lkml.kernel.org/r/1523730291-109696-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Reviewed-by: Cyrill Gorcunov Acked-by: Michal Hocko Cc: Alexey Dobriyan Cc: Matthew Wilcox Cc: Mateusz Guzik Cc: Kirill Tkhai Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 8 ++++---- include/linux/mm_types.h | 2 ++ kernel/fork.c | 1 + kernel/sys.c | 10 ++++++++-- mm/init-mm.c | 1 + 5 files changed, 16 insertions(+), 6 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index af128b374143..ef3f7df50023 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -239,12 +239,12 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, goto out_mmput; } - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); arg_start = mm->arg_start; arg_end = mm->arg_end; env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); BUG_ON(arg_start > arg_end); BUG_ON(env_start > env_end); @@ -927,10 +927,10 @@ static ssize_t environ_read(struct file *file, char __user *buf, if (!mmget_not_zero(mm)) goto free; - down_read(&mm->mmap_sem); + spin_lock(&mm->arg_lock); env_start = mm->env_start; env_end = mm->env_end; - up_read(&mm->mmap_sem); + spin_unlock(&mm->arg_lock); while (count > 0) { size_t this_len, max_len; diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 21612347d311..49dd59ea90f7 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -413,6 +413,8 @@ struct mm_struct { unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */ unsigned long stack_vm; /* VM_STACK */ unsigned long def_flags; + + spinlock_t arg_lock; /* protect the below fields */ unsigned long start_code, end_code, start_data, end_data; unsigned long start_brk, brk, start_stack; unsigned long arg_start, arg_end, env_start, env_end; diff --git a/kernel/fork.c b/kernel/fork.c index 80b48a8fb47b..c6d1c1ce9ed7 100644 --- a/kernel/fork.c +++ b/kernel/fork.c @@ -899,6 +899,7 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, mm->pinned_vm = 0; memset(&mm->rss_stat, 0, sizeof(mm->rss_stat)); spin_lock_init(&mm->page_table_lock); + spin_lock_init(&mm->arg_lock); mm_init_cpumask(mm); mm_init_aio(mm); mm_init_owner(mm, p); diff --git a/kernel/sys.c b/kernel/sys.c index d1b2b8d934bb..38509dc1f77b 100644 --- a/kernel/sys.c +++ b/kernel/sys.c @@ -2018,7 +2018,11 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data return error; } - down_write(&mm->mmap_sem); + /* + * arg_lock protects concurent updates but we still need mmap_sem for + * read to exclude races with sys_brk. + */ + down_read(&mm->mmap_sem); /* * We don't validate if these members are pointing to @@ -2032,6 +2036,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data * to any problem in kernel itself */ + spin_lock(&mm->arg_lock); mm->start_code = prctl_map.start_code; mm->end_code = prctl_map.end_code; mm->start_data = prctl_map.start_data; @@ -2043,6 +2048,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data mm->arg_end = prctl_map.arg_end; mm->env_start = prctl_map.env_start; mm->env_end = prctl_map.env_end; + spin_unlock(&mm->arg_lock); /* * Note this update of @saved_auxv is lockless thus @@ -2055,7 +2061,7 @@ static int prctl_set_mm_map(int opt, const void __user *addr, unsigned long data if (prctl_map.auxv_size) memcpy(mm->saved_auxv, user_auxv, sizeof(user_auxv)); - up_write(&mm->mmap_sem); + up_read(&mm->mmap_sem); return 0; } #endif /* CONFIG_CHECKPOINT_RESTORE */ diff --git a/mm/init-mm.c b/mm/init-mm.c index f94d5d15ebc0..f0179c9c04c2 100644 --- a/mm/init-mm.c +++ b/mm/init-mm.c @@ -22,6 +22,7 @@ struct mm_struct init_mm = { .mm_count = ATOMIC_INIT(1), .mmap_sem = __RWSEM_INITIALIZER(init_mm.mmap_sem), .page_table_lock = __SPIN_LOCK_UNLOCKED(init_mm.page_table_lock), + .arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock), .mmlist = LIST_HEAD_INIT(init_mm.mmlist), .user_ns = &init_user_ns, INIT_MM_CONTEXT(init_mm) From bb98f2c5ac5db92d34908dbac81a8de7c47c8353 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Jun 2018 17:05:31 -0700 Subject: [PATCH 017/118] mm, memcontrol: move swap charge handling into get_swap_page() Patch series "mm, memcontrol: Implement memory.swap.events", v2. This patchset implements memory.swap.events which contains max and fail events so that userland can monitor and respond to swap running out. This patch (of 2): get_swap_page() is always followed by mem_cgroup_try_charge_swap(). This patch moves mem_cgroup_try_charge_swap() into get_swap_page() and makes get_swap_page() call the function even after swap allocation failure. This simplifies the callers and consolidates memcg related logic and will ease adding swap related memcg events. Link: http://lkml.kernel.org/r/20180416230934.GH1911913@devbig577.frc2.facebook.com Signed-off-by: Tejun Heo Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Rik van Riel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +++ mm/shmem.c | 4 ---- mm/swap_slots.c | 10 +++++++--- mm/swap_state.c | 3 --- 4 files changed, 10 insertions(+), 10 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 1695f38630f1..e8166521a474 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6012,6 +6012,9 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; + if (!entry.val) + return 0; + memcg = mem_cgroup_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && diff --git a/mm/shmem.c b/mm/shmem.c index 9d6c7e595415..8c43e207cd3b 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1322,9 +1322,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) if (!swap.val) goto redirty; - if (mem_cgroup_try_charge_swap(page, swap)) - goto free_swap; - /* * Add inode to shmem_unuse()'s list of swapped-out inodes, * if it's not already there. Do it now before the page is @@ -1353,7 +1350,6 @@ static int shmem_writepage(struct page *page, struct writeback_control *wbc) } mutex_unlock(&shmem_swaplist_mutex); -free_swap: put_swap_page(page, swap); redirty: set_page_dirty(page); diff --git a/mm/swap_slots.c b/mm/swap_slots.c index f2641894f440..f51ac051c0c9 100644 --- a/mm/swap_slots.c +++ b/mm/swap_slots.c @@ -317,7 +317,7 @@ swp_entry_t get_swap_page(struct page *page) if (PageTransHuge(page)) { if (IS_ENABLED(CONFIG_THP_SWAP)) get_swap_pages(1, true, &entry); - return entry; + goto out; } /* @@ -347,10 +347,14 @@ repeat: } mutex_unlock(&cache->alloc_lock); if (entry.val) - return entry; + goto out; } get_swap_pages(1, false, &entry); - +out: + if (mem_cgroup_try_charge_swap(page, entry)) { + put_swap_page(page, entry); + entry.val = 0; + } return entry; } diff --git a/mm/swap_state.c b/mm/swap_state.c index 07f9aa2340c3..ab8e59cd18ea 100644 --- a/mm/swap_state.c +++ b/mm/swap_state.c @@ -216,9 +216,6 @@ int add_to_swap(struct page *page) if (!entry.val) return 0; - if (mem_cgroup_try_charge_swap(page, entry)) - goto fail; - /* * Radix-tree node allocations from PF_MEMALLOC contexts could * completely exhaust the page allocator. __GFP_NOMEMALLOC From f3a53a3a1e5b3b9bc114b5b7930fbe89d9ffed5d Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Jun 2018 17:05:35 -0700 Subject: [PATCH 018/118] mm, memcontrol: implement memory.swap.events Add swap max and fail events so that userland can monitor and respond to running out of swap. I'm not too sure about the fail event. Right now, it's a bit confusing which stats / events are recursive and which aren't and also which ones reflect events which originate from a given cgroup and which targets the cgroup. No idea what the right long term solution is and it could just be that growing them organically is actually the only right thing to do. Link: http://lkml.kernel.org/r/20180416231151.GI1911913@devbig577.frc2.facebook.com Signed-off-by: Tejun Heo Reviewed-by: Andrew Morton Cc: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Roman Gushchin Cc: Rik van Riel Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 16 ++++++++++++++++ include/linux/memcontrol.h | 5 +++++ mm/memcontrol.c | 24 +++++++++++++++++++++++- 3 files changed, 44 insertions(+), 1 deletion(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 74cdeaed9f7a..b0dda10b9382 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1199,6 +1199,22 @@ PAGE_SIZE multiple when read back. Swap usage hard limit. If a cgroup's swap usage reaches this limit, anonymous memory of the cgroup will not be swapped out. + memory.swap.events + A read-only flat-keyed file which exists on non-root cgroups. + The following entries are defined. Unless specified + otherwise, a value change in this file generates a file + modified event. + + max + The number of times the cgroup's swap usage was about + to go over the max boundary and swap allocation + failed. + + fail + The number of times swap allocation failed either + because of running out of swap system-wide or max + limit. + Usage Guidelines ~~~~~~~~~~~~~~~~ diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index d99b71bc2c66..517096c3cc99 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -53,6 +53,8 @@ enum memcg_memory_event { MEMCG_HIGH, MEMCG_MAX, MEMCG_OOM, + MEMCG_SWAP_MAX, + MEMCG_SWAP_FAIL, MEMCG_NR_MEMORY_EVENTS, }; @@ -208,6 +210,9 @@ struct mem_cgroup { atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; + /* handle for "memory.swap.events" */ + struct cgroup_file swap_events_file; + /* protect arrays of thresholds */ struct mutex thresholds_lock; diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e8166521a474..d86665cf4a49 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6012,13 +6012,17 @@ int mem_cgroup_try_charge_swap(struct page *page, swp_entry_t entry) if (!memcg) return 0; - if (!entry.val) + if (!entry.val) { + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); return 0; + } memcg = mem_cgroup_id_get_online(memcg); if (!mem_cgroup_is_root(memcg) && !page_counter_try_charge(&memcg->swap, nr_pages, &counter)) { + memcg_memory_event(memcg, MEMCG_SWAP_MAX); + memcg_memory_event(memcg, MEMCG_SWAP_FAIL); mem_cgroup_id_put(memcg); return -ENOMEM; } @@ -6156,6 +6160,18 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, return nbytes; } +static int swap_events_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + + seq_printf(m, "max %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_MAX])); + seq_printf(m, "fail %lu\n", + atomic_long_read(&memcg->memory_events[MEMCG_SWAP_FAIL])); + + return 0; +} + static struct cftype swap_files[] = { { .name = "swap.current", @@ -6168,6 +6184,12 @@ static struct cftype swap_files[] = { .seq_show = swap_max_show, .write = swap_max_write, }, + { + .name = "swap.events", + .flags = CFTYPE_NOT_ON_ROOT, + .file_offset = offsetof(struct mem_cgroup, swap_events_file), + .seq_show = swap_events_show, + }, { } /* terminate */ }; From c4d6c4cc7bfd5ecc18548420b7fb9440cf8416ae Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:39 -0700 Subject: [PATCH 019/118] zram: correct flag name of ZRAM_ACCESS Patch series "zram memory tracking", v5. zRam as swap is useful for small memory device. However, swap means those pages on zram are mostly cold pages due to VM's LRU algorithm. Especially, once init data for application are touched for launching, they tend to be not accessed any more and finally swapped out. zRAM can store such cold pages as compressed form but it's pointless to keep in memory. As well, it's pointless to store incompressible pages to zram so better idea is app developers manages them directly like free or mlock rather than remaining them on heap. This patch provides a debugfs /sys/kernel/debug/zram/zram0/block_state to represent each block's state so admin can investigate what memory is cold|incompressible|same page with using pagemap once the pages are swapped out. The output is as follows: 300 75.033841 .wh 301 63.806904 s.. 302 63.806919 ..h First column is zram's block index and 3rh one represents symbol (s: same page w: written page to backing store h: huge page) of the block state. Second column represents usec time unit of the block was last accessed. So above example means the 300th block is accessed at 75.033851 second and it was huge so it was written to the backing store. This patch (of 4): ZRAM_ACCESS is used for locking a slot of zram so correct the name. It is also not a common flag to indicate status of the block so move the declare position on top of the flag. Lastly, let's move the function to the top of source code to be able to use it easily without forward declaration. Link: http://lkml.kernel.org/r/20180416090946.63057-2-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 20 ++++++++++---------- drivers/block/zram/zram_drv.h | 6 +++--- 2 files changed, 13 insertions(+), 13 deletions(-) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 0f3fadd71230..18dadeab775b 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -52,6 +52,16 @@ static size_t huge_class_size; static void zram_free_page(struct zram *zram, size_t index); +static void zram_slot_lock(struct zram *zram, u32 index) +{ + bit_spin_lock(ZRAM_LOCK, &zram->table[index].value); +} + +static void zram_slot_unlock(struct zram *zram, u32 index) +{ + bit_spin_unlock(ZRAM_LOCK, &zram->table[index].value); +} + static inline bool init_done(struct zram *zram) { return zram->disksize; @@ -753,16 +763,6 @@ static DEVICE_ATTR_RO(io_stat); static DEVICE_ATTR_RO(mm_stat); static DEVICE_ATTR_RO(debug_stat); -static void zram_slot_lock(struct zram *zram, u32 index) -{ - bit_spin_lock(ZRAM_ACCESS, &zram->table[index].value); -} - -static void zram_slot_unlock(struct zram *zram, u32 index) -{ - bit_spin_unlock(ZRAM_ACCESS, &zram->table[index].value); -} - static void zram_meta_free(struct zram *zram, u64 disksize) { size_t num_pages = disksize >> PAGE_SHIFT; diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 008861220723..8d8959ceabd1 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -43,9 +43,9 @@ /* Flags for zram pages (table[page_no].value) */ enum zram_pageflags { - /* Page consists the same element */ - ZRAM_SAME = ZRAM_FLAG_SHIFT, - ZRAM_ACCESS, /* page is now accessed */ + /* zram slot is locked */ + ZRAM_LOCK = ZRAM_FLAG_SHIFT, + ZRAM_SAME, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ __NR_ZRAM_PAGEFLAGS, From 89e85bce4b02edb7408aebf69d5d1a6692a05f4f Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:42 -0700 Subject: [PATCH 020/118] zram: mark incompressible page as ZRAM_HUGE Mark incompressible pages so that we could investigate who is the owner of the incompressible pages once the page is swapped out via using upcoming zram memory tracker feature. With it, we could prevent such pages to be swapped out by using mlock. Otherwise we might remove them. This patch exposes new stat for huge pages via mm_stat. Link: http://lkml.kernel.org/r/20180416090946.63057-3-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/blockdev/zram.txt | 1 + drivers/block/zram/zram_drv.c | 17 ++++++++++++++--- drivers/block/zram/zram_drv.h | 2 ++ 3 files changed, 17 insertions(+), 3 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 257e65714c6a..78db38d02bc9 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -218,6 +218,7 @@ line of text and contains the following stats separated by whitespace: same_pages the number of same element filled pages written to this disk. No memory is allocated for such pages. pages_compacted the number of pages freed during compaction + huge_pages the number of incompressible pages 9) Deactivate: swapoff /dev/zram0 diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 18dadeab775b..777fb3339f59 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -729,14 +729,15 @@ static ssize_t mm_stat_show(struct device *dev, max_used = atomic_long_read(&zram->stats.max_used_pages); ret = scnprintf(buf, PAGE_SIZE, - "%8llu %8llu %8llu %8lu %8ld %8llu %8lu\n", + "%8llu %8llu %8llu %8lu %8ld %8llu %8lu %8llu\n", orig_size << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.compr_data_size), mem_used << PAGE_SHIFT, zram->limit_pages << PAGE_SHIFT, max_used << PAGE_SHIFT, (u64)atomic64_read(&zram->stats.same_pages), - pool_stats.pages_compacted); + pool_stats.pages_compacted, + (u64)atomic64_read(&zram->stats.huge_pages)); up_read(&zram->init_lock); return ret; @@ -805,6 +806,11 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; + if (zram_test_flag(zram, index, ZRAM_HUGE)) { + zram_clear_flag(zram, index, ZRAM_HUGE); + atomic64_dec(&zram->stats.huge_pages); + } + if (zram_wb_enabled(zram) && zram_test_flag(zram, index, ZRAM_WB)) { zram_wb_clear(zram, index); atomic64_dec(&zram->stats.pages_stored); @@ -973,6 +979,7 @@ compress_again: } if (unlikely(comp_len >= huge_class_size)) { + comp_len = PAGE_SIZE; if (zram_wb_enabled(zram) && allow_wb) { zcomp_stream_put(zram->comp); ret = write_to_bdev(zram, bvec, index, bio, &element); @@ -984,7 +991,6 @@ compress_again: allow_wb = false; goto compress_again; } - comp_len = PAGE_SIZE; } /* @@ -1046,6 +1052,11 @@ out: zram_slot_lock(zram, index); zram_free_page(zram, index); + if (comp_len == PAGE_SIZE) { + zram_set_flag(zram, index, ZRAM_HUGE); + atomic64_inc(&zram->stats.huge_pages); + } + if (flags) { zram_set_flag(zram, index, flags); zram_set_element(zram, index, element); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 8d8959ceabd1..ff0547bdb586 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -47,6 +47,7 @@ enum zram_pageflags { ZRAM_LOCK = ZRAM_FLAG_SHIFT, ZRAM_SAME, /* Page consists the same element */ ZRAM_WB, /* page is stored on backing_device */ + ZRAM_HUGE, /* Incompressible page */ __NR_ZRAM_PAGEFLAGS, }; @@ -71,6 +72,7 @@ struct zram_stats { atomic64_t invalid_io; /* non-page-aligned I/O requests */ atomic64_t notify_free; /* no. of swap slot free notifications */ atomic64_t same_pages; /* no. of same element filled pages */ + atomic64_t huge_pages; /* no. of huge pages */ atomic64_t pages_stored; /* no. of pages currently stored */ atomic_long_t max_used_pages; /* no. of maximum pages stored */ atomic64_t writestall; /* no. of write slow paths */ From d7eac6b6e1838ef1a1400df4ec55daa34bbc855e Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:45 -0700 Subject: [PATCH 021/118] zram: record accessed second zRam as swap is useful for small memory device. However, swap means those pages on zram are mostly cold pages due to VM's LRU algorithm. Especially, once init data for application are touched for launching, they tend to be not accessed any more and finally swapped out. zRAM can store such cold pages as compressed form but it's pointless to keep in memory. Better idea is app developers free them directly rather than remaining them on heap. This patch records last access time of each block of zram so that With upcoming zram memory tracking, it could help userspace developers to reduce memory footprint. Link: http://lkml.kernel.org/r/20180416090946.63057-4-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Sergey Senozhatsky Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- drivers/block/zram/zram_drv.c | 16 ++++++++++++++++ drivers/block/zram/zram_drv.h | 1 + 2 files changed, 17 insertions(+) diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 777fb3339f59..7fc10e2ad734 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -107,6 +107,16 @@ static inline void zram_set_element(struct zram *zram, u32 index, zram->table[index].element = element; } +static void zram_accessed(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = sched_clock(); +} + +static void zram_reset_access(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = 0; +} + static unsigned long zram_get_element(struct zram *zram, u32 index) { return zram->table[index].element; @@ -806,6 +816,8 @@ static void zram_free_page(struct zram *zram, size_t index) { unsigned long handle; + zram_reset_access(zram, index); + if (zram_test_flag(zram, index, ZRAM_HUGE)) { zram_clear_flag(zram, index, ZRAM_HUGE); atomic64_dec(&zram->stats.huge_pages); @@ -1177,6 +1189,10 @@ static int zram_bvec_rw(struct zram *zram, struct bio_vec *bvec, u32 index, generic_end_io_acct(q, rw_acct, &zram->disk->part0, start_time); + zram_slot_lock(zram, index); + zram_accessed(zram, index); + zram_slot_unlock(zram, index); + if (unlikely(ret < 0)) { if (!is_write) atomic64_inc(&zram->stats.failed_reads); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index ff0547bdb586..1075218e88b2 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -61,6 +61,7 @@ struct zram_table_entry { unsigned long element; }; unsigned long value; + u64 ac_time; }; struct zram_stats { From c0265342bff4fcaa2cdf13f4596244c18d4a7ae5 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:05:49 -0700 Subject: [PATCH 022/118] zram: introduce zram memory tracking zRam as swap is useful for small memory device. However, swap means those pages on zram are mostly cold pages due to VM's LRU algorithm. Especially, once init data for application are touched for launching, they tend to be not accessed any more and finally swapped out. zRAM can store such cold pages as compressed form but it's pointless to keep in memory. Better idea is app developers free them directly rather than remaining them on heap. This patch tell us last access time of each block of zram via "cat /sys/kernel/debug/zram/zram0/block_state". The output is as follows, 300 75.033841 .wh 301 63.806904 s.. 302 63.806919 ..h First column is zram's block index and 3rh one represents symbol (s: same page w: written page to backing store h: huge page) of the block state. Second column represents usec time unit of the block was last accessed. So above example means the 300th block is accessed at 75.033851 second and it was huge so it was written to the backing store. Admin can leverage this information to catch cold|incompressible pages of process with *pagemap* once part of heaps are swapped out. I used the feature a few years ago to find memory hoggers in userspace to notify them what memory they have wasted without touch for a long time. With it, they could reduce unnecessary memory space. However, at that time, I hacked up zram for the feature but now I need the feature again so I decided it would be better to upstream rather than keeping it alone. I hope I submit the userspace tool to use the feature soon. [akpm@linux-foundation.org: fix i386 printk warning] [minchan@kernel.org: use ktime_get_boottime() instead of sched_clock()] Link: http://lkml.kernel.org/r/20180420063525.GA253739@rodete-desktop-imager.corp.google.com [akpm@linux-foundation.org: documentation tweak] [akpm@linux-foundation.org: fix i386 printk warning] [minchan@kernel.org: fix compile warning] Link: http://lkml.kernel.org/r/20180508104849.GA8209@rodete-desktop-imager.corp.google.com [rdunlap@infradead.org: fix printk formats] Link: http://lkml.kernel.org/r/3652ccb1-96ef-0b0b-05d1-f661d7733dcc@infradead.org Link: http://lkml.kernel.org/r/20180416090946.63057-5-minchan@kernel.org Signed-off-by: Minchan Kim Signed-off-by: Randy Dunlap Reviewed-by: Sergey Senozhatsky Acked-by: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/blockdev/zram.txt | 24 ++++++ drivers/block/zram/Kconfig | 14 +++- drivers/block/zram/zram_drv.c | 132 +++++++++++++++++++++++++++++--- drivers/block/zram/zram_drv.h | 7 +- 4 files changed, 163 insertions(+), 14 deletions(-) diff --git a/Documentation/blockdev/zram.txt b/Documentation/blockdev/zram.txt index 78db38d02bc9..875b2b56b87f 100644 --- a/Documentation/blockdev/zram.txt +++ b/Documentation/blockdev/zram.txt @@ -243,5 +243,29 @@ to backing storage rather than keeping it in memory. User should set up backing device via /sys/block/zramX/backing_dev before disksize setting. += memory tracking + +With CONFIG_ZRAM_MEMORY_TRACKING, user can know information of the +zram block. It could be useful to catch cold or incompressible +pages of the process with*pagemap. +If you enable the feature, you could see block state via +/sys/kernel/debug/zram/zram0/block_state". The output is as follows, + + 300 75.033841 .wh + 301 63.806904 s.. + 302 63.806919 ..h + +First column is zram's block index. +Second column is access time since the system was booted +Third column is state of the block. +(s: same page +w: written page to backing store +h: huge page) + +First line of above example says 300th block is accessed at 75.033841sec +and the block's state is huge so it is written back to the backing +storage. It's a debugging feature so anyone shouldn't rely on it to work +properly. + Nitin Gupta ngupta@vflare.org diff --git a/drivers/block/zram/Kconfig b/drivers/block/zram/Kconfig index ac3a31d433b2..635235759a0a 100644 --- a/drivers/block/zram/Kconfig +++ b/drivers/block/zram/Kconfig @@ -13,7 +13,7 @@ config ZRAM It has several use cases, for example: /tmp storage, use as swap disks and maybe many more. - See zram.txt for more information. + See Documentation/blockdev/zram.txt for more information. config ZRAM_WRITEBACK bool "Write back incompressible page to backing device" @@ -25,4 +25,14 @@ config ZRAM_WRITEBACK For this feature, admin should set up backing device via /sys/block/zramX/backing_dev. - See zram.txt for more infomration. + See Documentation/blockdev/zram.txt for more information. + +config ZRAM_MEMORY_TRACKING + bool "Track zRam block status" + depends on ZRAM && DEBUG_FS + help + With this feature, admin can track the state of allocated blocks + of zRAM. Admin could see the information via + /sys/kernel/debug/zram/zramX/block_state. + + See Documentation/blockdev/zram.txt for more information. diff --git a/drivers/block/zram/zram_drv.c b/drivers/block/zram/zram_drv.c index 7fc10e2ad734..da51293e7c03 100644 --- a/drivers/block/zram/zram_drv.c +++ b/drivers/block/zram/zram_drv.c @@ -31,6 +31,7 @@ #include #include #include +#include #include #include "zram_drv.h" @@ -67,6 +68,13 @@ static inline bool init_done(struct zram *zram) return zram->disksize; } +static inline bool zram_allocated(struct zram *zram, u32 index) +{ + + return (zram->table[index].value >> (ZRAM_FLAG_SHIFT + 1)) || + zram->table[index].handle; +} + static inline struct zram *dev_to_zram(struct device *dev) { return (struct zram *)dev_to_disk(dev)->private_data; @@ -83,7 +91,7 @@ static void zram_set_handle(struct zram *zram, u32 index, unsigned long handle) } /* flag operations require table entry bit_spin_lock() being held */ -static int zram_test_flag(struct zram *zram, u32 index, +static bool zram_test_flag(struct zram *zram, u32 index, enum zram_pageflags flag) { return zram->table[index].value & BIT(flag); @@ -107,16 +115,6 @@ static inline void zram_set_element(struct zram *zram, u32 index, zram->table[index].element = element; } -static void zram_accessed(struct zram *zram, u32 index) -{ - zram->table[index].ac_time = sched_clock(); -} - -static void zram_reset_access(struct zram *zram, u32 index) -{ - zram->table[index].ac_time = 0; -} - static unsigned long zram_get_element(struct zram *zram, u32 index) { return zram->table[index].element; @@ -620,6 +618,114 @@ static int read_from_bdev(struct zram *zram, struct bio_vec *bvec, static void zram_wb_clear(struct zram *zram, u32 index) {} #endif +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + +static struct dentry *zram_debugfs_root; + +static void zram_debugfs_create(void) +{ + zram_debugfs_root = debugfs_create_dir("zram", NULL); +} + +static void zram_debugfs_destroy(void) +{ + debugfs_remove_recursive(zram_debugfs_root); +} + +static void zram_accessed(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = ktime_get_boottime(); +} + +static void zram_reset_access(struct zram *zram, u32 index) +{ + zram->table[index].ac_time = 0; +} + +static ssize_t read_block_state(struct file *file, char __user *buf, + size_t count, loff_t *ppos) +{ + char *kbuf; + ssize_t index, written = 0; + struct zram *zram = file->private_data; + unsigned long nr_pages = zram->disksize >> PAGE_SHIFT; + struct timespec64 ts; + + kbuf = kvmalloc(count, GFP_KERNEL); + if (!kbuf) + return -ENOMEM; + + down_read(&zram->init_lock); + if (!init_done(zram)) { + up_read(&zram->init_lock); + kvfree(kbuf); + return -EINVAL; + } + + for (index = *ppos; index < nr_pages; index++) { + int copied; + + zram_slot_lock(zram, index); + if (!zram_allocated(zram, index)) + goto next; + + ts = ktime_to_timespec64(zram->table[index].ac_time); + copied = snprintf(kbuf + written, count, + "%12zd %12lld.%06lu %c%c%c\n", + index, (s64)ts.tv_sec, + ts.tv_nsec / NSEC_PER_USEC, + zram_test_flag(zram, index, ZRAM_SAME) ? 's' : '.', + zram_test_flag(zram, index, ZRAM_WB) ? 'w' : '.', + zram_test_flag(zram, index, ZRAM_HUGE) ? 'h' : '.'); + + if (count < copied) { + zram_slot_unlock(zram, index); + break; + } + written += copied; + count -= copied; +next: + zram_slot_unlock(zram, index); + *ppos += 1; + } + + up_read(&zram->init_lock); + if (copy_to_user(buf, kbuf, written)) + written = -EFAULT; + kvfree(kbuf); + + return written; +} + +static const struct file_operations proc_zram_block_state_op = { + .open = simple_open, + .read = read_block_state, + .llseek = default_llseek, +}; + +static void zram_debugfs_register(struct zram *zram) +{ + if (!zram_debugfs_root) + return; + + zram->debugfs_dir = debugfs_create_dir(zram->disk->disk_name, + zram_debugfs_root); + debugfs_create_file("block_state", 0400, zram->debugfs_dir, + zram, &proc_zram_block_state_op); +} + +static void zram_debugfs_unregister(struct zram *zram) +{ + debugfs_remove_recursive(zram->debugfs_dir); +} +#else +static void zram_debugfs_create(void) {}; +static void zram_debugfs_destroy(void) {}; +static void zram_accessed(struct zram *zram, u32 index) {}; +static void zram_reset_access(struct zram *zram, u32 index) {}; +static void zram_debugfs_register(struct zram *zram) {}; +static void zram_debugfs_unregister(struct zram *zram) {}; +#endif /* * We switched to per-cpu streams and this attr is not needed anymore. @@ -1604,6 +1710,7 @@ static int zram_add(void) } strlcpy(zram->compressor, default_compressor, sizeof(zram->compressor)); + zram_debugfs_register(zram); pr_info("Added device: %s\n", zram->disk->disk_name); return device_id; @@ -1637,6 +1744,7 @@ static int zram_remove(struct zram *zram) zram->claim = true; mutex_unlock(&bdev->bd_mutex); + zram_debugfs_unregister(zram); /* * Remove sysfs first, so no one will perform a disksize * store while we destroy the devices. This also helps during @@ -1739,6 +1847,7 @@ static void destroy_devices(void) { class_unregister(&zram_control_class); idr_for_each(&zram_index_idr, &zram_remove_cb, NULL); + zram_debugfs_destroy(); idr_destroy(&zram_index_idr); unregister_blkdev(zram_major, "zram"); cpuhp_remove_multi_state(CPUHP_ZCOMP_PREPARE); @@ -1760,6 +1869,7 @@ static int __init zram_init(void) return ret; } + zram_debugfs_create(); zram_major = register_blkdev(0, "zram"); if (zram_major <= 0) { pr_err("Unable to get major number\n"); diff --git a/drivers/block/zram/zram_drv.h b/drivers/block/zram/zram_drv.h index 1075218e88b2..72c8584b6dff 100644 --- a/drivers/block/zram/zram_drv.h +++ b/drivers/block/zram/zram_drv.h @@ -61,7 +61,9 @@ struct zram_table_entry { unsigned long element; }; unsigned long value; - u64 ac_time; +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + ktime_t ac_time; +#endif }; struct zram_stats { @@ -110,5 +112,8 @@ struct zram { unsigned long nr_pages; spinlock_t bitmap_lock; #endif +#ifdef CONFIG_ZRAM_MEMORY_TRACKING + struct dentry *debugfs_dir; +#endif }; #endif From 5b9c98f3082b4e8a24ef5e6bb21ed0558b54349a Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 7 Jun 2018 17:05:53 -0700 Subject: [PATCH 023/118] mm/shmem: add __rcu annotations and properly deref radix entry Patch series "restructure memfd code", v4. This patch (of 3): In preparation for memfd code restucture, clean up sparse warnings. Most changes required adding __rcu annotations. The routine find_swap_entry was modified to properly deference radix tree entries. Link: http://lkml.kernel.org/r/20180415182119.4517-2-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Signed-off-by: Matthew Wilcox Cc: Hugh Dickins Cc: Andrea Arcangeli Cc: Michal Hocko Cc: Marc-Andr Lureau Cc: David Herrmann Cc: Khalid Aziz Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 20 +++++++++++++------- 1 file changed, 13 insertions(+), 7 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 8c43e207cd3b..54bec60380df 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -327,7 +327,7 @@ static int shmem_radix_tree_replace(struct address_space *mapping, pgoff_t index, void *expected, void *replacement) { struct radix_tree_node *node; - void **pslot; + void __rcu **pslot; void *item; VM_BUG_ON(!expected); @@ -395,7 +395,7 @@ static bool shmem_confirm_swap(struct address_space *mapping, #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE /* ifdef here to avoid bloating shmem.o when not necessary */ -int shmem_huge __read_mostly; +static int shmem_huge __read_mostly; #if defined(CONFIG_SYSFS) || defined(CONFIG_TMPFS) static int shmem_parse_huge(const char *str) @@ -682,7 +682,7 @@ unsigned long shmem_partial_swap_usage(struct address_space *mapping, pgoff_t start, pgoff_t end) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; struct page *page; unsigned long swapped = 0; @@ -1098,13 +1098,19 @@ static void shmem_evict_inode(struct inode *inode) static unsigned long find_swap_entry(struct radix_tree_root *root, void *item) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; unsigned long found = -1; unsigned int checked = 0; rcu_read_lock(); radix_tree_for_each_slot(slot, root, &iter, 0) { - if (*slot == item) { + void *entry = radix_tree_deref_slot(slot); + + if (radix_tree_deref_retry(entry)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + if (entry == item) { found = iter.index; break; } @@ -2622,7 +2628,7 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) static void shmem_tag_pins(struct address_space *mapping) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; pgoff_t start; struct page *page; @@ -2664,7 +2670,7 @@ static void shmem_tag_pins(struct address_space *mapping) static int shmem_wait_for_pins(struct address_space *mapping) { struct radix_tree_iter iter; - void **slot; + void __rcu **slot; pgoff_t start; struct page *page; int error, scan; From c49fcfcda8b5b06529103664f0291f433f5e6d24 Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 7 Jun 2018 17:05:57 -0700 Subject: [PATCH 024/118] mm/shmem: update file sealing comments and file checking In preparation for memfd code restructure, update comments, definitions and function names dealing with file sealing to indicate that tmpfs and hugetlbfs are the supported filesystems. Also, change file pointer checks in memfd_file_seals_ptr to use defined interfaces instead of directly referencing file_operation structs. Link: http://lkml.kernel.org/r/20180415182119.4517-3-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Khalid Aziz Cc: Andrea Arcangeli Cc: David Herrmann Cc: Hugh Dickins Cc: Marc-Andr Lureau Cc: Matthew Wilcox Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 50 ++++++++++++++++++++++++++------------------------ 1 file changed, 26 insertions(+), 24 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 54bec60380df..ceba031d9985 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2620,12 +2620,13 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) /* * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, - * so reuse a tag which we firmly believe is never set or cleared on shmem. + * so reuse a tag which we firmly believe is never set or cleared on tmpfs + * or hugetlbfs because they are memory only filesystems. */ -#define SHMEM_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE #define LAST_SCAN 4 /* about 150ms max */ -static void shmem_tag_pins(struct address_space *mapping) +static void memfd_tag_pins(struct address_space *mapping) { struct radix_tree_iter iter; void __rcu **slot; @@ -2646,7 +2647,7 @@ static void shmem_tag_pins(struct address_space *mapping) } else if (page_count(page) - page_mapcount(page) > 1) { xa_lock_irq(&mapping->i_pages); radix_tree_tag_set(&mapping->i_pages, iter.index, - SHMEM_TAG_PINNED); + MEMFD_TAG_PINNED); xa_unlock_irq(&mapping->i_pages); } @@ -2667,7 +2668,7 @@ static void shmem_tag_pins(struct address_space *mapping) * The caller must guarantee that no new user will acquire writable references * to those pages to avoid races. */ -static int shmem_wait_for_pins(struct address_space *mapping) +static int memfd_wait_for_pins(struct address_space *mapping) { struct radix_tree_iter iter; void __rcu **slot; @@ -2675,11 +2676,11 @@ static int shmem_wait_for_pins(struct address_space *mapping) struct page *page; int error, scan; - shmem_tag_pins(mapping); + memfd_tag_pins(mapping); error = 0; for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->i_pages, SHMEM_TAG_PINNED)) + if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) break; if (!scan) @@ -2690,7 +2691,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) start = 0; rcu_read_lock(); radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, - start, SHMEM_TAG_PINNED) { + start, MEMFD_TAG_PINNED) { page = radix_tree_deref_slot(slot); if (radix_tree_exception(page)) { @@ -2717,7 +2718,7 @@ static int shmem_wait_for_pins(struct address_space *mapping) xa_lock_irq(&mapping->i_pages); radix_tree_tag_clear(&mapping->i_pages, - iter.index, SHMEM_TAG_PINNED); + iter.index, MEMFD_TAG_PINNED); xa_unlock_irq(&mapping->i_pages); continue_resched: if (need_resched()) { @@ -2733,11 +2734,11 @@ continue_resched: static unsigned int *memfd_file_seals_ptr(struct file *file) { - if (file->f_op == &shmem_file_operations) + if (shmem_file(file)) return &SHMEM_I(file_inode(file))->seals; #ifdef CONFIG_HUGETLBFS - if (file->f_op == &hugetlbfs_file_operations) + if (is_file_hugepages(file)) return &HUGETLBFS_I(file_inode(file))->seals; #endif @@ -2757,16 +2758,17 @@ static int memfd_add_seals(struct file *file, unsigned int seals) /* * SEALING - * Sealing allows multiple parties to share a shmem-file but restrict - * access to a specific subset of file operations. Seals can only be - * added, but never removed. This way, mutually untrusted parties can - * share common memory regions with a well-defined policy. A malicious - * peer can thus never perform unwanted operations on a shared object. + * Sealing allows multiple parties to share a tmpfs or hugetlbfs file + * but restrict access to a specific subset of file operations. Seals + * can only be added, but never removed. This way, mutually untrusted + * parties can share common memory regions with a well-defined policy. + * A malicious peer can thus never perform unwanted operations on a + * shared object. * - * Seals are only supported on special shmem-files and always affect - * the whole underlying inode. Once a seal is set, it may prevent some - * kinds of access to the file. Currently, the following seals are - * defined: + * Seals are only supported on special tmpfs or hugetlbfs files and + * always affect the whole underlying inode. Once a seal is set, it + * may prevent some kinds of access to the file. Currently, the + * following seals are defined: * SEAL_SEAL: Prevent further seals from being set on this file * SEAL_SHRINK: Prevent the file from shrinking * SEAL_GROW: Prevent the file from growing @@ -2780,9 +2782,9 @@ static int memfd_add_seals(struct file *file, unsigned int seals) * added. * * Semantics of sealing are only defined on volatile files. Only - * anonymous shmem files support sealing. More importantly, seals are - * never written to disk. Therefore, there's no plan to support it on - * other file types. + * anonymous tmpfs and hugetlbfs files support sealing. More + * importantly, seals are never written to disk. Therefore, there's + * no plan to support it on other file types. */ if (!(file->f_mode & FMODE_WRITE)) @@ -2808,7 +2810,7 @@ static int memfd_add_seals(struct file *file, unsigned int seals) if (error) goto unlock; - error = shmem_wait_for_pins(file->f_mapping); + error = memfd_wait_for_pins(file->f_mapping); if (error) { mapping_allow_writable(file->f_mapping); goto unlock; From 5d752600a8c373382264392f5b573b2fc9c0e8ea Mon Sep 17 00:00:00 2001 From: Mike Kravetz Date: Thu, 7 Jun 2018 17:06:01 -0700 Subject: [PATCH 025/118] mm: restructure memfd code With the addition of memfd hugetlbfs support, we now have the situation where memfd depends on TMPFS -or- HUGETLBFS. Previously, memfd was only supported on tmpfs, so it made sense that the code resided in shmem.c. In the current code, memfd is only functional if TMPFS is defined. If HUGETLFS is defined and TMPFS is not defined, then memfd functionality will not be available for hugetlbfs. This does not cause BUGs, just a lack of potentially desired functionality. Code is restructured in the following way: - include/linux/memfd.h is a new file containing memfd specific definitions previously contained in shmem_fs.h. - mm/memfd.c is a new file containing memfd specific code previously contained in shmem.c. - memfd specific code is removed from shmem_fs.h and shmem.c. - A new config option MEMFD_CREATE is added that is defined if TMPFS or HUGETLBFS is defined. No functional changes are made to the code: restructuring only. Link: http://lkml.kernel.org/r/20180415182119.4517-4-mike.kravetz@oracle.com Signed-off-by: Mike Kravetz Reviewed-by: Khalid Aziz Cc: Andrea Arcangeli Cc: David Herrmann Cc: Hugh Dickins Cc: Marc-Andr Lureau Cc: Matthew Wilcox Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 3 + fs/fcntl.c | 2 +- include/linux/memfd.h | 16 ++ include/linux/shmem_fs.h | 13 -- mm/Makefile | 1 + mm/memfd.c | 345 +++++++++++++++++++++++++++++++++++++++ mm/shmem.c | 324 ------------------------------------ 7 files changed, 366 insertions(+), 338 deletions(-) create mode 100644 include/linux/memfd.h create mode 100644 mm/memfd.c diff --git a/fs/Kconfig b/fs/Kconfig index ac4ac908f001..51f78a28072a 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -203,6 +203,9 @@ config HUGETLBFS config HUGETLB_PAGE def_bool HUGETLBFS +config MEMFD_CREATE + def_bool TMPFS || HUGETLBFS + config ARCH_HAS_GIGANTIC_PAGE bool diff --git a/fs/fcntl.c b/fs/fcntl.c index c42169459298..12273b6ea56d 100644 --- a/fs/fcntl.c +++ b/fs/fcntl.c @@ -23,7 +23,7 @@ #include #include #include -#include +#include #include #include diff --git a/include/linux/memfd.h b/include/linux/memfd.h new file mode 100644 index 000000000000..4f1600413f91 --- /dev/null +++ b/include/linux/memfd.h @@ -0,0 +1,16 @@ +/* SPDX-License-Identifier: GPL-2.0 */ +#ifndef __LINUX_MEMFD_H +#define __LINUX_MEMFD_H + +#include + +#ifdef CONFIG_MEMFD_CREATE +extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); +#else +static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) +{ + return -EINVAL; +} +#endif + +#endif /* __LINUX_MEMFD_H */ diff --git a/include/linux/shmem_fs.h b/include/linux/shmem_fs.h index 73b5e655a76e..f155dc607112 100644 --- a/include/linux/shmem_fs.h +++ b/include/linux/shmem_fs.h @@ -110,19 +110,6 @@ static inline bool shmem_file(struct file *file) extern bool shmem_charge(struct inode *inode, long pages); extern void shmem_uncharge(struct inode *inode, long pages); -#ifdef CONFIG_TMPFS - -extern long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg); - -#else - -static inline long memfd_fcntl(struct file *f, unsigned int c, unsigned long a) -{ - return -EINVAL; -} - -#endif - #ifdef CONFIG_TRANSPARENT_HUGE_PAGECACHE extern bool shmem_huge_enabled(struct vm_area_struct *vma); #else diff --git a/mm/Makefile b/mm/Makefile index b4e54a9ae9c5..8716bdabe1e6 100644 --- a/mm/Makefile +++ b/mm/Makefile @@ -105,3 +105,4 @@ obj-$(CONFIG_DEBUG_PAGE_REF) += debug_page_ref.o obj-$(CONFIG_HARDENED_USERCOPY) += usercopy.o obj-$(CONFIG_PERCPU_STATS) += percpu-stats.o obj-$(CONFIG_HMM) += hmm.o +obj-$(CONFIG_MEMFD_CREATE) += memfd.o diff --git a/mm/memfd.c b/mm/memfd.c new file mode 100644 index 000000000000..27069518e3c5 --- /dev/null +++ b/mm/memfd.c @@ -0,0 +1,345 @@ +/* + * memfd_create system call and file sealing support + * + * Code was originally included in shmem.c, and broken out to facilitate + * use by hugetlbfs as well as tmpfs. + * + * This file is released under the GPL. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* + * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, + * so reuse a tag which we firmly believe is never set or cleared on tmpfs + * or hugetlbfs because they are memory only filesystems. + */ +#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE +#define LAST_SCAN 4 /* about 150ms max */ + +static void memfd_tag_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void __rcu **slot; + pgoff_t start; + struct page *page; + + lru_add_drain(); + start = 0; + rcu_read_lock(); + + radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { + page = radix_tree_deref_slot(slot); + if (!page || radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + } else if (page_count(page) - page_mapcount(page) > 1) { + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_set(&mapping->i_pages, iter.index, + MEMFD_TAG_PINNED); + xa_unlock_irq(&mapping->i_pages); + } + + if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + } + rcu_read_unlock(); +} + +/* + * Setting SEAL_WRITE requires us to verify there's no pending writer. However, + * via get_user_pages(), drivers might have some pending I/O without any active + * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages + * and see whether it has an elevated ref-count. If so, we tag them and wait for + * them to be dropped. + * The caller must guarantee that no new user will acquire writable references + * to those pages to avoid races. + */ +static int memfd_wait_for_pins(struct address_space *mapping) +{ + struct radix_tree_iter iter; + void __rcu **slot; + pgoff_t start; + struct page *page; + int error, scan; + + memfd_tag_pins(mapping); + + error = 0; + for (scan = 0; scan <= LAST_SCAN; scan++) { + if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) + break; + + if (!scan) + lru_add_drain_all(); + else if (schedule_timeout_killable((HZ << scan) / 200)) + scan = LAST_SCAN; + + start = 0; + rcu_read_lock(); + radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, + start, MEMFD_TAG_PINNED) { + + page = radix_tree_deref_slot(slot); + if (radix_tree_exception(page)) { + if (radix_tree_deref_retry(page)) { + slot = radix_tree_iter_retry(&iter); + continue; + } + + page = NULL; + } + + if (page && + page_count(page) - page_mapcount(page) != 1) { + if (scan < LAST_SCAN) + goto continue_resched; + + /* + * On the last scan, we clean up all those tags + * we inserted; but make a note that we still + * found pages pinned. + */ + error = -EBUSY; + } + + xa_lock_irq(&mapping->i_pages); + radix_tree_tag_clear(&mapping->i_pages, + iter.index, MEMFD_TAG_PINNED); + xa_unlock_irq(&mapping->i_pages); +continue_resched: + if (need_resched()) { + slot = radix_tree_iter_resume(slot, &iter); + cond_resched_rcu(); + } + } + rcu_read_unlock(); + } + + return error; +} + +static unsigned int *memfd_file_seals_ptr(struct file *file) +{ + if (shmem_file(file)) + return &SHMEM_I(file_inode(file))->seals; + +#ifdef CONFIG_HUGETLBFS + if (is_file_hugepages(file)) + return &HUGETLBFS_I(file_inode(file))->seals; +#endif + + return NULL; +} + +#define F_ALL_SEALS (F_SEAL_SEAL | \ + F_SEAL_SHRINK | \ + F_SEAL_GROW | \ + F_SEAL_WRITE) + +static int memfd_add_seals(struct file *file, unsigned int seals) +{ + struct inode *inode = file_inode(file); + unsigned int *file_seals; + int error; + + /* + * SEALING + * Sealing allows multiple parties to share a tmpfs or hugetlbfs file + * but restrict access to a specific subset of file operations. Seals + * can only be added, but never removed. This way, mutually untrusted + * parties can share common memory regions with a well-defined policy. + * A malicious peer can thus never perform unwanted operations on a + * shared object. + * + * Seals are only supported on special tmpfs or hugetlbfs files and + * always affect the whole underlying inode. Once a seal is set, it + * may prevent some kinds of access to the file. Currently, the + * following seals are defined: + * SEAL_SEAL: Prevent further seals from being set on this file + * SEAL_SHRINK: Prevent the file from shrinking + * SEAL_GROW: Prevent the file from growing + * SEAL_WRITE: Prevent write access to the file + * + * As we don't require any trust relationship between two parties, we + * must prevent seals from being removed. Therefore, sealing a file + * only adds a given set of seals to the file, it never touches + * existing seals. Furthermore, the "setting seals"-operation can be + * sealed itself, which basically prevents any further seal from being + * added. + * + * Semantics of sealing are only defined on volatile files. Only + * anonymous tmpfs and hugetlbfs files support sealing. More + * importantly, seals are never written to disk. Therefore, there's + * no plan to support it on other file types. + */ + + if (!(file->f_mode & FMODE_WRITE)) + return -EPERM; + if (seals & ~(unsigned int)F_ALL_SEALS) + return -EINVAL; + + inode_lock(inode); + + file_seals = memfd_file_seals_ptr(file); + if (!file_seals) { + error = -EINVAL; + goto unlock; + } + + if (*file_seals & F_SEAL_SEAL) { + error = -EPERM; + goto unlock; + } + + if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { + error = mapping_deny_writable(file->f_mapping); + if (error) + goto unlock; + + error = memfd_wait_for_pins(file->f_mapping); + if (error) { + mapping_allow_writable(file->f_mapping); + goto unlock; + } + } + + *file_seals |= seals; + error = 0; + +unlock: + inode_unlock(inode); + return error; +} + +static int memfd_get_seals(struct file *file) +{ + unsigned int *seals = memfd_file_seals_ptr(file); + + return seals ? *seals : -EINVAL; +} + +long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) +{ + long error; + + switch (cmd) { + case F_ADD_SEALS: + /* disallow upper 32bit */ + if (arg > UINT_MAX) + return -EINVAL; + + error = memfd_add_seals(file, arg); + break; + case F_GET_SEALS: + error = memfd_get_seals(file); + break; + default: + error = -EINVAL; + break; + } + + return error; +} + +#define MFD_NAME_PREFIX "memfd:" +#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) +#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) + +#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) + +SYSCALL_DEFINE2(memfd_create, + const char __user *, uname, + unsigned int, flags) +{ + unsigned int *file_seals; + struct file *file; + int fd, error; + char *name; + long len; + + if (!(flags & MFD_HUGETLB)) { + if (flags & ~(unsigned int)MFD_ALL_FLAGS) + return -EINVAL; + } else { + /* Allow huge page size encoding in flags. */ + if (flags & ~(unsigned int)(MFD_ALL_FLAGS | + (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) + return -EINVAL; + } + + /* length includes terminating zero */ + len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); + if (len <= 0) + return -EFAULT; + if (len > MFD_NAME_MAX_LEN + 1) + return -EINVAL; + + name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); + if (!name) + return -ENOMEM; + + strcpy(name, MFD_NAME_PREFIX); + if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { + error = -EFAULT; + goto err_name; + } + + /* terminating-zero may have changed after strnlen_user() returned */ + if (name[len + MFD_NAME_PREFIX_LEN - 1]) { + error = -EFAULT; + goto err_name; + } + + fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); + if (fd < 0) { + error = fd; + goto err_name; + } + + if (flags & MFD_HUGETLB) { + struct user_struct *user = NULL; + + file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, + HUGETLB_ANONHUGE_INODE, + (flags >> MFD_HUGE_SHIFT) & + MFD_HUGE_MASK); + } else + file = shmem_file_setup(name, 0, VM_NORESERVE); + if (IS_ERR(file)) { + error = PTR_ERR(file); + goto err_fd; + } + file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; + file->f_flags |= O_RDWR | O_LARGEFILE; + + if (flags & MFD_ALLOW_SEALING) { + file_seals = memfd_file_seals_ptr(file); + *file_seals &= ~F_SEAL_SEAL; + } + + fd_install(fd, file); + kfree(name); + return fd; + +err_fd: + put_unused_fd(fd); +err_name: + kfree(name); + return error; +} diff --git a/mm/shmem.c b/mm/shmem.c index ceba031d9985..cb4d7fa389d0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -2618,243 +2618,6 @@ static loff_t shmem_file_llseek(struct file *file, loff_t offset, int whence) return offset; } -/* - * We need a tag: a new tag would expand every radix_tree_node by 8 bytes, - * so reuse a tag which we firmly believe is never set or cleared on tmpfs - * or hugetlbfs because they are memory only filesystems. - */ -#define MEMFD_TAG_PINNED PAGECACHE_TAG_TOWRITE -#define LAST_SCAN 4 /* about 150ms max */ - -static void memfd_tag_pins(struct address_space *mapping) -{ - struct radix_tree_iter iter; - void __rcu **slot; - pgoff_t start; - struct page *page; - - lru_add_drain(); - start = 0; - rcu_read_lock(); - - radix_tree_for_each_slot(slot, &mapping->i_pages, &iter, start) { - page = radix_tree_deref_slot(slot); - if (!page || radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - } else if (page_count(page) - page_mapcount(page) > 1) { - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_set(&mapping->i_pages, iter.index, - MEMFD_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); - } - - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } - } - rcu_read_unlock(); -} - -/* - * Setting SEAL_WRITE requires us to verify there's no pending writer. However, - * via get_user_pages(), drivers might have some pending I/O without any active - * user-space mappings (eg., direct-IO, AIO). Therefore, we look at all pages - * and see whether it has an elevated ref-count. If so, we tag them and wait for - * them to be dropped. - * The caller must guarantee that no new user will acquire writable references - * to those pages to avoid races. - */ -static int memfd_wait_for_pins(struct address_space *mapping) -{ - struct radix_tree_iter iter; - void __rcu **slot; - pgoff_t start; - struct page *page; - int error, scan; - - memfd_tag_pins(mapping); - - error = 0; - for (scan = 0; scan <= LAST_SCAN; scan++) { - if (!radix_tree_tagged(&mapping->i_pages, MEMFD_TAG_PINNED)) - break; - - if (!scan) - lru_add_drain_all(); - else if (schedule_timeout_killable((HZ << scan) / 200)) - scan = LAST_SCAN; - - start = 0; - rcu_read_lock(); - radix_tree_for_each_tagged(slot, &mapping->i_pages, &iter, - start, MEMFD_TAG_PINNED) { - - page = radix_tree_deref_slot(slot); - if (radix_tree_exception(page)) { - if (radix_tree_deref_retry(page)) { - slot = radix_tree_iter_retry(&iter); - continue; - } - - page = NULL; - } - - if (page && - page_count(page) - page_mapcount(page) != 1) { - if (scan < LAST_SCAN) - goto continue_resched; - - /* - * On the last scan, we clean up all those tags - * we inserted; but make a note that we still - * found pages pinned. - */ - error = -EBUSY; - } - - xa_lock_irq(&mapping->i_pages); - radix_tree_tag_clear(&mapping->i_pages, - iter.index, MEMFD_TAG_PINNED); - xa_unlock_irq(&mapping->i_pages); -continue_resched: - if (need_resched()) { - slot = radix_tree_iter_resume(slot, &iter); - cond_resched_rcu(); - } - } - rcu_read_unlock(); - } - - return error; -} - -static unsigned int *memfd_file_seals_ptr(struct file *file) -{ - if (shmem_file(file)) - return &SHMEM_I(file_inode(file))->seals; - -#ifdef CONFIG_HUGETLBFS - if (is_file_hugepages(file)) - return &HUGETLBFS_I(file_inode(file))->seals; -#endif - - return NULL; -} - -#define F_ALL_SEALS (F_SEAL_SEAL | \ - F_SEAL_SHRINK | \ - F_SEAL_GROW | \ - F_SEAL_WRITE) - -static int memfd_add_seals(struct file *file, unsigned int seals) -{ - struct inode *inode = file_inode(file); - unsigned int *file_seals; - int error; - - /* - * SEALING - * Sealing allows multiple parties to share a tmpfs or hugetlbfs file - * but restrict access to a specific subset of file operations. Seals - * can only be added, but never removed. This way, mutually untrusted - * parties can share common memory regions with a well-defined policy. - * A malicious peer can thus never perform unwanted operations on a - * shared object. - * - * Seals are only supported on special tmpfs or hugetlbfs files and - * always affect the whole underlying inode. Once a seal is set, it - * may prevent some kinds of access to the file. Currently, the - * following seals are defined: - * SEAL_SEAL: Prevent further seals from being set on this file - * SEAL_SHRINK: Prevent the file from shrinking - * SEAL_GROW: Prevent the file from growing - * SEAL_WRITE: Prevent write access to the file - * - * As we don't require any trust relationship between two parties, we - * must prevent seals from being removed. Therefore, sealing a file - * only adds a given set of seals to the file, it never touches - * existing seals. Furthermore, the "setting seals"-operation can be - * sealed itself, which basically prevents any further seal from being - * added. - * - * Semantics of sealing are only defined on volatile files. Only - * anonymous tmpfs and hugetlbfs files support sealing. More - * importantly, seals are never written to disk. Therefore, there's - * no plan to support it on other file types. - */ - - if (!(file->f_mode & FMODE_WRITE)) - return -EPERM; - if (seals & ~(unsigned int)F_ALL_SEALS) - return -EINVAL; - - inode_lock(inode); - - file_seals = memfd_file_seals_ptr(file); - if (!file_seals) { - error = -EINVAL; - goto unlock; - } - - if (*file_seals & F_SEAL_SEAL) { - error = -EPERM; - goto unlock; - } - - if ((seals & F_SEAL_WRITE) && !(*file_seals & F_SEAL_WRITE)) { - error = mapping_deny_writable(file->f_mapping); - if (error) - goto unlock; - - error = memfd_wait_for_pins(file->f_mapping); - if (error) { - mapping_allow_writable(file->f_mapping); - goto unlock; - } - } - - *file_seals |= seals; - error = 0; - -unlock: - inode_unlock(inode); - return error; -} - -static int memfd_get_seals(struct file *file) -{ - unsigned int *seals = memfd_file_seals_ptr(file); - - return seals ? *seals : -EINVAL; -} - -long memfd_fcntl(struct file *file, unsigned int cmd, unsigned long arg) -{ - long error; - - switch (cmd) { - case F_ADD_SEALS: - /* disallow upper 32bit */ - if (arg > UINT_MAX) - return -EINVAL; - - error = memfd_add_seals(file, arg); - break; - case F_GET_SEALS: - error = memfd_get_seals(file); - break; - default: - error = -EINVAL; - break; - } - - return error; -} - static long shmem_fallocate(struct file *file, int mode, loff_t offset, loff_t len) { @@ -3677,93 +3440,6 @@ static int shmem_show_options(struct seq_file *seq, struct dentry *root) return 0; } -#define MFD_NAME_PREFIX "memfd:" -#define MFD_NAME_PREFIX_LEN (sizeof(MFD_NAME_PREFIX) - 1) -#define MFD_NAME_MAX_LEN (NAME_MAX - MFD_NAME_PREFIX_LEN) - -#define MFD_ALL_FLAGS (MFD_CLOEXEC | MFD_ALLOW_SEALING | MFD_HUGETLB) - -SYSCALL_DEFINE2(memfd_create, - const char __user *, uname, - unsigned int, flags) -{ - unsigned int *file_seals; - struct file *file; - int fd, error; - char *name; - long len; - - if (!(flags & MFD_HUGETLB)) { - if (flags & ~(unsigned int)MFD_ALL_FLAGS) - return -EINVAL; - } else { - /* Allow huge page size encoding in flags. */ - if (flags & ~(unsigned int)(MFD_ALL_FLAGS | - (MFD_HUGE_MASK << MFD_HUGE_SHIFT))) - return -EINVAL; - } - - /* length includes terminating zero */ - len = strnlen_user(uname, MFD_NAME_MAX_LEN + 1); - if (len <= 0) - return -EFAULT; - if (len > MFD_NAME_MAX_LEN + 1) - return -EINVAL; - - name = kmalloc(len + MFD_NAME_PREFIX_LEN, GFP_KERNEL); - if (!name) - return -ENOMEM; - - strcpy(name, MFD_NAME_PREFIX); - if (copy_from_user(&name[MFD_NAME_PREFIX_LEN], uname, len)) { - error = -EFAULT; - goto err_name; - } - - /* terminating-zero may have changed after strnlen_user() returned */ - if (name[len + MFD_NAME_PREFIX_LEN - 1]) { - error = -EFAULT; - goto err_name; - } - - fd = get_unused_fd_flags((flags & MFD_CLOEXEC) ? O_CLOEXEC : 0); - if (fd < 0) { - error = fd; - goto err_name; - } - - if (flags & MFD_HUGETLB) { - struct user_struct *user = NULL; - - file = hugetlb_file_setup(name, 0, VM_NORESERVE, &user, - HUGETLB_ANONHUGE_INODE, - (flags >> MFD_HUGE_SHIFT) & - MFD_HUGE_MASK); - } else - file = shmem_file_setup(name, 0, VM_NORESERVE); - if (IS_ERR(file)) { - error = PTR_ERR(file); - goto err_fd; - } - file->f_mode |= FMODE_LSEEK | FMODE_PREAD | FMODE_PWRITE; - file->f_flags |= O_RDWR | O_LARGEFILE; - - if (flags & MFD_ALLOW_SEALING) { - file_seals = memfd_file_seals_ptr(file); - *file_seals &= ~F_SEAL_SEAL; - } - - fd_install(fd, file); - kfree(name); - return fd; - -err_fd: - put_unused_fd(fd); -err_name: - kfree(name); - return error; -} - #endif /* CONFIG_TMPFS */ static void shmem_put_super(struct super_block *sb) From e69438596bb3e97809e76be315e54a4a444f4797 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 7 Jun 2018 17:06:04 -0700 Subject: [PATCH 026/118] mm/page_alloc: remove realsize in free_area_init_core() Highmem's realsize always equals to freesize, so it is not necessary to spare a variable to record this. Link: http://lkml.kernel.org/r/20180413083859.65888-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 22320ea27489..4fbe211340e0 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -6229,18 +6229,18 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) for (j = 0; j < MAX_NR_ZONES; j++) { struct zone *zone = pgdat->node_zones + j; - unsigned long size, realsize, freesize, memmap_pages; + unsigned long size, freesize, memmap_pages; unsigned long zone_start_pfn = zone->zone_start_pfn; size = zone->spanned_pages; - realsize = freesize = zone->present_pages; + freesize = zone->present_pages; /* * Adjust freesize so that it accounts for how much memory * is used by this zone for memmap. This affects the watermark * and per-cpu initialisations */ - memmap_pages = calc_memmap_size(size, realsize); + memmap_pages = calc_memmap_size(size, freesize); if (!is_highmem_idx(j)) { if (freesize >= memmap_pages) { freesize -= memmap_pages; @@ -6272,7 +6272,7 @@ static void __paginginit free_area_init_core(struct pglist_data *pgdat) * when the bootmem allocator frees pages into the buddy system. * And all highmem pages will be managed by the buddy system. */ - zone->managed_pages = is_highmem_idx(j) ? realsize : freesize; + zone->managed_pages = freesize; #ifdef CONFIG_NUMA zone->node = nid; #endif From 3010a5ea665a089361e435093bd737399123fcc4 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 7 Jun 2018 17:06:08 -0700 Subject: [PATCH 027/118] mm: introduce ARCH_HAS_PTE_SPECIAL Currently the PTE special supports is turned on in per architecture header files. Most of the time, it is defined in arch/*/include/asm/pgtable.h depending or not on some other per architecture static definition. This patch introduce a new configuration variable to manage this directly in the Kconfig files. It would later replace __HAVE_ARCH_PTE_SPECIAL. Here notes for some architecture where the definition of __HAVE_ARCH_PTE_SPECIAL is not obvious: arm __HAVE_ARCH_PTE_SPECIAL which is currently defined in arch/arm/include/asm/pgtable-3level.h which is included by arch/arm/include/asm/pgtable.h when CONFIG_ARM_LPAE is set. So select ARCH_HAS_PTE_SPECIAL if ARM_LPAE. powerpc __HAVE_ARCH_PTE_SPECIAL is defined in 2 files: - arch/powerpc/include/asm/book3s/64/pgtable.h - arch/powerpc/include/asm/pte-common.h The first one is included if (PPC_BOOK3S & PPC64) while the second is included in all the other cases. So select ARCH_HAS_PTE_SPECIAL all the time. sparc: __HAVE_ARCH_PTE_SPECIAL is defined if defined(__sparc__) && defined(__arch64__) which are defined through the compiler in sparc/Makefile if !SPARC32 which I assume to be if SPARC64. So select ARCH_HAS_PTE_SPECIAL if SPARC64 There is no functional change introduced by this patch. Link: http://lkml.kernel.org/r/1523433816-14460-2-git-send-email-ldufour@linux.vnet.ibm.com Signed-off-by: Laurent Dufour Suggested-by: Jerome Glisse Reviewed-by: Jerome Glisse Acked-by: David Rientjes Cc: Michal Hocko Cc: "Aneesh Kumar K . V" Cc: Michael Ellerman Cc: Benjamin Herrenschmidt Cc: Paul Mackerras Cc: Jonathan Corbet Cc: Catalin Marinas Cc: Will Deacon Cc: Yoshinori Sato Cc: Rich Felker Cc: David S. Miller Cc: Thomas Gleixner Cc: Ingo Molnar Cc: Vineet Gupta Cc: Palmer Dabbelt Cc: Albert Ou Cc: Martin Schwidefsky Cc: Heiko Carstens Cc: David Rientjes Cc: Robin Murphy Cc: Christophe LEROY Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/features/vm/pte_special/arch-support.txt | 2 +- arch/arc/Kconfig | 1 + arch/arc/include/asm/pgtable.h | 2 -- arch/arm/Kconfig | 1 + arch/arm/include/asm/pgtable-3level.h | 1 - arch/arm64/Kconfig | 1 + arch/arm64/include/asm/pgtable.h | 2 -- arch/powerpc/Kconfig | 1 + arch/powerpc/include/asm/book3s/64/pgtable.h | 3 --- arch/powerpc/include/asm/pte-common.h | 3 --- arch/riscv/Kconfig | 1 + arch/riscv/include/asm/pgtable-bits.h | 3 --- arch/s390/Kconfig | 1 + arch/s390/include/asm/pgtable.h | 1 - arch/sh/Kconfig | 1 + arch/sh/include/asm/pgtable.h | 2 -- arch/sparc/Kconfig | 1 + arch/sparc/include/asm/pgtable_64.h | 3 --- arch/x86/Kconfig | 1 + arch/x86/include/asm/pgtable_types.h | 1 - include/linux/pfn_t.h | 4 ++-- mm/Kconfig | 3 +++ mm/gup.c | 4 ++-- mm/memory.c | 2 +- 24 files changed, 18 insertions(+), 27 deletions(-) diff --git a/Documentation/features/vm/pte_special/arch-support.txt b/Documentation/features/vm/pte_special/arch-support.txt index 6a608a6dcf71..a8378424bc98 100644 --- a/Documentation/features/vm/pte_special/arch-support.txt +++ b/Documentation/features/vm/pte_special/arch-support.txt @@ -1,6 +1,6 @@ # # Feature name: pte_special -# Kconfig: __HAVE_ARCH_PTE_SPECIAL +# Kconfig: ARCH_HAS_PTE_SPECIAL # description: arch supports the pte_special()/pte_mkspecial() VM APIs # ----------------------- diff --git a/arch/arc/Kconfig b/arch/arc/Kconfig index 89d47eac18b2..e81bcd271be7 100644 --- a/arch/arc/Kconfig +++ b/arch/arc/Kconfig @@ -48,6 +48,7 @@ config ARC select HAVE_GENERIC_DMA_COHERENT select HAVE_KERNEL_GZIP select HAVE_KERNEL_LZMA + select ARCH_HAS_PTE_SPECIAL config MIGHT_HAVE_PCI bool diff --git a/arch/arc/include/asm/pgtable.h b/arch/arc/include/asm/pgtable.h index 08fe33830d4b..8ec5599a0957 100644 --- a/arch/arc/include/asm/pgtable.h +++ b/arch/arc/include/asm/pgtable.h @@ -320,8 +320,6 @@ PTE_BIT_FUNC(mkexec, |= (_PAGE_EXECUTE)); PTE_BIT_FUNC(mkspecial, |= (_PAGE_SPECIAL)); PTE_BIT_FUNC(mkhuge, |= (_PAGE_HW_SZ)); -#define __HAVE_ARCH_PTE_SPECIAL - static inline pte_t pte_modify(pte_t pte, pgprot_t newprot) { return __pte((pte_val(pte) & _PAGE_CHG_MASK) | pgprot_val(newprot)); diff --git a/arch/arm/Kconfig b/arch/arm/Kconfig index 8f460bdd4be1..534563ac7f5f 100644 --- a/arch/arm/Kconfig +++ b/arch/arm/Kconfig @@ -8,6 +8,7 @@ config ARM select ARCH_HAS_DEVMEM_IS_ALLOWED select ARCH_HAS_ELF_RANDOMIZE select ARCH_HAS_FORTIFY_SOURCE + select ARCH_HAS_PTE_SPECIAL if ARM_LPAE select ARCH_HAS_SET_MEMORY select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_STRICT_KERNEL_RWX if MMU && !XIP_KERNEL diff --git a/arch/arm/include/asm/pgtable-3level.h b/arch/arm/include/asm/pgtable-3level.h index 2a4836087358..6d50a11d7793 100644 --- a/arch/arm/include/asm/pgtable-3level.h +++ b/arch/arm/include/asm/pgtable-3level.h @@ -219,7 +219,6 @@ static inline pte_t pte_mkspecial(pte_t pte) pte_val(pte) |= L_PTE_SPECIAL; return pte; } -#define __HAVE_ARCH_PTE_SPECIAL #define pmd_write(pmd) (pmd_isclear((pmd), L_PMD_SECT_RDONLY)) #define pmd_dirty(pmd) (pmd_isset((pmd), L_PMD_SECT_DIRTY)) diff --git a/arch/arm64/Kconfig b/arch/arm64/Kconfig index b25ed7834f6c..4759566a78cb 100644 --- a/arch/arm64/Kconfig +++ b/arch/arm64/Kconfig @@ -17,6 +17,7 @@ config ARM64 select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV select ARCH_HAS_MEMBARRIER_SYNC_CORE + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/arm64/include/asm/pgtable.h b/arch/arm64/include/asm/pgtable.h index 7c4c8f318ba9..9f82d6b53851 100644 --- a/arch/arm64/include/asm/pgtable.h +++ b/arch/arm64/include/asm/pgtable.h @@ -306,8 +306,6 @@ static inline int pte_same(pte_t pte_a, pte_t pte_b) #define HPAGE_MASK (~(HPAGE_SIZE - 1)) #define HUGETLB_PAGE_ORDER (HPAGE_SHIFT - PAGE_SHIFT) -#define __HAVE_ARCH_PTE_SPECIAL - static inline pte_t pgd_pte(pgd_t pgd) { return __pte(pgd_val(pgd)); diff --git a/arch/powerpc/Kconfig b/arch/powerpc/Kconfig index 076fe3094856..8f959df2de7a 100644 --- a/arch/powerpc/Kconfig +++ b/arch/powerpc/Kconfig @@ -135,6 +135,7 @@ config PPC select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_PHYS_TO_DMA select ARCH_HAS_PMEM_API if PPC64 + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_MEMBARRIER_CALLBACKS select ARCH_HAS_SCALED_CPUTIME if VIRT_CPU_ACCOUNTING_NATIVE select ARCH_HAS_SG_CHAIN diff --git a/arch/powerpc/include/asm/book3s/64/pgtable.h b/arch/powerpc/include/asm/book3s/64/pgtable.h index 42fe7c2ff2df..63cee159022b 100644 --- a/arch/powerpc/include/asm/book3s/64/pgtable.h +++ b/arch/powerpc/include/asm/book3s/64/pgtable.h @@ -335,9 +335,6 @@ extern unsigned long pci_io_base; /* Advertise special mapping type for AGP */ #define HAVE_PAGE_AGP -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #ifndef __ASSEMBLY__ /* diff --git a/arch/powerpc/include/asm/pte-common.h b/arch/powerpc/include/asm/pte-common.h index 050b0d775324..bef56141a549 100644 --- a/arch/powerpc/include/asm/pte-common.h +++ b/arch/powerpc/include/asm/pte-common.h @@ -208,9 +208,6 @@ static inline bool pte_user(pte_t pte) #define PAGE_AGP (PAGE_KERNEL_NC) #define HAVE_PAGE_AGP -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #ifndef _PAGE_READ /* if not defined, we should not find _PAGE_WRITE too */ #define _PAGE_READ 0 diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig index 274bc064c41f..17f19e67993b 100644 --- a/arch/riscv/Kconfig +++ b/arch/riscv/Kconfig @@ -42,6 +42,7 @@ config RISCV select THREAD_INFO_IN_TASK select RISCV_TIMER select GENERIC_IRQ_MULTI_HANDLER + select ARCH_HAS_PTE_SPECIAL config MMU def_bool y diff --git a/arch/riscv/include/asm/pgtable-bits.h b/arch/riscv/include/asm/pgtable-bits.h index 997ddbb1d370..2fa2942be221 100644 --- a/arch/riscv/include/asm/pgtable-bits.h +++ b/arch/riscv/include/asm/pgtable-bits.h @@ -42,7 +42,4 @@ _PAGE_WRITE | _PAGE_EXEC | \ _PAGE_USER | _PAGE_GLOBAL)) -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - #endif /* _ASM_RISCV_PGTABLE_BITS_H */ diff --git a/arch/s390/Kconfig b/arch/s390/Kconfig index b7deee7e738f..baed39772c84 100644 --- a/arch/s390/Kconfig +++ b/arch/s390/Kconfig @@ -65,6 +65,7 @@ config S390 select ARCH_HAS_GCOV_PROFILE_ALL select ARCH_HAS_GIGANTIC_PAGE if (MEMORY_ISOLATION && COMPACTION) || CMA select ARCH_HAS_KCOV + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_SET_MEMORY select ARCH_HAS_SG_CHAIN select ARCH_HAS_STRICT_KERNEL_RWX diff --git a/arch/s390/include/asm/pgtable.h b/arch/s390/include/asm/pgtable.h index 2d24d33bf188..9809694e1389 100644 --- a/arch/s390/include/asm/pgtable.h +++ b/arch/s390/include/asm/pgtable.h @@ -171,7 +171,6 @@ static inline int is_module_addr(void *addr) #define _PAGE_WRITE 0x020 /* SW pte write bit */ #define _PAGE_SPECIAL 0x040 /* SW associated with special page */ #define _PAGE_UNUSED 0x080 /* SW bit for pgste usage state */ -#define __HAVE_ARCH_PTE_SPECIAL #ifdef CONFIG_MEM_SOFT_DIRTY #define _PAGE_SOFT_DIRTY 0x002 /* SW pte soft dirty bit */ diff --git a/arch/sh/Kconfig b/arch/sh/Kconfig index ae619d54018c..4d61a085982b 100644 --- a/arch/sh/Kconfig +++ b/arch/sh/Kconfig @@ -1,6 +1,7 @@ # SPDX-License-Identifier: GPL-2.0 config SUPERH def_bool y + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_TICK_BROADCAST if GENERIC_CLOCKEVENTS_BROADCAST select ARCH_MIGHT_HAVE_PC_PARPORT select ARCH_NO_COHERENT_DMA_MMAP if !MMU diff --git a/arch/sh/include/asm/pgtable.h b/arch/sh/include/asm/pgtable.h index 89c513a982fc..f6abfe2bca93 100644 --- a/arch/sh/include/asm/pgtable.h +++ b/arch/sh/include/asm/pgtable.h @@ -156,8 +156,6 @@ extern void page_table_range_init(unsigned long start, unsigned long end, #define HAVE_ARCH_UNMAPPED_AREA #define HAVE_ARCH_UNMAPPED_AREA_TOPDOWN -#define __HAVE_ARCH_PTE_SPECIAL - #include #endif /* __ASM_SH_PGTABLE_H */ diff --git a/arch/sparc/Kconfig b/arch/sparc/Kconfig index b42ba888217d..9a2b8877f174 100644 --- a/arch/sparc/Kconfig +++ b/arch/sparc/Kconfig @@ -88,6 +88,7 @@ config SPARC64 select ARCH_USE_QUEUED_SPINLOCKS select GENERIC_TIME_VSYSCALL select ARCH_CLOCKSOURCE_DATA + select ARCH_HAS_PTE_SPECIAL config ARCH_DEFCONFIG string diff --git a/arch/sparc/include/asm/pgtable_64.h b/arch/sparc/include/asm/pgtable_64.h index 44d6ac47e035..1393a8ac596b 100644 --- a/arch/sparc/include/asm/pgtable_64.h +++ b/arch/sparc/include/asm/pgtable_64.h @@ -117,9 +117,6 @@ bool kern_addr_valid(unsigned long addr); #define _PAGE_PMD_HUGE _AC(0x0100000000000000,UL) /* Huge page */ #define _PAGE_PUD_HUGE _PAGE_PMD_HUGE -/* Advertise support for _PAGE_SPECIAL */ -#define __HAVE_ARCH_PTE_SPECIAL - /* SUN4U pte bits... */ #define _PAGE_SZ4MB_4U _AC(0x6000000000000000,UL) /* 4MB Page */ #define _PAGE_SZ512K_4U _AC(0x4000000000000000,UL) /* 512K Page */ diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig index cb6e3a219294..f182a4e8e5bd 100644 --- a/arch/x86/Kconfig +++ b/arch/x86/Kconfig @@ -60,6 +60,7 @@ config X86 select ARCH_HAS_KCOV if X86_64 select ARCH_HAS_MEMBARRIER_SYNC_CORE select ARCH_HAS_PMEM_API if X86_64 + select ARCH_HAS_PTE_SPECIAL select ARCH_HAS_REFCOUNT select ARCH_HAS_UACCESS_FLUSHCACHE if X86_64 select ARCH_HAS_UACCESS_MCSAFE if X86_64 diff --git a/arch/x86/include/asm/pgtable_types.h b/arch/x86/include/asm/pgtable_types.h index 1e5a40673953..99fff853c944 100644 --- a/arch/x86/include/asm/pgtable_types.h +++ b/arch/x86/include/asm/pgtable_types.h @@ -65,7 +65,6 @@ #define _PAGE_PKEY_BIT2 (_AT(pteval_t, 0)) #define _PAGE_PKEY_BIT3 (_AT(pteval_t, 0)) #endif -#define __HAVE_ARCH_PTE_SPECIAL #define _PAGE_PKEY_MASK (_PAGE_PKEY_BIT0 | \ _PAGE_PKEY_BIT1 | \ diff --git a/include/linux/pfn_t.h b/include/linux/pfn_t.h index a03c2642a87c..21713dc14ce2 100644 --- a/include/linux/pfn_t.h +++ b/include/linux/pfn_t.h @@ -122,7 +122,7 @@ pud_t pud_mkdevmap(pud_t pud); #endif #endif /* __HAVE_ARCH_PTE_DEVMAP */ -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static inline bool pfn_t_special(pfn_t pfn) { return (pfn.val & PFN_SPECIAL) == PFN_SPECIAL; @@ -132,5 +132,5 @@ static inline bool pfn_t_special(pfn_t pfn) { return false; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #endif /* _LINUX_PFN_T_H_ */ diff --git a/mm/Kconfig b/mm/Kconfig index 3e0b6e87f65d..00bffa7a5112 100644 --- a/mm/Kconfig +++ b/mm/Kconfig @@ -754,3 +754,6 @@ config GUP_BENCHMARK performance of get_user_pages_fast(). See tools/testing/selftests/vm/gup_benchmark.c + +config ARCH_HAS_PTE_SPECIAL + bool diff --git a/mm/gup.c b/mm/gup.c index 541904a7c60f..010153989b9b 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -1354,7 +1354,7 @@ static void undo_dev_pagemap(int *nr, int nr_start, struct page **pages) } } -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, int write, struct page **pages, int *nr) { @@ -1430,7 +1430,7 @@ static int gup_pte_range(pmd_t pmd, unsigned long addr, unsigned long end, { return 0; } -#endif /* __HAVE_ARCH_PTE_SPECIAL */ +#endif /* CONFIG_ARCH_HAS_PTE_SPECIAL */ #if defined(__HAVE_ARCH_PTE_DEVMAP) && defined(CONFIG_TRANSPARENT_HUGEPAGE) static int __gup_device_huge(unsigned long pfn, unsigned long addr, diff --git a/mm/memory.c b/mm/memory.c index 6a97893a8de7..42d2d4366c11 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -817,7 +817,7 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * PFNMAP mappings in order to support COWable mappings. * */ -#ifdef __HAVE_ARCH_PTE_SPECIAL +#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL # define HAVE_PTE_SPECIAL 1 #else # define HAVE_PTE_SPECIAL 0 From 00b3a331fdf7f1466ac93a008ccd2dab45cb46c5 Mon Sep 17 00:00:00 2001 From: Laurent Dufour Date: Thu, 7 Jun 2018 17:06:12 -0700 Subject: [PATCH 028/118] mm: remove odd HAVE_PTE_SPECIAL Remove the additional define HAVE_PTE_SPECIAL and rely directly on CONFIG_ARCH_HAS_PTE_SPECIAL. There is no functional change introduced by this patch Link: http://lkml.kernel.org/r/1523533733-25437-1-git-send-email-ldufour@linux.vnet.ibm.com Signed-off-by: Laurent Dufour Acked-by: David Rientjes Reviewed-by: Andrew Morton Cc: Jerome Glisse Cc: Michal Hocko Cc: Christophe LEROY Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memory.c | 15 ++++++--------- 1 file changed, 6 insertions(+), 9 deletions(-) diff --git a/mm/memory.c b/mm/memory.c index 42d2d4366c11..bc381486d527 100644 --- a/mm/memory.c +++ b/mm/memory.c @@ -817,17 +817,12 @@ static void print_bad_pte(struct vm_area_struct *vma, unsigned long addr, * PFNMAP mappings in order to support COWable mappings. * */ -#ifdef CONFIG_ARCH_HAS_PTE_SPECIAL -# define HAVE_PTE_SPECIAL 1 -#else -# define HAVE_PTE_SPECIAL 0 -#endif struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, pte_t pte, bool with_public_device) { unsigned long pfn = pte_pfn(pte); - if (HAVE_PTE_SPECIAL) { + if (IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL)) { if (likely(!pte_special(pte))) goto check_pfn; if (vma->vm_ops && vma->vm_ops->find_special_page) @@ -862,7 +857,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, return NULL; } - /* !HAVE_PTE_SPECIAL case follows: */ + /* !CONFIG_ARCH_HAS_PTE_SPECIAL case follows: */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { @@ -881,6 +876,7 @@ struct page *_vm_normal_page(struct vm_area_struct *vma, unsigned long addr, if (is_zero_pfn(pfn)) return NULL; + check_pfn: if (unlikely(pfn > highest_memmap_pfn)) { print_bad_pte(vma, addr, pte, NULL); @@ -904,7 +900,7 @@ struct page *vm_normal_page_pmd(struct vm_area_struct *vma, unsigned long addr, /* * There is no pmd_special() but there may be special pmds, e.g. * in a direct-access (dax) mapping, so let's just replicate the - * !HAVE_PTE_SPECIAL case from vm_normal_page() here. + * !CONFIG_ARCH_HAS_PTE_SPECIAL case from vm_normal_page() here. */ if (unlikely(vma->vm_flags & (VM_PFNMAP|VM_MIXEDMAP))) { if (vma->vm_flags & VM_MIXEDMAP) { @@ -1933,7 +1929,8 @@ static int __vm_insert_mixed(struct vm_area_struct *vma, unsigned long addr, * than insert_pfn). If a zero_pfn were inserted into a VM_MIXEDMAP * without pte special, it would there be refcounted as a normal page. */ - if (!HAVE_PTE_SPECIAL && !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { + if (!IS_ENABLED(CONFIG_ARCH_HAS_PTE_SPECIAL) && + !pfn_t_devmap(pfn) && pfn_t_valid(pfn)) { struct page *page; /* From 1c4bc43ddfd52cbe5a08bb86ae636f55d2799424 Mon Sep 17 00:00:00 2001 From: Stefan Agner Date: Thu, 7 Jun 2018 17:06:15 -0700 Subject: [PATCH 029/118] mm/memblock: introduce PHYS_ADDR_MAX So far code was using ULLONG_MAX and type casting to obtain a phys_addr_t with all bits set. The typecast is necessary to silence compiler warnings on 32-bit platforms. Use the simpler but still type safe approach "~(phys_addr_t)0" to create a preprocessor define for all bits set. Link: http://lkml.kernel.org/r/20180406213809.566-1-stefan@agner.ch Signed-off-by: Stefan Agner Suggested-by: Linus Torvalds Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Catalin Marinas Cc: Pavel Tatashin Cc: Ard Biesheuvel Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/kernel.h | 1 + mm/memblock.c | 22 +++++++++++----------- 2 files changed, 12 insertions(+), 11 deletions(-) diff --git a/include/linux/kernel.h b/include/linux/kernel.h index 7aed92624531..7c4e8f1f72d8 100644 --- a/include/linux/kernel.h +++ b/include/linux/kernel.h @@ -29,6 +29,7 @@ #define LLONG_MIN (-LLONG_MAX - 1) #define ULLONG_MAX (~0ULL) #define SIZE_MAX (~(size_t)0) +#define PHYS_ADDR_MAX (~(phys_addr_t)0) #define U8_MAX ((u8)~0U) #define S8_MAX ((s8)(U8_MAX>>1)) diff --git a/mm/memblock.c b/mm/memblock.c index 5108356ad8aa..eec988c21c7e 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -68,7 +68,7 @@ ulong __init_memblock choose_memblock_flags(void) /* adjust *@size so that (@base + *@size) doesn't overflow, return new size */ static inline phys_addr_t memblock_cap_size(phys_addr_t base, phys_addr_t *size) { - return *size = min(*size, (phys_addr_t)ULLONG_MAX - base); + return *size = min(*size, PHYS_ADDR_MAX - base); } /* @@ -925,7 +925,7 @@ void __init_memblock __next_mem_range(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : (phys_addr_t)ULLONG_MAX; + r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, @@ -1041,7 +1041,7 @@ void __init_memblock __next_mem_range_rev(u64 *idx, int nid, ulong flags, r = &type_b->regions[idx_b]; r_start = idx_b ? r[-1].base + r[-1].size : 0; r_end = idx_b < type_b->cnt ? - r->base : (phys_addr_t)ULLONG_MAX; + r->base : PHYS_ADDR_MAX; /* * if idx_b advanced past idx_a, * break out to advance idx_a @@ -1516,13 +1516,13 @@ phys_addr_t __init_memblock memblock_end_of_DRAM(void) static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) { - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t max_addr = PHYS_ADDR_MAX; struct memblock_region *r; /* * translate the memory @limit size into the max address within one of * the memory memblock regions, if the @limit exceeds the total size - * of those regions, max_addr will keep original value ULLONG_MAX + * of those regions, max_addr will keep original value PHYS_ADDR_MAX */ for_each_memblock(memory, r) { if (limit <= r->size) { @@ -1537,7 +1537,7 @@ static phys_addr_t __init_memblock __find_max_addr(phys_addr_t limit) void __init memblock_enforce_memory_limit(phys_addr_t limit) { - phys_addr_t max_addr = (phys_addr_t)ULLONG_MAX; + phys_addr_t max_addr = PHYS_ADDR_MAX; if (!limit) return; @@ -1545,14 +1545,14 @@ void __init memblock_enforce_memory_limit(phys_addr_t limit) max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ - if (max_addr == (phys_addr_t)ULLONG_MAX) + if (max_addr == PHYS_ADDR_MAX) return; /* truncate both memory and reserved regions */ memblock_remove_range(&memblock.memory, max_addr, - (phys_addr_t)ULLONG_MAX); + PHYS_ADDR_MAX); memblock_remove_range(&memblock.reserved, max_addr, - (phys_addr_t)ULLONG_MAX); + PHYS_ADDR_MAX); } void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) @@ -1580,7 +1580,7 @@ void __init memblock_cap_memory_range(phys_addr_t base, phys_addr_t size) /* truncate the reserved regions */ memblock_remove_range(&memblock.reserved, 0, base); memblock_remove_range(&memblock.reserved, - base + size, (phys_addr_t)ULLONG_MAX); + base + size, PHYS_ADDR_MAX); } void __init memblock_mem_limit_remove_map(phys_addr_t limit) @@ -1593,7 +1593,7 @@ void __init memblock_mem_limit_remove_map(phys_addr_t limit) max_addr = __find_max_addr(limit); /* @limit exceeds the total size of the memory, do nothing */ - if (max_addr == (phys_addr_t)ULLONG_MAX) + if (max_addr == PHYS_ADDR_MAX) return; memblock_cap_memory_range(0, max_addr); From bbec2e15170aae3e084d7d9afc730aeebe01b654 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:06:18 -0700 Subject: [PATCH 030/118] mm: rename page_counter's count/limit into usage/max This patch renames struct page_counter fields: count -> usage limit -> max and the corresponding functions: page_counter_limit() -> page_counter_set_max() mem_cgroup_get_limit() -> mem_cgroup_get_max() mem_cgroup_resize_limit() -> mem_cgroup_resize_max() memcg_update_kmem_limit() -> memcg_update_kmem_max() memcg_update_tcp_limit() -> memcg_update_tcp_max() The idea behind this renaming is to have the direct matching between memory cgroup knobs (low, high, max) and page_counters API. This is pure renaming, this patch doesn't bring any functional change. Link: http://lkml.kernel.org/r/20180405185921.4942-1-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 4 +- include/linux/page_counter.h | 12 ++-- mm/hugetlb_cgroup.c | 6 +- mm/memcontrol.c | 112 +++++++++++++++++------------------ mm/oom_kill.c | 2 +- mm/page_counter.c | 28 ++++----- 6 files changed, 82 insertions(+), 82 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 517096c3cc99..577a19a6a93b 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -467,7 +467,7 @@ unsigned long mem_cgroup_get_zone_lru_size(struct lruvec *lruvec, void mem_cgroup_handle_over_high(void); -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg); +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg); void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p); @@ -858,7 +858,7 @@ mem_cgroup_node_nr_lru_pages(struct mem_cgroup *memcg, return 0; } -static inline unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +static inline unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { return 0; } diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index c15ab80ad32d..94029dad9317 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -7,8 +7,8 @@ #include struct page_counter { - atomic_long_t count; - unsigned long limit; + atomic_long_t usage; + unsigned long max; struct page_counter *parent; /* legacy */ @@ -25,14 +25,14 @@ struct page_counter { static inline void page_counter_init(struct page_counter *counter, struct page_counter *parent) { - atomic_long_set(&counter->count, 0); - counter->limit = PAGE_COUNTER_MAX; + atomic_long_set(&counter->usage, 0); + counter->max = PAGE_COUNTER_MAX; counter->parent = parent; } static inline unsigned long page_counter_read(struct page_counter *counter) { - return atomic_long_read(&counter->count); + return atomic_long_read(&counter->usage); } void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages); @@ -41,7 +41,7 @@ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_limit(struct page_counter *counter, unsigned long limit); +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/hugetlb_cgroup.c b/mm/hugetlb_cgroup.c index eec1150125b9..68c2f2f3c05b 100644 --- a/mm/hugetlb_cgroup.c +++ b/mm/hugetlb_cgroup.c @@ -84,7 +84,7 @@ static void hugetlb_cgroup_init(struct hugetlb_cgroup *h_cgroup, limit = round_down(PAGE_COUNTER_MAX, 1 << huge_page_order(&hstates[idx])); - ret = page_counter_limit(counter, limit); + ret = page_counter_set_max(counter, limit); VM_BUG_ON(ret); } } @@ -273,7 +273,7 @@ static u64 hugetlb_cgroup_read_u64(struct cgroup_subsys_state *css, case RES_USAGE: return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -306,7 +306,7 @@ static ssize_t hugetlb_cgroup_write(struct kernfs_open_file *of, switch (MEMFILE_ATTR(of_cft(of)->private)) { case RES_LIMIT: mutex_lock(&hugetlb_limit_mutex); - ret = page_counter_limit(&h_cg->hugepage[idx], nr_pages); + ret = page_counter_set_max(&h_cg->hugepage[idx], nr_pages); mutex_unlock(&hugetlb_limit_mutex); break; default: diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d86665cf4a49..79bb5aeaa800 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -1034,13 +1034,13 @@ static unsigned long mem_cgroup_margin(struct mem_cgroup *memcg) unsigned long limit; count = page_counter_read(&memcg->memory); - limit = READ_ONCE(memcg->memory.limit); + limit = READ_ONCE(memcg->memory.max); if (count < limit) margin = limit - count; if (do_memsw_account()) { count = page_counter_read(&memcg->memsw); - limit = READ_ONCE(memcg->memsw.limit); + limit = READ_ONCE(memcg->memsw.max); if (count <= limit) margin = min(margin, limit - count); else @@ -1148,13 +1148,13 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) pr_info("memory: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memory)), - K((u64)memcg->memory.limit), memcg->memory.failcnt); + K((u64)memcg->memory.max), memcg->memory.failcnt); pr_info("memory+swap: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->memsw)), - K((u64)memcg->memsw.limit), memcg->memsw.failcnt); + K((u64)memcg->memsw.max), memcg->memsw.failcnt); pr_info("kmem: usage %llukB, limit %llukB, failcnt %lu\n", K((u64)page_counter_read(&memcg->kmem)), - K((u64)memcg->kmem.limit), memcg->kmem.failcnt); + K((u64)memcg->kmem.max), memcg->kmem.failcnt); for_each_mem_cgroup_tree(iter, memcg) { pr_info("Memory cgroup stats for "); @@ -1179,21 +1179,21 @@ void mem_cgroup_print_oom_info(struct mem_cgroup *memcg, struct task_struct *p) /* * Return the memory (and swap, if configured) limit for a memcg. */ -unsigned long mem_cgroup_get_limit(struct mem_cgroup *memcg) +unsigned long mem_cgroup_get_max(struct mem_cgroup *memcg) { - unsigned long limit; + unsigned long max; - limit = memcg->memory.limit; + max = memcg->memory.max; if (mem_cgroup_swappiness(memcg)) { - unsigned long memsw_limit; - unsigned long swap_limit; + unsigned long memsw_max; + unsigned long swap_max; - memsw_limit = memcg->memsw.limit; - swap_limit = memcg->swap.limit; - swap_limit = min(swap_limit, (unsigned long)total_swap_pages); - limit = min(limit + swap_limit, memsw_limit); + memsw_max = memcg->memsw.max; + swap_max = memcg->swap.max; + swap_max = min(swap_max, (unsigned long)total_swap_pages); + max = min(max + swap_max, memsw_max); } - return limit; + return max; } static bool mem_cgroup_out_of_memory(struct mem_cgroup *memcg, gfp_t gfp_mask, @@ -2444,10 +2444,10 @@ static inline int mem_cgroup_move_swap_account(swp_entry_t entry, } #endif -static DEFINE_MUTEX(memcg_limit_mutex); +static DEFINE_MUTEX(memcg_max_mutex); -static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, - unsigned long limit, bool memsw) +static int mem_cgroup_resize_max(struct mem_cgroup *memcg, + unsigned long max, bool memsw) { bool enlarge = false; int ret; @@ -2460,22 +2460,22 @@ static int mem_cgroup_resize_limit(struct mem_cgroup *memcg, break; } - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); /* * Make sure that the new limit (memsw or memory limit) doesn't - * break our basic invariant rule memory.limit <= memsw.limit. + * break our basic invariant rule memory.max <= memsw.max. */ - limits_invariant = memsw ? limit >= memcg->memory.limit : - limit <= memcg->memsw.limit; + limits_invariant = memsw ? max >= memcg->memory.max : + max <= memcg->memsw.max; if (!limits_invariant) { - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); ret = -EINVAL; break; } - if (limit > counter->limit) + if (max > counter->max) enlarge = true; - ret = page_counter_limit(counter, limit); - mutex_unlock(&memcg_limit_mutex); + ret = page_counter_set_max(counter, max); + mutex_unlock(&memcg_max_mutex); if (!ret) break; @@ -2757,7 +2757,7 @@ static u64 mem_cgroup_read_u64(struct cgroup_subsys_state *css, return (u64)mem_cgroup_usage(memcg, true) * PAGE_SIZE; return (u64)page_counter_read(counter) * PAGE_SIZE; case RES_LIMIT: - return (u64)counter->limit * PAGE_SIZE; + return (u64)counter->max * PAGE_SIZE; case RES_MAX_USAGE: return (u64)counter->watermark * PAGE_SIZE; case RES_FAILCNT: @@ -2871,24 +2871,24 @@ static void memcg_free_kmem(struct mem_cgroup *memcg) } #endif /* !CONFIG_SLOB */ -static int memcg_update_kmem_limit(struct mem_cgroup *memcg, - unsigned long limit) +static int memcg_update_kmem_max(struct mem_cgroup *memcg, + unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); - ret = page_counter_limit(&memcg->kmem, limit); - mutex_unlock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); + ret = page_counter_set_max(&memcg->kmem, max); + mutex_unlock(&memcg_max_mutex); return ret; } -static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) +static int memcg_update_tcp_max(struct mem_cgroup *memcg, unsigned long max) { int ret; - mutex_lock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); - ret = page_counter_limit(&memcg->tcpmem, limit); + ret = page_counter_set_max(&memcg->tcpmem, max); if (ret) goto out; @@ -2913,7 +2913,7 @@ static int memcg_update_tcp_limit(struct mem_cgroup *memcg, unsigned long limit) memcg->tcpmem_active = true; } out: - mutex_unlock(&memcg_limit_mutex); + mutex_unlock(&memcg_max_mutex); return ret; } @@ -2941,16 +2941,16 @@ static ssize_t mem_cgroup_write(struct kernfs_open_file *of, } switch (MEMFILE_TYPE(of_cft(of)->private)) { case _MEM: - ret = mem_cgroup_resize_limit(memcg, nr_pages, false); + ret = mem_cgroup_resize_max(memcg, nr_pages, false); break; case _MEMSWAP: - ret = mem_cgroup_resize_limit(memcg, nr_pages, true); + ret = mem_cgroup_resize_max(memcg, nr_pages, true); break; case _KMEM: - ret = memcg_update_kmem_limit(memcg, nr_pages); + ret = memcg_update_kmem_max(memcg, nr_pages); break; case _TCP: - ret = memcg_update_tcp_limit(memcg, nr_pages); + ret = memcg_update_tcp_max(memcg, nr_pages); break; } break; @@ -3126,8 +3126,8 @@ static int memcg_stat_show(struct seq_file *m, void *v) /* Hierarchical information */ memory = memsw = PAGE_COUNTER_MAX; for (mi = memcg; mi; mi = parent_mem_cgroup(mi)) { - memory = min(memory, mi->memory.limit); - memsw = min(memsw, mi->memsw.limit); + memory = min(memory, mi->memory.max); + memsw = min(memsw, mi->memsw.max); } seq_printf(m, "hierarchical_memory_limit %llu\n", (u64)memory * PAGE_SIZE); @@ -3626,7 +3626,7 @@ void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, *pheadroom = PAGE_COUNTER_MAX; while ((parent = parent_mem_cgroup(memcg))) { - unsigned long ceiling = min(memcg->memory.limit, memcg->high); + unsigned long ceiling = min(memcg->memory.max, memcg->high); unsigned long used = page_counter_read(&memcg->memory); *pheadroom = min(*pheadroom, ceiling - min(ceiling, used)); @@ -4319,12 +4319,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - page_counter_limit(&memcg->memory, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->swap, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->memsw, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->kmem, PAGE_COUNTER_MAX); - page_counter_limit(&memcg->tcpmem, PAGE_COUNTER_MAX); memcg->low = 0; + page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); + page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); @@ -5131,7 +5131,7 @@ static ssize_t memory_high_write(struct kernfs_open_file *of, static int memory_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->memory.limit); + unsigned long max = READ_ONCE(memcg->memory.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5155,7 +5155,7 @@ static ssize_t memory_max_write(struct kernfs_open_file *of, if (err) return err; - xchg(&memcg->memory.limit, max); + xchg(&memcg->memory.max, max); for (;;) { unsigned long nr_pages = page_counter_read(&memcg->memory); @@ -6074,7 +6074,7 @@ long mem_cgroup_get_nr_swap_pages(struct mem_cgroup *memcg) return nr_swap_pages; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) nr_swap_pages = min_t(long, nr_swap_pages, - READ_ONCE(memcg->swap.limit) - + READ_ONCE(memcg->swap.max) - page_counter_read(&memcg->swap)); return nr_swap_pages; } @@ -6095,7 +6095,7 @@ bool mem_cgroup_swap_full(struct page *page) return false; for (; memcg != root_mem_cgroup; memcg = parent_mem_cgroup(memcg)) - if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.limit) + if (page_counter_read(&memcg->swap) * 2 >= memcg->swap.max) return true; return false; @@ -6129,7 +6129,7 @@ static u64 swap_current_read(struct cgroup_subsys_state *css, static int swap_max_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long max = READ_ONCE(memcg->swap.limit); + unsigned long max = READ_ONCE(memcg->swap.max); if (max == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -6151,9 +6151,9 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, if (err) return err; - mutex_lock(&memcg_limit_mutex); - err = page_counter_limit(&memcg->swap, max); - mutex_unlock(&memcg_limit_mutex); + mutex_lock(&memcg_max_mutex); + err = page_counter_set_max(&memcg->swap, max); + mutex_unlock(&memcg_max_mutex); if (err) return err; diff --git a/mm/oom_kill.c b/mm/oom_kill.c index 8ba6cb88cf58..6694348b27e9 100644 --- a/mm/oom_kill.c +++ b/mm/oom_kill.c @@ -256,7 +256,7 @@ static enum oom_constraint constrained_alloc(struct oom_control *oc) int nid; if (is_memcg_oom(oc)) { - oc->totalpages = mem_cgroup_get_limit(oc->memcg) ?: 1; + oc->totalpages = mem_cgroup_get_max(oc->memcg) ?: 1; return CONSTRAINT_MEMCG; } diff --git a/mm/page_counter.c b/mm/page_counter.c index 2a8df3ad60a4..41937c9a9d11 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -22,7 +22,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) { long new; - new = atomic_long_sub_return(nr_pages, &counter->count); + new = atomic_long_sub_return(nr_pages, &counter->usage); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -41,7 +41,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) for (c = counter; c; c = c->parent) { long new; - new = atomic_long_add_return(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -82,9 +82,9 @@ bool page_counter_try_charge(struct page_counter *counter, * we either see the new limit or the setter sees the * counter has changed and retries. */ - new = atomic_long_add_return(nr_pages, &c->count); - if (new > c->limit) { - atomic_long_sub(nr_pages, &c->count); + new = atomic_long_add_return(nr_pages, &c->usage); + if (new > c->max) { + atomic_long_sub(nr_pages, &c->usage); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -123,20 +123,20 @@ void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages) } /** - * page_counter_limit - limit the number of pages allowed + * page_counter_set_max - set the maximum number of pages allowed * @counter: counter - * @limit: limit to set + * @nr_pages: limit to set * * Returns 0 on success, -EBUSY if the current number of pages on the * counter already exceeds the specified limit. * * The caller must serialize invocations on the same counter. */ -int page_counter_limit(struct page_counter *counter, unsigned long limit) +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) { for (;;) { unsigned long old; - long count; + long usage; /* * Update the limit while making sure that it's not @@ -149,17 +149,17 @@ int page_counter_limit(struct page_counter *counter, unsigned long limit) * the limit, so if it sees the old limit, we see the * modified counter and retry. */ - count = atomic_long_read(&counter->count); + usage = atomic_long_read(&counter->usage); - if (count > limit) + if (usage > nr_pages) return -EBUSY; - old = xchg(&counter->limit, limit); + old = xchg(&counter->max, nr_pages); - if (atomic_long_read(&counter->count) <= count) + if (atomic_long_read(&counter->usage) <= usage) return 0; - counter->limit = old; + counter->max = old; cond_resched(); } } From 230671533d64631116be3ff9d407bd9ca5a58e1b Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:06:22 -0700 Subject: [PATCH 031/118] mm: memory.low hierarchical behavior This patch aims to address an issue in current memory.low semantics, which makes it hard to use it in a hierarchy, where some leaf memory cgroups are more valuable than others. For example, there are memcgs A, A/B, A/C, A/D and A/E: A A/memory.low = 2G, A/memory.current = 6G //\\ BC DE B/memory.low = 3G B/memory.current = 2G C/memory.low = 1G C/memory.current = 2G D/memory.low = 0 D/memory.current = 2G E/memory.low = 10G E/memory.current = 0 If we apply memory pressure, B, C and D are reclaimed at the same pace while A's usage exceeds 2G. This is obviously wrong, as B's usage is fully below B's memory.low, and C has 1G of protection as well. Also, A is pushed to the size, which is less than A's 2G memory.low, which is also wrong. A simple bash script (provided below) can be used to reproduce the problem. Current results are: A: 1430097920 A/B: 711929856 A/C: 717426688 A/D: 741376 A/E: 0 To address the issue a concept of effective memory.low is introduced. Effective memory.low is always equal or less than original memory.low. In a case, when there is no memory.low overcommittment (and also for top-level cgroups), these two values are equal. Otherwise it's a part of parent's effective memory.low, calculated as a cgroup's memory.low usage divided by sum of sibling's memory.low usages (under memory.low usage I mean the size of actually protected memory: memory.current if memory.current < memory.low, 0 otherwise). It's necessary to track the actual usage, because otherwise an empty cgroup with memory.low set (A/E in my example) will affect actual memory distribution, which makes no sense. To avoid traversing the cgroup tree twice, page_counters code is reused. Calculating effective memory.low can be done in the reclaim path, as we conveniently traversing the cgroup tree from top to bottom and check memory.low on each level. So, it's a perfect place to calculate effective memory low and save it to use it for children cgroups. This also eliminates a need to traverse the cgroup tree from bottom to top each time to check if parent's guarantee is not exceeded. Setting/resetting effective memory.low is intentionally racy, but it's fine and shouldn't lead to any significant differences in actual memory distribution. With this patch applied results are matching the expectations: A: 2147930112 A/B: 1428721664 A/C: 718393344 A/D: 815104 A/E: 0 Test script: #!/bin/bash CGPATH="/sys/fs/cgroup" truncate /file1 --size 2G truncate /file2 --size 2G truncate /file3 --size 2G truncate /file4 --size 50G mkdir "${CGPATH}/A" echo "+memory" > "${CGPATH}/A/cgroup.subtree_control" mkdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E" echo 2G > "${CGPATH}/A/memory.low" echo 3G > "${CGPATH}/A/B/memory.low" echo 1G > "${CGPATH}/A/C/memory.low" echo 0 > "${CGPATH}/A/D/memory.low" echo 10G > "${CGPATH}/A/E/memory.low" echo $$ > "${CGPATH}/A/B/cgroup.procs" && vmtouch -qt /file1 echo $$ > "${CGPATH}/A/C/cgroup.procs" && vmtouch -qt /file2 echo $$ > "${CGPATH}/A/D/cgroup.procs" && vmtouch -qt /file3 echo $$ > "${CGPATH}/cgroup.procs" && vmtouch -qt /file4 echo "A: " `cat "${CGPATH}/A/memory.current"` echo "A/B: " `cat "${CGPATH}/A/B/memory.current"` echo "A/C: " `cat "${CGPATH}/A/C/memory.current"` echo "A/D: " `cat "${CGPATH}/A/D/memory.current"` echo "A/E: " `cat "${CGPATH}/A/E/memory.current"` rmdir "${CGPATH}/A/B" "${CGPATH}/A/C" "${CGPATH}/A/D" "${CGPATH}/A/E" rmdir "${CGPATH}/A" rm /file1 /file2 /file3 /file4 Link: http://lkml.kernel.org/r/20180405185921.4942-2-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 3 +- include/linux/page_counter.h | 7 +++ mm/memcontrol.c | 112 ++++++++++++++++++++++++++--------- mm/page_counter.c | 43 ++++++++++++++ 4 files changed, 134 insertions(+), 31 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 577a19a6a93b..891945507044 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -181,8 +181,7 @@ struct mem_cgroup { struct page_counter kmem; struct page_counter tcpmem; - /* Normal memory consumption range */ - unsigned long low; + /* Upper bound of normal memory consumption range */ unsigned long high; /* Range enforcement for interrupt charges */ diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 94029dad9317..7902a727d3b6 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -9,8 +9,14 @@ struct page_counter { atomic_long_t usage; unsigned long max; + unsigned long low; struct page_counter *parent; + /* effective memory.low and memory.low usage tracking */ + unsigned long elow; + atomic_long_t low_usage; + atomic_long_t children_low_usage; + /* legacy */ unsigned long watermark; unsigned long failcnt; @@ -42,6 +48,7 @@ bool page_counter_try_charge(struct page_counter *counter, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 79bb5aeaa800..d0261059aab9 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4270,7 +4270,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); - memcg->low = 0; + page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); wb_memcg_offline(memcg); @@ -4319,12 +4319,12 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) { struct mem_cgroup *memcg = mem_cgroup_from_css(css); - memcg->low = 0; page_counter_set_max(&memcg->memory, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->swap, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_low(&memcg->memory, 0); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; memcg_wb_domain_size_changed(memcg); @@ -5064,7 +5064,7 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); - unsigned long low = READ_ONCE(memcg->low); + unsigned long low = READ_ONCE(memcg->memory.low); if (low == PAGE_COUNTER_MAX) seq_puts(m, "max\n"); @@ -5086,7 +5086,7 @@ static ssize_t memory_low_write(struct kernfs_open_file *of, if (err) return err; - memcg->low = low; + page_counter_set_low(&memcg->memory, low); return nbytes; } @@ -5348,36 +5348,72 @@ struct cgroup_subsys memory_cgrp_subsys = { * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * - * Returns %true if memory consumption of @memcg, and that of all - * ancestors up to (but not including) @root, is below the normal range. + * WARNING: This function is not stateless! It can only be used as part + * of a top-down tree iteration, not for isolated queries. * - * @root is exclusive; it is never low when looked at directly and isn't - * checked when traversing the hierarchy. + * Returns %true if memory consumption of @memcg is below the normal range. * - * Excluding @root enables using memory.low to prioritize memory usage - * between cgroups within a subtree of the hierarchy that is limited by - * memory.high or memory.max. + * @root is exclusive; it is never low when looked at directly * - * For example, given cgroup A with children B and C: + * To provide a proper hierarchical behavior, effective memory.low value + * is used. * - * A - * / \ - * B C + * Effective memory.low is always equal or less than the original memory.low. + * If there is no memory.low overcommittment (which is always true for + * top-level memory cgroups), these two values are equal. + * Otherwise, it's a part of parent's effective memory.low, + * calculated as a cgroup's memory.low usage divided by sum of sibling's + * memory.low usages, where memory.low usage is the size of actually + * protected memory. * - * and + * low_usage + * elow = min( memory.low, parent->elow * ------------------ ), + * siblings_low_usage * - * 1. A/memory.current > A/memory.high - * 2. A/B/memory.current < A/B/memory.low - * 3. A/C/memory.current >= A/C/memory.low + * | memory.current, if memory.current < memory.low + * low_usage = | + | 0, otherwise. * - * As 'A' is high, i.e. triggers reclaim from 'A', and 'B' is low, we - * should reclaim from 'C' until 'A' is no longer high or until we can - * no longer reclaim from 'C'. If 'A', i.e. @root, isn't excluded by - * mem_cgroup_low when reclaming from 'A', then 'B' won't be considered - * low and we will reclaim indiscriminately from both 'B' and 'C'. + * + * Such definition of the effective memory.low provides the expected + * hierarchical behavior: parent's memory.low value is limiting + * children, unprotected memory is reclaimed first and cgroups, + * which are not using their guarantee do not affect actual memory + * distribution. + * + * For example, if there are memcgs A, A/B, A/C, A/D and A/E: + * + * A A/memory.low = 2G, A/memory.current = 6G + * //\\ + * BC DE B/memory.low = 3G B/memory.current = 2G + * C/memory.low = 1G C/memory.current = 2G + * D/memory.low = 0 D/memory.current = 2G + * E/memory.low = 10G E/memory.current = 0 + * + * and the memory pressure is applied, the following memory distribution + * is expected (approximately): + * + * A/memory.current = 2G + * + * B/memory.current = 1.3G + * C/memory.current = 0.6G + * D/memory.current = 0 + * E/memory.current = 0 + * + * These calculations require constant tracking of the actual low usages + * (see propagate_low_usage()), as well as recursive calculation of + * effective memory.low values. But as we do call mem_cgroup_low() + * path for each memory cgroup top-down from the reclaim, + * it's possible to optimize this part, and save calculated elow + * for next usage. This part is intentionally racy, but it's ok, + * as memory.low is a best-effort mechanism. */ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) { + unsigned long usage, low_usage, siblings_low_usage; + unsigned long elow, parent_elow; + struct mem_cgroup *parent; + if (mem_cgroup_disabled()) return false; @@ -5386,12 +5422,30 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) if (memcg == root) return false; - for (; memcg != root; memcg = parent_mem_cgroup(memcg)) { - if (page_counter_read(&memcg->memory) >= memcg->low) - return false; - } + elow = memcg->memory.low; + usage = page_counter_read(&memcg->memory); + parent = parent_mem_cgroup(memcg); - return true; + if (parent == root) + goto exit; + + parent_elow = READ_ONCE(parent->memory.elow); + elow = min(elow, parent_elow); + + if (!elow || !parent_elow) + goto exit; + + low_usage = min(usage, memcg->memory.low); + siblings_low_usage = atomic_long_read( + &parent->memory.children_low_usage); + + if (!low_usage || !siblings_low_usage) + goto exit; + + elow = min(elow, parent_elow * low_usage / siblings_low_usage); +exit: + memcg->memory.elow = elow; + return usage < elow; } /** diff --git a/mm/page_counter.c b/mm/page_counter.c index 41937c9a9d11..a5ff4cbc355a 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,6 +13,28 @@ #include #include +static void propagate_low_usage(struct page_counter *c, unsigned long usage) +{ + unsigned long low_usage, old; + long delta; + + if (!c->parent) + return; + + if (!c->low && !atomic_long_read(&c->low_usage)) + return; + + if (usage <= c->low) + low_usage = usage; + else + low_usage = 0; + + old = atomic_long_xchg(&c->low_usage, low_usage); + delta = low_usage - old; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); +} + /** * page_counter_cancel - take pages out of the local counter * @counter: counter @@ -23,6 +45,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); + propagate_low_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -42,6 +65,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_add_return(nr_pages, &c->usage); + propagate_low_usage(counter, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -85,6 +109,7 @@ bool page_counter_try_charge(struct page_counter *counter, new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); + propagate_low_usage(counter, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -93,6 +118,7 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } + propagate_low_usage(counter, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. @@ -164,6 +190,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) } } +/** + * page_counter_set_low - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->low = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_low_usage(c, atomic_long_read(&c->usage)); +} + /** * page_counter_memparse - memparse() for page counter limits * @buf: string to parse From 5f93ad67436b3da8312db5ad4eedfa739c00787f Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:06:26 -0700 Subject: [PATCH 032/118] mm: treat memory.low value inclusive If memcg's usage is equal to the memory.low value, avoid reclaiming from this cgroup while there is a surplus of reclaimable memory. This sounds more logical and also matches memory.high and memory.max behavior: both are inclusive. Empty cgroups are not considered protected, so MEMCG_LOW events are not emitted for empty cgroups, if there is no more reclaimable memory in the system. Link: http://lkml.kernel.org/r/20180406122132.GA7185@castle Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index d0261059aab9..8bab4660e6aa 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -5344,14 +5344,14 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_low - check if memory consumption is below the normal range + * mem_cgroup_low - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. * - * Returns %true if memory consumption of @memcg is below the normal range. + * Returns %true if memory consumption of @memcg is in the normal range. * * @root is exclusive; it is never low when looked at directly * @@ -5445,7 +5445,7 @@ bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) elow = min(elow, parent_elow * low_usage / siblings_low_usage); exit: memcg->memory.elow = elow; - return usage < elow; + return usage && usage <= elow; } /** From 7854207fe9545181b048df4e684def36306a86ec Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:06:29 -0700 Subject: [PATCH 033/118] mm/docs: describe memory.low refinements Refine cgroup v2 docs after latest memory.low changes. Link: http://lkml.kernel.org/r/20180405185921.4942-4-guro@fb.com Signed-off-by: Roman Gushchin Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 28 ++++++++++++------------- 1 file changed, 13 insertions(+), 15 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index b0dda10b9382..7b56ca80e37a 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1005,10 +1005,17 @@ PAGE_SIZE multiple when read back. A read-write single value file which exists on non-root cgroups. The default is "0". - Best-effort memory protection. If the memory usages of a - cgroup and all its ancestors are below their low boundaries, - the cgroup's memory won't be reclaimed unless memory can be - reclaimed from unprotected cgroups. + Best-effort memory protection. If the memory usage of a + cgroup is within its effective low boundary, the cgroup's + memory won't be reclaimed unless memory can be reclaimed + from unprotected cgroups. + + Effective low boundary is limited by memory.low values of + all ancestor cgroups. If there is memory.low overcommitment + (child cgroup or cgroups are requiring more protected memory, + than parent will allow), then each child cgroup will get + the part of parent's protection proportional to the its + actual memory usage below memory.low. Putting more memory than generally available under this protection is discouraged. @@ -1950,17 +1957,8 @@ system performance due to overreclaim, to the point where the feature becomes self-defeating. The memory.low boundary on the other hand is a top-down allocated -reserve. A cgroup enjoys reclaim protection when it and all its -ancestors are below their low boundaries, which makes delegation of -subtrees possible. Secondly, new cgroups have no reserve per default -and in the common case most cgroups are eligible for the preferred -reclaim pass. This allows the new low boundary to be efficiently -implemented with just a minor addition to the generic reclaim code, -without the need for out-of-band data structures and reclaim passes. -Because the generic reclaim code considers all cgroups except for the -ones running low in the preferred first reclaim pass, overreclaim of -individual groups is eliminated as well, resulting in much better -overall workload performance. +reserve. A cgroup enjoys reclaim protection when it's within its low, +which makes delegation of subtrees possible. The original high boundary, the hard limit, is defined as a strict limit that can not budge, even if the OOM killer has to be called. From 688272809fcce5b17fcefd5892b59f3788efb144 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 7 Jun 2018 17:06:34 -0700 Subject: [PATCH 034/118] mm, gup: prevent pmd checking race in follow_pmd_mask() mmap_sem will be read locked when calling follow_pmd_mask(). But this cannot prevent PMD from being changed for all cases when PTL is unlocked, for example, from pmd_trans_huge() to pmd_none() via MADV_DONTNEED. So it is possible for the pmd_present() check in follow_pmd_mask() to encounter an invalid PMD. This may cause an incorrect VM_BUG_ON() or an infinite loop. Fix this by reading the PMD entry into a local variable with READ_ONCE() and checking the local variable and pmd_none() in the retry loop. As Kirill pointed out, with PTL unlocked, the *pmd may be changed under us, so reading it directly again and again may incur weird bugs. So although using *pmd directly other than for pmd_present() checking may be safe, it is still better to replace them to read *pmd once and check the local variable multiple times. When PTL unlocked, replace all *pmd with local variable was suggested by Kirill. Link: http://lkml.kernel.org/r/20180419083514.1365-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Zi Yan Cc: "Kirill A. Shutemov" Cc: Al Viro Cc: "Aneesh Kumar K.V" Cc: Dan Williams Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/gup.c | 38 +++++++++++++++++++++++++++----------- 1 file changed, 27 insertions(+), 11 deletions(-) diff --git a/mm/gup.c b/mm/gup.c index 010153989b9b..1020c7f8f5ee 100644 --- a/mm/gup.c +++ b/mm/gup.c @@ -212,53 +212,69 @@ static struct page *follow_pmd_mask(struct vm_area_struct *vma, unsigned long address, pud_t *pudp, unsigned int flags, unsigned int *page_mask) { - pmd_t *pmd; + pmd_t *pmd, pmdval; spinlock_t *ptl; struct page *page; struct mm_struct *mm = vma->vm_mm; pmd = pmd_offset(pudp, address); - if (pmd_none(*pmd)) + /* + * The READ_ONCE() will stabilize the pmdval in a register or + * on the stack so that it will stop changing under the code. + */ + pmdval = READ_ONCE(*pmd); + if (pmd_none(pmdval)) return no_page_table(vma, flags); - if (pmd_huge(*pmd) && vma->vm_flags & VM_HUGETLB) { + if (pmd_huge(pmdval) && vma->vm_flags & VM_HUGETLB) { page = follow_huge_pmd(mm, address, pmd, flags); if (page) return page; return no_page_table(vma, flags); } - if (is_hugepd(__hugepd(pmd_val(*pmd)))) { + if (is_hugepd(__hugepd(pmd_val(pmdval)))) { page = follow_huge_pd(vma, address, - __hugepd(pmd_val(*pmd)), flags, + __hugepd(pmd_val(pmdval)), flags, PMD_SHIFT); if (page) return page; return no_page_table(vma, flags); } retry: - if (!pmd_present(*pmd)) { + if (!pmd_present(pmdval)) { if (likely(!(flags & FOLL_MIGRATION))) return no_page_table(vma, flags); VM_BUG_ON(thp_migration_supported() && - !is_pmd_migration_entry(*pmd)); - if (is_pmd_migration_entry(*pmd)) + !is_pmd_migration_entry(pmdval)); + if (is_pmd_migration_entry(pmdval)) pmd_migration_entry_wait(mm, pmd); + pmdval = READ_ONCE(*pmd); + /* + * MADV_DONTNEED may convert the pmd to null because + * mmap_sem is held in read mode + */ + if (pmd_none(pmdval)) + return no_page_table(vma, flags); goto retry; } - if (pmd_devmap(*pmd)) { + if (pmd_devmap(pmdval)) { ptl = pmd_lock(mm, pmd); page = follow_devmap_pmd(vma, address, pmd, flags); spin_unlock(ptl); if (page) return page; } - if (likely(!pmd_trans_huge(*pmd))) + if (likely(!pmd_trans_huge(pmdval))) return follow_page_pte(vma, address, pmd, flags); - if ((flags & FOLL_NUMA) && pmd_protnone(*pmd)) + if ((flags & FOLL_NUMA) && pmd_protnone(pmdval)) return no_page_table(vma, flags); retry_locked: ptl = pmd_lock(mm, pmd); + if (unlikely(pmd_none(*pmd))) { + spin_unlock(ptl); + return no_page_table(vma, flags); + } if (unlikely(!pmd_present(*pmd))) { spin_unlock(ptl); if (likely(!(flags & FOLL_MIGRATION))) From d538c164fc018bdb79f07f904fa9d48acf18bb0a Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 7 Jun 2018 17:06:39 -0700 Subject: [PATCH 035/118] mm/sparse.c: check __highest_present_section_nr only for a present section When searching a present section, there are two boundaries: * __highest_present_section_nr * NR_MEM_SECTIONS And it is known, __highest_present_section_nr is a more strict boundary than NR_MEM_SECTIONS. This means it would be necessary to check __highest_present_section_nr only. Link: http://lkml.kernel.org/r/20180326081956.75275-2-richard.weiyang@gmail.com Signed-off-by: Wei Yang Acked-by: David Rientjes Reviewed-by: Andrew Morton Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 4 +--- 1 file changed, 1 insertion(+), 3 deletions(-) diff --git a/mm/sparse.c b/mm/sparse.c index 73dc2fcc0eab..3570ff294ab1 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -190,15 +190,13 @@ static inline int next_present_section_nr(int section_nr) section_nr++; if (present_section_nr(section_nr)) return section_nr; - } while ((section_nr < NR_MEM_SECTIONS) && - (section_nr <= __highest_present_section_nr)); + } while ((section_nr <= __highest_present_section_nr)); return -1; } #define for_each_present_section_nr(start, section_nr) \ for (section_nr = next_present_section_nr(start-1); \ ((section_nr >= 0) && \ - (section_nr < NR_MEM_SECTIONS) && \ (section_nr <= __highest_present_section_nr)); \ section_nr = next_present_section_nr(section_nr)) From 08994b24673b6ae33ee40fc3b5e265c6762848e4 Mon Sep 17 00:00:00 2001 From: Wei Yang Date: Thu, 7 Jun 2018 17:06:43 -0700 Subject: [PATCH 036/118] mm/sparse.c: pass the __highest_present_section_nr + 1 to alloc_func() In commit c4e1be9ec113 ("mm, sparsemem: break out of loops early") __highest_present_section_nr is introduced to reduce the loop counts for present section. This is also helpful for usemap and memmap allocation. This patch uses __highest_present_section_nr + 1 to optimize the loop. Link: http://lkml.kernel.org/r/20180326081956.75275-1-richard.weiyang@gmail.com Signed-off-by: Wei Yang Reviewed-by: Andrew Morton Cc: David Rientjes Cc: Dave Hansen Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/sparse.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/sparse.c b/mm/sparse.c index 3570ff294ab1..f13f2723950a 100644 --- a/mm/sparse.c +++ b/mm/sparse.c @@ -522,7 +522,7 @@ static void __init alloc_usemap_and_memmap(void (*alloc_func) map_count = 1; } /* ok, last chunk */ - alloc_func(data, pnum_begin, NR_MEM_SECTIONS, + alloc_func(data, pnum_begin, __highest_present_section_nr+1, map_count, nodeid_begin); } From 82a2e924ff2c6accbc840dfa46c42b24da457a31 Mon Sep 17 00:00:00 2001 From: Chintan Pandya Date: Thu, 7 Jun 2018 17:06:46 -0700 Subject: [PATCH 037/118] mm: vmalloc: clean up vunmap to avoid pgtable ops twice vunmap does page table clear operations twice in the case when DEBUG_PAGEALLOC_ENABLE_DEFAULT is enabled. So, clean up the code as that is unintended. As a perf gain, we save few us. Below ftrace data was obtained while doing 1 MB of vmalloc/vfree on ARM64 based SoC *without* this patch applied. After this patch, we can save ~3 us (on 1 extra vunmap_page_range). CPU DURATION FUNCTION CALLS | | | | | | | 6) | __vunmap() { 6) | vmap_debug_free_range() { 6) 3.281 us | vunmap_page_range(); 6) + 45.468 us | } 6) 2.760 us | vunmap_page_range(); 6) ! 505.105 us | } [cpandya@codeaurora.org: v3] Link: http://lkml.kernel.org/r/1525176960-18408-1-git-send-email-cpandya@codeaurora.org Link: http://lkml.kernel.org/r/1523876342-10545-1-git-send-email-cpandya@codeaurora.org Signed-off-by: Chintan Pandya Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Laura Abbott Cc: Catalin Marinas Cc: Johannes Weiner Cc: Florian Fainelli Cc: Yisheng Xie Cc: Ard Biesheuvel Cc: Wei Yang Cc: Byungchul Park Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 29 +++++++---------------------- 1 file changed, 7 insertions(+), 22 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 63a5f502da08..12bd82e6554e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -603,26 +603,6 @@ static void unmap_vmap_area(struct vmap_area *va) vunmap_page_range(va->va_start, va->va_end); } -static void vmap_debug_free_range(unsigned long start, unsigned long end) -{ - /* - * Unmap page tables and force a TLB flush immediately if pagealloc - * debugging is enabled. This catches use after free bugs similarly to - * those in linear kernel virtual address space after a page has been - * freed. - * - * All the lazy freeing logic is still retained, in order to minimise - * intrusiveness of this debugging feature. - * - * This is going to be *slow* (linear kernel virtual address debugging - * doesn't do a broadcast TLB flush so it is a lot faster). - */ - if (debug_pagealloc_enabled()) { - vunmap_page_range(start, end); - flush_tlb_kernel_range(start, end); - } -} - /* * lazy_max_pages is the maximum amount of virtual address space we gather up * before attempting to purge with a TLB flush. @@ -756,6 +736,9 @@ static void free_unmap_vmap_area(struct vmap_area *va) { flush_cache_vunmap(va->va_start, va->va_end); unmap_vmap_area(va); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range(va->va_start, va->va_end); + free_vmap_area_noflush(va); } @@ -1053,6 +1036,10 @@ static void vb_free(const void *addr, unsigned long size) vunmap_page_range((unsigned long)addr, (unsigned long)addr + size); + if (debug_pagealloc_enabled()) + flush_tlb_kernel_range((unsigned long)addr, + (unsigned long)addr + size); + spin_lock(&vb->lock); /* Expand dirty range */ @@ -1142,7 +1129,6 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(!PAGE_ALIGNED(addr)); debug_check_no_locks_freed(mem, size); - vmap_debug_free_range(addr, addr+size); if (likely(count <= VMAP_MAX_ALLOC)) { vb_free(mem, size); @@ -1499,7 +1485,6 @@ struct vm_struct *remove_vm_area(const void *addr) va->flags |= VM_LAZY_FREE; spin_unlock(&vmap_area_lock); - vmap_debug_free_range(va->va_start, va->va_end); kasan_free_shadow(vm); free_unmap_vmap_area(va); From f3c01d2f3ade6790db67f80fef60df84424f8964 Mon Sep 17 00:00:00 2001 From: Chintan Pandya Date: Thu, 7 Jun 2018 17:06:50 -0700 Subject: [PATCH 038/118] mm: vmalloc: avoid racy handling of debugobjects in vunmap Currently, __vunmap flow is, 1) Release the VM area 2) Free the debug objects corresponding to that vm area. This leave some race window open. 1) Release the VM area 1.5) Some other client gets the same vm area 1.6) This client allocates new debug objects on the same vm area 2) Free the debug objects corresponding to this vm area. Here, we actually free 'other' client's debug objects. Fix this by freeing the debug objects first and then releasing the VM area. Link: http://lkml.kernel.org/r/1523961828-9485-2-git-send-email-cpandya@codeaurora.org Signed-off-by: Chintan Pandya Reviewed-by: Andrew Morton Cc: Ard Biesheuvel Cc: Byungchul Park Cc: Catalin Marinas Cc: Florian Fainelli Cc: Johannes Weiner Cc: Laura Abbott Cc: Vlastimil Babka Cc: Wei Yang Cc: Yisheng Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 12bd82e6554e..4df66e1abeb1 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1504,7 +1504,7 @@ static void __vunmap(const void *addr, int deallocate_pages) addr)) return; - area = remove_vm_area(addr); + area = find_vmap_area((unsigned long)addr)->vm; if (unlikely(!area)) { WARN(1, KERN_ERR "Trying to vfree() nonexistent vm area (%p)\n", addr); @@ -1514,6 +1514,7 @@ static void __vunmap(const void *addr, int deallocate_pages) debug_check_no_locks_freed(addr, get_vm_area_size(area)); debug_check_no_obj_freed(addr, get_vm_area_size(area)); + remove_vm_area(addr); if (deallocate_pages) { int i; From 05e3ff9505858a39dc696ca195b5d79e524aac03 Mon Sep 17 00:00:00 2001 From: Chintan Pandya Date: Thu, 7 Jun 2018 17:06:53 -0700 Subject: [PATCH 039/118] mm: vmalloc: pass proper vm_start into debugobjects Client can call vunmap with some intermediate 'addr' which may not be the start of the VM area. Entire unmap code works with vm->vm_start which is proper but debug object API is called with 'addr'. This could be a problem within debug objects. Pass proper start address into debug object API. [akpm@linux-foundation.org: fix warning] Link: http://lkml.kernel.org/r/1523961828-9485-3-git-send-email-cpandya@codeaurora.org Signed-off-by: Chintan Pandya Reviewed-by: Andrew Morton Cc: Ard Biesheuvel Cc: Byungchul Park Cc: Catalin Marinas Cc: Florian Fainelli Cc: Johannes Weiner Cc: Laura Abbott Cc: Vlastimil Babka Cc: Wei Yang Cc: Yisheng Xie Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmalloc.c | 9 +++++---- 1 file changed, 5 insertions(+), 4 deletions(-) diff --git a/mm/vmalloc.c b/mm/vmalloc.c index 4df66e1abeb1..89efac3a020e 100644 --- a/mm/vmalloc.c +++ b/mm/vmalloc.c @@ -1128,15 +1128,16 @@ void vm_unmap_ram(const void *mem, unsigned int count) BUG_ON(addr > VMALLOC_END); BUG_ON(!PAGE_ALIGNED(addr)); - debug_check_no_locks_freed(mem, size); - if (likely(count <= VMAP_MAX_ALLOC)) { + debug_check_no_locks_freed(mem, size); vb_free(mem, size); return; } va = find_vmap_area(addr); BUG_ON(!va); + debug_check_no_locks_freed((void *)va->va_start, + (va->va_end - va->va_start)); free_unmap_vmap_area(va); } EXPORT_SYMBOL(vm_unmap_ram); @@ -1511,8 +1512,8 @@ static void __vunmap(const void *addr, int deallocate_pages) return; } - debug_check_no_locks_freed(addr, get_vm_area_size(area)); - debug_check_no_obj_freed(addr, get_vm_area_size(area)); + debug_check_no_locks_freed(area->addr, get_vm_area_size(area)); + debug_check_no_obj_freed(area->addr, get_vm_area_size(area)); remove_vm_area(addr); if (deallocate_pages) { From 89fdcd262fd40712da65e922558872a12b203332 Mon Sep 17 00:00:00 2001 From: Yang Shi Date: Thu, 7 Jun 2018 17:06:59 -0700 Subject: [PATCH 040/118] mm: shmem: make stat.st_blksize return huge page size if THP is on Since tmpfs THP was supported in 4.8, hugetlbfs is not the only filesystem with huge page support anymore. tmpfs can use huge page via THP when mounting by "huge=" mount option. When applications use huge page on hugetlbfs, it just need check the filesystem magic number, but it is not enough for tmpfs. Make stat.st_blksize return huge page size if it is mounted by appropriate "huge=" option to give applications a hint to optimize the behavior with THP. Some applications may not do wisely with THP. For example, QEMU may mmap file on non huge page aligned hint address with MAP_FIXED, which results in no pages are PMD mapped even though THP is used. Some applications may mmap file with non huge page aligned offset. Both behaviors make THP pointless. statfs.f_bsize still returns 4KB for tmpfs since THP could be split, and it also may fallback to 4KB page silently if there is not enough huge page. Furthermore, different f_bsize makes max_blocks and free_blocks calculation harder but without too much benefit. Returning huge page size via stat.st_blksize sounds good enough. Since PUD size huge page for THP has not been supported, now it just returns HPAGE_PMD_SIZE. Hugh said: : Sorry, I have no enthusiasm for this patch; but do I feel strongly : enough to override you and everyone else to NAK it? No, I don't feel : that strongly, maybe st_blksize isn't worth arguing over. : : We did look at struct stat when designing huge tmpfs, to see if there : were any fields that should be adjusted for it; but concluded none. : Yes, it would sometimes be nice to have a quickly accessible indicator : for when tmpfs has been mounted huge (scanning /proc/mounts for options : can be tiresome, agreed); but since tmpfs tries to supply huge (or not) : pages transparently, no difference seemed right. : : So, because st_blksize is a not very useful field of struct stat, with : "size" in the name, we're going to put HPAGE_PMD_SIZE in there instead : of PAGE_SIZE, if the tmpfs was mounted with one of the huge "huge" : options (force or always, okay; within_size or advise, not so much). : Though HPAGE_PMD_SIZE is no more its "preferred I/O size" or "blocksize : for file system I/O" than PAGE_SIZE was. : : Which we can expect to speed up some applications and disadvantage : others, depending on how they interpret st_blksize: just like if we : changed it in the same way on non-huge tmpfs. (Did I actually try : changing st_blksize early on, and find it broke something? If so, I've : now forgotten what, and a search through commit messages didn't find : it; but I guess we'll find out soon enough.) : : If there were an mstat() syscall, returning a field "preferred : alignment", then we could certainly agree to put HPAGE_PMD_SIZE in : there; but in stat()'s st_blksize? And what happens when (in future) : mm maps this or that hard-disk filesystem's blocks with a pmd mapping - : should that filesystem then advertise a bigger st_blksize, despite the : same disk layout as before? What happens with DAX? : : And this change is not going to help the QEMU suboptimality that : brought you here (or does QEMU align mmaps according to st_blksize?). : QEMU ought to work well with kernels without this change, and kernels : with this change; and I hope it can easily deal with both by avoiding : that use of MAP_FIXED which prevented the kernel's intended alignment. [akpm@linux-foundation.org: remove unneeded `else'] Link: http://lkml.kernel.org/r/1524665633-83806-1-git-send-email-yang.shi@linux.alibaba.com Signed-off-by: Yang Shi Suggested-by: Christoph Hellwig Reviewed-by: Christoph Hellwig Acked-by: Kirill A. Shutemov Cc: Hugh Dickins Cc: Michal Hocko Cc: Alexander Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 14 ++++++++++++++ 1 file changed, 14 insertions(+) diff --git a/mm/shmem.c b/mm/shmem.c index cb4d7fa389d0..103f3d1040f8 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -571,6 +571,15 @@ static unsigned long shmem_unused_huge_shrink(struct shmem_sb_info *sbinfo, } #endif /* CONFIG_TRANSPARENT_HUGE_PAGECACHE */ +static inline bool is_huge_enabled(struct shmem_sb_info *sbinfo) +{ + if (IS_ENABLED(CONFIG_TRANSPARENT_HUGE_PAGECACHE) && + (shmem_huge == SHMEM_HUGE_FORCE || sbinfo->huge) && + shmem_huge != SHMEM_HUGE_DENY) + return true; + return false; +} + /* * Like add_to_page_cache_locked, but error if expected item has gone. */ @@ -988,6 +997,7 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, { struct inode *inode = path->dentry->d_inode; struct shmem_inode_info *info = SHMEM_I(inode); + struct shmem_sb_info *sb_info = SHMEM_SB(inode->i_sb); if (info->alloced - info->swapped != inode->i_mapping->nrpages) { spin_lock_irq(&info->lock); @@ -995,6 +1005,10 @@ static int shmem_getattr(const struct path *path, struct kstat *stat, spin_unlock_irq(&info->lock); } generic_fillattr(inode, stat); + + if (is_huge_enabled(sb_info)) + stat->blksize = HPAGE_PMD_SIZE; + return 0; } From 93781325da6e07af57a21f111d1f2b9f128fa397 Mon Sep 17 00:00:00 2001 From: Omar Sandoval Date: Thu, 7 Jun 2018 17:07:02 -0700 Subject: [PATCH 041/118] lockdep: fix fs_reclaim annotation While revisiting my Btrfs swapfile series [1], I introduced a situation in which reclaim would lock i_rwsem, and even though the swapon() path clearly made GFP_KERNEL allocations while holding i_rwsem, I got no complaints from lockdep. It turns out that the rework of the fs_reclaim annotation was broken: if the current task has PF_MEMALLOC set, we don't acquire the dummy fs_reclaim lock, but when reclaiming we always check this _after_ we've just set the PF_MEMALLOC flag. In most cases, we can fix this by moving the fs_reclaim_{acquire,release}() outside of the memalloc_noreclaim_{save,restore}(), althought kswapd is slightly different. After applying this, I got the expected lockdep splats. 1: https://lwn.net/Articles/625412/ Link: http://lkml.kernel.org/r/9f8aa70652a98e98d7c4de0fc96a4addcee13efe.1523778026.git.osandov@fb.com Fixes: d92a8cfcb37e ("locking/lockdep: Rework FS_RECLAIM annotation") Signed-off-by: Omar Sandoval Reviewed-by: Andrew Morton Cc: Peter Zijlstra Cc: Tetsuo Handa Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/sched/mm.h | 4 ++++ mm/page_alloc.c | 20 +++++++++++++++----- mm/vmscan.c | 20 +++++++++++++------- 3 files changed, 32 insertions(+), 12 deletions(-) diff --git a/include/linux/sched/mm.h b/include/linux/sched/mm.h index 76a8cb4ef178..44d356f5e47c 100644 --- a/include/linux/sched/mm.h +++ b/include/linux/sched/mm.h @@ -163,9 +163,13 @@ static inline gfp_t current_gfp_context(gfp_t flags) } #ifdef CONFIG_LOCKDEP +extern void __fs_reclaim_acquire(void); +extern void __fs_reclaim_release(void); extern void fs_reclaim_acquire(gfp_t gfp_mask); extern void fs_reclaim_release(gfp_t gfp_mask); #else +static inline void __fs_reclaim_acquire(void) { } +static inline void __fs_reclaim_release(void) { } static inline void fs_reclaim_acquire(gfp_t gfp_mask) { } static inline void fs_reclaim_release(gfp_t gfp_mask) { } #endif diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 4fbe211340e0..b93cc79a8db4 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -3701,7 +3701,7 @@ should_compact_retry(struct alloc_context *ac, unsigned int order, int alloc_fla #endif /* CONFIG_COMPACTION */ #ifdef CONFIG_LOCKDEP -struct lockdep_map __fs_reclaim_map = +static struct lockdep_map __fs_reclaim_map = STATIC_LOCKDEP_MAP_INIT("fs_reclaim", &__fs_reclaim_map); static bool __need_fs_reclaim(gfp_t gfp_mask) @@ -3726,17 +3726,27 @@ static bool __need_fs_reclaim(gfp_t gfp_mask) return true; } +void __fs_reclaim_acquire(void) +{ + lock_map_acquire(&__fs_reclaim_map); +} + +void __fs_reclaim_release(void) +{ + lock_map_release(&__fs_reclaim_map); +} + void fs_reclaim_acquire(gfp_t gfp_mask) { if (__need_fs_reclaim(gfp_mask)) - lock_map_acquire(&__fs_reclaim_map); + __fs_reclaim_acquire(); } EXPORT_SYMBOL_GPL(fs_reclaim_acquire); void fs_reclaim_release(gfp_t gfp_mask) { if (__need_fs_reclaim(gfp_mask)) - lock_map_release(&__fs_reclaim_map); + __fs_reclaim_release(); } EXPORT_SYMBOL_GPL(fs_reclaim_release); #endif @@ -3754,8 +3764,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, /* We now go into synchronous reclaim */ cpuset_memory_pressure_bump(); - noreclaim_flag = memalloc_noreclaim_save(); fs_reclaim_acquire(gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; current->reclaim_state = &reclaim_state; @@ -3763,8 +3773,8 @@ __perform_reclaim(gfp_t gfp_mask, unsigned int order, ac->nodemask); current->reclaim_state = NULL; - fs_reclaim_release(gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(gfp_mask); cond_resched(); diff --git a/mm/vmscan.c b/mm/vmscan.c index 9270a4370d54..0e67b477ecef 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -3318,11 +3318,15 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) .may_unmap = 1, .may_swap = 1, }; + + __fs_reclaim_acquire(); + count_vm_event(PAGEOUTRUN); do { unsigned long nr_reclaimed = sc.nr_reclaimed; bool raise_priority = true; + bool ret; sc.reclaim_idx = classzone_idx; @@ -3395,7 +3399,10 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) wake_up_all(&pgdat->pfmemalloc_wait); /* Check if kswapd should be suspending */ - if (try_to_freeze() || kthread_should_stop()) + __fs_reclaim_release(); + ret = try_to_freeze(); + __fs_reclaim_acquire(); + if (ret || kthread_should_stop()) break; /* @@ -3412,6 +3419,7 @@ static int balance_pgdat(pg_data_t *pgdat, int order, int classzone_idx) out: snapshot_refaults(NULL, pgdat); + __fs_reclaim_release(); /* * Return the order kswapd stopped reclaiming at as * prepare_kswapd_sleep() takes it into account. If another caller @@ -3600,9 +3608,7 @@ kswapd_try_sleep: */ trace_mm_vmscan_kswapd_wake(pgdat->node_id, classzone_idx, alloc_order); - fs_reclaim_acquire(GFP_KERNEL); reclaim_order = balance_pgdat(pgdat, alloc_order, classzone_idx); - fs_reclaim_release(GFP_KERNEL); if (reclaim_order < alloc_order) goto kswapd_try_sleep; } @@ -3684,16 +3690,16 @@ unsigned long shrink_all_memory(unsigned long nr_to_reclaim) unsigned long nr_reclaimed; unsigned int noreclaim_flag; - noreclaim_flag = memalloc_noreclaim_save(); fs_reclaim_acquire(sc.gfp_mask); + noreclaim_flag = memalloc_noreclaim_save(); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; nr_reclaimed = do_try_to_free_pages(zonelist, &sc); p->reclaim_state = NULL; - fs_reclaim_release(sc.gfp_mask); memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); return nr_reclaimed; } @@ -3870,6 +3876,7 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in }; cond_resched(); + fs_reclaim_acquire(sc.gfp_mask); /* * We need to be able to allocate from the reserves for RECLAIM_UNMAP * and we also need to be able to write out pages for RECLAIM_WRITE @@ -3877,7 +3884,6 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in */ noreclaim_flag = memalloc_noreclaim_save(); p->flags |= PF_SWAPWRITE; - fs_reclaim_acquire(sc.gfp_mask); reclaim_state.reclaimed_slab = 0; p->reclaim_state = &reclaim_state; @@ -3892,9 +3898,9 @@ static int __node_reclaim(struct pglist_data *pgdat, gfp_t gfp_mask, unsigned in } p->reclaim_state = NULL; - fs_reclaim_release(gfp_mask); current->flags &= ~PF_SWAPWRITE; memalloc_noreclaim_restore(noreclaim_flag); + fs_reclaim_release(sc.gfp_mask); return sc.nr_reclaimed >= nr_pages; } From 48f49e1f66854ef44439b84a5cf47179539f3804 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 7 Jun 2018 17:07:07 -0700 Subject: [PATCH 042/118] mm/ksm: remove unused page_referenced_ksm declaration Commit 9f32624be943 ("mm/rmap: use rmap_walk() in page_referenced()") removed the declaration of page_referenced_ksm for the case CONFIG_KSM=y, but left one for CONFIG_KSM=n. Remove the unused leftover. Link: http://lkml.kernel.org/r/1524552106-7356-2-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 6 ------ 1 file changed, 6 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index 44368b19b27e..bbdfca3db6e1 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -89,12 +89,6 @@ static inline struct page *ksm_might_need_to_copy(struct page *page, return page; } -static inline int page_referenced_ksm(struct page *page, - struct mem_cgroup *memcg, unsigned long *vm_flags) -{ - return 0; -} - static inline void rmap_walk_ksm(struct page *page, struct rmap_walk_control *rwc) { From 88484826bc67844eeae1855ebe32cce9edd937c9 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 7 Jun 2018 17:07:11 -0700 Subject: [PATCH 043/118] mm/ksm: move [set_]page_stable_node from ksm.h to ksm.c page_stable_node() and set_page_stable_node() are only used in mm/ksm.c and there is no point to keep them in the include/linux/ksm.h [akpm@linux-foundation.org: fix SYSFS=n build] Link: http://lkml.kernel.org/r/1524552106-7356-3-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Reviewed-by: Andrew Morton Cc: Andrea Arcangeli Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/ksm.h | 11 ----------- mm/ksm.c | 11 +++++++++++ 2 files changed, 11 insertions(+), 11 deletions(-) diff --git a/include/linux/ksm.h b/include/linux/ksm.h index bbdfca3db6e1..161e8164abcf 100644 --- a/include/linux/ksm.h +++ b/include/linux/ksm.h @@ -37,17 +37,6 @@ static inline void ksm_exit(struct mm_struct *mm) __ksm_exit(mm); } -static inline struct stable_node *page_stable_node(struct page *page) -{ - return PageKsm(page) ? page_rmapping(page) : NULL; -} - -static inline void set_page_stable_node(struct page *page, - struct stable_node *stable_node) -{ - page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); -} - /* * When do_swap_page() first faults in from swap what used to be a KSM page, * no problem, it will be assigned to this vma's anon_vma; but thereafter, diff --git a/mm/ksm.c b/mm/ksm.c index 7d6558f3bac9..e2d2886fb1df 100644 --- a/mm/ksm.c +++ b/mm/ksm.c @@ -840,6 +840,17 @@ static int unmerge_ksm_pages(struct vm_area_struct *vma, return err; } +static inline struct stable_node *page_stable_node(struct page *page) +{ + return PageKsm(page) ? page_rmapping(page) : NULL; +} + +static inline void set_page_stable_node(struct page *page, + struct stable_node *stable_node) +{ + page->mapping = (void *)((unsigned long)stable_node | PAGE_MAPPING_KSM); +} + #ifdef CONFIG_SYSFS /* * Only called through the sysfs control interface: From 12ba780d64f6c96d40de9c4d1468fb732e64be4c Mon Sep 17 00:00:00 2001 From: Amir Goldstein Date: Thu, 7 Jun 2018 17:07:15 -0700 Subject: [PATCH 044/118] tmpfs: allow decoding a file handle of an unlinked file tmpfs uses the helper d_find_alias() to find a dentry from a decoded inode, but d_find_alias() skips unhashed dentries, so unlinked files cannot be decoded from a file handle. This can be reproduced using xfstests test program open_by_handle: $ open_by handle -c /tmp/testdir $ open_by_handle -dk /tmp/testdir open_by_handle(/tmp/testdir/file000000) returned 116 incorrectly on an unlinked open file! To fix this, if d_find_alias() can't find a hashed alias, call d_find_any_alias() to return an unhashed one. Link: http://lkml.kernel.org/r/CAOQ4uxg+qSLP0KwdW+h1tcPqOCQd+_pGZVXiePQB1TXCMBMctQ@mail.gmail.com Signed-off-by: Amir Goldstein Reviewed-by: NeilBrown Cc: Hugh Dickins Cc: Jeff Layton Cc: "J. Bruce Fields" Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 11 ++++++++++- 1 file changed, 10 insertions(+), 1 deletion(-) diff --git a/mm/shmem.c b/mm/shmem.c index 103f3d1040f8..9a1b553e30e0 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -3209,6 +3209,15 @@ static int shmem_match(struct inode *ino, void *vfh) return ino->i_ino == inum && fh[0] == ino->i_generation; } +/* Find any alias of inode, but prefer a hashed alias */ +static struct dentry *shmem_find_alias(struct inode *inode) +{ + struct dentry *alias = d_find_alias(inode); + + return alias ?: d_find_any_alias(inode); +} + + static struct dentry *shmem_fh_to_dentry(struct super_block *sb, struct fid *fid, int fh_len, int fh_type) { @@ -3225,7 +3234,7 @@ static struct dentry *shmem_fh_to_dentry(struct super_block *sb, inode = ilookup5(sb, (unsigned long)(inum + fid->raw[0]), shmem_match, fid->raw); if (inode) { - dentry = d_find_alias(inode); + dentry = shmem_find_alias(inode); iput(inode); } From 9ccc361716362e8abb01d9944f8df97b4aac3e23 Mon Sep 17 00:00:00 2001 From: Wang Long Date: Thu, 7 Jun 2018 17:07:19 -0700 Subject: [PATCH 045/118] memcg: writeback: use memcg->cgwb_list directly mem_cgroup_cgwb_list is a very simple wrapper and it will never be used outside of code under CONFIG_CGROUP_WRITEBACK. so use memcg->cgwb_list directly. Link: http://lkml.kernel.org/r/1524406173-212182-1-git-send-email-wanglong19@meituan.com Signed-off-by: Wang Long Reviewed-by: Jan Kara Acked-by: Tejun Heo Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: Johannes Weiner Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 1 - mm/backing-dev.c | 4 ++-- mm/memcontrol.c | 5 ----- 3 files changed, 2 insertions(+), 8 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 891945507044..10d741e8fe51 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -1097,7 +1097,6 @@ static inline void dec_lruvec_page_state(struct page *page, #ifdef CONFIG_CGROUP_WRITEBACK -struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg); struct wb_domain *mem_cgroup_wb_domain(struct bdi_writeback *wb); void mem_cgroup_wb_stats(struct bdi_writeback *wb, unsigned long *pfilepages, unsigned long *pheadroom, unsigned long *pdirty, diff --git a/mm/backing-dev.c b/mm/backing-dev.c index 8fe3ebd6ac00..347cc834c04a 100644 --- a/mm/backing-dev.c +++ b/mm/backing-dev.c @@ -557,7 +557,7 @@ static int cgwb_create(struct backing_dev_info *bdi, memcg = mem_cgroup_from_css(memcg_css); blkcg_css = cgroup_get_e_css(memcg_css->cgroup, &io_cgrp_subsys); blkcg = css_to_blkcg(blkcg_css); - memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + memcg_cgwb_list = &memcg->cgwb_list; blkcg_cgwb_list = &blkcg->cgwb_list; /* look up again under lock and discard on blkcg mismatch */ @@ -736,7 +736,7 @@ static void cgwb_bdi_unregister(struct backing_dev_info *bdi) */ void wb_memcg_offline(struct mem_cgroup *memcg) { - struct list_head *memcg_cgwb_list = mem_cgroup_cgwb_list(memcg); + struct list_head *memcg_cgwb_list = &memcg->cgwb_list; struct bdi_writeback *wb, *next; spin_lock_irq(&cgwb_lock); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 8bab4660e6aa..2751259f0a76 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3562,11 +3562,6 @@ static int mem_cgroup_oom_control_write(struct cgroup_subsys_state *css, #ifdef CONFIG_CGROUP_WRITEBACK -struct list_head *mem_cgroup_cgwb_list(struct mem_cgroup *memcg) -{ - return &memcg->cgwb_list; -} - static int memcg_wb_domain_init(struct mem_cgroup *memcg, gfp_t gfp) { return wb_domain_init(&memcg->cgwb_domain, gfp); From 8dd53fd3b702c12050ec5ddd07b56f8c4608ebea Mon Sep 17 00:00:00 2001 From: Greg Thelen Date: Thu, 7 Jun 2018 17:07:23 -0700 Subject: [PATCH 046/118] memcg: mark memcg1_events static const Mark memcg1_events static: it's only used by memcontrol.c. And mark it const: it's not modified. Link: http://lkml.kernel.org/r/20180503192940.94971-1-gthelen@google.com Signed-off-by: Greg Thelen Acked-by: Michal Hocko Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 2751259f0a76..847a775af225 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -3083,7 +3083,7 @@ static int memcg_numa_stat_show(struct seq_file *m, void *v) #endif /* CONFIG_NUMA */ /* Universal VM events cgroup1 shows, original sort order */ -unsigned int memcg1_events[] = { +static const unsigned int memcg1_events[] = { PGPGIN, PGPGOUT, PGFAULT, From bb4a7ea2b1449722cec9d787dca5a74ca36e8eeb Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 7 Jun 2018 17:07:27 -0700 Subject: [PATCH 047/118] mm: memcontrol: drain stocks on resize limit Resizing the memcg limit for cgroup-v2 drains the stocks before triggering the memcg reclaim. Do the same for cgroup-v1 to make the behavior consistent. Link: http://lkml.kernel.org/r/20180504205548.110696-1-shakeelb@google.com Signed-off-by: Shakeel Butt Acked-by: Johannes Weiner Acked-by: Michal Hocko Cc: Greg Thelen Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 7 +++++++ 1 file changed, 7 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 847a775af225..18dcdac0b158 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2450,6 +2450,7 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, unsigned long max, bool memsw) { bool enlarge = false; + bool drained = false; int ret; bool limits_invariant; struct page_counter *counter = memsw ? &memcg->memsw : &memcg->memory; @@ -2480,6 +2481,12 @@ static int mem_cgroup_resize_max(struct mem_cgroup *memcg, if (!ret) break; + if (!drained) { + drain_all_stock(memcg); + drained = true; + continue; + } + if (!try_to_free_mem_cgroup_pages(memcg, 1, GFP_KERNEL, !memsw)) { ret = -EBUSY; From d12c60f64cf8c768fddfd848eaf7ee57a13f18b1 Mon Sep 17 00:00:00 2001 From: Junaid Shahid Date: Thu, 7 Jun 2018 17:07:31 -0700 Subject: [PATCH 048/118] mm: memcontrol: drain memcg stock on force_empty The per-cpu memcg stock can retain a charge of upto 32 pages. On a machine with large number of cpus, this can amount to a decent amount of memory. Additionally force_empty interface might be triggering unneeded memcg reclaims. Link: http://lkml.kernel.org/r/20180507201651.165879-1-shakeelb@google.com Signed-off-by: Junaid Shahid Signed-off-by: Shakeel Butt Acked-by: Michal Hocko Cc: Greg Thelen Cc: Johannes Weiner Cc: Vladimir Davydov Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memcontrol.c | 3 +++ 1 file changed, 3 insertions(+) diff --git a/mm/memcontrol.c b/mm/memcontrol.c index 18dcdac0b158..e6de0d6a3a8d 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -2610,6 +2610,9 @@ static int mem_cgroup_force_empty(struct mem_cgroup *memcg) /* we call try-to-free pages for make this cgroup empty */ lru_add_drain_all(); + + drain_all_stock(memcg); + /* try to free all pages in this cgroup */ while (nr_retries && page_counter_read(&memcg->memory)) { int progress; From 25cf23d7a95716fc6eb165208b5eb2e3b2e86f82 Mon Sep 17 00:00:00 2001 From: Minchan Kim Date: Thu, 7 Jun 2018 17:07:35 -0700 Subject: [PATCH 049/118] mm/memblock: print memblock_remove memblock_remove report is useful to see why MemTotal of /proc/meminfo between two kernels makes difference. Link: http://lkml.kernel.org/r/20180508104223.8028-1-minchan@kernel.org Signed-off-by: Minchan Kim Reviewed-by: Andrew Morton Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/memblock.c | 5 +++++ 1 file changed, 5 insertions(+) diff --git a/mm/memblock.c b/mm/memblock.c index eec988c21c7e..93ad42bc8a73 100644 --- a/mm/memblock.c +++ b/mm/memblock.c @@ -697,6 +697,11 @@ static int __init_memblock memblock_remove_range(struct memblock_type *type, int __init_memblock memblock_remove(phys_addr_t base, phys_addr_t size) { + phys_addr_t end = base + size - 1; + + memblock_dbg("memblock_remove: [%pa-%pa] %pS\n", + &base, &end, (void *)_RET_IP_); + return memblock_remove_range(&memblock.memory, base, size); } From ab6ecf247a9321e3180e021a6a60164dee53ab2e Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 7 Jun 2018 17:07:39 -0700 Subject: [PATCH 050/118] mm: /proc/pid/pagemap: hide swap entries from unprivileged users In commit ab676b7d6fbf ("pagemap: do not leak physical addresses to non-privileged userspace"), the /proc/PID/pagemap is restricted to be readable only by CAP_SYS_ADMIN to address some security issue. In commit 1c90308e7a77 ("pagemap: hide physical addresses from non-privileged users"), the restriction is relieved to make /proc/PID/pagemap readable, but hide the physical addresses for non-privileged users. But the swap entries are readable for non-privileged users too. This has some security issues. For example, for page under migrating, the swap entry has physical address information. So, in this patch, the swap entries are hided for non-privileged users too. Link: http://lkml.kernel.org/r/20180508012745.7238-1-ying.huang@intel.com Fixes: 1c90308e7a77 ("pagemap: hide physical addresses from non-privileged users") Signed-off-by: "Huang, Ying" Suggested-by: Kirill A. Shutemov Reviewed-by: Naoya Horiguchi Reviewed-by: Konstantin Khlebnikov Acked-by: Michal Hocko Cc: Konstantin Khlebnikov Cc: Andrei Vagin Cc: Jerome Glisse Cc: Daniel Colascione Cc: Zi Yan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/task_mmu.c | 26 ++++++++++++++++---------- 1 file changed, 16 insertions(+), 10 deletions(-) diff --git a/fs/proc/task_mmu.c b/fs/proc/task_mmu.c index 7e074138d2f2..597969db9e90 100644 --- a/fs/proc/task_mmu.c +++ b/fs/proc/task_mmu.c @@ -1259,8 +1259,9 @@ static pagemap_entry_t pte_to_pagemap_entry(struct pagemapread *pm, if (pte_swp_soft_dirty(pte)) flags |= PM_SOFT_DIRTY; entry = pte_to_swp_entry(pte); - frame = swp_type(entry) | - (swp_offset(entry) << MAX_SWAPFILES_SHIFT); + if (pm->show_pfn) + frame = swp_type(entry) | + (swp_offset(entry) << MAX_SWAPFILES_SHIFT); flags |= PM_SWAP; if (is_migration_entry(entry)) page = migration_entry_to_page(entry); @@ -1311,11 +1312,14 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, #ifdef CONFIG_ARCH_ENABLE_THP_MIGRATION else if (is_swap_pmd(pmd)) { swp_entry_t entry = pmd_to_swp_entry(pmd); - unsigned long offset = swp_offset(entry); + unsigned long offset; - offset += (addr & ~PMD_MASK) >> PAGE_SHIFT; - frame = swp_type(entry) | - (offset << MAX_SWAPFILES_SHIFT); + if (pm->show_pfn) { + offset = swp_offset(entry) + + ((addr & ~PMD_MASK) >> PAGE_SHIFT); + frame = swp_type(entry) | + (offset << MAX_SWAPFILES_SHIFT); + } flags |= PM_SWAP; if (pmd_swp_soft_dirty(pmd)) flags |= PM_SOFT_DIRTY; @@ -1333,10 +1337,12 @@ static int pagemap_pmd_range(pmd_t *pmdp, unsigned long addr, unsigned long end, err = add_to_pagemap(addr, &pme, pm); if (err) break; - if (pm->show_pfn && (flags & PM_PRESENT)) - frame++; - else if (flags & PM_SWAP) - frame += (1 << MAX_SWAPFILES_SHIFT); + if (pm->show_pfn) { + if (flags & PM_PRESENT) + frame++; + else if (flags & PM_SWAP) + frame += (1 << MAX_SWAPFILES_SHIFT); + } } spin_unlock(ptl); return err; From fb52bbaee598f58352d8732637ebe7013b2df79f Mon Sep 17 00:00:00 2001 From: Mathieu Malaterre Date: Thu, 7 Jun 2018 17:07:43 -0700 Subject: [PATCH 051/118] mm: move is_pageblock_removable_nolock() to mm/memory_hotplug.c is_pageblock_removable_nolock() is not used outside of mm/memory_hotplug.c. Move it next to unique caller is_mem_section_removable() and make it static. Remove prototype in to silence gcc warning (W=1): mm/page_alloc.c:7704:6: warning: no previous prototype for `is_pageblock_removable_nolock' [-Wmissing-prototypes] Link: http://lkml.kernel.org/r/20180509190001.24789-1-malat@debian.org Signed-off-by: Mathieu Malaterre Suggested-by: Michal Hocko Reviewed-by: Andrew Morton Acked-by: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memory_hotplug.h | 1 - mm/memory_hotplug.c | 23 +++++++++++++++++++++++ mm/page_alloc.c | 23 ----------------------- 3 files changed, 23 insertions(+), 24 deletions(-) diff --git a/include/linux/memory_hotplug.h b/include/linux/memory_hotplug.h index 2b0265265c28..4e9828cda7a2 100644 --- a/include/linux/memory_hotplug.h +++ b/include/linux/memory_hotplug.h @@ -107,7 +107,6 @@ static inline bool movable_node_is_enabled(void) } #ifdef CONFIG_MEMORY_HOTREMOVE -extern bool is_pageblock_removable_nolock(struct page *page); extern int arch_remove_memory(u64 start, u64 size, struct vmem_altmap *altmap); extern int __remove_pages(struct zone *zone, unsigned long start_pfn, diff --git a/mm/memory_hotplug.c b/mm/memory_hotplug.c index 25982467800b..7deb49f69e27 100644 --- a/mm/memory_hotplug.c +++ b/mm/memory_hotplug.c @@ -1237,6 +1237,29 @@ static struct page *next_active_pageblock(struct page *page) return page + pageblock_nr_pages; } +static bool is_pageblock_removable_nolock(struct page *page) +{ + struct zone *zone; + unsigned long pfn; + + /* + * We have to be careful here because we are iterating over memory + * sections which are not zone aware so we might end up outside of + * the zone but still within the section. + * We have to take care about the node as well. If the node is offline + * its NODE_DATA will be NULL - see page_zone. + */ + if (!node_online(page_to_nid(page))) + return false; + + zone = page_zone(page); + pfn = page_to_pfn(page); + if (!zone_spans_pfn(zone, pfn)) + return false; + + return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); +} + /* Checks if this range of memory is likely to be hot-removable. */ bool is_mem_section_removable(unsigned long start_pfn, unsigned long nr_pages) { diff --git a/mm/page_alloc.c b/mm/page_alloc.c index b93cc79a8db4..d0d26da12086 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -7692,29 +7692,6 @@ unmovable: return true; } -bool is_pageblock_removable_nolock(struct page *page) -{ - struct zone *zone; - unsigned long pfn; - - /* - * We have to be careful here because we are iterating over memory - * sections which are not zone aware so we might end up outside of - * the zone but still within the section. - * We have to take care about the node as well. If the node is offline - * its NODE_DATA will be NULL - see page_zone. - */ - if (!node_online(page_to_nid(page))) - return false; - - zone = page_zone(page); - pfn = page_to_pfn(page); - if (!zone_spans_pfn(zone, pfn)) - return false; - - return !has_unmovable_pages(zone, page, 0, MIGRATE_MOVABLE, true); -} - #if (defined(CONFIG_MEMORY_ISOLATION) && defined(CONFIG_COMPACTION)) || defined(CONFIG_CMA) static unsigned long pfn_max_align_down(unsigned long pfn) From bf8d5d52ffe89aac5b46ddb39dd1a4351fae5df4 Mon Sep 17 00:00:00 2001 From: Roman Gushchin Date: Thu, 7 Jun 2018 17:07:46 -0700 Subject: [PATCH 052/118] memcg: introduce memory.min Memory controller implements the memory.low best-effort memory protection mechanism, which works perfectly in many cases and allows protecting working sets of important workloads from sudden reclaim. But its semantics has a significant limitation: it works only as long as there is a supply of reclaimable memory. This makes it pretty useless against any sort of slow memory leaks or memory usage increases. This is especially true for swapless systems. If swap is enabled, memory soft protection effectively postpones problems, allowing a leaking application to fill all swap area, which makes no sense. The only effective way to guarantee the memory protection in this case is to invoke the OOM killer. It's possible to handle this case in userspace by reacting on MEMCG_LOW events; but there is still a place for a fail-safe in-kernel mechanism to provide stronger guarantees. This patch introduces the memory.min interface for cgroup v2 memory controller. It works very similarly to memory.low (sharing the same hierarchical behavior), except that it's not disabled if there is no more reclaimable memory in the system. If cgroup is not populated, its memory.min is ignored, because otherwise even the OOM killer wouldn't be able to reclaim the protected memory, and the system can stall. [guro@fb.com: s/low/min/ in docs] Link: http://lkml.kernel.org/r/20180510130758.GA9129@castle.DHCP.thefacebook.com Link: http://lkml.kernel.org/r/20180509180734.GA4856@castle.DHCP.thefacebook.com Signed-off-by: Roman Gushchin Reviewed-by: Randy Dunlap Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Vladimir Davydov Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 27 +++++- include/linux/memcontrol.h | 15 ++- include/linux/page_counter.h | 11 ++- mm/memcontrol.c | 118 +++++++++++++++++++----- mm/page_counter.c | 63 +++++++++---- mm/vmscan.c | 18 +++- 6 files changed, 202 insertions(+), 50 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index 7b56ca80e37a..e34d3c938729 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1001,6 +1001,29 @@ PAGE_SIZE multiple when read back. The total amount of memory currently being used by the cgroup and its descendants. + memory.min + A read-write single value file which exists on non-root + cgroups. The default is "0". + + Hard memory protection. If the memory usage of a cgroup + is within its effective min boundary, the cgroup's memory + won't be reclaimed under any conditions. If there is no + unprotected reclaimable memory available, OOM killer + is invoked. + + Effective min boundary is limited by memory.min values of + all ancestor cgroups. If there is memory.min overcommitment + (child cgroup or cgroups are requiring more protected memory + than parent will allow), then each child cgroup will get + the part of parent's protection proportional to its + actual memory usage below memory.min. + + Putting more memory than generally available under this + protection is discouraged and may lead to constant OOMs. + + If a memory cgroup is not populated with processes, + its memory.min is ignored. + memory.low A read-write single value file which exists on non-root cgroups. The default is "0". @@ -1012,9 +1035,9 @@ PAGE_SIZE multiple when read back. Effective low boundary is limited by memory.low values of all ancestor cgroups. If there is memory.low overcommitment - (child cgroup or cgroups are requiring more protected memory, + (child cgroup or cgroups are requiring more protected memory than parent will allow), then each child cgroup will get - the part of parent's protection proportional to the its + the part of parent's protection proportional to its actual memory usage below memory.low. Putting more memory than generally available under this diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 10d741e8fe51..9c04cf8e6487 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -58,6 +58,12 @@ enum memcg_memory_event { MEMCG_NR_MEMORY_EVENTS, }; +enum mem_cgroup_protection { + MEMCG_PROT_NONE, + MEMCG_PROT_LOW, + MEMCG_PROT_MIN, +}; + struct mem_cgroup_reclaim_cookie { pg_data_t *pgdat; int priority; @@ -289,7 +295,8 @@ static inline bool mem_cgroup_disabled(void) return !cgroup_subsys_enabled(memory_cgrp_subsys); } -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg); +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg); int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, gfp_t gfp_mask, struct mem_cgroup **memcgp, @@ -734,10 +741,10 @@ static inline void memcg_memory_event(struct mem_cgroup *memcg, { } -static inline bool mem_cgroup_low(struct mem_cgroup *root, - struct mem_cgroup *memcg) +static inline enum mem_cgroup_protection mem_cgroup_protected( + struct mem_cgroup *root, struct mem_cgroup *memcg) { - return false; + return MEMCG_PROT_NONE; } static inline int mem_cgroup_try_charge(struct page *page, struct mm_struct *mm, diff --git a/include/linux/page_counter.h b/include/linux/page_counter.h index 7902a727d3b6..bab7e57f659b 100644 --- a/include/linux/page_counter.h +++ b/include/linux/page_counter.h @@ -8,10 +8,16 @@ struct page_counter { atomic_long_t usage; - unsigned long max; + unsigned long min; unsigned long low; + unsigned long max; struct page_counter *parent; + /* effective memory.min and memory.min usage tracking */ + unsigned long emin; + atomic_long_t min_usage; + atomic_long_t children_min_usage; + /* effective memory.low and memory.low usage tracking */ unsigned long elow; atomic_long_t low_usage; @@ -47,8 +53,9 @@ bool page_counter_try_charge(struct page_counter *counter, unsigned long nr_pages, struct page_counter **fail); void page_counter_uncharge(struct page_counter *counter, unsigned long nr_pages); -int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages); void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages); +int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages); int page_counter_memparse(const char *buf, const char *max, unsigned long *nr_pages); diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e6de0d6a3a8d..e3d56927a724 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -4275,6 +4275,7 @@ static void mem_cgroup_css_offline(struct cgroup_subsys_state *css) } spin_unlock(&memcg->event_list_lock); + page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); memcg_offline_kmem(memcg); @@ -4329,6 +4330,7 @@ static void mem_cgroup_css_reset(struct cgroup_subsys_state *css) page_counter_set_max(&memcg->memsw, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->kmem, PAGE_COUNTER_MAX); page_counter_set_max(&memcg->tcpmem, PAGE_COUNTER_MAX); + page_counter_set_min(&memcg->memory, 0); page_counter_set_low(&memcg->memory, 0); memcg->high = PAGE_COUNTER_MAX; memcg->soft_limit = PAGE_COUNTER_MAX; @@ -5066,6 +5068,36 @@ static u64 memory_current_read(struct cgroup_subsys_state *css, return (u64)page_counter_read(&memcg->memory) * PAGE_SIZE; } +static int memory_min_show(struct seq_file *m, void *v) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); + unsigned long min = READ_ONCE(memcg->memory.min); + + if (min == PAGE_COUNTER_MAX) + seq_puts(m, "max\n"); + else + seq_printf(m, "%llu\n", (u64)min * PAGE_SIZE); + + return 0; +} + +static ssize_t memory_min_write(struct kernfs_open_file *of, + char *buf, size_t nbytes, loff_t off) +{ + struct mem_cgroup *memcg = mem_cgroup_from_css(of_css(of)); + unsigned long min; + int err; + + buf = strstrip(buf); + err = page_counter_memparse(buf, "max", &min); + if (err) + return err; + + page_counter_set_min(&memcg->memory, min); + + return nbytes; +} + static int memory_low_show(struct seq_file *m, void *v) { struct mem_cgroup *memcg = mem_cgroup_from_css(seq_css(m)); @@ -5300,6 +5332,12 @@ static struct cftype memory_files[] = { .flags = CFTYPE_NOT_ON_ROOT, .read_u64 = memory_current_read, }, + { + .name = "min", + .flags = CFTYPE_NOT_ON_ROOT, + .seq_show = memory_min_show, + .write = memory_min_write, + }, { .name = "low", .flags = CFTYPE_NOT_ON_ROOT, @@ -5349,19 +5387,24 @@ struct cgroup_subsys memory_cgrp_subsys = { }; /** - * mem_cgroup_low - check if memory consumption is in the normal range + * mem_cgroup_protected - check if memory consumption is in the normal range * @root: the top ancestor of the sub-tree being checked * @memcg: the memory cgroup to check * * WARNING: This function is not stateless! It can only be used as part * of a top-down tree iteration, not for isolated queries. * - * Returns %true if memory consumption of @memcg is in the normal range. + * Returns one of the following: + * MEMCG_PROT_NONE: cgroup memory is not protected + * MEMCG_PROT_LOW: cgroup memory is protected as long there is + * an unprotected supply of reclaimable memory from other cgroups. + * MEMCG_PROT_MIN: cgroup memory is protected * - * @root is exclusive; it is never low when looked at directly + * @root is exclusive; it is never protected when looked at directly * - * To provide a proper hierarchical behavior, effective memory.low value - * is used. + * To provide a proper hierarchical behavior, effective memory.min/low values + * are used. Below is the description of how effective memory.low is calculated. + * Effective memory.min values is calculated in the same way. * * Effective memory.low is always equal or less than the original memory.low. * If there is no memory.low overcommittment (which is always true for @@ -5406,51 +5449,78 @@ struct cgroup_subsys memory_cgrp_subsys = { * E/memory.current = 0 * * These calculations require constant tracking of the actual low usages - * (see propagate_low_usage()), as well as recursive calculation of - * effective memory.low values. But as we do call mem_cgroup_low() + * (see propagate_protected_usage()), as well as recursive calculation of + * effective memory.low values. But as we do call mem_cgroup_protected() * path for each memory cgroup top-down from the reclaim, * it's possible to optimize this part, and save calculated elow * for next usage. This part is intentionally racy, but it's ok, * as memory.low is a best-effort mechanism. */ -bool mem_cgroup_low(struct mem_cgroup *root, struct mem_cgroup *memcg) +enum mem_cgroup_protection mem_cgroup_protected(struct mem_cgroup *root, + struct mem_cgroup *memcg) { - unsigned long usage, low_usage, siblings_low_usage; - unsigned long elow, parent_elow; struct mem_cgroup *parent; + unsigned long emin, parent_emin; + unsigned long elow, parent_elow; + unsigned long usage; if (mem_cgroup_disabled()) - return false; + return MEMCG_PROT_NONE; if (!root) root = root_mem_cgroup; if (memcg == root) - return false; + return MEMCG_PROT_NONE; - elow = memcg->memory.low; usage = page_counter_read(&memcg->memory); - parent = parent_mem_cgroup(memcg); + if (!usage) + return MEMCG_PROT_NONE; + emin = memcg->memory.min; + elow = memcg->memory.low; + + parent = parent_mem_cgroup(memcg); if (parent == root) goto exit; + parent_emin = READ_ONCE(parent->memory.emin); + emin = min(emin, parent_emin); + if (emin && parent_emin) { + unsigned long min_usage, siblings_min_usage; + + min_usage = min(usage, memcg->memory.min); + siblings_min_usage = atomic_long_read( + &parent->memory.children_min_usage); + + if (min_usage && siblings_min_usage) + emin = min(emin, parent_emin * min_usage / + siblings_min_usage); + } + parent_elow = READ_ONCE(parent->memory.elow); elow = min(elow, parent_elow); + if (elow && parent_elow) { + unsigned long low_usage, siblings_low_usage; - if (!elow || !parent_elow) - goto exit; + low_usage = min(usage, memcg->memory.low); + siblings_low_usage = atomic_long_read( + &parent->memory.children_low_usage); - low_usage = min(usage, memcg->memory.low); - siblings_low_usage = atomic_long_read( - &parent->memory.children_low_usage); + if (low_usage && siblings_low_usage) + elow = min(elow, parent_elow * low_usage / + siblings_low_usage); + } - if (!low_usage || !siblings_low_usage) - goto exit; - - elow = min(elow, parent_elow * low_usage / siblings_low_usage); exit: + memcg->memory.emin = emin; memcg->memory.elow = elow; - return usage && usage <= elow; + + if (usage <= emin) + return MEMCG_PROT_MIN; + else if (usage <= elow) + return MEMCG_PROT_LOW; + else + return MEMCG_PROT_NONE; } /** diff --git a/mm/page_counter.c b/mm/page_counter.c index a5ff4cbc355a..de31470655f6 100644 --- a/mm/page_counter.c +++ b/mm/page_counter.c @@ -13,26 +13,38 @@ #include #include -static void propagate_low_usage(struct page_counter *c, unsigned long usage) +static void propagate_protected_usage(struct page_counter *c, + unsigned long usage) { - unsigned long low_usage, old; + unsigned long protected, old_protected; long delta; if (!c->parent) return; - if (!c->low && !atomic_long_read(&c->low_usage)) - return; + if (c->min || atomic_long_read(&c->min_usage)) { + if (usage <= c->min) + protected = usage; + else + protected = 0; - if (usage <= c->low) - low_usage = usage; - else - low_usage = 0; + old_protected = atomic_long_xchg(&c->min_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_min_usage); + } - old = atomic_long_xchg(&c->low_usage, low_usage); - delta = low_usage - old; - if (delta) - atomic_long_add(delta, &c->parent->children_low_usage); + if (c->low || atomic_long_read(&c->low_usage)) { + if (usage <= c->low) + protected = usage; + else + protected = 0; + + old_protected = atomic_long_xchg(&c->low_usage, protected); + delta = protected - old_protected; + if (delta) + atomic_long_add(delta, &c->parent->children_low_usage); + } } /** @@ -45,7 +57,7 @@ void page_counter_cancel(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_sub_return(nr_pages, &counter->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* More uncharges than charges? */ WARN_ON_ONCE(new < 0); } @@ -65,7 +77,7 @@ void page_counter_charge(struct page_counter *counter, unsigned long nr_pages) long new; new = atomic_long_add_return(nr_pages, &c->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * This is indeed racy, but we can live with some * inaccuracy in the watermark. @@ -109,7 +121,7 @@ bool page_counter_try_charge(struct page_counter *counter, new = atomic_long_add_return(nr_pages, &c->usage); if (new > c->max) { atomic_long_sub(nr_pages, &c->usage); - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * This is racy, but we can live with some * inaccuracy in the failcnt. @@ -118,7 +130,7 @@ bool page_counter_try_charge(struct page_counter *counter, *fail = c; goto failed; } - propagate_low_usage(counter, new); + propagate_protected_usage(counter, new); /* * Just like with failcnt, we can live with some * inaccuracy in the watermark. @@ -190,6 +202,23 @@ int page_counter_set_max(struct page_counter *counter, unsigned long nr_pages) } } +/** + * page_counter_set_min - set the amount of protected memory + * @counter: counter + * @nr_pages: value to set + * + * The caller must serialize invocations on the same counter. + */ +void page_counter_set_min(struct page_counter *counter, unsigned long nr_pages) +{ + struct page_counter *c; + + counter->min = nr_pages; + + for (c = counter; c; c = c->parent) + propagate_protected_usage(c, atomic_long_read(&c->usage)); +} + /** * page_counter_set_low - set the amount of protected memory * @counter: counter @@ -204,7 +233,7 @@ void page_counter_set_low(struct page_counter *counter, unsigned long nr_pages) counter->low = nr_pages; for (c = counter; c; c = c->parent) - propagate_low_usage(c, atomic_long_read(&c->usage)); + propagate_protected_usage(c, atomic_long_read(&c->usage)); } /** diff --git a/mm/vmscan.c b/mm/vmscan.c index 0e67b477ecef..03822f86f288 100644 --- a/mm/vmscan.c +++ b/mm/vmscan.c @@ -2544,12 +2544,28 @@ static bool shrink_node(pg_data_t *pgdat, struct scan_control *sc) unsigned long reclaimed; unsigned long scanned; - if (mem_cgroup_low(root, memcg)) { + switch (mem_cgroup_protected(root, memcg)) { + case MEMCG_PROT_MIN: + /* + * Hard protection. + * If there is no reclaimable memory, OOM. + */ + continue; + case MEMCG_PROT_LOW: + /* + * Soft protection. + * Respect the protection only as long as + * there is an unprotected supply + * of reclaimable memory from other cgroups. + */ if (!sc->memcg_low_reclaim) { sc->memcg_low_skipped = 1; continue; } memcg_memory_event(memcg, MEMCG_LOW); + break; + case MEMCG_PROT_NONE: + break; } reclaimed = sc->nr_reclaimed; From d62ff365b812ecd7415252568c1836811b3dde02 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 7 Jun 2018 17:07:50 -0700 Subject: [PATCH 053/118] mm/vmpressure.c: use kstrndup instead of kmalloc+strncpy Using kstrndup() simplifies the code. Link: http://lkml.kernel.org/r/20180503201807.24941-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Michal Hocko Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmpressure.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 85350ce2d25d..7142207224d3 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -390,12 +390,11 @@ int vmpressure_register_event(struct mem_cgroup *memcg, char *token; int ret = 0; - spec_orig = spec = kzalloc(MAX_VMPRESSURE_ARGS_LEN + 1, GFP_KERNEL); + spec_orig = spec = kstrndup(args, MAX_VMPRESSURE_ARGS_LEN, GFP_KERNEL); if (!spec) { ret = -ENOMEM; goto out; } - strncpy(spec, args, MAX_VMPRESSURE_ARGS_LEN); /* Find required level */ token = strsep(&spec, ","); From 3cadfa2b9497abbd3ae11a790d53ed51b4cf8ac5 Mon Sep 17 00:00:00 2001 From: Andy Shevchenko Date: Thu, 7 Jun 2018 17:07:53 -0700 Subject: [PATCH 054/118] mm/vmpressure.c: convert to use match_string() helper The new helper returns index of the matching string in an array. We are going to use it here. Link: http://lkml.kernel.org/r/20180503203206.44046-1-andriy.shevchenko@linux.intel.com Signed-off-by: Andy Shevchenko Acked-by: Michal Hocko Reviewed-by: Andrew Morton Cc: David Rientjes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/vmpressure.c | 32 ++++++-------------------------- 1 file changed, 6 insertions(+), 26 deletions(-) diff --git a/mm/vmpressure.c b/mm/vmpressure.c index 7142207224d3..4854584ec436 100644 --- a/mm/vmpressure.c +++ b/mm/vmpressure.c @@ -342,26 +342,6 @@ void vmpressure_prio(gfp_t gfp, struct mem_cgroup *memcg, int prio) vmpressure(gfp, memcg, true, vmpressure_win, 0); } -static enum vmpressure_levels str_to_level(const char *arg) -{ - enum vmpressure_levels level; - - for (level = 0; level < VMPRESSURE_NUM_LEVELS; level++) - if (!strcmp(vmpressure_str_levels[level], arg)) - return level; - return -1; -} - -static enum vmpressure_modes str_to_mode(const char *arg) -{ - enum vmpressure_modes mode; - - for (mode = 0; mode < VMPRESSURE_NUM_MODES; mode++) - if (!strcmp(vmpressure_str_modes[mode], arg)) - return mode; - return -1; -} - #define MAX_VMPRESSURE_ARGS_LEN (strlen("critical") + strlen("hierarchy") + 2) /** @@ -398,18 +378,18 @@ int vmpressure_register_event(struct mem_cgroup *memcg, /* Find required level */ token = strsep(&spec, ","); - level = str_to_level(token); - if (level == -1) { - ret = -EINVAL; + level = match_string(vmpressure_str_levels, VMPRESSURE_NUM_LEVELS, token); + if (level < 0) { + ret = level; goto out; } /* Find optional mode */ token = strsep(&spec, ","); if (token) { - mode = str_to_mode(token); - if (mode == -1) { - ret = -EINVAL; + mode = match_string(vmpressure_str_modes, VMPRESSURE_NUM_MODES, token); + if (mode < 0) { + ret = mode; goto out; } } From a380b40abbcecda28037b712fc437e157a1e9b57 Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Thu, 7 Jun 2018 17:07:57 -0700 Subject: [PATCH 055/118] mm/page_alloc.c: remove useless parameter of finalise_ac() finalise_ac() has parameter order which is not used at all. Remove it. Signed-off-by: Huaisheng Ye Acked-by: Michal Hocko Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 5 ++--- 1 file changed, 2 insertions(+), 3 deletions(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index d0d26da12086..96d48d24e1ef 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4336,8 +4336,7 @@ static inline bool prepare_alloc_pages(gfp_t gfp_mask, unsigned int order, } /* Determine whether to spread dirty pages and what the first usable zone */ -static inline void finalise_ac(gfp_t gfp_mask, - unsigned int order, struct alloc_context *ac) +static inline void finalise_ac(gfp_t gfp_mask, struct alloc_context *ac) { /* Dirty zone balancing only done in the fast path */ ac->spread_dirty_pages = (gfp_mask & __GFP_WRITE); @@ -4368,7 +4367,7 @@ __alloc_pages_nodemask(gfp_t gfp_mask, unsigned int order, int preferred_nid, if (!prepare_alloc_pages(gfp_mask, order, preferred_nid, nodemask, &ac, &alloc_mask, &alloc_flags)) return NULL; - finalise_ac(gfp_mask, order, &ac); + finalise_ac(gfp_mask, &ac); /* First allocation attempt */ page = get_page_from_freelist(alloc_mask, order, alloc_flags, &ac); From 2bcd6454bae78775632e1848ce1eabf65c6fbd4f Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:08:00 -0700 Subject: [PATCH 056/118] mm: use new return type vm_fault_t Use new return type vm_fault_t for fault handler in struct vm_operations_struct. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. Link: http://lkml.kernel.org/r/20180511190542.GA2412@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Cc: Dan Williams Cc: Jan Kara Cc: Ross Zwisler Cc: Rik van Riel Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Pavel Tatashin Cc: Michal Hocko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 4 ++-- mm/filemap.c | 8 ++++---- mm/nommu.c | 2 +- 3 files changed, 7 insertions(+), 7 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index b7012bb1d218..d1dc618a56bf 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2303,10 +2303,10 @@ extern void truncate_inode_pages_range(struct address_space *, extern void truncate_inode_pages_final(struct address_space *); /* generic vm_area_ops exported for stackable file systems */ -extern int filemap_fault(struct vm_fault *vmf); +extern vm_fault_t filemap_fault(struct vm_fault *vmf); extern void filemap_map_pages(struct vm_fault *vmf, pgoff_t start_pgoff, pgoff_t end_pgoff); -extern int filemap_page_mkwrite(struct vm_fault *vmf); +extern vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf); /* mm/page-writeback.c */ int __must_check write_one_page(struct page *page); diff --git a/mm/filemap.c b/mm/filemap.c index 0604cb02e6f3..52517f28e6f4 100644 --- a/mm/filemap.c +++ b/mm/filemap.c @@ -2489,7 +2489,7 @@ static void do_async_mmap_readahead(struct vm_area_struct *vma, * * We never return with VM_FAULT_RETRY and a bit from VM_FAULT_ERROR set. */ -int filemap_fault(struct vm_fault *vmf) +vm_fault_t filemap_fault(struct vm_fault *vmf) { int error; struct file *file = vmf->vma->vm_file; @@ -2499,7 +2499,7 @@ int filemap_fault(struct vm_fault *vmf) pgoff_t offset = vmf->pgoff; pgoff_t max_off; struct page *page; - int ret = 0; + vm_fault_t ret = 0; max_off = DIV_ROUND_UP(i_size_read(inode), PAGE_SIZE); if (unlikely(offset >= max_off)) @@ -2693,11 +2693,11 @@ next: } EXPORT_SYMBOL(filemap_map_pages); -int filemap_page_mkwrite(struct vm_fault *vmf) +vm_fault_t filemap_page_mkwrite(struct vm_fault *vmf) { struct page *page = vmf->page; struct inode *inode = file_inode(vmf->vma->vm_file); - int ret = VM_FAULT_LOCKED; + vm_fault_t ret = VM_FAULT_LOCKED; sb_start_pagefault(inode->i_sb); file_update_time(vmf->vma->vm_file); diff --git a/mm/nommu.c b/mm/nommu.c index 13723736d38f..4452d8bd9ae4 100644 --- a/mm/nommu.c +++ b/mm/nommu.c @@ -1763,7 +1763,7 @@ unsigned long arch_get_unmapped_area(struct file *file, unsigned long addr, return -ENOMEM; } -int filemap_fault(struct vm_fault *vmf) +vm_fault_t filemap_fault(struct vm_fault *vmf) { BUG(); return 0; From b3ec9f33acb8d3a6173515d3d547be969dc70566 Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:08:04 -0700 Subject: [PATCH 057/118] mm: change return type to vm_fault_t Use new return type vm_fault_t for fault handler in struct vm_operations_struct. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. See commit 1c8f422059ae ("mm: change return type to vm_fault_t") Link: http://lkml.kernel.org/r/20180512063745.GA26866@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Joe Perches Cc: Michal Hocko Cc: Hugh Dickins Cc: Dan Williams Cc: David Rientjes Cc: Mike Kravetz Cc: Naoya Horiguchi Cc: Aneesh Kumar K.V Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 6 +++--- mm/hugetlb.c | 2 +- mm/mmap.c | 4 ++-- 3 files changed, 6 insertions(+), 6 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 49dd59ea90f7..eb9da245286d 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -629,9 +629,9 @@ struct vm_special_mapping { * If non-NULL, then this is called to resolve page faults * on the special mapping. If used, .pages is not checked. */ - int (*fault)(const struct vm_special_mapping *sm, - struct vm_area_struct *vma, - struct vm_fault *vmf); + vm_fault_t (*fault)(const struct vm_special_mapping *sm, + struct vm_area_struct *vma, + struct vm_fault *vmf); int (*mremap)(const struct vm_special_mapping *sm, struct vm_area_struct *new_vma); diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 129088710510..6c6decc63469 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3159,7 +3159,7 @@ static unsigned long hugetlb_vm_op_pagesize(struct vm_area_struct *vma) * hugegpage VMA. do_page_fault() is supposed to trap this, so BUG is we get * this far. */ -static int hugetlb_vm_op_fault(struct vm_fault *vmf) +static vm_fault_t hugetlb_vm_op_fault(struct vm_fault *vmf) { BUG(); return 0; diff --git a/mm/mmap.c b/mm/mmap.c index d817764a9974..d1eb87ef4b1a 100644 --- a/mm/mmap.c +++ b/mm/mmap.c @@ -3277,7 +3277,7 @@ void vm_stat_account(struct mm_struct *mm, vm_flags_t flags, long npages) mm->data_vm += npages; } -static int special_mapping_fault(struct vm_fault *vmf); +static vm_fault_t special_mapping_fault(struct vm_fault *vmf); /* * Having a close hook prevents vma merging regardless of flags. @@ -3316,7 +3316,7 @@ static const struct vm_operations_struct legacy_special_mapping_vmops = { .fault = special_mapping_fault, }; -static int special_mapping_fault(struct vm_fault *vmf) +static vm_fault_t special_mapping_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; pgoff_t pgoff; From 285b8dcaacfc36b0468aaa03e3c628006ae31381 Mon Sep 17 00:00:00 2001 From: Huang Ying Date: Thu, 7 Jun 2018 17:08:08 -0700 Subject: [PATCH 058/118] mm, hugetlbfs: pass fault address to no page handler This is to take better advantage of general huge page clearing optimization (commit c79b57e462b5: "mm: hugetlb: clear target sub-page last when clearing huge page") for hugetlbfs. In the general optimization patch, the sub-page to access will be cleared last to avoid the cache lines of to access sub-page to be evicted when clearing other sub-pages. This works better if we have the address of the sub-page to access, that is, the fault address inside the huge page. So the hugetlbfs no page fault handler is changed to pass that information. This will benefit workloads which don't access the begin of the hugetlbfs huge page after the page fault under heavy cache contention for shared last level cache. The patch is a generic optimization which should benefit quite some workloads, not for a specific use case. To demonstrate the performance benefit of the patch, we tested it with vm-scalability run on hugetlbfs. With this patch, the throughput increases ~28.1% in vm-scalability anon-w-seq test case with 88 processes on a 2 socket Xeon E5 2699 v4 system (44 cores, 88 threads). The test case creates 88 processes, each process mmaps a big anonymous memory area with MAP_HUGETLB and writes to it from the end to the begin. For each process, other processes could be seen as other workload which generates heavy cache pressure. At the same time, the cache miss rate reduced from ~36.3% to ~25.6%, the IPC (instruction per cycle) increased from 0.3 to 0.37, and the time spent in user space is reduced ~19.3%. Link: http://lkml.kernel.org/r/20180517083539.9242-1-ying.huang@intel.com Signed-off-by: "Huang, Ying" Reviewed-by: Mike Kravetz Cc: Michal Hocko Cc: David Rientjes Cc: Andrea Arcangeli Cc: "Kirill A. Shutemov" Cc: Andi Kleen Cc: Jan Kara Cc: Matthew Wilcox Cc: Hugh Dickins Cc: Minchan Kim Cc: Shaohua Li Cc: Christopher Lameter Cc: "Aneesh Kumar K.V" Cc: Punit Agrawal Cc: Anshuman Khandual Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/hugetlb.c | 42 +++++++++++++++++++++--------------------- 1 file changed, 21 insertions(+), 21 deletions(-) diff --git a/mm/hugetlb.c b/mm/hugetlb.c index 6c6decc63469..696befffe6f7 100644 --- a/mm/hugetlb.c +++ b/mm/hugetlb.c @@ -3686,6 +3686,7 @@ static int hugetlb_no_page(struct mm_struct *mm, struct vm_area_struct *vma, struct page *page; pte_t new_pte; spinlock_t *ptl; + unsigned long haddr = address & huge_page_mask(h); /* * Currently, we are forced to kill the process in the event the @@ -3716,7 +3717,7 @@ retry: u32 hash; struct vm_fault vmf = { .vma = vma, - .address = address, + .address = haddr, .flags = flags, /* * Hard to debug if it ends up being @@ -3733,14 +3734,14 @@ retry: * fault to make calling code simpler. */ hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, - idx, address); + idx, haddr); mutex_unlock(&hugetlb_fault_mutex_table[hash]); ret = handle_userfault(&vmf, VM_UFFD_MISSING); mutex_lock(&hugetlb_fault_mutex_table[hash]); goto out; } - page = alloc_huge_page(vma, address, 0); + page = alloc_huge_page(vma, haddr, 0); if (IS_ERR(page)) { ret = PTR_ERR(page); if (ret == -ENOMEM) @@ -3789,12 +3790,12 @@ retry: * the spinlock. */ if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { - if (vma_needs_reservation(h, vma, address) < 0) { + if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto backout_unlocked; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, address); + vma_end_reservation(h, vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); @@ -3808,17 +3809,17 @@ retry: if (anon_rmap) { ClearPagePrivate(page); - hugepage_add_new_anon_rmap(page, vma, address); + hugepage_add_new_anon_rmap(page, vma, haddr); } else page_dup_rmap(page, true); new_pte = make_huge_pte(vma, page, ((vma->vm_flags & VM_WRITE) && (vma->vm_flags & VM_SHARED))); - set_huge_pte_at(mm, address, ptep, new_pte); + set_huge_pte_at(mm, haddr, ptep, new_pte); hugetlb_count_add(pages_per_huge_page(h), mm); if ((flags & FAULT_FLAG_WRITE) && !(vma->vm_flags & VM_SHARED)) { /* Optimization, do the COW without a second fault */ - ret = hugetlb_cow(mm, vma, address, ptep, page, ptl); + ret = hugetlb_cow(mm, vma, haddr, ptep, page, ptl); } spin_unlock(ptl); @@ -3830,7 +3831,7 @@ backout: spin_unlock(ptl); backout_unlocked: unlock_page(page); - restore_reserve_on_error(h, vma, address, page); + restore_reserve_on_error(h, vma, haddr, page); put_page(page); goto out; } @@ -3883,10 +3884,9 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, struct hstate *h = hstate_vma(vma); struct address_space *mapping; int need_wait_lock = 0; + unsigned long haddr = address & huge_page_mask(h); - address &= huge_page_mask(h); - - ptep = huge_pte_offset(mm, address, huge_page_size(h)); + ptep = huge_pte_offset(mm, haddr, huge_page_size(h)); if (ptep) { entry = huge_ptep_get(ptep); if (unlikely(is_hugetlb_entry_migration(entry))) { @@ -3896,20 +3896,20 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, return VM_FAULT_HWPOISON_LARGE | VM_FAULT_SET_HINDEX(hstate_index(h)); } else { - ptep = huge_pte_alloc(mm, address, huge_page_size(h)); + ptep = huge_pte_alloc(mm, haddr, huge_page_size(h)); if (!ptep) return VM_FAULT_OOM; } mapping = vma->vm_file->f_mapping; - idx = vma_hugecache_offset(h, vma, address); + idx = vma_hugecache_offset(h, vma, haddr); /* * Serialize hugepage allocation and instantiation, so that we don't * get spurious allocation failures if two CPUs race to instantiate * the same page in the page cache. */ - hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, address); + hash = hugetlb_fault_mutex_hash(h, mm, vma, mapping, idx, haddr); mutex_lock(&hugetlb_fault_mutex_table[hash]); entry = huge_ptep_get(ptep); @@ -3939,16 +3939,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, * consumed. */ if ((flags & FAULT_FLAG_WRITE) && !huge_pte_write(entry)) { - if (vma_needs_reservation(h, vma, address) < 0) { + if (vma_needs_reservation(h, vma, haddr) < 0) { ret = VM_FAULT_OOM; goto out_mutex; } /* Just decrements count, does not deallocate */ - vma_end_reservation(h, vma, address); + vma_end_reservation(h, vma, haddr); if (!(vma->vm_flags & VM_MAYSHARE)) pagecache_page = hugetlbfs_pagecache_page(h, - vma, address); + vma, haddr); } ptl = huge_pte_lock(h, mm, ptep); @@ -3973,16 +3973,16 @@ int hugetlb_fault(struct mm_struct *mm, struct vm_area_struct *vma, if (flags & FAULT_FLAG_WRITE) { if (!huge_pte_write(entry)) { - ret = hugetlb_cow(mm, vma, address, ptep, + ret = hugetlb_cow(mm, vma, haddr, ptep, pagecache_page, ptl); goto out_put_page; } entry = huge_pte_mkdirty(entry); } entry = pte_mkyoung(entry); - if (huge_ptep_set_access_flags(vma, address, ptep, entry, + if (huge_ptep_set_access_flags(vma, haddr, ptep, entry, flags & FAULT_FLAG_WRITE)) - update_mmu_cache(vma, address, ptep); + update_mmu_cache(vma, haddr, ptep); out_put_page: if (page != pagecache_page) unlock_page(page); From e67d4ca79aaf9d13a00d229b1b1c96b86828e8ba Mon Sep 17 00:00:00 2001 From: Shakeel Butt Date: Thu, 7 Jun 2018 17:08:11 -0700 Subject: [PATCH 059/118] mm: save two stranded bits in gfp_mask ___GFP_COLD and ___GFP_OTHER_NODE were removed but their bits were stranded. Fill the gaps by moving the existing gfp masks around. Link: http://lkml.kernel.org/r/20180516211439.177440-1-shakeelb@google.com Signed-off-by: Shakeel Butt Suggested-by: Vlastimil Babka Acked-by: Michal Hocko Acked-by: Vlastimil Babka Reviewed-by: Andrew Morton Cc: Greg Thelen Cc: Mel Gorman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index fc5ab85278d5..7f860ea29ec6 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -24,6 +24,7 @@ struct vm_area_struct; #define ___GFP_HIGH 0x20u #define ___GFP_IO 0x40u #define ___GFP_FS 0x80u +#define ___GFP_WRITE 0x100u #define ___GFP_NOWARN 0x200u #define ___GFP_RETRY_MAYFAIL 0x400u #define ___GFP_NOFAIL 0x800u @@ -36,11 +37,10 @@ struct vm_area_struct; #define ___GFP_THISNODE 0x40000u #define ___GFP_ATOMIC 0x80000u #define ___GFP_ACCOUNT 0x100000u -#define ___GFP_DIRECT_RECLAIM 0x400000u -#define ___GFP_WRITE 0x800000u -#define ___GFP_KSWAPD_RECLAIM 0x1000000u +#define ___GFP_DIRECT_RECLAIM 0x200000u +#define ___GFP_KSWAPD_RECLAIM 0x400000u #ifdef CONFIG_LOCKDEP -#define ___GFP_NOLOCKDEP 0x2000000u +#define ___GFP_NOLOCKDEP 0x800000u #else #define ___GFP_NOLOCKDEP 0 #endif @@ -205,7 +205,7 @@ struct vm_area_struct; #define __GFP_NOLOCKDEP ((__force gfp_t)___GFP_NOLOCKDEP) /* Room for N __GFP_FOO bits */ -#define __GFP_BITS_SHIFT (25 + IS_ENABLED(CONFIG_LOCKDEP)) +#define __GFP_BITS_SHIFT (23 + IS_ENABLED(CONFIG_LOCKDEP)) #define __GFP_BITS_MASK ((__force gfp_t)((1 << __GFP_BITS_SHIFT) - 1)) /* From 620b4e903179d58342503fa09d9c680d93bf7db8 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:15 -0700 Subject: [PATCH 060/118] s390: use _refcount for pgtables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Patch series "Rearrange struct page", v6. As presented at LSFMM, this patch-set rearranges struct page to give more contiguous usable space to users who have allocated a struct page for their own purposes. For a graphical view of before-and-after, see the first two tabs of https://docs.google.com/spreadsheets/d/1tvCszs_7FXrjei9_mtFiKV6nW1FLnYyvPvW-qNZhdog/edit?usp=sharing Highlights: - deferred_list now really exists in struct page instead of just a comment. - hmm_data also exists in struct page instead of being a nasty hack. - x86's PGD pages have a real pointer to the mm_struct. - VMalloc pages now have all sorts of extra information stored in them to help with debugging and tuning. - rcu_head is no longer tied to slab in case anyone else wants to free pages by RCU. - slub's counters no longer share space with _refcount. - slub's freelist+counters are now naturally dword aligned. - slub loses a parameter to a lot of functions and a sysfs file. This patch (of 17): s390 borrows the storage used for _mapcount in struct page in order to account whether the bottom or top half is being used for 2kB page tables. I want to use that for something else, so use the top byte of _refcount instead of the bottom byte of _mapcount. _refcount may temporarily be incremented by other CPUs that see a stale pointer to this page in the page cache, but each CPU can only increment it by one, and there are no systems with 2^24 CPUs today, so they will not change the upper byte of _refcount. We do have to be a little careful not to lose any of their writes (as they will subsequently decrement the counter). Link: http://lkml.kernel.org/r/20180518194519.3820-2-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Martin Schwidefsky Cc: "Kirill A . Shutemov" Cc: Christoph Lameter Cc: Lai Jiangshan Cc: Pekka Enberg Cc: Vlastimil Babka Cc: Dave Hansen Cc: Jérôme Glisse Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/s390/mm/pgalloc.c | 21 ++++++++++++--------- 1 file changed, 12 insertions(+), 9 deletions(-) diff --git a/arch/s390/mm/pgalloc.c b/arch/s390/mm/pgalloc.c index 562f72955956..84bd6329a88d 100644 --- a/arch/s390/mm/pgalloc.c +++ b/arch/s390/mm/pgalloc.c @@ -190,14 +190,15 @@ unsigned long *page_table_alloc(struct mm_struct *mm) if (!list_empty(&mm->context.pgtable_list)) { page = list_first_entry(&mm->context.pgtable_list, struct page, lru); - mask = atomic_read(&page->_mapcount); + mask = atomic_read(&page->_refcount) >> 24; mask = (mask | (mask >> 4)) & 3; if (mask != 3) { table = (unsigned long *) page_to_phys(page); bit = mask & 1; /* =1 -> second 2K */ if (bit) table += PTRS_PER_PTE; - atomic_xor_bits(&page->_mapcount, 1U << bit); + atomic_xor_bits(&page->_refcount, + 1U << (bit + 24)); list_del(&page->lru); } } @@ -218,12 +219,12 @@ unsigned long *page_table_alloc(struct mm_struct *mm) table = (unsigned long *) page_to_phys(page); if (mm_alloc_pgste(mm)) { /* Return 4K page table with PGSTEs */ - atomic_set(&page->_mapcount, 3); + atomic_xor_bits(&page->_refcount, 3 << 24); memset64((u64 *)table, _PAGE_INVALID, PTRS_PER_PTE); memset64((u64 *)table + PTRS_PER_PTE, 0, PTRS_PER_PTE); } else { /* Return the first 2K fragment of the page */ - atomic_set(&page->_mapcount, 1); + atomic_xor_bits(&page->_refcount, 1 << 24); memset64((u64 *)table, _PAGE_INVALID, 2 * PTRS_PER_PTE); spin_lock_bh(&mm->context.lock); list_add(&page->lru, &mm->context.pgtable_list); @@ -242,7 +243,8 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) /* Free 2K page table fragment of a 4K page */ bit = (__pa(table) & ~PAGE_MASK)/(PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_mapcount, 1U << bit); + mask = atomic_xor_bits(&page->_refcount, 1U << (bit + 24)); + mask >>= 24; if (mask & 3) list_add(&page->lru, &mm->context.pgtable_list); else @@ -253,7 +255,6 @@ void page_table_free(struct mm_struct *mm, unsigned long *table) } pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); __free_page(page); } @@ -274,7 +275,8 @@ void page_table_free_rcu(struct mmu_gather *tlb, unsigned long *table, } bit = (__pa(table) & ~PAGE_MASK) / (PTRS_PER_PTE*sizeof(pte_t)); spin_lock_bh(&mm->context.lock); - mask = atomic_xor_bits(&page->_mapcount, 0x11U << bit); + mask = atomic_xor_bits(&page->_refcount, 0x11U << (bit + 24)); + mask >>= 24; if (mask & 3) list_add_tail(&page->lru, &mm->context.pgtable_list); else @@ -296,12 +298,13 @@ static void __tlb_remove_table(void *_table) break; case 1: /* lower 2K of a 4K page table */ case 2: /* higher 2K of a 4K page table */ - if (atomic_xor_bits(&page->_mapcount, mask << 4) != 0) + mask = atomic_xor_bits(&page->_refcount, mask << (4 + 24)); + mask >>= 24; + if (mask != 0) break; /* fallthrough */ case 3: /* 4K page table with pgstes */ pgtable_page_dtor(page); - atomic_set(&page->_mapcount, -1); __free_page(page); break; } From 6e292b9be7f4358985ce33ae1f59ab30a8c09e08 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:18 -0700 Subject: [PATCH 061/118] mm: split page_type out from _mapcount MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit We're already using a union of many fields here, so stop abusing the _mapcount and make page_type its own field. That implies renaming some of the machinery that creates PageBuddy, PageBalloon and PageKmemcg; bring back the PG_buddy, PG_balloon and PG_kmemcg names. As suggested by Kirill, make page_type a bitmask. Because it starts out life as -1 (thanks to sharing the storage with _mapcount), setting a page flag means clearing the appropriate bit. This gives us space for probably twenty or so extra bits (depending how paranoid we want to be about _mapcount underflow). Link: http://lkml.kernel.org/r/20180518194519.3820-3-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 13 ++++++----- include/linux/page-flags.h | 45 ++++++++++++++++++++++---------------- kernel/crash_core.c | 1 + mm/page_alloc.c | 13 +++++------ scripts/tags.sh | 6 ++--- 5 files changed, 43 insertions(+), 35 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index eb9da245286d..3a554fdf45c2 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -96,6 +96,14 @@ struct page { }; union { + /* + * If the page is neither PageSlab nor mappable to userspace, + * the value stored here may help determine what this page + * is used for. See page-flags.h for a list of page types + * which are currently stored here. + */ + unsigned int page_type; + _slub_counter_t counters; unsigned int active; /* SLAB */ struct { /* SLUB */ @@ -109,11 +117,6 @@ struct page { /* * Count of ptes mapped in mms, to show when * page is mapped & limit reverse map searches. - * - * Extra information about page type may be - * stored here for pages that are never mapped, - * in which case the value MUST BE <= -2. - * See page-flags.h for more details. */ atomic_t _mapcount; diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index e34a27727b9a..8c25b28a35aa 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -642,49 +642,56 @@ PAGEFLAG_FALSE(DoubleMap) #endif /* - * For pages that are never mapped to userspace, page->mapcount may be - * used for storing extra information about page type. Any value used - * for this purpose must be <= -2, but it's better start not too close - * to -2 so that an underflow of the page_mapcount() won't be mistaken - * for a special page. + * For pages that are never mapped to userspace (and aren't PageSlab), + * page_type may be used. Because it is initialised to -1, we invert the + * sense of the bit, so __SetPageFoo *clears* the bit used for PageFoo, and + * __ClearPageFoo *sets* the bit used for PageFoo. We reserve a few high and + * low bits so that an underflow or overflow of page_mapcount() won't be + * mistaken for a page type value. */ -#define PAGE_MAPCOUNT_OPS(uname, lname) \ + +#define PAGE_TYPE_BASE 0xf0000000 +/* Reserve 0x0000007f to catch underflows of page_mapcount */ +#define PG_buddy 0x00000080 +#define PG_balloon 0x00000100 +#define PG_kmemcg 0x00000200 + +#define PageType(page, flag) \ + ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) + +#define PAGE_TYPE_OPS(uname, lname) \ static __always_inline int Page##uname(struct page *page) \ { \ - return atomic_read(&page->_mapcount) == \ - PAGE_##lname##_MAPCOUNT_VALUE; \ + return PageType(page, PG_##lname); \ } \ static __always_inline void __SetPage##uname(struct page *page) \ { \ - VM_BUG_ON_PAGE(atomic_read(&page->_mapcount) != -1, page); \ - atomic_set(&page->_mapcount, PAGE_##lname##_MAPCOUNT_VALUE); \ + VM_BUG_ON_PAGE(!PageType(page, 0), page); \ + page->page_type &= ~PG_##lname; \ } \ static __always_inline void __ClearPage##uname(struct page *page) \ { \ VM_BUG_ON_PAGE(!Page##uname(page), page); \ - atomic_set(&page->_mapcount, -1); \ + page->page_type |= PG_##lname; \ } /* - * PageBuddy() indicate that the page is free and in the buddy system + * PageBuddy() indicates that the page is free and in the buddy system * (see mm/page_alloc.c). */ -#define PAGE_BUDDY_MAPCOUNT_VALUE (-128) -PAGE_MAPCOUNT_OPS(Buddy, BUDDY) +PAGE_TYPE_OPS(Buddy, buddy) /* - * PageBalloon() is set on pages that are on the balloon page list + * PageBalloon() is true for pages that are on the balloon page list * (see mm/balloon_compaction.c). */ -#define PAGE_BALLOON_MAPCOUNT_VALUE (-256) -PAGE_MAPCOUNT_OPS(Balloon, BALLOON) +PAGE_TYPE_OPS(Balloon, balloon) /* * If kmemcg is enabled, the buddy allocator will set PageKmemcg() on * pages allocated with __GFP_ACCOUNT. It gets cleared on page free. */ -#define PAGE_KMEMCG_MAPCOUNT_VALUE (-512) -PAGE_MAPCOUNT_OPS(Kmemcg, KMEMCG) +PAGE_TYPE_OPS(Kmemcg, kmemcg) extern bool is_free_buddy_page(struct page *page); diff --git a/kernel/crash_core.c b/kernel/crash_core.c index f7674d676889..b66aced5e8c2 100644 --- a/kernel/crash_core.c +++ b/kernel/crash_core.c @@ -460,6 +460,7 @@ static int __init crash_save_vmcoreinfo_init(void) VMCOREINFO_NUMBER(PG_hwpoison); #endif VMCOREINFO_NUMBER(PG_head_mask); +#define PAGE_BUDDY_MAPCOUNT_VALUE (~PG_buddy) VMCOREINFO_NUMBER(PAGE_BUDDY_MAPCOUNT_VALUE); #ifdef CONFIG_HUGETLB_PAGE VMCOREINFO_NUMBER(HUGETLB_PAGE_DTOR); diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 96d48d24e1ef..5afdf495c374 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -705,16 +705,14 @@ static inline void rmv_page_order(struct page *page) /* * This function checks whether a page is free && is the buddy - * we can do coalesce a page and its buddy if + * we can coalesce a page and its buddy if * (a) the buddy is not in a hole (check before calling!) && * (b) the buddy is in the buddy system && * (c) a page and its buddy have the same order && * (d) a page and its buddy are in the same zone. * - * For recording whether a page is in the buddy system, we set ->_mapcount - * PAGE_BUDDY_MAPCOUNT_VALUE. - * Setting, clearing, and testing _mapcount PAGE_BUDDY_MAPCOUNT_VALUE is - * serialized by zone->lock. + * For recording whether a page is in the buddy system, we set PageBuddy. + * Setting, clearing, and testing PageBuddy is serialized by zone->lock. * * For recording page's order, we use page_private(page). */ @@ -759,9 +757,8 @@ static inline int page_is_buddy(struct page *page, struct page *buddy, * as necessary, plus some accounting needed to play nicely with other * parts of the VM system. * At each level, we keep a list of pages, which are heads of continuous - * free pages of length of (1 << order) and marked with _mapcount - * PAGE_BUDDY_MAPCOUNT_VALUE. Page's order is recorded in page_private(page) - * field. + * free pages of length of (1 << order) and marked with PageBuddy. + * Page's order is recorded in page_private(page) field. * So when we are allocating or freeing one, we can derive the state of the * other. That is, if we allocate a small block, and both were * free, the remainder of the region must be split into blocks. diff --git a/scripts/tags.sh b/scripts/tags.sh index e587610d1492..66f08bb1cce9 100755 --- a/scripts/tags.sh +++ b/scripts/tags.sh @@ -179,9 +179,9 @@ regex_c=( '/\ Date: Thu, 7 Jun 2018 17:08:23 -0700 Subject: [PATCH 062/118] mm: mark pages in use for page tables MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Define a new PageTable bit in the page_type and use it to mark pages in use as page tables. This can be helpful when debugging crashdumps or analysing memory fragmentation. Add a KPF flag to report these pages to userspace and update page-types.c to interpret that flag. Note that only pages currently accounted as NR_PAGETABLES are tracked as PageTable; this does not include pgd/p4d/pud/pmd pages. Those will be the subject of a later patch. Link: http://lkml.kernel.org/r/20180518194519.3820-4-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/page.c | 2 ++ include/linux/mm.h | 2 ++ include/linux/page-flags.h | 6 ++++++ include/uapi/linux/kernel-page-flags.h | 2 +- tools/vm/page-types.c | 1 + 5 files changed, 12 insertions(+), 1 deletion(-) diff --git a/fs/proc/page.c b/fs/proc/page.c index 1491918a33c3..792c78a49174 100644 --- a/fs/proc/page.c +++ b/fs/proc/page.c @@ -154,6 +154,8 @@ u64 stable_page_flags(struct page *page) if (PageBalloon(page)) u |= 1 << KPF_BALLOON; + if (PageTable(page)) + u |= 1 << KPF_PGTABLE; if (page_is_idle(page)) u |= 1 << KPF_IDLE; diff --git a/include/linux/mm.h b/include/linux/mm.h index d1dc618a56bf..1dd588b7c649 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -1851,6 +1851,7 @@ static inline bool pgtable_page_ctor(struct page *page) { if (!ptlock_init(page)) return false; + __SetPageTable(page); inc_zone_page_state(page, NR_PAGETABLE); return true; } @@ -1858,6 +1859,7 @@ static inline bool pgtable_page_ctor(struct page *page) static inline void pgtable_page_dtor(struct page *page) { pte_lock_deinit(page); + __ClearPageTable(page); dec_zone_page_state(page, NR_PAGETABLE); } diff --git a/include/linux/page-flags.h b/include/linux/page-flags.h index 8c25b28a35aa..901943e4754b 100644 --- a/include/linux/page-flags.h +++ b/include/linux/page-flags.h @@ -655,6 +655,7 @@ PAGEFLAG_FALSE(DoubleMap) #define PG_buddy 0x00000080 #define PG_balloon 0x00000100 #define PG_kmemcg 0x00000200 +#define PG_table 0x00000400 #define PageType(page, flag) \ ((page->page_type & (PAGE_TYPE_BASE | flag)) == PAGE_TYPE_BASE) @@ -693,6 +694,11 @@ PAGE_TYPE_OPS(Balloon, balloon) */ PAGE_TYPE_OPS(Kmemcg, kmemcg) +/* + * Marks pages in use as page tables. + */ +PAGE_TYPE_OPS(Table, table) + extern bool is_free_buddy_page(struct page *page); __PAGEFLAG(Isolated, isolated, PF_ANY); diff --git a/include/uapi/linux/kernel-page-flags.h b/include/uapi/linux/kernel-page-flags.h index fa139841ec18..21b9113c69da 100644 --- a/include/uapi/linux/kernel-page-flags.h +++ b/include/uapi/linux/kernel-page-flags.h @@ -35,6 +35,6 @@ #define KPF_BALLOON 23 #define KPF_ZERO_PAGE 24 #define KPF_IDLE 25 - +#define KPF_PGTABLE 26 #endif /* _UAPILINUX_KERNEL_PAGE_FLAGS_H */ diff --git a/tools/vm/page-types.c b/tools/vm/page-types.c index a8783f48f77f..cce853dca691 100644 --- a/tools/vm/page-types.c +++ b/tools/vm/page-types.c @@ -131,6 +131,7 @@ static const char * const page_flag_names[] = { [KPF_KSM] = "x:ksm", [KPF_THP] = "t:thp", [KPF_BALLOON] = "o:balloon", + [KPF_PGTABLE] = "g:pgtable", [KPF_ZERO_PAGE] = "z:zero_page", [KPF_IDLE] = "i:idle_page", From d4fc5069a39405d67937b25c0dc9249a9b6c50f1 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:26 -0700 Subject: [PATCH 063/118] mm: switch s_mem and slab_cache in struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This will allow us to store slub's counters in the same bits as slab's s_mem. slub now needs to set page->mapping to NULL as it frees the page, just like slab does. Link: http://lkml.kernel.org/r/20180518194519.3820-5-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Christoph Lameter Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 4 ++-- mm/slub.c | 1 + 2 files changed, 3 insertions(+), 2 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 3a554fdf45c2..6f153f2fab50 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -83,7 +83,7 @@ struct page { /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ struct address_space *mapping; - void *s_mem; /* slab first object */ + struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ atomic_t compound_mapcount; /* first tail page */ /* page_deferred_list().next -- second tail page */ }; @@ -194,7 +194,7 @@ struct page { spinlock_t ptl; #endif #endif - struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ + void *s_mem; /* slab first object */ }; #ifdef CONFIG_MEMCG diff --git a/mm/slub.c b/mm/slub.c index 48f75872c356..0170ea8a97fe 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1695,6 +1695,7 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlab(page); page_mapcount_reset(page); + page->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; memcg_uncharge_slab(page, order, s); From 7d27a04bb2b5bd665f147439d18ae236080eef32 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:31 -0700 Subject: [PATCH 064/118] mm: move 'private' union within struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By moving page->private to the fourth word of struct page, we can put the SLUB counters in the same word as SLAB's s_mem and still do the cmpxchg_double trick. Now the SLUB counters no longer overlap with the mapcount or refcount so we can drop the call to page_mapcount_reset() and simplify set_page_slub_counters() to a single line. Link: http://lkml.kernel.org/r/20180518194519.3820-6-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 56 ++++++++++++++++++---------------------- mm/slub.c | 20 ++------------ 2 files changed, 27 insertions(+), 49 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 6f153f2fab50..bcc5ee8b7b07 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -65,15 +65,9 @@ struct hmm; */ #ifdef CONFIG_HAVE_ALIGNED_STRUCT_PAGE #define _struct_page_alignment __aligned(2 * sizeof(unsigned long)) -#if defined(CONFIG_HAVE_CMPXCHG_DOUBLE) -#define _slub_counter_t unsigned long #else -#define _slub_counter_t unsigned int -#endif -#else /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ #define _struct_page_alignment -#define _slub_counter_t unsigned int -#endif /* !CONFIG_HAVE_ALIGNED_STRUCT_PAGE */ +#endif struct page { /* First double word block */ @@ -95,6 +89,30 @@ struct page { /* page_deferred_list().prev -- second tail page */ }; + union { + /* + * Mapping-private opaque data: + * Usually used for buffer_heads if PagePrivate + * Used for swp_entry_t if PageSwapCache + * Indicates order in the buddy system if PageBuddy + */ + unsigned long private; +#if USE_SPLIT_PTE_PTLOCKS +#if ALLOC_SPLIT_PTLOCKS + spinlock_t *ptl; +#else + spinlock_t ptl; +#endif +#endif + void *s_mem; /* slab first object */ + unsigned long counters; /* SLUB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; + union { /* * If the page is neither PageSlab nor mappable to userspace, @@ -104,13 +122,7 @@ struct page { */ unsigned int page_type; - _slub_counter_t counters; unsigned int active; /* SLAB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; - }; int units; /* SLOB */ struct { /* Page cache */ @@ -179,24 +191,6 @@ struct page { #endif }; - union { - /* - * Mapping-private opaque data: - * Usually used for buffer_heads if PagePrivate - * Used for swp_entry_t if PageSwapCache - * Indicates order in the buddy system if PageBuddy - */ - unsigned long private; -#if USE_SPLIT_PTE_PTLOCKS -#if ALLOC_SPLIT_PTLOCKS - spinlock_t *ptl; -#else - spinlock_t ptl; -#endif -#endif - void *s_mem; /* slab first object */ - }; - #ifdef CONFIG_MEMCG struct mem_cgroup *mem_cgroup; #endif diff --git a/mm/slub.c b/mm/slub.c index 0170ea8a97fe..f5db87839ab4 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -356,21 +356,6 @@ static __always_inline void slab_unlock(struct page *page) __bit_spin_unlock(PG_locked, &page->flags); } -static inline void set_page_slub_counters(struct page *page, unsigned long counters_new) -{ - struct page tmp; - tmp.counters = counters_new; - /* - * page->counters can cover frozen/inuse/objects as well - * as page->_refcount. If we assign to ->counters directly - * we run the risk of losing updates to page->_refcount, so - * be careful and only assign to the fields we need. - */ - page->frozen = tmp.frozen; - page->inuse = tmp.inuse; - page->objects = tmp.objects; -} - /* Interrupts must be disabled (for the fallback code to work right) */ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page, void *freelist_old, unsigned long counters_old, @@ -392,7 +377,7 @@ static inline bool __cmpxchg_double_slab(struct kmem_cache *s, struct page *page if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - set_page_slub_counters(page, counters_new); + page->counters = counters_new; slab_unlock(page); return true; } @@ -431,7 +416,7 @@ static inline bool cmpxchg_double_slab(struct kmem_cache *s, struct page *page, if (page->freelist == freelist_old && page->counters == counters_old) { page->freelist = freelist_new; - set_page_slub_counters(page, counters_new); + page->counters = counters_new; slab_unlock(page); local_irq_restore(flags); return true; @@ -1694,7 +1679,6 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __ClearPageSlabPfmemalloc(page); __ClearPageSlab(page); - page_mapcount_reset(page); page->mapping = NULL; if (current->reclaim_state) current->reclaim_state->reclaimed_slab += pages; From b21999da02f891aafa362252f4db7ba5c34a9a8d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:35 -0700 Subject: [PATCH 065/118] mm: move _refcount out of struct page union MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Keeping the refcount in the union only encourages people to put something else in the union which will overlap with _refcount and eventually explode messily. pahole reports no fields change location. Link: http://lkml.kernel.org/r/20180518194519.3820-7-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 25 ++++++++++--------------- 1 file changed, 10 insertions(+), 15 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index bcc5ee8b7b07..0b0c0e224011 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -113,7 +113,13 @@ struct page { }; }; - union { + union { /* This union is 4 bytes in size. */ + /* + * If the page can be mapped to userspace, encodes the number + * of times this page is referenced by a page table. + */ + atomic_t _mapcount; + /* * If the page is neither PageSlab nor mappable to userspace, * the value stored here may help determine what this page @@ -124,22 +130,11 @@ struct page { unsigned int active; /* SLAB */ int units; /* SLOB */ - - struct { /* Page cache */ - /* - * Count of ptes mapped in mms, to show when - * page is mapped & limit reverse map searches. - */ - atomic_t _mapcount; - - /* - * Usage count, *USE WRAPPER FUNCTION* when manual - * accounting. See page_ref.h - */ - atomic_t _refcount; - }; }; + /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ + atomic_t _refcount; + /* * WARNING: bit 0 of the first word encode PageTail(). That means * the rest users of the storage space MUST NOT use the bit to From 66a6ffd2af42b0bafaecccbd7c1c30d386de21ab Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:39 -0700 Subject: [PATCH 066/118] mm: combine first three unions in struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit By combining these three one-word unions into one three-word union, we make it easier for users to add their own multi-word fields to struct page, as well as making it obvious that SLUB needs to keep its double-word alignment for its freelist & counters. No field moves position; verified with pahole. Link: http://lkml.kernel.org/r/20180518194519.3820-8-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Kirill A. Shutemov Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 66 ++++++++++++++++++++-------------------- 1 file changed, 33 insertions(+), 33 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 0b0c0e224011..158f9693f865 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -70,46 +70,46 @@ struct hmm; #endif struct page { - /* First double word block */ unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ + /* Three words (12/24 bytes) are available in this union. */ union { - /* See page-flags.h for the definition of PAGE_MAPPING_FLAGS */ - struct address_space *mapping; - - struct kmem_cache *slab_cache; /* SL[AU]B: Pointer to slab */ + struct { /* Page cache and anonymous pages */ + /* See page-flags.h for PAGE_MAPPING_FLAGS */ + struct address_space *mapping; + pgoff_t index; /* Our offset within mapping. */ + /** + * @private: Mapping-private opaque data. + * Usually used for buffer_heads if PagePrivate. + * Used for swp_entry_t if PageSwapCache. + * Indicates order in the buddy system if PageBuddy. + */ + unsigned long private; + }; + struct { /* slab, slob and slub */ + struct kmem_cache *slab_cache; /* not slob */ + /* Double-word boundary */ + void *freelist; /* first free object */ + union { + void *s_mem; /* slab: first object */ + unsigned long counters; /* SLUB */ + struct { /* SLUB */ + unsigned inuse:16; + unsigned objects:15; + unsigned frozen:1; + }; + }; + }; atomic_t compound_mapcount; /* first tail page */ - /* page_deferred_list().next -- second tail page */ - }; - - /* Second double word */ - union { - pgoff_t index; /* Our offset within mapping. */ - void *freelist; /* sl[aou]b first free object */ - /* page_deferred_list().prev -- second tail page */ - }; - - union { - /* - * Mapping-private opaque data: - * Usually used for buffer_heads if PagePrivate - * Used for swp_entry_t if PageSwapCache - * Indicates order in the buddy system if PageBuddy - */ - unsigned long private; -#if USE_SPLIT_PTE_PTLOCKS + struct list_head deferred_list; /* second tail page */ + struct { /* Page table pages */ + unsigned long _pt_pad_2; /* mapping */ + unsigned long _pt_pad_3; #if ALLOC_SPLIT_PTLOCKS - spinlock_t *ptl; + spinlock_t *ptl; #else - spinlock_t ptl; + spinlock_t ptl; #endif -#endif - void *s_mem; /* slab first object */ - unsigned long counters; /* SLUB */ - struct { /* SLUB */ - unsigned inuse:16; - unsigned objects:15; - unsigned frozen:1; }; }; From fa3015b7eed5da68a40c7667e8b3d5c20ef56e82 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:42 -0700 Subject: [PATCH 067/118] mm: use page->deferred_list MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Now that we can represent the location of 'deferred_list' in C instead of comments, make use of that ability. Link: http://lkml.kernel.org/r/20180518194519.3820-9-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/huge_memory.c | 7 ++----- mm/page_alloc.c | 2 +- 2 files changed, 3 insertions(+), 6 deletions(-) diff --git a/mm/huge_memory.c b/mm/huge_memory.c index ac5591d8622c..ba8fdc0b6e7f 100644 --- a/mm/huge_memory.c +++ b/mm/huge_memory.c @@ -483,11 +483,8 @@ pmd_t maybe_pmd_mkwrite(pmd_t pmd, struct vm_area_struct *vma) static inline struct list_head *page_deferred_list(struct page *page) { - /* - * ->lru in the tail pages is occupied by compound_head. - * Let's use ->mapping + ->index in the second tail page as list_head. - */ - return (struct list_head *)&page[2].mapping; + /* ->lru in the tail pages is occupied by compound_head. */ + return &page[2].deferred_list; } void prep_transhuge_page(struct page *page) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 5afdf495c374..628fd3e24731 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -952,7 +952,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) case 2: /* * the second tail page: ->mapping is - * page_deferred_list().next -- ignore value. + * deferred_list.next -- ignore value. */ break; default: From b7ccc7f8c643de0b71cd2da3245d0c27038a8dd7 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:46 -0700 Subject: [PATCH 068/118] mm: move lru union within struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Since the LRU is two words, this does not affect the double-word alignment of SLUB's freelist. Link: http://lkml.kernel.org/r/20180518194519.3820-10-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 102 +++++++++++++++++++-------------------- mm/slub.c | 8 +-- 2 files changed, 55 insertions(+), 55 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 158f9693f865..5b97bd445ada 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -72,6 +72,57 @@ struct hmm; struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ + /* + * WARNING: bit 0 of the first word encode PageTail(). That means + * the rest users of the storage space MUST NOT use the bit to + * avoid collision and false-positive PageTail(). + */ + union { + struct list_head lru; /* Pageout list, eg. active_list + * protected by zone_lru_lock ! + * Can be used as a generic list + * by the page owner. + */ + struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an + * lru or handled by a slab + * allocator, this points to the + * hosting device page map. + */ + struct { /* slub per cpu partial pages */ + struct page *next; /* Next partial slab */ +#ifdef CONFIG_64BIT + int pages; /* Nr of partial slabs left */ + int pobjects; /* Approximate # of objects */ +#else + short int pages; + short int pobjects; +#endif + }; + + struct rcu_head rcu_head; /* Used by SLAB + * when destroying via RCU + */ + /* Tail pages of compound page */ + struct { + unsigned long compound_head; /* If bit zero is set */ + + /* First tail page only */ + unsigned char compound_dtor; + unsigned char compound_order; + /* two/six bytes available here */ + }; + +#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS + struct { + unsigned long __pad; /* do not overlay pmd_huge_pte + * with compound_head to avoid + * possible bit 0 collision. + */ + pgtable_t pmd_huge_pte; /* protected by page->ptl */ + }; +#endif + }; + /* Three words (12/24 bytes) are available in this union. */ union { struct { /* Page cache and anonymous pages */ @@ -135,57 +186,6 @@ struct page { /* Usage count. *DO NOT USE DIRECTLY*. See page_ref.h */ atomic_t _refcount; - /* - * WARNING: bit 0 of the first word encode PageTail(). That means - * the rest users of the storage space MUST NOT use the bit to - * avoid collision and false-positive PageTail(). - */ - union { - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone_lru_lock ! - * Can be used as a generic list - * by the page owner. - */ - struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an - * lru or handled by a slab - * allocator, this points to the - * hosting device page map. - */ - struct { /* slub per cpu partial pages */ - struct page *next; /* Next partial slab */ -#ifdef CONFIG_64BIT - int pages; /* Nr of partial slabs left */ - int pobjects; /* Approximate # of objects */ -#else - short int pages; - short int pobjects; -#endif - }; - - struct rcu_head rcu_head; /* Used by SLAB - * when destroying via RCU - */ - /* Tail pages of compound page */ - struct { - unsigned long compound_head; /* If bit zero is set */ - - /* First tail page only */ - unsigned char compound_dtor; - unsigned char compound_order; - /* two/six bytes available here */ - }; - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS - struct { - unsigned long __pad; /* do not overlay pmd_huge_pte - * with compound_head to avoid - * possible bit 0 collision. - */ - pgtable_t pmd_huge_pte; /* protected by page->ptl */ - }; -#endif - }; - #ifdef CONFIG_MEMCG struct mem_cgroup *mem_cgroup; #endif diff --git a/mm/slub.c b/mm/slub.c index f5db87839ab4..a96bf429af08 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -52,11 +52,11 @@ * and to synchronize major metadata changes to slab cache structures. * * The slab_lock is only used for debugging and on arches that do not - * have the ability to do a cmpxchg_double. It only protects the second - * double word in the page struct. Meaning + * have the ability to do a cmpxchg_double. It only protects: * A. page->freelist -> List of object free in a page - * B. page->counters -> Counters of objects - * C. page->frozen -> frozen state + * B. page->inuse -> Number of objects in use + * C. page->objects -> Number of objects in page + * D. page->frozen -> frozen state * * If a slab is frozen then it is exempt from list management. It is not * on any list. The processor that froze the slab is the one who can From 4da1984edbbed0ffc961006d265d78a7b0095ea4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:50 -0700 Subject: [PATCH 069/118] mm: combine LRU and main union in struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit This gives us five words of space in a single union in struct page. The compound_mapcount moves position (from offset 24 to offset 20) on 64-bit systems, but that does not seem likely to cause any trouble. Link: http://lkml.kernel.org/r/20180518194519.3820-11-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Acked-by: Kirill A. Shutemov Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 97 +++++++++++++++++++--------------------- mm/page_alloc.c | 2 +- 2 files changed, 47 insertions(+), 52 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 5b97bd445ada..dfe6d648e055 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -73,59 +73,19 @@ struct page { unsigned long flags; /* Atomic flags, some possibly * updated asynchronously */ /* - * WARNING: bit 0 of the first word encode PageTail(). That means - * the rest users of the storage space MUST NOT use the bit to + * Five words (20/40 bytes) are available in this union. + * WARNING: bit 0 of the first word is used for PageTail(). That + * means the other users of this union MUST NOT use the bit to * avoid collision and false-positive PageTail(). */ - union { - struct list_head lru; /* Pageout list, eg. active_list - * protected by zone_lru_lock ! - * Can be used as a generic list - * by the page owner. - */ - struct dev_pagemap *pgmap; /* ZONE_DEVICE pages are never on an - * lru or handled by a slab - * allocator, this points to the - * hosting device page map. - */ - struct { /* slub per cpu partial pages */ - struct page *next; /* Next partial slab */ -#ifdef CONFIG_64BIT - int pages; /* Nr of partial slabs left */ - int pobjects; /* Approximate # of objects */ -#else - short int pages; - short int pobjects; -#endif - }; - - struct rcu_head rcu_head; /* Used by SLAB - * when destroying via RCU - */ - /* Tail pages of compound page */ - struct { - unsigned long compound_head; /* If bit zero is set */ - - /* First tail page only */ - unsigned char compound_dtor; - unsigned char compound_order; - /* two/six bytes available here */ - }; - -#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && USE_SPLIT_PMD_PTLOCKS - struct { - unsigned long __pad; /* do not overlay pmd_huge_pte - * with compound_head to avoid - * possible bit 0 collision. - */ - pgtable_t pmd_huge_pte; /* protected by page->ptl */ - }; -#endif - }; - - /* Three words (12/24 bytes) are available in this union. */ union { struct { /* Page cache and anonymous pages */ + /** + * @lru: Pageout list, eg. active_list protected by + * zone_lru_lock. Sometimes used as a generic list + * by the page owner. + */ + struct list_head lru; /* See page-flags.h for PAGE_MAPPING_FLAGS */ struct address_space *mapping; pgoff_t index; /* Our offset within mapping. */ @@ -138,6 +98,19 @@ struct page { unsigned long private; }; struct { /* slab, slob and slub */ + union { + struct list_head slab_list; /* uses lru */ + struct { /* Partial pages */ + struct page *next; +#ifdef CONFIG_64BIT + int pages; /* Nr of pages left */ + int pobjects; /* Approximate count */ +#else + short int pages; + short int pobjects; +#endif + }; + }; struct kmem_cache *slab_cache; /* not slob */ /* Double-word boundary */ void *freelist; /* first free object */ @@ -151,9 +124,22 @@ struct page { }; }; }; - atomic_t compound_mapcount; /* first tail page */ - struct list_head deferred_list; /* second tail page */ + struct { /* Tail pages of compound page */ + unsigned long compound_head; /* Bit zero is set */ + + /* First tail page only */ + unsigned char compound_dtor; + unsigned char compound_order; + atomic_t compound_mapcount; + }; + struct { /* Second tail page of compound page */ + unsigned long _compound_pad_1; /* compound_head */ + unsigned long _compound_pad_2; + struct list_head deferred_list; + }; struct { /* Page table pages */ + unsigned long _pt_pad_1; /* compound_head */ + pgtable_t pmd_huge_pte; /* protected by page->ptl */ unsigned long _pt_pad_2; /* mapping */ unsigned long _pt_pad_3; #if ALLOC_SPLIT_PTLOCKS @@ -162,6 +148,15 @@ struct page { spinlock_t ptl; #endif }; + + /** @rcu_head: You can use this to free a page by RCU. */ + struct rcu_head rcu_head; + + /** + * @pgmap: For ZONE_DEVICE pages, this points to the hosting + * device page map. + */ + struct dev_pagemap *pgmap; }; union { /* This union is 4 bytes in size. */ diff --git a/mm/page_alloc.c b/mm/page_alloc.c index 628fd3e24731..ef1811531999 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -943,7 +943,7 @@ static int free_tail_pages_check(struct page *head_page, struct page *page) } switch (page - head_page) { case 1: - /* the first tail page: ->mapping is compound_mapcount() */ + /* the first tail page: ->mapping may be compound_mapcount() */ if (unlikely(compound_mapcount(page))) { bad_page(page, "nonzero compound_mapcount", 0); goto out; From 97b4a671987123009724dfbdf188ec6e122573d4 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:53 -0700 Subject: [PATCH 070/118] mm: improve struct page documentation MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Rewrite the documentation to describe what you can use in struct page rather than what you can't. Link: http://lkml.kernel.org/r/20180518194519.3820-12-willy@infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Randy Dunlap Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm_types.h | 40 +++++++++++++++++++--------------------- 1 file changed, 19 insertions(+), 21 deletions(-) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index dfe6d648e055..9b45760cc797 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -33,29 +33,27 @@ struct hmm; * it to keep track of whatever it is we are using the page for at the * moment. Note that we have no way to track which tasks are using * a page, though if it is a pagecache page, rmap structures can tell us - * who is mapping it. If you allocate the page using alloc_pages(), you - * can use some of the space in struct page for your own purposes. + * who is mapping it. * - * Pages that were once in the page cache may be found under the RCU lock - * even after they have been recycled to a different purpose. The page - * cache reads and writes some of the fields in struct page to pin the - * page before checking that it's still in the page cache. It is vital - * that all users of struct page: - * 1. Use the first word as PageFlags. - * 2. Clear or preserve bit 0 of page->compound_head. It is used as - * PageTail for compound pages, and the page cache must not see false - * positives. Some users put a pointer here (guaranteed to be at least - * 4-byte aligned), other users avoid using the field altogether. - * 3. page->_refcount must either not be used, or must be used in such a - * way that other CPUs temporarily incrementing and then decrementing the - * refcount does not cause problems. On receiving the page from - * alloc_pages(), the refcount will be positive. - * 4. Either preserve page->_mapcount or restore it to -1 before freeing it. + * If you allocate the page using alloc_pages(), you can use some of the + * space in struct page for your own purposes. The five words in the main + * union are available, except for bit 0 of the first word which must be + * kept clear. Many users use this word to store a pointer to an object + * which is guaranteed to be aligned. If you use the same storage as + * page->mapping, you must restore it to NULL before freeing the page. * - * If you allocate pages of order > 0, you can use the fields in the struct - * page associated with each page, but bear in mind that the pages may have - * been inserted individually into the page cache, so you must use the above - * four fields in a compatible way for each struct page. + * If your page will not be mapped to userspace, you can also use the four + * bytes in the mapcount union, but you must call page_mapcount_reset() + * before freeing it. + * + * If you want to use the refcount field, it must be used in such a way + * that other CPUs temporarily incrementing and then decrementing the + * refcount does not cause problems. On receiving the page from + * alloc_pages(), the refcount will be positive. + * + * If you allocate pages of order > 0, you can use some of the fields + * in each subpage, but you may need to restore some of their values + * afterwards. * * SLUB uses cmpxchg_double() to atomically update its freelist and * counters. That requires that freelist & counters be adjacent and From a052f0a5168ef99c517f6638aaec59f39060f81d Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:08:57 -0700 Subject: [PATCH 071/118] mm: add pt_mm to struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit For pgd page table pages, x86 overloads the page->index field to store a pointer to the mm_struct. Rename this to pt_mm so it's visible to other users. Link: http://lkml.kernel.org/r/20180518194519.3820-13-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- arch/x86/mm/pgtable.c | 5 ++--- include/linux/mm_types.h | 2 +- 2 files changed, 3 insertions(+), 4 deletions(-) diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c index ffc8c13c50e4..938dbcd46b97 100644 --- a/arch/x86/mm/pgtable.c +++ b/arch/x86/mm/pgtable.c @@ -114,13 +114,12 @@ static inline void pgd_list_del(pgd_t *pgd) static void pgd_set_mm(pgd_t *pgd, struct mm_struct *mm) { - BUILD_BUG_ON(sizeof(virt_to_page(pgd)->index) < sizeof(mm)); - virt_to_page(pgd)->index = (pgoff_t)mm; + virt_to_page(pgd)->pt_mm = mm; } struct mm_struct *pgd_page_get_mm(struct page *page) { - return (struct mm_struct *)page->index; + return page->pt_mm; } static void pgd_ctor(struct mm_struct *mm, pgd_t *pgd) diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 9b45760cc797..811aa0c71e1a 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -139,7 +139,7 @@ struct page { unsigned long _pt_pad_1; /* compound_head */ pgtable_t pmd_huge_pte; /* protected by page->ptl */ unsigned long _pt_pad_2; /* mapping */ - unsigned long _pt_pad_3; + struct mm_struct *pt_mm; /* x86 pgds only */ #if ALLOC_SPLIT_PTLOCKS spinlock_t *ptl; #else From 50e7fbc3bf0c3e41af0075f17a9e4ddefa82e1b5 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:09:01 -0700 Subject: [PATCH 072/118] mm: add hmm_data to struct page MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Make hmm_data an explicit member of the struct page union. Link: http://lkml.kernel.org/r/20180518194519.3820-14-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Vlastimil Babka Cc: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/hmm.h | 8 ++------ include/linux/mm_types.h | 12 ++++++------ 2 files changed, 8 insertions(+), 12 deletions(-) diff --git a/include/linux/hmm.h b/include/linux/hmm.h index 2f1327c37a63..4c92e3ba3e16 100644 --- a/include/linux/hmm.h +++ b/include/linux/hmm.h @@ -522,9 +522,7 @@ void hmm_devmem_remove(struct hmm_devmem *devmem); static inline void hmm_devmem_page_set_drvdata(struct page *page, unsigned long data) { - unsigned long *drvdata = (unsigned long *)&page->pgmap; - - drvdata[1] = data; + page->hmm_data = data; } /* @@ -535,9 +533,7 @@ static inline void hmm_devmem_page_set_drvdata(struct page *page, */ static inline unsigned long hmm_devmem_page_get_drvdata(const struct page *page) { - const unsigned long *drvdata = (const unsigned long *)&page->pgmap; - - return drvdata[1]; + return page->hmm_data; } diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h index 811aa0c71e1a..99ce070e7dcb 100644 --- a/include/linux/mm_types.h +++ b/include/linux/mm_types.h @@ -146,15 +146,15 @@ struct page { spinlock_t ptl; #endif }; + struct { /* ZONE_DEVICE pages */ + /** @pgmap: Points to the hosting device page map. */ + struct dev_pagemap *pgmap; + unsigned long hmm_data; + unsigned long _zd_pad_1; /* uses mapping */ + }; /** @rcu_head: You can use this to free a page by RCU. */ struct rcu_head rcu_head; - - /** - * @pgmap: For ZONE_DEVICE pages, this points to the hosting - * device page map. - */ - struct dev_pagemap *pgmap; }; union { /* This union is 4 bytes in size. */ From bf68c214df6d73e85fc19b6bbcb642554b491b98 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:09:05 -0700 Subject: [PATCH 073/118] slab,slub: remove rcu_head size checks MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit rcu_head may now grow larger than list_head without affecting slab or slub. Link: http://lkml.kernel.org/r/20180518194519.3820-15-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Christoph Lameter Acked-by: Vlastimil Babka Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slab.c | 2 -- mm/slub.c | 27 ++------------------------- 2 files changed, 2 insertions(+), 27 deletions(-) diff --git a/mm/slab.c b/mm/slab.c index c1fe8099b3cd..36688f6c87eb 100644 --- a/mm/slab.c +++ b/mm/slab.c @@ -1235,8 +1235,6 @@ void __init kmem_cache_init(void) { int i; - BUILD_BUG_ON(sizeof(((struct page *)NULL)->lru) < - sizeof(struct rcu_head)); kmem_cache = &kmem_cache_boot; if (!IS_ENABLED(CONFIG_NUMA) || num_possible_nodes() == 1) diff --git a/mm/slub.c b/mm/slub.c index a96bf429af08..d5bddf0f4792 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -1686,17 +1686,9 @@ static void __free_slab(struct kmem_cache *s, struct page *page) __free_pages(page, order); } -#define need_reserve_slab_rcu \ - (sizeof(((struct page *)NULL)->lru) < sizeof(struct rcu_head)) - static void rcu_free_slab(struct rcu_head *h) { - struct page *page; - - if (need_reserve_slab_rcu) - page = virt_to_head_page(h); - else - page = container_of((struct list_head *)h, struct page, lru); + struct page *page = container_of(h, struct page, rcu_head); __free_slab(page->slab_cache, page); } @@ -1704,19 +1696,7 @@ static void rcu_free_slab(struct rcu_head *h) static void free_slab(struct kmem_cache *s, struct page *page) { if (unlikely(s->flags & SLAB_TYPESAFE_BY_RCU)) { - struct rcu_head *head; - - if (need_reserve_slab_rcu) { - int order = compound_order(page); - int offset = (PAGE_SIZE << order) - s->reserved; - - VM_BUG_ON(s->reserved != sizeof(*head)); - head = page_address(page) + offset; - } else { - head = &page->rcu_head; - } - - call_rcu(head, rcu_free_slab); + call_rcu(&page->rcu_head, rcu_free_slab); } else __free_slab(s, page); } @@ -3583,9 +3563,6 @@ static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) s->random = get_random_long(); #endif - if (need_reserve_slab_rcu && (s->flags & SLAB_TYPESAFE_BY_RCU)) - s->reserved = sizeof(struct rcu_head); - if (!calculate_sizes(s, -1)) goto error; if (disable_higher_order_debug) { From 9736d2a95e36ac3f60b063f498961103f3d4f165 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:09:10 -0700 Subject: [PATCH 074/118] slub: remove kmem_cache->reserved MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The reserved field was only used for embedding an rcu_head in the data structure. With the previous commit, we no longer need it. That lets us remove the 'reserved' argument to a lot of functions. Link: http://lkml.kernel.org/r/20180518194519.3820-16-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/slub_def.h | 1 - mm/slub.c | 41 ++++++++++++++++++++-------------------- 2 files changed, 20 insertions(+), 22 deletions(-) diff --git a/include/linux/slub_def.h b/include/linux/slub_def.h index 3773e26c08c1..09fa2c6f0e68 100644 --- a/include/linux/slub_def.h +++ b/include/linux/slub_def.h @@ -101,7 +101,6 @@ struct kmem_cache { void (*ctor)(void *); unsigned int inuse; /* Offset to metadata */ unsigned int align; /* Alignment */ - unsigned int reserved; /* Reserved bytes at the end of slabs */ unsigned int red_left_pad; /* Left redzone padding size */ const char *name; /* Name (only for display!) */ struct list_head list; /* List of slab caches */ diff --git a/mm/slub.c b/mm/slub.c index d5bddf0f4792..f885dcf09750 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -316,16 +316,16 @@ static inline unsigned int slab_index(void *p, struct kmem_cache *s, void *addr) return (p - addr) / s->size; } -static inline unsigned int order_objects(unsigned int order, unsigned int size, unsigned int reserved) +static inline unsigned int order_objects(unsigned int order, unsigned int size) { - return (((unsigned int)PAGE_SIZE << order) - reserved) / size; + return ((unsigned int)PAGE_SIZE << order) / size; } static inline struct kmem_cache_order_objects oo_make(unsigned int order, - unsigned int size, unsigned int reserved) + unsigned int size) { struct kmem_cache_order_objects x = { - (order << OO_SHIFT) + order_objects(order, size, reserved) + (order << OO_SHIFT) + order_objects(order, size) }; return x; @@ -832,7 +832,7 @@ static int slab_pad_check(struct kmem_cache *s, struct page *page) return 1; start = page_address(page); - length = (PAGE_SIZE << compound_order(page)) - s->reserved; + length = PAGE_SIZE << compound_order(page); end = start + length; remainder = length % s->size; if (!remainder) @@ -921,7 +921,7 @@ static int check_slab(struct kmem_cache *s, struct page *page) return 0; } - maxobj = order_objects(compound_order(page), s->size, s->reserved); + maxobj = order_objects(compound_order(page), s->size); if (page->objects > maxobj) { slab_err(s, page, "objects %u > max %u", page->objects, maxobj); @@ -971,7 +971,7 @@ static int on_freelist(struct kmem_cache *s, struct page *page, void *search) nr++; } - max_objects = order_objects(compound_order(page), s->size, s->reserved); + max_objects = order_objects(compound_order(page), s->size); if (max_objects > MAX_OBJS_PER_PAGE) max_objects = MAX_OBJS_PER_PAGE; @@ -3193,21 +3193,21 @@ static unsigned int slub_min_objects; */ static inline unsigned int slab_order(unsigned int size, unsigned int min_objects, unsigned int max_order, - unsigned int fract_leftover, unsigned int reserved) + unsigned int fract_leftover) { unsigned int min_order = slub_min_order; unsigned int order; - if (order_objects(min_order, size, reserved) > MAX_OBJS_PER_PAGE) + if (order_objects(min_order, size) > MAX_OBJS_PER_PAGE) return get_order(size * MAX_OBJS_PER_PAGE) - 1; - for (order = max(min_order, (unsigned int)get_order(min_objects * size + reserved)); + for (order = max(min_order, (unsigned int)get_order(min_objects * size)); order <= max_order; order++) { unsigned int slab_size = (unsigned int)PAGE_SIZE << order; unsigned int rem; - rem = (slab_size - reserved) % size; + rem = slab_size % size; if (rem <= slab_size / fract_leftover) break; @@ -3216,7 +3216,7 @@ static inline unsigned int slab_order(unsigned int size, return order; } -static inline int calculate_order(unsigned int size, unsigned int reserved) +static inline int calculate_order(unsigned int size) { unsigned int order; unsigned int min_objects; @@ -3233,7 +3233,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) min_objects = slub_min_objects; if (!min_objects) min_objects = 4 * (fls(nr_cpu_ids) + 1); - max_objects = order_objects(slub_max_order, size, reserved); + max_objects = order_objects(slub_max_order, size); min_objects = min(min_objects, max_objects); while (min_objects > 1) { @@ -3242,7 +3242,7 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) fraction = 16; while (fraction >= 4) { order = slab_order(size, min_objects, - slub_max_order, fraction, reserved); + slub_max_order, fraction); if (order <= slub_max_order) return order; fraction /= 2; @@ -3254,14 +3254,14 @@ static inline int calculate_order(unsigned int size, unsigned int reserved) * We were unable to place multiple objects in a slab. Now * lets see if we can place a single object there. */ - order = slab_order(size, 1, slub_max_order, 1, reserved); + order = slab_order(size, 1, slub_max_order, 1); if (order <= slub_max_order) return order; /* * Doh this slab cannot be placed using slub_max_order. */ - order = slab_order(size, 1, MAX_ORDER, 1, reserved); + order = slab_order(size, 1, MAX_ORDER, 1); if (order < MAX_ORDER) return order; return -ENOSYS; @@ -3529,7 +3529,7 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) if (forced_order >= 0) order = forced_order; else - order = calculate_order(size, s->reserved); + order = calculate_order(size); if ((int)order < 0) return 0; @@ -3547,8 +3547,8 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) /* * Determine the number of objects per slab */ - s->oo = oo_make(order, size, s->reserved); - s->min = oo_make(get_order(size), size, s->reserved); + s->oo = oo_make(order, size); + s->min = oo_make(get_order(size), size); if (oo_objects(s->oo) > oo_objects(s->max)) s->max = s->oo; @@ -3558,7 +3558,6 @@ static int calculate_sizes(struct kmem_cache *s, int forced_order) static int kmem_cache_open(struct kmem_cache *s, slab_flags_t flags) { s->flags = kmem_cache_flags(s->size, flags, s->name, s->ctor); - s->reserved = 0; #ifdef CONFIG_SLAB_FREELIST_HARDENED s->random = get_random_long(); #endif @@ -5077,7 +5076,7 @@ SLAB_ATTR_RO(destroy_by_rcu); static ssize_t reserved_show(struct kmem_cache *s, char *buf) { - return sprintf(buf, "%u\n", s->reserved); + return sprintf(buf, "0\n"); } SLAB_ATTR_RO(reserved); From 325d7d4a968669bec18c054db6b5de5f4fc82874 Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:09:14 -0700 Subject: [PATCH 075/118] slub: remove 'reserved' file from sysfs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Christoph doubts anyone was using the 'reserved' file in sysfs, so remove it. Link: http://lkml.kernel.org/r/20180518194519.3820-17-willy@infradead.org Signed-off-by: Matthew Wilcox Acked-by: Christoph Lameter Cc: Dave Hansen Cc: Jérôme Glisse Cc: "Kirill A . Shutemov" Cc: Lai Jiangshan Cc: Martin Schwidefsky Cc: Pekka Enberg Cc: Randy Dunlap Cc: Vlastimil Babka Cc: Andrey Ryabinin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/slub.c | 7 ------- 1 file changed, 7 deletions(-) diff --git a/mm/slub.c b/mm/slub.c index f885dcf09750..15505479c3ab 100644 --- a/mm/slub.c +++ b/mm/slub.c @@ -5074,12 +5074,6 @@ static ssize_t destroy_by_rcu_show(struct kmem_cache *s, char *buf) } SLAB_ATTR_RO(destroy_by_rcu); -static ssize_t reserved_show(struct kmem_cache *s, char *buf) -{ - return sprintf(buf, "0\n"); -} -SLAB_ATTR_RO(reserved); - #ifdef CONFIG_SLUB_DEBUG static ssize_t slabs_show(struct kmem_cache *s, char *buf) { @@ -5392,7 +5386,6 @@ static struct attribute *slab_attrs[] = { &reclaim_account_attr.attr, &destroy_by_rcu_attr.attr, &shrink_attr.attr, - &reserved_attr.attr, &slabs_cpu_partial_attr.attr, #ifdef CONFIG_SLUB_DEBUG &total_objects_attr.attr, From 20acce679910fb6d30a89fa02586a3b4a134dfeb Mon Sep 17 00:00:00 2001 From: Souptick Joarder Date: Thu, 7 Jun 2018 17:09:17 -0700 Subject: [PATCH 076/118] mm/shmem.c: use new return type vm_fault_t Use new return type vm_fault_t for fault handler. For now, this is just documenting that the function returns a VM_FAULT value rather than an errno. Once all instances are converted, vm_fault_t will become a distinct type. See commit 1c8f422059ae ("mm: change return type to vm_fault_t") vmf_error() is the newly introduce inline function in 4.17-rc6. Link: http://lkml.kernel.org/r/20180521202410.GA17912@jordon-HP-15-Notebook-PC Signed-off-by: Souptick Joarder Reviewed-by: Matthew Wilcox Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 12 ++++++------ 1 file changed, 6 insertions(+), 6 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 9a1b553e30e0..2b686e3f53ad 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1947,14 +1947,14 @@ static int synchronous_wake_function(wait_queue_entry_t *wait, unsigned mode, in return ret; } -static int shmem_fault(struct vm_fault *vmf) +static vm_fault_t shmem_fault(struct vm_fault *vmf) { struct vm_area_struct *vma = vmf->vma; struct inode *inode = file_inode(vma->vm_file); gfp_t gfp = mapping_gfp_mask(inode->i_mapping); enum sgp_type sgp; - int error; - int ret = VM_FAULT_LOCKED; + int err; + vm_fault_t ret = VM_FAULT_LOCKED; /* * Trinity finds that probing a hole which tmpfs is punching can @@ -2022,10 +2022,10 @@ static int shmem_fault(struct vm_fault *vmf) else if (vma->vm_flags & VM_HUGEPAGE) sgp = SGP_HUGE; - error = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, + err = shmem_getpage_gfp(inode, vmf->pgoff, &vmf->page, sgp, gfp, vma, vmf, &ret); - if (error) - return ((error == -ENOMEM) ? VM_FAULT_OOM : VM_FAULT_SIGBUS); + if (err) + return vmf_error(err); return ret; } From be09102b4190561b67e3809b07a7fd29c9774152 Mon Sep 17 00:00:00 2001 From: Tejun Heo Date: Thu, 7 Jun 2018 17:09:21 -0700 Subject: [PATCH 077/118] mm: memcg: allow lowering memory.swap.max below the current usage Currently an attempt to set swap.max into a value lower than the actual swap usage fails, which causes configuration problems as there's no way of lowering the configuration below the current usage short of turning off swap entirely. This makes swap.max difficult to use and allows delegatees to lock the delegator out of reducing swap allocation. This patch updates swap_max_write() so that the limit can be lowered below the current usage. It doesn't implement active reclaiming of swap entries for the following reasons. * mem_cgroup_swap_full() already tells the swap machinary to aggressively reclaim swap entries if the usage is above 50% of limit, so simply lowering the limit automatically triggers gradual reclaim. * Forcing back swapped out pages is likely to heavily impact the workload and mess up the working set. Given that swap usually is a lot less valuable and less scarce, letting the existing usage dissipate over time through the above gradual reclaim and as they're falted back in is likely the better behavior. Link: http://lkml.kernel.org/r/20180523185041.GR1718769@devbig577.frc2.facebook.com Signed-off-by: Tejun Heo Acked-by: Roman Gushchin Acked-by: Rik van Riel Acked-by: Johannes Weiner Cc: Michal Hocko Cc: Shaohua Li Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/admin-guide/cgroup-v2.rst | 5 +++++ mm/memcontrol.c | 6 +----- 2 files changed, 6 insertions(+), 5 deletions(-) diff --git a/Documentation/admin-guide/cgroup-v2.rst b/Documentation/admin-guide/cgroup-v2.rst index e34d3c938729..8a2c52d5c53b 100644 --- a/Documentation/admin-guide/cgroup-v2.rst +++ b/Documentation/admin-guide/cgroup-v2.rst @@ -1245,6 +1245,11 @@ PAGE_SIZE multiple when read back. because of running out of swap system-wide or max limit. + When reduced under the current usage, the existing swap + entries are reclaimed gradually and the swap usage may stay + higher than the limit for an extended period of time. This + reduces the impact on the workload and memory management. + Usage Guidelines ~~~~~~~~~~~~~~~~ diff --git a/mm/memcontrol.c b/mm/memcontrol.c index e3d56927a724..c1e64d60ed02 100644 --- a/mm/memcontrol.c +++ b/mm/memcontrol.c @@ -6280,11 +6280,7 @@ static ssize_t swap_max_write(struct kernfs_open_file *of, if (err) return err; - mutex_lock(&memcg_max_mutex); - err = page_counter_set_max(&memcg->swap, max); - mutex_unlock(&memcg_max_mutex); - if (err) - return err; + xchg(&memcg->swap.max, max); return nbytes; } From df2cc96e77011cf7989208b206da9817e0321028 Mon Sep 17 00:00:00 2001 From: Mike Rapoport Date: Thu, 7 Jun 2018 17:09:25 -0700 Subject: [PATCH 078/118] userfaultfd: prevent non-cooperative events vs mcopy_atomic races If a process monitored with userfaultfd changes it's memory mappings or forks() at the same time as uffd monitor fills the process memory with UFFDIO_COPY, the actual creation of page table entries and copying of the data in mcopy_atomic may happen either before of after the memory mapping modifications and there is no way for the uffd monitor to maintain consistent view of the process memory layout. For instance, let's consider fork() running in parallel with userfaultfd_copy(): process | uffd monitor ---------------------------------+------------------------------ fork() | userfaultfd_copy() ... | ... dup_mmap() | down_read(mmap_sem) down_write(mmap_sem) | /* create PTEs, copy data */ dup_uffd() | up_read(mmap_sem) copy_page_range() | up_write(mmap_sem) | dup_uffd_complete() | /* notify monitor */ | If the userfaultfd_copy() takes the mmap_sem first, the new page(s) will be present by the time copy_page_range() is called and they will appear in the child's memory mappings. However, if the fork() is the first to take the mmap_sem, the new pages won't be mapped in the child's address space. If the pages are not present and child tries to access them, the monitor will get page fault notification and everything is fine. However, if the pages *are present*, the child can access them without uffd noticing. And if we copy them into child it'll see the wrong data. Since we are talking about background copy, we'd need to decide whether the pages should be copied or not regardless #PF notifications. Since userfaultfd monitor has no way to determine what was the order, let's disallow userfaultfd_copy in parallel with the non-cooperative events. In such case we return -EAGAIN and the uffd monitor can understand that userfaultfd_copy() clashed with a non-cooperative event and take an appropriate action. Link: http://lkml.kernel.org/r/1527061324-19949-1-git-send-email-rppt@linux.vnet.ibm.com Signed-off-by: Mike Rapoport Acked-by: Pavel Emelyanov Cc: Andrea Arcangeli Cc: Mike Kravetz Cc: Andrei Vagin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/userfaultfd.c | 22 ++++++++++++++++++++-- include/linux/userfaultfd_k.h | 6 ++++-- mm/userfaultfd.c | 22 +++++++++++++++++----- 3 files changed, 41 insertions(+), 9 deletions(-) diff --git a/fs/userfaultfd.c b/fs/userfaultfd.c index cec550c8468f..123bf7d516fc 100644 --- a/fs/userfaultfd.c +++ b/fs/userfaultfd.c @@ -62,6 +62,8 @@ struct userfaultfd_ctx { enum userfaultfd_state state; /* released */ bool released; + /* memory mappings are changing because of non-cooperative event */ + bool mmap_changing; /* mm with one ore more vmas attached to this userfaultfd_ctx */ struct mm_struct *mm; }; @@ -641,6 +643,7 @@ static void userfaultfd_event_wait_completion(struct userfaultfd_ctx *ctx, * already released. */ out: + WRITE_ONCE(ctx->mmap_changing, false); userfaultfd_ctx_put(ctx); } @@ -686,10 +689,12 @@ int dup_userfaultfd(struct vm_area_struct *vma, struct list_head *fcs) ctx->state = UFFD_STATE_RUNNING; ctx->features = octx->features; ctx->released = false; + ctx->mmap_changing = false; ctx->mm = vma->vm_mm; mmgrab(ctx->mm); userfaultfd_ctx_get(octx); + WRITE_ONCE(octx->mmap_changing, true); fctx->orig = octx; fctx->new = ctx; list_add_tail(&fctx->list, fcs); @@ -732,6 +737,7 @@ void mremap_userfaultfd_prep(struct vm_area_struct *vma, if (ctx && (ctx->features & UFFD_FEATURE_EVENT_REMAP)) { vm_ctx->ctx = ctx; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); } } @@ -772,6 +778,7 @@ bool userfaultfd_remove(struct vm_area_struct *vma, return true; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); up_read(&mm->mmap_sem); msg_init(&ewq.msg); @@ -815,6 +822,7 @@ int userfaultfd_unmap_prep(struct vm_area_struct *vma, return -ENOMEM; userfaultfd_ctx_get(ctx); + WRITE_ONCE(ctx->mmap_changing, true); unmap_ctx->ctx = ctx; unmap_ctx->start = start; unmap_ctx->end = end; @@ -1653,6 +1661,10 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, user_uffdio_copy = (struct uffdio_copy __user *) arg; + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + ret = -EFAULT; if (copy_from_user(&uffdio_copy, user_uffdio_copy, /* don't copy "copy" last field */ @@ -1674,7 +1686,7 @@ static int userfaultfd_copy(struct userfaultfd_ctx *ctx, goto out; if (mmget_not_zero(ctx->mm)) { ret = mcopy_atomic(ctx->mm, uffdio_copy.dst, uffdio_copy.src, - uffdio_copy.len); + uffdio_copy.len, &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1705,6 +1717,10 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, user_uffdio_zeropage = (struct uffdio_zeropage __user *) arg; + ret = -EAGAIN; + if (READ_ONCE(ctx->mmap_changing)) + goto out; + ret = -EFAULT; if (copy_from_user(&uffdio_zeropage, user_uffdio_zeropage, /* don't copy "zeropage" last field */ @@ -1721,7 +1737,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx, if (mmget_not_zero(ctx->mm)) { ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start, - uffdio_zeropage.range.len); + uffdio_zeropage.range.len, + &ctx->mmap_changing); mmput(ctx->mm); } else { return -ESRCH; @@ -1900,6 +1917,7 @@ SYSCALL_DEFINE1(userfaultfd, int, flags) ctx->features = 0; ctx->state = UFFD_STATE_WAIT_API; ctx->released = false; + ctx->mmap_changing = false; ctx->mm = current->mm; /* prevent the mm struct to be freed */ mmgrab(ctx->mm); diff --git a/include/linux/userfaultfd_k.h b/include/linux/userfaultfd_k.h index f2f3b68ba910..e091f0a11b11 100644 --- a/include/linux/userfaultfd_k.h +++ b/include/linux/userfaultfd_k.h @@ -31,10 +31,12 @@ extern int handle_userfault(struct vm_fault *vmf, unsigned long reason); extern ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len); + unsigned long src_start, unsigned long len, + bool *mmap_changing); extern ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long len); + unsigned long len, + bool *mmap_changing); /* mm helpers */ static inline bool is_mergeable_vm_userfaultfd_ctx(struct vm_area_struct *vma, diff --git a/mm/userfaultfd.c b/mm/userfaultfd.c index 39791b81ede7..5029f241908f 100644 --- a/mm/userfaultfd.c +++ b/mm/userfaultfd.c @@ -404,7 +404,8 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, unsigned long src_start, unsigned long len, - bool zeropage) + bool zeropage, + bool *mmap_changing) { struct vm_area_struct *dst_vma; ssize_t err; @@ -430,6 +431,15 @@ static __always_inline ssize_t __mcopy_atomic(struct mm_struct *dst_mm, retry: down_read(&dst_mm->mmap_sem); + /* + * If memory mappings are changing because of non-cooperative + * operation (e.g. mremap) running in parallel, bail out and + * request the user to retry later + */ + err = -EAGAIN; + if (mmap_changing && READ_ONCE(*mmap_changing)) + goto out_unlock; + /* * Make sure the vma is not shared, that the dst range is * both valid and fully within a single existing vma. @@ -563,13 +573,15 @@ out: } ssize_t mcopy_atomic(struct mm_struct *dst_mm, unsigned long dst_start, - unsigned long src_start, unsigned long len) + unsigned long src_start, unsigned long len, + bool *mmap_changing) { - return __mcopy_atomic(dst_mm, dst_start, src_start, len, false); + return __mcopy_atomic(dst_mm, dst_start, src_start, len, false, + mmap_changing); } ssize_t mfill_zeropage(struct mm_struct *dst_mm, unsigned long start, - unsigned long len) + unsigned long len, bool *mmap_changing) { - return __mcopy_atomic(dst_mm, start, 0, len, true); + return __mcopy_atomic(dst_mm, start, 0, len, true, mmap_changing); } From 7810e6781e0fcbca78b91cf65053f895bf59e85f Mon Sep 17 00:00:00 2001 From: Vlastimil Babka Date: Thu, 7 Jun 2018 17:09:29 -0700 Subject: [PATCH 079/118] mm, page_alloc: do not break __GFP_THISNODE by zonelist reset In __alloc_pages_slowpath() we reset zonelist and preferred_zoneref for allocations that can ignore memory policies. The zonelist is obtained from current CPU's node. This is a problem for __GFP_THISNODE allocations that want to allocate on a different node, e.g. because the allocating thread has been migrated to a different CPU. This has been observed to break SLAB in our 4.4-based kernel, because there it relies on __GFP_THISNODE working as intended. If a slab page is put on wrong node's list, then further list manipulations may corrupt the list because page_to_nid() is used to determine which node's list_lock should be locked and thus we may take a wrong lock and race. Current SLAB implementation seems to be immune by luck thanks to commit 511e3a058812 ("mm/slab: make cache_grow() handle the page allocated on arbitrary node") but there may be others assuming that __GFP_THISNODE works as promised. We can fix it by simply removing the zonelist reset completely. There is actually no reason to reset it, because memory policies and cpusets don't affect the zonelist choice in the first place. This was different when commit 183f6371aac2 ("mm: ignore mempolicies when using ALLOC_NO_WATERMARK") introduced the code, as mempolicies provided their own restricted zonelists. We might consider this for 4.17 although I don't know if there's anything currently broken. SLAB is currently not affected, but in kernels older than 4.7 that don't yet have 511e3a058812 ("mm/slab: make cache_grow() handle the page allocated on arbitrary node") it is. That's at least 4.4 LTS. Older ones I'll have to check. So stable backports should be more important, but will have to be reviewed carefully, as the code went through many changes. BTW I think that also the ac->preferred_zoneref reset is currently useless if we don't also reset ac->nodemask from a mempolicy to NULL first (which we probably should for the OOM victims etc?), but I would leave that for a separate patch. Link: http://lkml.kernel.org/r/20180525130853.13915-1-vbabka@suse.cz Signed-off-by: Vlastimil Babka Fixes: 183f6371aac2 ("mm: ignore mempolicies when using ALLOC_NO_WATERMARK") Acked-by: Mel Gorman Cc: Michal Hocko Cc: David Rientjes Cc: Joonsoo Kim Cc: Vlastimil Babka Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/page_alloc.c | 1 - 1 file changed, 1 deletion(-) diff --git a/mm/page_alloc.c b/mm/page_alloc.c index ef1811531999..07b3c23762ad 100644 --- a/mm/page_alloc.c +++ b/mm/page_alloc.c @@ -4169,7 +4169,6 @@ retry: * orientated. */ if (!(alloc_flags & ALLOC_CPUSET) || reserve_flags) { - ac->zonelist = node_zonelist(numa_node_id(), gfp_mask); ac->preferred_zoneref = first_zones_zonelist(ac->zonelist, ac->high_zoneidx, ac->nodemask); } From daa280753cefa692607190852a98a9c06ae9ec9a Mon Sep 17 00:00:00 2001 From: "Kirill A. Shutemov" Date: Thu, 7 Jun 2018 17:09:32 -0700 Subject: [PATCH 080/118] mm/shmem.c: zero out unused vma fields in shmem_pseudo_vma_init() shmem/tmpfs uses pseudo vma to allocate page with correct NUMA policy. The pseudo vma doesn't have vm_page_prot set. We are going to encode encryption KeyID in vm_page_prot. Having garbage there causes problems. Zero out all unused fields in the pseudo vma. Link: http://lkml.kernel.org/r/20180531135602.20321-1-kirill.shutemov@linux.intel.com Signed-off-by: Kirill A. Shutemov Reviewed-by: Andrew Morton Cc: Hugh Dickins Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- mm/shmem.c | 3 +-- 1 file changed, 1 insertion(+), 2 deletions(-) diff --git a/mm/shmem.c b/mm/shmem.c index 2b686e3f53ad..e9a7ac74823d 100644 --- a/mm/shmem.c +++ b/mm/shmem.c @@ -1420,10 +1420,9 @@ static void shmem_pseudo_vma_init(struct vm_area_struct *vma, struct shmem_inode_info *info, pgoff_t index) { /* Create a pseudo vma that just contains the policy */ - vma->vm_start = 0; + memset(vma, 0, sizeof(*vma)); /* Bias interleave by inode number to distribute better across nodes */ vma->vm_pgoff = index + info->vfs_inode.i_ino; - vma->vm_ops = NULL; vma->vm_policy = mpol_shared_policy_lookup(&info->policy, index); } From 4b33b6959581d5093af2badb489d914911d99eaf Mon Sep 17 00:00:00 2001 From: Huaisheng Ye Date: Thu, 7 Jun 2018 17:09:36 -0700 Subject: [PATCH 081/118] include/linux/gfp.h: fix the annotation of GFP_ZONE_TABLE When bit is equal to 0x4, it means OPT_ZONE_DMA32 should be got from GFP_ZONE_TABLE. OPT_ZONE_DMA32 shall be equal to ZONE_DMA32 or ZONE_NORMAL according to the status of CONFIG_ZONE_DMA32. Similarly, when bit is equal to 0xc, that means OPT_ZONE_DMA32 should be got with an allocation policy GFP_MOVABLE. So ZONE_DMA32 or ZONE_NORMAL is the possible result value. Link: http://lkml.kernel.org/r/20180601163403.1032-1-yehs2007@zoho.com Signed-off-by: Huaisheng Ye Reviewed-by: Andrew Morton Cc: Vlastimil Babka Cc: Michal Hocko Cc: Mel Gorman Cc: Kate Stewart Cc: "Levin, Alexander (Sasha Levin)" Cc: Greg Kroah-Hartman Cc: Christoph Hellwig Cc: Matthew Wilcox Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/gfp.h | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/include/linux/gfp.h b/include/linux/gfp.h index 7f860ea29ec6..a6afcec53795 100644 --- a/include/linux/gfp.h +++ b/include/linux/gfp.h @@ -343,7 +343,7 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) * 0x1 => DMA or NORMAL * 0x2 => HIGHMEM or NORMAL * 0x3 => BAD (DMA+HIGHMEM) - * 0x4 => DMA32 or DMA or NORMAL + * 0x4 => DMA32 or NORMAL * 0x5 => BAD (DMA+DMA32) * 0x6 => BAD (HIGHMEM+DMA32) * 0x7 => BAD (HIGHMEM+DMA32+DMA) @@ -351,7 +351,7 @@ static inline bool gfpflags_allow_blocking(const gfp_t gfp_flags) * 0x9 => DMA or NORMAL (MOVABLE+DMA) * 0xa => MOVABLE (Movable is valid only if HIGHMEM is set too) * 0xb => BAD (MOVABLE+HIGHMEM+DMA) - * 0xc => DMA32 (MOVABLE+DMA32) + * 0xc => DMA32 or NORMAL (MOVABLE+DMA32) * 0xd => BAD (MOVABLE+DMA32+DMA) * 0xe => BAD (MOVABLE+DMA32+HIGHMEM) * 0xf => BAD (MOVABLE+DMA32+HIGHMEM+DMA) From ce91f6ee5b3bbbad8caff61b1c46d845c8db19bf Mon Sep 17 00:00:00 2001 From: Michal Hocko Date: Thu, 7 Jun 2018 17:09:40 -0700 Subject: [PATCH 082/118] mm: kvmalloc does not fallback to vmalloc for incompatible gfp flags kvmalloc warned about incompatible gfp_mask to catch abusers (mostly GFP_NOFS) with an intention that this will motivate authors of the code to fix those. Linus argues that this just motivates people to do even more hacks like if (gfp == GFP_KERNEL) kvmalloc else kmalloc I haven't seen this happening much (Linus pointed to bucket_lock special cases an atomic allocation but my git foo hasn't found much more) but it is true that we can grow those in future. Therefore Linus suggested to simply not fallback to vmalloc for incompatible gfp flags and rather stick with the kmalloc path. Link: http://lkml.kernel.org/r/20180601115329.27807-1-mhocko@kernel.org Signed-off-by: Michal Hocko Suggested-by: Linus Torvalds Cc: Tom Herbert Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bucket_locks.c | 5 +---- mm/util.c | 6 ++++-- 2 files changed, 5 insertions(+), 6 deletions(-) diff --git a/lib/bucket_locks.c b/lib/bucket_locks.c index 266a97c5708b..ade3ce6c4af6 100644 --- a/lib/bucket_locks.c +++ b/lib/bucket_locks.c @@ -30,10 +30,7 @@ int alloc_bucket_spinlocks(spinlock_t **locks, unsigned int *locks_mask, } if (sizeof(spinlock_t) != 0) { - if (gfpflags_allow_blocking(gfp)) - tlocks = kvmalloc(size * sizeof(spinlock_t), gfp); - else - tlocks = kmalloc_array(size, sizeof(spinlock_t), gfp); + tlocks = kvmalloc_array(size, sizeof(spinlock_t), gfp); if (!tlocks) return -ENOMEM; for (i = 0; i < size; i++) diff --git a/mm/util.c b/mm/util.c index c2d0a7cdb189..3351659200e6 100644 --- a/mm/util.c +++ b/mm/util.c @@ -391,7 +391,8 @@ EXPORT_SYMBOL(vm_mmap); * __GFP_RETRY_MAYFAIL is supported, and it should be used only if kmalloc is * preferable to the vmalloc fallback, due to visible performance drawbacks. * - * Any use of gfp flags outside of GFP_KERNEL should be consulted with mm people. + * Please note that any use of gfp flags outside of GFP_KERNEL is careful to not + * fall back to vmalloc. */ void *kvmalloc_node(size_t size, gfp_t flags, int node) { @@ -402,7 +403,8 @@ void *kvmalloc_node(size_t size, gfp_t flags, int node) * vmalloc uses GFP_KERNEL for some internal allocations (e.g page tables) * so the given set of flags has to be compatible. */ - WARN_ON_ONCE((flags & GFP_KERNEL) != GFP_KERNEL); + if ((flags & GFP_KERNEL) != GFP_KERNEL) + return kmalloc_node(size, flags, node); /* * We want to attempt a large physically contiguous block first because From e81bf9793b1861d74953ef041b4f6c7faecc2dbd Mon Sep 17 00:00:00 2001 From: Aaron Lu Date: Thu, 7 Jun 2018 17:09:44 -0700 Subject: [PATCH 083/118] mem_cgroup: make sure moving_account, move_lock_task and stat_cpu in the same cacheline The LKP robot found a 27% will-it-scale/page_fault3 performance regression regarding commit e27be240df53("mm: memcg: make sure memory.events is uptodate when waking pollers"). What the test does is: 1 mkstemp() a 128M file on a tmpfs; 2 start $nr_cpu processes, each to loop the following: 2.1 mmap() this file in shared write mode; 2.2 write 0 to this file in a PAGE_SIZE step till the end of the file; 2.3 unmap() this file and repeat this process. 3 After 5 minutes, check how many loops they managed to complete, the higher the better. The commit itself looks innocent enough as it merely changed some event counting mechanism and this test didn't trigger those events at all. Perf shows increased cycles spent on accessing root_mem_cgroup->stat_cpu in count_memcg_event_mm()(called by handle_mm_fault()) and in __mod_memcg_state() called by page_add_file_rmap(). So it's likely due to the changed layout of 'struct mem_cgroup' that either make stat_cpu falling into a constantly modifying cacheline or some hot fields stop being in the same cacheline. I verified this by moving memory_events[] back to where it was: : --- a/include/linux/memcontrol.h : +++ b/include/linux/memcontrol.h : @@ -205,7 +205,6 @@ struct mem_cgroup { : int oom_kill_disable; : : /* memory.events */ : - atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; : struct cgroup_file events_file; : : /* protect arrays of thresholds */ : @@ -238,6 +237,7 @@ struct mem_cgroup { : struct mem_cgroup_stat_cpu __percpu *stat_cpu; : atomic_long_t stat[MEMCG_NR_STAT]; : atomic_long_t events[NR_VM_EVENT_ITEMS]; : + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; : : unsigned long socket_pressure; And performance restored. Later investigation found that as long as the following 3 fields moving_account, move_lock_task and stat_cpu are in the same cacheline, performance will be good. To avoid future performance surprise by other commits changing the layout of 'struct mem_cgroup', this patch makes sure the 3 fields stay in the same cacheline. One concern of this approach is, moving_account and move_lock_task could be modified when a process changes memory cgroup while stat_cpu is a always read field, it might hurt to place them in the same cacheline. I assume it is rare for a process to change memory cgroup so this should be OK. Link: https://lkml.kernel.org/r/20180528114019.GF9904@yexl-desktop Link: http://lkml.kernel.org/r/20180601071115.GA27302@intel.com Signed-off-by: Aaron Lu Reported-by: kernel test robot Cc: Johannes Weiner Cc: Michal Hocko Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/memcontrol.h | 23 +++++++++++++++++++---- 1 file changed, 19 insertions(+), 4 deletions(-) diff --git a/include/linux/memcontrol.h b/include/linux/memcontrol.h index 9c04cf8e6487..4f52ec755725 100644 --- a/include/linux/memcontrol.h +++ b/include/linux/memcontrol.h @@ -166,6 +166,15 @@ enum memcg_kmem_state { KMEM_ONLINE, }; +#if defined(CONFIG_SMP) +struct memcg_padding { + char x[0]; +} ____cacheline_internodealigned_in_smp; +#define MEMCG_PADDING(name) struct memcg_padding name; +#else +#define MEMCG_PADDING(name) +#endif + /* * The memory controller data structure. The memory controller controls both * page cache and RSS per cgroup. We would eventually like to provide @@ -212,7 +221,6 @@ struct mem_cgroup { int oom_kill_disable; /* memory.events */ - atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; struct cgroup_file events_file; /* handle for "memory.swap.events" */ @@ -235,19 +243,26 @@ struct mem_cgroup { * mem_cgroup ? And what type of charges should we move ? */ unsigned long move_charge_at_immigrate; + /* taken only while moving_account > 0 */ + spinlock_t move_lock; + unsigned long move_lock_flags; + + MEMCG_PADDING(_pad1_); + /* * set > 0 if pages under this cgroup are moving to other cgroup. */ atomic_t moving_account; - /* taken only while moving_account > 0 */ - spinlock_t move_lock; struct task_struct *move_lock_task; - unsigned long move_lock_flags; /* memory.stat */ struct mem_cgroup_stat_cpu __percpu *stat_cpu; + + MEMCG_PADDING(_pad2_); + atomic_long_t stat[MEMCG_NR_STAT]; atomic_long_t events[NR_VM_EVENT_ITEMS]; + atomic_long_t memory_events[MEMCG_NR_MEMORY_EVENTS]; unsigned long socket_pressure; From 72eb7de9c1580cf5705d4f592aca21a8438fde22 Mon Sep 17 00:00:00 2001 From: Sahara Date: Thu, 7 Jun 2018 17:09:48 -0700 Subject: [PATCH 084/118] mm: remove page_is_poisoned() from linux/mm.h When commit bd33ef368135 ("mm: enable page poisoning early at boot") got rid of the PAGE_EXT_DEBUG_POISON, page_is_poisoned in the header left behind. This patch cleans up the leftovers under the table. Link: http://lkml.kernel.org/r/1528101069-21637-1-git-send-email-kpark3469@gmail.com Signed-off-by: Sahara Acked-by: Michal Hocko Reviewed-by: Andrew Morton Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mm.h | 2 -- 1 file changed, 2 deletions(-) diff --git a/include/linux/mm.h b/include/linux/mm.h index 1dd588b7c649..f0f3905f8bb8 100644 --- a/include/linux/mm.h +++ b/include/linux/mm.h @@ -2532,12 +2532,10 @@ extern int apply_to_page_range(struct mm_struct *mm, unsigned long address, #ifdef CONFIG_PAGE_POISONING extern bool page_poisoning_enabled(void); extern void kernel_poison_pages(struct page *page, int numpages, int enable); -extern bool page_is_poisoned(struct page *page); #else static inline bool page_poisoning_enabled(void) { return false; } static inline void kernel_poison_pages(struct page *page, int numpages, int enable) { } -static inline bool page_is_poisoned(struct page *page) { return false; } #endif #ifdef CONFIG_DEBUG_PAGEALLOC From b42262af5ecfc64f92423dc1e5ef634d9195f4b0 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Jun 2018 17:09:52 -0700 Subject: [PATCH 085/118] proc: more "unsigned int" in /proc/*/cmdline access_remote_vm() doesn't return negative errors, it returns number of bytes read/written (0 if error occurs). This allows to delete some comparisons which never trigger. Reuse "nr_read" variable while I'm at it. Link: http://lkml.kernel.org/r/20180221192605.GB28548@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 27 +++++++++++---------------- 1 file changed, 11 insertions(+), 16 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index ef3f7df50023..159abd09b411 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -261,9 +261,10 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, * Inherently racy -- command line shares address space * with code and data. */ - rv = access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON); - if (rv <= 0) + if (access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON) != 1) { + rv = 0; goto out_free_page; + } rv = 0; @@ -275,14 +276,11 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, p = arg_start + *pos; len = len1 - *pos; while (count > 0 && len > 0) { - unsigned int _count; - int nr_read; + unsigned int nr_read; - _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); - if (nr_read < 0) - rv = nr_read; - if (nr_read <= 0) + nr_read = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, nr_read, FOLL_ANON); + if (nr_read == 0) goto out_free_page; if (copy_to_user(buf, page, nr_read)) { @@ -320,15 +318,12 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, p = cmdline[i].p + pos1; len = cmdline[i].len - pos1; while (count > 0 && len > 0) { - unsigned int _count, l; - int nr_read; + unsigned int nr_read, l; bool final; - _count = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, _count, FOLL_ANON); - if (nr_read < 0) - rv = nr_read; - if (nr_read <= 0) + nr_read = min3(count, len, PAGE_SIZE); + nr_read = access_remote_vm(mm, p, page, nr_read, FOLL_ANON); + if (nr_read == 0) goto out_free_page; /* From 6a6b9c4c11f8da5156b72e868baf0c6c392a5014 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Jun 2018 17:09:55 -0700 Subject: [PATCH 086/118] proc: somewhat simpler code for /proc/*/cmdline "final" variable is OK but we can get away with less lines. Link: http://lkml.kernel.org/r/20180221192751.GC28548@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 24 +++++++++--------------- 1 file changed, 9 insertions(+), 15 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 159abd09b411..4b27e26fc6de 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -318,8 +318,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, p = cmdline[i].p + pos1; len = cmdline[i].len - pos1; while (count > 0 && len > 0) { - unsigned int nr_read, l; - bool final; + unsigned int nr_read, nr_write; nr_read = min3(count, len, PAGE_SIZE); nr_read = access_remote_vm(mm, p, page, nr_read, FOLL_ANON); @@ -330,25 +329,20 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, * Command line can be shorter than whole ARGV * even if last "marker" byte says it is not. */ - final = false; - l = strnlen(page, nr_read); - if (l < nr_read) { - nr_read = l; - final = true; - } + nr_write = strnlen(page, nr_read); - if (copy_to_user(buf, page, nr_read)) { + if (copy_to_user(buf, page, nr_write)) { rv = -EFAULT; goto out_free_page; } - p += nr_read; - len -= nr_read; - buf += nr_read; - count -= nr_read; - rv += nr_read; + p += nr_write; + len -= nr_write; + buf += nr_write; + count -= nr_write; + rv += nr_write; - if (final) + if (nr_write < nr_read) goto out_free_page; } From 6a6cbe75dbdfaed6c4bbc2f109d69808f9cfa421 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Jun 2018 17:09:59 -0700 Subject: [PATCH 087/118] proc: simpler iterations for /proc/*/cmdline "rv" variable is used both as a counter of bytes transferred and an error value holder but it can be reduced solely to error values if original start of userspace buffer is stashed and used at the very end. [akpm@linux-foundation.org: simplify cleanup code] Link: http://lkml.kernel.org/r/20180221193009.GA28678@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 33 ++++++++++++++------------------- 1 file changed, 14 insertions(+), 19 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 4b27e26fc6de..3fcca77d45f1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -215,8 +215,9 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, unsigned long arg_start, arg_end, env_start, env_end; unsigned long len1, len2, len; unsigned long p; + char __user *buf0 = buf; char c; - ssize_t rv; + int rv; BUG_ON(*pos < 0); @@ -253,25 +254,20 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, len2 = env_end - env_start; /* Empty ARGV. */ - if (len1 == 0) { - rv = 0; - goto out_free_page; - } + if (len1 == 0) + goto end; + /* * Inherently racy -- command line shares address space * with code and data. */ - if (access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON) != 1) { - rv = 0; - goto out_free_page; - } - - rv = 0; + if (access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON) != 1) + goto end; if (c == '\0') { /* Command line (set of strings) occupies whole ARGV. */ if (len1 <= *pos) - goto out_free_page; + goto end; p = arg_start + *pos; len = len1 - *pos; @@ -281,7 +277,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, nr_read = min3(count, len, PAGE_SIZE); nr_read = access_remote_vm(mm, p, page, nr_read, FOLL_ANON); if (nr_read == 0) - goto out_free_page; + goto end; if (copy_to_user(buf, page, nr_read)) { rv = -EFAULT; @@ -292,7 +288,6 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, len -= nr_read; buf += nr_read; count -= nr_read; - rv += nr_read; } } else { /* @@ -323,7 +318,7 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, nr_read = min3(count, len, PAGE_SIZE); nr_read = access_remote_vm(mm, p, page, nr_read, FOLL_ANON); if (nr_read == 0) - goto out_free_page; + goto end; /* * Command line can be shorter than whole ARGV @@ -340,10 +335,9 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, len -= nr_write; buf += nr_write; count -= nr_write; - rv += nr_write; if (nr_write < nr_read) - goto out_free_page; + goto end; } /* Only first chunk can be read partially. */ @@ -352,12 +346,13 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, } } +end: + *pos += buf - buf0; + rv = buf - buf0; out_free_page: free_page((unsigned long)page); out_mmput: mmput(mm); - if (rv > 0) - *pos += rv; return rv; } From 3cb4e162e4e258d906c07c411283b42b7f20a285 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Jun 2018 17:10:02 -0700 Subject: [PATCH 088/118] proc: deduplicate /proc/*/cmdline implementation Code can be sonsolidated if a dummy region of 0 length is used in normal case of \0-separated command line: 1) [arg_start, arg_end) + [dummy len=0] 2) [arg_start, arg_end) + [env_start, env_end) Link: http://lkml.kernel.org/r/20180221193335.GB28678@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Andy Shevchenko Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 53 +++++++++++++++++++------------------------------- 1 file changed, 20 insertions(+), 33 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 3fcca77d45f1..3954f50805b8 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -213,9 +213,12 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, char *page; unsigned long count = _count; unsigned long arg_start, arg_end, env_start, env_end; - unsigned long len1, len2, len; - unsigned long p; + unsigned long len1, len2; char __user *buf0 = buf; + struct { + unsigned long p; + unsigned long len; + } cmdline[2]; char c; int rv; @@ -264,43 +267,21 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, if (access_remote_vm(mm, arg_end - 1, &c, 1, FOLL_ANON) != 1) goto end; + cmdline[0].p = arg_start; + cmdline[0].len = len1; if (c == '\0') { /* Command line (set of strings) occupies whole ARGV. */ - if (len1 <= *pos) - goto end; - - p = arg_start + *pos; - len = len1 - *pos; - while (count > 0 && len > 0) { - unsigned int nr_read; - - nr_read = min3(count, len, PAGE_SIZE); - nr_read = access_remote_vm(mm, p, page, nr_read, FOLL_ANON); - if (nr_read == 0) - goto end; - - if (copy_to_user(buf, page, nr_read)) { - rv = -EFAULT; - goto out_free_page; - } - - p += nr_read; - len -= nr_read; - buf += nr_read; - count -= nr_read; - } + cmdline[1].len = 0; } else { /* * Command line (1 string) occupies ARGV and * extends into ENVP. */ - struct { - unsigned long p; - unsigned long len; - } cmdline[2] = { - { .p = arg_start, .len = len1 }, - { .p = env_start, .len = len2 }, - }; + cmdline[1].p = env_start; + cmdline[1].len = len2; + } + + { loff_t pos1 = *pos; unsigned int i; @@ -310,6 +291,9 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, i++; } while (i < 2) { + unsigned long p; + unsigned long len; + p = cmdline[i].p + pos1; len = cmdline[i].len - pos1; while (count > 0 && len > 0) { @@ -324,7 +308,10 @@ static ssize_t proc_pid_cmdline_read(struct file *file, char __user *buf, * Command line can be shorter than whole ARGV * even if last "marker" byte says it is not. */ - nr_write = strnlen(page, nr_read); + if (c == '\0') + nr_write = nr_read; + else + nr_write = strnlen(page, nr_read); if (copy_to_user(buf, page, nr_write)) { rv = -EFAULT; From 941169298a3e60db694373acbb49cd6569c8398b Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Jun 2018 17:10:07 -0700 Subject: [PATCH 089/118] proc: smaller RCU section in ->getattr() struct kstat is thread local. Link: http://lkml.kernel.org/r/20180423213626.GB9043@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 3954f50805b8..97af285c63e1 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1755,9 +1755,9 @@ int pid_getattr(const struct path *path, struct kstat *stat, generic_fillattr(inode, stat); - rcu_read_lock(); stat->uid = GLOBAL_ROOT_UID; stat->gid = GLOBAL_ROOT_GID; + rcu_read_lock(); task = pid_task(proc_pid(inode), PIDTYPE_PID); if (task) { if (!has_pid_permissions(pid, task, HIDEPID_INVISIBLE)) { From a4ef3895655ce9dcda679dde160b1543104bbe55 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Jun 2018 17:10:10 -0700 Subject: [PATCH 090/118] proc: use "unsigned int" in proc_fill_cache() All those lengths are unsigned as they should be. Link: http://lkml.kernel.org/r/20180423213751.GC9043@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 6 +++--- fs/proc/fd.c | 2 +- fs/proc/internal.h | 2 +- 3 files changed, 5 insertions(+), 5 deletions(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 97af285c63e1..720205aa1e34 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -1846,7 +1846,7 @@ const struct dentry_operations pid_dentry_operations = * by stat. */ bool proc_fill_cache(struct file *file, struct dir_context *ctx, - const char *name, int len, + const char *name, unsigned int len, instantiate_t instantiate, struct task_struct *task, const void *ptr) { struct dentry *child, *dir = file->f_path.dentry; @@ -3222,7 +3222,7 @@ int proc_pid_readdir(struct file *file, struct dir_context *ctx) iter.task; iter.tgid += 1, iter = next_tgid(ns, iter)) { char name[10 + 1]; - int len; + unsigned int len; cond_resched(); if (!has_pid_permissions(ns, iter.task, HIDEPID_INVISIBLE)) @@ -3549,7 +3549,7 @@ static int proc_task_readdir(struct file *file, struct dir_context *ctx) task; task = next_tid(task), ctx->pos++) { char name[10 + 1]; - int len; + unsigned int len; tid = task_pid_nr_ns(task, ns); len = snprintf(name, sizeof(name), "%u", tid); if (!proc_fill_cache(file, ctx, name, len, diff --git a/fs/proc/fd.c b/fs/proc/fd.c index 05b9893e9a22..81882a13212d 100644 --- a/fs/proc/fd.c +++ b/fs/proc/fd.c @@ -248,7 +248,7 @@ static int proc_readfd_common(struct file *file, struct dir_context *ctx, struct file *f; struct fd_data data; char name[10 + 1]; - int len; + unsigned int len; f = fcheck_files(files, fd); if (!f) diff --git a/fs/proc/internal.h b/fs/proc/internal.h index 93eb1906c28d..50cb22a08c2f 100644 --- a/fs/proc/internal.h +++ b/fs/proc/internal.h @@ -163,7 +163,7 @@ extern loff_t mem_lseek(struct file *, loff_t, int); /* Lookups */ typedef struct dentry *instantiate_t(struct dentry *, struct task_struct *, const void *); -extern bool proc_fill_cache(struct file *, struct dir_context *, const char *, int, +bool proc_fill_cache(struct file *, struct dir_context *, const char *, unsigned int, instantiate_t, struct task_struct *, const void *); /* From 197850a1e04ab586953a4434a6c4c6575b7d2122 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Jun 2018 17:10:13 -0700 Subject: [PATCH 091/118] proc: use "unsigned int" for sigqueue length It's defined as atomic_t and really long signal queues are unheard of. Link: http://lkml.kernel.org/r/20180423215119.GF9043@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/array.c | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/fs/proc/array.c b/fs/proc/array.c index 004077f1a7bf..0ceb3b6b37e7 100644 --- a/fs/proc/array.c +++ b/fs/proc/array.c @@ -268,7 +268,7 @@ static inline void task_sig(struct seq_file *m, struct task_struct *p) unsigned long flags; sigset_t pending, shpending, blocked, ignored, caught; int num_threads = 0; - unsigned long qsize = 0; + unsigned int qsize = 0; unsigned long qlim = 0; sigemptyset(&pending); From 5d008fb414f7c73e0761835d941943e17d32463d Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Jun 2018 17:10:17 -0700 Subject: [PATCH 092/118] proc: use "unsigned int" for /proc/*/stack struct stack_trace::nr_entries is defined as "unsigned int" (YAY!) so the iterator should be unsigned as well. It saves 1 byte of code or something like that. Link: http://lkml.kernel.org/r/20180423215248.GG9043@avx2 Signed-off-by: Alexey Dobriyan Reviewed-by: Andrew Morton Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/proc/base.c | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/fs/proc/base.c b/fs/proc/base.c index 720205aa1e34..44dec22e5e9e 100644 --- a/fs/proc/base.c +++ b/fs/proc/base.c @@ -401,7 +401,6 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, struct stack_trace trace; unsigned long *entries; int err; - int i; entries = kmalloc(MAX_STACK_TRACE_DEPTH * sizeof(*entries), GFP_KERNEL); if (!entries) @@ -414,6 +413,8 @@ static int proc_pid_stack(struct seq_file *m, struct pid_namespace *ns, err = lock_trace(task); if (!err) { + unsigned int i; + save_stack_trace_tsk(task, &trace); for (i = 0; i < trace.nr_entries; i++) { From b2f5de0334f03e90ae7dee72a7fc597ef555a9a8 Mon Sep 17 00:00:00 2001 From: Alexey Dobriyan Date: Thu, 7 Jun 2018 17:10:20 -0700 Subject: [PATCH 093/118] tools/testing/selftests/proc: test /proc/*/fd a bit (+ PF_KTHREAD is ABI!) * Test lookup in /proc/self/fd. "map_files" lookup story showed that lookup is not that simple. * Test that all those symlinks open the same file. Check with (st_dev, st_info). * Test that kernel threads do not have anything in their /proc/*/fd/ directory. Now this is where things get interesting. First, kernel threads aren't pinned by /proc/self or equivalent, thus some "atomicity" is required. Second, ->comm can contain whitespace and ')'. No, they are not escaped. Third, the only reliable way to check if process is kernel thread appears to be field #9 in /proc/*/stat. This field is struct task_struct::flags in decimal! Check is done by testing PF_KTHREAD flags like we do in kernel. PF_KTREAD value is a part of userspace ABI !!! Other methods for determining kernel threadness are not reliable: * RSS can be 0 if everything is swapped, even while reading from /proc/self. * ->total_vm CAN BE ZERO if process is finishing munmap(NULL, whole address space); * /proc/*/maps and similar files can be empty because unmapping everything works. Read returning 0 can't distinguish between kernel thread and such suicide process. Link: http://lkml.kernel.org/r/20180505000414.GA15090@avx2 Signed-off-by: Alexey Dobriyan Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- tools/testing/selftests/proc/.gitignore | 3 + tools/testing/selftests/proc/Makefile | 5 +- tools/testing/selftests/proc/fd-001-lookup.c | 168 +++++++++++++++++ .../testing/selftests/proc/fd-002-posix-eq.c | 57 ++++++ tools/testing/selftests/proc/fd-003-kthread.c | 178 ++++++++++++++++++ tools/testing/selftests/proc/proc-uptime.h | 16 +- tools/testing/selftests/proc/proc.h | 39 ++++ tools/testing/selftests/proc/read.c | 17 +- 8 files changed, 451 insertions(+), 32 deletions(-) create mode 100644 tools/testing/selftests/proc/fd-001-lookup.c create mode 100644 tools/testing/selftests/proc/fd-002-posix-eq.c create mode 100644 tools/testing/selftests/proc/fd-003-kthread.c create mode 100644 tools/testing/selftests/proc/proc.h diff --git a/tools/testing/selftests/proc/.gitignore b/tools/testing/selftests/proc/.gitignore index 6c16f77c722c..74e5912e9f2e 100644 --- a/tools/testing/selftests/proc/.gitignore +++ b/tools/testing/selftests/proc/.gitignore @@ -1,3 +1,6 @@ +/fd-001-lookup +/fd-002-posix-eq +/fd-003-kthread /proc-loadavg-001 /proc-self-map-files-001 /proc-self-map-files-002 diff --git a/tools/testing/selftests/proc/Makefile b/tools/testing/selftests/proc/Makefile index dbb87e56264c..db310eedc268 100644 --- a/tools/testing/selftests/proc/Makefile +++ b/tools/testing/selftests/proc/Makefile @@ -1,6 +1,9 @@ -CFLAGS += -Wall -O2 +CFLAGS += -Wall -O2 -Wno-unused-function TEST_GEN_PROGS := +TEST_GEN_PROGS += fd-001-lookup +TEST_GEN_PROGS += fd-002-posix-eq +TEST_GEN_PROGS += fd-003-kthread TEST_GEN_PROGS += proc-loadavg-001 TEST_GEN_PROGS += proc-self-map-files-001 TEST_GEN_PROGS += proc-self-map-files-002 diff --git a/tools/testing/selftests/proc/fd-001-lookup.c b/tools/testing/selftests/proc/fd-001-lookup.c new file mode 100644 index 000000000000..a2010dfb2110 --- /dev/null +++ b/tools/testing/selftests/proc/fd-001-lookup.c @@ -0,0 +1,168 @@ +/* + * Copyright © 2018 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test /proc/*/fd lookup. +#define _GNU_SOURCE +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "proc.h" + +/* lstat(2) has more "coverage" in case non-symlink pops up somehow. */ +static void test_lookup_pass(const char *pathname) +{ + struct stat st; + ssize_t rv; + + memset(&st, 0, sizeof(struct stat)); + rv = lstat(pathname, &st); + assert(rv == 0); + assert(S_ISLNK(st.st_mode)); +} + +static void test_lookup_fail(const char *pathname) +{ + struct stat st; + ssize_t rv; + + rv = lstat(pathname, &st); + assert(rv == -1 && errno == ENOENT); +} + +static void test_lookup(unsigned int fd) +{ + char buf[64]; + unsigned int c; + unsigned int u; + int i; + + snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd); + test_lookup_pass(buf); + + /* leading junk */ + for (c = 1; c <= 255; c++) { + if (c == '/') + continue; + snprintf(buf, sizeof(buf), "/proc/self/fd/%c%u", c, fd); + test_lookup_fail(buf); + } + + /* trailing junk */ + for (c = 1; c <= 255; c++) { + if (c == '/') + continue; + snprintf(buf, sizeof(buf), "/proc/self/fd/%u%c", fd, c); + test_lookup_fail(buf); + } + + for (i = INT_MIN; i < INT_MIN + 1024; i++) { + snprintf(buf, sizeof(buf), "/proc/self/fd/%d", i); + test_lookup_fail(buf); + } + for (i = -1024; i < 0; i++) { + snprintf(buf, sizeof(buf), "/proc/self/fd/%d", i); + test_lookup_fail(buf); + } + for (u = INT_MAX - 1024; u <= (unsigned int)INT_MAX + 1024; u++) { + snprintf(buf, sizeof(buf), "/proc/self/fd/%u", u); + test_lookup_fail(buf); + } + for (u = UINT_MAX - 1024; u != 0; u++) { + snprintf(buf, sizeof(buf), "/proc/self/fd/%u", u); + test_lookup_fail(buf); + } + + +} + +int main(void) +{ + struct dirent *de; + unsigned int fd, target_fd; + + if (unshare(CLONE_FILES) == -1) + return 1; + + /* Wipe fdtable. */ + do { + DIR *d; + + d = opendir("/proc/self/fd"); + if (!d) + return 1; + + de = xreaddir(d); + assert(de->d_type == DT_DIR); + assert(streq(de->d_name, ".")); + + de = xreaddir(d); + assert(de->d_type == DT_DIR); + assert(streq(de->d_name, "..")); +next: + de = xreaddir(d); + if (de) { + unsigned long long fd_ull; + unsigned int fd; + char *end; + + assert(de->d_type == DT_LNK); + + fd_ull = xstrtoull(de->d_name, &end); + assert(*end == '\0'); + assert(fd_ull == (unsigned int)fd_ull); + + fd = fd_ull; + if (fd == dirfd(d)) + goto next; + close(fd); + } + + closedir(d); + } while (de); + + /* Now fdtable is clean. */ + + fd = open("/", O_PATH|O_DIRECTORY); + assert(fd == 0); + test_lookup(fd); + close(fd); + + /* Clean again! */ + + fd = open("/", O_PATH|O_DIRECTORY); + assert(fd == 0); + /* Default RLIMIT_NOFILE-1 */ + target_fd = 1023; + while (target_fd > 0) { + if (dup2(fd, target_fd) == target_fd) + break; + target_fd /= 2; + } + assert(target_fd > 0); + close(fd); + test_lookup(target_fd); + close(target_fd); + + return 0; +} diff --git a/tools/testing/selftests/proc/fd-002-posix-eq.c b/tools/testing/selftests/proc/fd-002-posix-eq.c new file mode 100644 index 000000000000..417322ca9c53 --- /dev/null +++ b/tools/testing/selftests/proc/fd-002-posix-eq.c @@ -0,0 +1,57 @@ +/* + * Copyright © 2018 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test that open(/proc/*/fd/*) opens the same file. +#undef NDEBUG +#include +#include +#include +#include +#include +#include + +int main(void) +{ + int fd0, fd1, fd2; + struct stat st0, st1, st2; + char buf[64]; + int rv; + + fd0 = open("/", O_DIRECTORY|O_RDONLY); + assert(fd0 >= 0); + + snprintf(buf, sizeof(buf), "/proc/self/fd/%u", fd0); + fd1 = open(buf, O_RDONLY); + assert(fd1 >= 0); + + snprintf(buf, sizeof(buf), "/proc/thread-self/fd/%u", fd0); + fd2 = open(buf, O_RDONLY); + assert(fd2 >= 0); + + rv = fstat(fd0, &st0); + assert(rv == 0); + rv = fstat(fd1, &st1); + assert(rv == 0); + rv = fstat(fd2, &st2); + assert(rv == 0); + + assert(st0.st_dev == st1.st_dev); + assert(st0.st_ino == st1.st_ino); + + assert(st0.st_dev == st2.st_dev); + assert(st0.st_ino == st2.st_ino); + + return 0; +} diff --git a/tools/testing/selftests/proc/fd-003-kthread.c b/tools/testing/selftests/proc/fd-003-kthread.c new file mode 100644 index 000000000000..1d659d55368c --- /dev/null +++ b/tools/testing/selftests/proc/fd-003-kthread.c @@ -0,0 +1,178 @@ +/* + * Copyright © 2018 Alexey Dobriyan + * + * Permission to use, copy, modify, and distribute this software for any + * purpose with or without fee is hereby granted, provided that the above + * copyright notice and this permission notice appear in all copies. + * + * THE SOFTWARE IS PROVIDED "AS IS" AND THE AUTHOR DISCLAIMS ALL WARRANTIES + * WITH REGARD TO THIS SOFTWARE INCLUDING ALL IMPLIED WARRANTIES OF + * MERCHANTABILITY AND FITNESS. IN NO EVENT SHALL THE AUTHOR BE LIABLE FOR + * ANY SPECIAL, DIRECT, INDIRECT, OR CONSEQUENTIAL DAMAGES OR ANY DAMAGES + * WHATSOEVER RESULTING FROM LOSS OF USE, DATA OR PROFITS, WHETHER IN AN + * ACTION OF CONTRACT, NEGLIGENCE OR OTHER TORTIOUS ACTION, ARISING OUT OF + * OR IN CONNECTION WITH THE USE OR PERFORMANCE OF THIS SOFTWARE. + */ +// Test that /proc/$KERNEL_THREAD/fd/ is empty. +#define _GNU_SOURCE +#undef NDEBUG +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "proc.h" + +#define PF_KHTREAD 0x00200000 + +/* + * Test for kernel threadness atomically with openat(). + * + * Return /proc/$PID/fd descriptor if process is kernel thread. + * Return -1 if a process is userspace process. + */ +static int kernel_thread_fd(unsigned int pid) +{ + unsigned int flags = 0; + char buf[4096]; + int dir_fd, fd; + ssize_t rv; + + snprintf(buf, sizeof(buf), "/proc/%u", pid); + dir_fd = open(buf, O_RDONLY|O_DIRECTORY); + if (dir_fd == -1) + return -1; + + /* + * Believe it or not, struct task_struct::flags is directly exposed + * to userspace! + */ + fd = openat(dir_fd, "stat", O_RDONLY); + if (fd == -1) { + close(dir_fd); + return -1; + } + rv = read(fd, buf, sizeof(buf)); + close(fd); + if (0 < rv && rv <= sizeof(buf)) { + unsigned long long flags_ull; + char *p, *end; + int i; + + assert(buf[rv - 1] == '\n'); + buf[rv - 1] = '\0'; + + /* Search backwards: ->comm can contain whitespace and ')'. */ + for (i = 0; i < 43; i++) { + p = strrchr(buf, ' '); + assert(p); + *p = '\0'; + } + + p = strrchr(buf, ' '); + assert(p); + + flags_ull = xstrtoull(p + 1, &end); + assert(*end == '\0'); + assert(flags_ull == (unsigned int)flags_ull); + + flags = flags_ull; + } + + fd = -1; + if (flags & PF_KHTREAD) { + fd = openat(dir_fd, "fd", O_RDONLY|O_DIRECTORY); + } + close(dir_fd); + return fd; +} + +static void test_readdir(int fd) +{ + DIR *d; + struct dirent *de; + + d = fdopendir(fd); + assert(d); + + de = xreaddir(d); + assert(streq(de->d_name, ".")); + assert(de->d_type == DT_DIR); + + de = xreaddir(d); + assert(streq(de->d_name, "..")); + assert(de->d_type == DT_DIR); + + de = xreaddir(d); + assert(!de); +} + +static inline int sys_statx(int dirfd, const char *pathname, int flags, + unsigned int mask, void *stx) +{ + return syscall(SYS_statx, dirfd, pathname, flags, mask, stx); +} + +static void test_lookup_fail(int fd, const char *pathname) +{ + char stx[256] __attribute__((aligned(8))); + int rv; + + rv = sys_statx(fd, pathname, AT_SYMLINK_NOFOLLOW, 0, (void *)stx); + assert(rv == -1 && errno == ENOENT); +} + +static void test_lookup(int fd) +{ + char buf[64]; + unsigned int u; + int i; + + for (i = INT_MIN; i < INT_MIN + 1024; i++) { + snprintf(buf, sizeof(buf), "%d", i); + test_lookup_fail(fd, buf); + } + for (i = -1024; i < 1024; i++) { + snprintf(buf, sizeof(buf), "%d", i); + test_lookup_fail(fd, buf); + } + for (u = INT_MAX - 1024; u < (unsigned int)INT_MAX + 1024; u++) { + snprintf(buf, sizeof(buf), "%u", u); + test_lookup_fail(fd, buf); + } + for (u = UINT_MAX - 1024; u != 0; u++) { + snprintf(buf, sizeof(buf), "%u", u); + test_lookup_fail(fd, buf); + } +} + +int main(void) +{ + unsigned int pid; + int fd; + + /* + * In theory this will loop indefinitely if kernel threads are exiled + * from /proc. + * + * Start with kthreadd. + */ + pid = 2; + while ((fd = kernel_thread_fd(pid)) == -1 && pid < 1024) { + pid++; + } + /* EACCES if run as non-root. */ + if (pid >= 1024) + return 1; + + test_readdir(fd); + test_lookup(fd); + + return 0; +} diff --git a/tools/testing/selftests/proc/proc-uptime.h b/tools/testing/selftests/proc/proc-uptime.h index 0e464b50e9d9..dc6a42b1d6b0 100644 --- a/tools/testing/selftests/proc/proc-uptime.h +++ b/tools/testing/selftests/proc/proc-uptime.h @@ -20,21 +20,7 @@ #include #include -static unsigned long long xstrtoull(const char *p, char **end) -{ - if (*p == '0') { - *end = (char *)p + 1; - return 0; - } else if ('1' <= *p && *p <= '9') { - unsigned long long val; - - errno = 0; - val = strtoull(p, end, 10); - assert(errno == 0); - return val; - } else - assert(0); -} +#include "proc.h" static void proc_uptime(int fd, uint64_t *uptime, uint64_t *idle) { diff --git a/tools/testing/selftests/proc/proc.h b/tools/testing/selftests/proc/proc.h new file mode 100644 index 000000000000..4e178166fd84 --- /dev/null +++ b/tools/testing/selftests/proc/proc.h @@ -0,0 +1,39 @@ +#pragma once +#undef NDEBUG +#include +#include +#include +#include +#include +#include + +static inline bool streq(const char *s1, const char *s2) +{ + return strcmp(s1, s2) == 0; +} + +static unsigned long long xstrtoull(const char *p, char **end) +{ + if (*p == '0') { + *end = (char *)p + 1; + return 0; + } else if ('1' <= *p && *p <= '9') { + unsigned long long val; + + errno = 0; + val = strtoull(p, end, 10); + assert(errno == 0); + return val; + } else + assert(0); +} + +static struct dirent *xreaddir(DIR *d) +{ + struct dirent *de; + + errno = 0; + de = readdir(d); + assert(de || errno == 0); + return de; +} diff --git a/tools/testing/selftests/proc/read.c b/tools/testing/selftests/proc/read.c index 1e73c2232097..563e752e6eba 100644 --- a/tools/testing/selftests/proc/read.c +++ b/tools/testing/selftests/proc/read.c @@ -31,22 +31,7 @@ #include #include -static inline bool streq(const char *s1, const char *s2) -{ - return strcmp(s1, s2) == 0; -} - -static struct dirent *xreaddir(DIR *d) -{ - struct dirent *de; - - errno = 0; - de = readdir(d); - if (!de && errno != 0) { - exit(1); - } - return de; -} +#include "proc.h" static void f_reg(DIR *d, const char *filename) { From 6d8e410807829aaaa0c32013d1c8afe91d67c0c5 Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Jun 2018 17:10:24 -0700 Subject: [PATCH 094/118] int-ll64.h: define u{8,16,32,64} and s{8,16,32,64} based on uapi header has the same typedefs except that it prefixes them with double-underscore for user space. Use them for the kernel space typedefs. Link: http://lkml.kernel.org/r/1526350925-14922-1-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Reviewed-by: Andrew Morton Cc: Geert Uytterhoeven Cc: Alexey Dobriyan Cc: Lihao Liang Cc: Philippe Ombredanne Cc: Pekka Enberg Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/asm-generic/int-ll64.h | 19 ++++++++----------- 1 file changed, 8 insertions(+), 11 deletions(-) diff --git a/include/asm-generic/int-ll64.h b/include/asm-generic/int-ll64.h index ffb68d67be5f..a248545f1e18 100644 --- a/include/asm-generic/int-ll64.h +++ b/include/asm-generic/int-ll64.h @@ -13,17 +13,14 @@ #ifndef __ASSEMBLY__ -typedef signed char s8; -typedef unsigned char u8; - -typedef signed short s16; -typedef unsigned short u16; - -typedef signed int s32; -typedef unsigned int u32; - -typedef signed long long s64; -typedef unsigned long long u64; +typedef __s8 s8; +typedef __u8 u8; +typedef __s16 s16; +typedef __u16 u16; +typedef __s32 s32; +typedef __u32 u32; +typedef __s64 s64; +typedef __u64 u64; #define S8_C(x) x #define U8_C(x) x ## U From 6d0e8d53849fb491549a01893d786bf33d4b28df Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Jun 2018 17:10:27 -0700 Subject: [PATCH 095/118] include/linux/types.h: define aligned_ types based on uapi header has the same typedefs except that it prefixes them with double-underscore for user space. Use them for the kernel space typedefs. Link: http://lkml.kernel.org/r/1526350925-14922-2-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Reviewed-by: Andrew Morton Cc: Geert Uytterhoeven Cc: Alexey Dobriyan Cc: Lihao Liang Cc: Philippe Ombredanne Cc: Pekka Enberg Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/types.h | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/include/linux/types.h b/include/linux/types.h index ec13d02b3481..be1589763e0a 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -115,9 +115,9 @@ typedef __s64 int64_t; #endif /* this is a special 64bit data type that is 8-byte aligned */ -#define aligned_u64 __u64 __attribute__((aligned(8))) -#define aligned_be64 __be64 __attribute__((aligned(8))) -#define aligned_le64 __le64 __attribute__((aligned(8))) +#define aligned_u64 __aligned_u64 +#define aligned_be64 __aligned_be64 +#define aligned_le64 __aligned_le64 /** * The type used for indexing onto a disc or disc partition. From b22f22a3c199b3282b05a927c60eb8edbe42d25d Mon Sep 17 00:00:00 2001 From: Masahiro Yamada Date: Thu, 7 Jun 2018 17:10:30 -0700 Subject: [PATCH 096/118] include/linux/types.h: use fixed width types without double-underscore prefix This header file is not exported. It is safe to reference types without double-underscore prefix. Link: http://lkml.kernel.org/r/1526350925-14922-3-git-send-email-yamada.masahiro@socionext.com Signed-off-by: Masahiro Yamada Cc: Geert Uytterhoeven Cc: Alexey Dobriyan Cc: Lihao Liang Cc: Philippe Ombredanne Cc: Pekka Enberg Cc: Greg Kroah-Hartman Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/types.h | 28 ++++++++++++++-------------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/include/linux/types.h b/include/linux/types.h index be1589763e0a..9834e90aa010 100644 --- a/include/linux/types.h +++ b/include/linux/types.h @@ -10,14 +10,14 @@ #define DECLARE_BITMAP(name,bits) \ unsigned long name[BITS_TO_LONGS(bits)] -typedef __u32 __kernel_dev_t; +typedef u32 __kernel_dev_t; typedef __kernel_fd_set fd_set; typedef __kernel_dev_t dev_t; typedef __kernel_ino_t ino_t; typedef __kernel_mode_t mode_t; typedef unsigned short umode_t; -typedef __u32 nlink_t; +typedef u32 nlink_t; typedef __kernel_off_t off_t; typedef __kernel_pid_t pid_t; typedef __kernel_daddr_t daddr_t; @@ -95,23 +95,23 @@ typedef unsigned long ulong; #ifndef __BIT_TYPES_DEFINED__ #define __BIT_TYPES_DEFINED__ -typedef __u8 u_int8_t; -typedef __s8 int8_t; -typedef __u16 u_int16_t; -typedef __s16 int16_t; -typedef __u32 u_int32_t; -typedef __s32 int32_t; +typedef u8 u_int8_t; +typedef s8 int8_t; +typedef u16 u_int16_t; +typedef s16 int16_t; +typedef u32 u_int32_t; +typedef s32 int32_t; #endif /* !(__BIT_TYPES_DEFINED__) */ -typedef __u8 uint8_t; -typedef __u16 uint16_t; -typedef __u32 uint32_t; +typedef u8 uint8_t; +typedef u16 uint16_t; +typedef u32 uint32_t; #if defined(__GNUC__) -typedef __u64 uint64_t; -typedef __u64 u_int64_t; -typedef __s64 int64_t; +typedef u64 uint64_t; +typedef u64 u_int64_t; +typedef s64 int64_t; #endif /* this is a special 64bit data type that is 8-byte aligned */ From 401c636a0eeb0d51862fce222da1bf08e3a0ffd0 Mon Sep 17 00:00:00 2001 From: Tetsuo Handa Date: Thu, 7 Jun 2018 17:10:34 -0700 Subject: [PATCH 097/118] kernel/hung_task.c: show all hung tasks before panic When we get a hung task it can often be valuable to see _all_ the hung tasks on the system before calling panic(). Quoting from https://syzkaller.appspot.com/text?tag=CrashReport&id=5316056503549952 ---------------------------------------- INFO: task syz-executor0:6540 blocked for more than 120 seconds. Not tainted 4.16.0+ #13 "echo 0 > /proc/sys/kernel/hung_task_timeout_secs" disables this message. syz-executor0 D23560 6540 4521 0x80000004 Call Trace: context_switch kernel/sched/core.c:2848 [inline] __schedule+0x8fb/0x1ef0 kernel/sched/core.c:3490 schedule+0xf5/0x430 kernel/sched/core.c:3549 schedule_preempt_disabled+0x10/0x20 kernel/sched/core.c:3607 __mutex_lock_common kernel/locking/mutex.c:833 [inline] __mutex_lock+0xb7f/0x1810 kernel/locking/mutex.c:893 mutex_lock_nested+0x16/0x20 kernel/locking/mutex.c:908 lo_ioctl+0x8b/0x1b70 drivers/block/loop.c:1355 __blkdev_driver_ioctl block/ioctl.c:303 [inline] blkdev_ioctl+0x1759/0x1e00 block/ioctl.c:601 ioctl_by_bdev+0xa5/0x110 fs/block_dev.c:2060 isofs_get_last_session fs/isofs/inode.c:567 [inline] isofs_fill_super+0x2ba9/0x3bc0 fs/isofs/inode.c:660 mount_bdev+0x2b7/0x370 fs/super.c:1119 isofs_mount+0x34/0x40 fs/isofs/inode.c:1560 mount_fs+0x66/0x2d0 fs/super.c:1222 vfs_kern_mount.part.26+0xc6/0x4a0 fs/namespace.c:1037 vfs_kern_mount fs/namespace.c:2514 [inline] do_new_mount fs/namespace.c:2517 [inline] do_mount+0xea4/0x2b90 fs/namespace.c:2847 ksys_mount+0xab/0x120 fs/namespace.c:3063 SYSC_mount fs/namespace.c:3077 [inline] SyS_mount+0x39/0x50 fs/namespace.c:3074 do_syscall_64+0x281/0x940 arch/x86/entry/common.c:287 entry_SYSCALL_64_after_hwframe+0x42/0xb7 (...snipped...) Showing all locks held in the system: (...snipped...) 2 locks held by syz-executor0/6540: #0: 00000000566d4c39 (&type->s_umount_key#49/1){+.+.}, at: alloc_super fs/super.c:211 [inline] #0: 00000000566d4c39 (&type->s_umount_key#49/1){+.+.}, at: sget_userns+0x3b2/0xe60 fs/super.c:502 /* down_write_nested(&s->s_umount, SINGLE_DEPTH_NESTING); */ #1: 0000000043ca8836 (&lo->lo_ctl_mutex/1){+.+.}, at: lo_ioctl+0x8b/0x1b70 drivers/block/loop.c:1355 /* mutex_lock_nested(&lo->lo_ctl_mutex, 1); */ (...snipped...) 3 locks held by syz-executor7/6541: #0: 0000000043ca8836 (&lo->lo_ctl_mutex/1){+.+.}, at: lo_ioctl+0x8b/0x1b70 drivers/block/loop.c:1355 /* mutex_lock_nested(&lo->lo_ctl_mutex, 1); */ #1: 000000007bf3d3f9 (&bdev->bd_mutex){+.+.}, at: blkdev_reread_part+0x1e/0x40 block/ioctl.c:192 #2: 00000000566d4c39 (&type->s_umount_key#50){.+.+}, at: __get_super.part.10+0x1d3/0x280 fs/super.c:663 /* down_read(&sb->s_umount); */ ---------------------------------------- When reporting an AB-BA deadlock like shown above, it would be nice if trace of PID=6541 is printed as well as trace of PID=6540 before calling panic(). Showing hung tasks up to /proc/sys/kernel/hung_task_warnings could delay calling panic() but normally there should not be so many hung tasks. Link: http://lkml.kernel.org/r/201804050705.BHE57833.HVFOFtSOMQJFOL@I-love.SAKURA.ne.jp Signed-off-by: Tetsuo Handa Acked-by: Paul E. McKenney Acked-by: Dmitry Vyukov Cc: Vegard Nossum Cc: Mandeep Singh Baines Cc: Peter Zijlstra Cc: Thomas Gleixner Cc: Ingo Molnar Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- kernel/hung_task.c | 11 +++++++---- 1 file changed, 7 insertions(+), 4 deletions(-) diff --git a/kernel/hung_task.c b/kernel/hung_task.c index 751593ed7c0b..32b479468e4d 100644 --- a/kernel/hung_task.c +++ b/kernel/hung_task.c @@ -44,6 +44,7 @@ int __read_mostly sysctl_hung_task_warnings = 10; static int __read_mostly did_panic; static bool hung_task_show_lock; +static bool hung_task_call_panic; static struct task_struct *watchdog_task; @@ -127,10 +128,8 @@ static void check_hung_task(struct task_struct *t, unsigned long timeout) touch_nmi_watchdog(); if (sysctl_hung_task_panic) { - if (hung_task_show_lock) - debug_show_all_locks(); - trigger_all_cpu_backtrace(); - panic("hung_task: blocked tasks"); + hung_task_show_lock = true; + hung_task_call_panic = true; } } @@ -193,6 +192,10 @@ static void check_hung_uninterruptible_tasks(unsigned long timeout) rcu_read_unlock(); if (hung_task_show_lock) debug_show_all_locks(); + if (hung_task_call_panic) { + trigger_all_cpu_backtrace(); + panic("hung_task: blocked tasks"); + } } static long hung_timeout_jiffies(unsigned long last_checked, From 0455c74788fd5aad4399f00e3fbbb7e87450ca58 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 7 Jun 2018 17:10:38 -0700 Subject: [PATCH 098/118] get_maintainer: improve patch recognition There are mode change and rename only patches that are unrecognized by the get_maintainer.pl script. Recognize them. Link: http://lkml.kernel.org/r/bf63101a908d0ff51948164aa60e672368066186.1526949367.git.joe@perches.com Signed-off-by: Joe Perches Reported-by: Heinrich Schuchardt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/get_maintainer.pl | 13 ++++++++++++- 1 file changed, 12 insertions(+), 1 deletion(-) diff --git a/scripts/get_maintainer.pl b/scripts/get_maintainer.pl index 30eca36b4dad..c87fa734e3e1 100755 --- a/scripts/get_maintainer.pl +++ b/scripts/get_maintainer.pl @@ -542,7 +542,18 @@ foreach my $file (@ARGV) { while (<$patch>) { my $patch_line = $_; - if (m/^\+\+\+\s+(\S+)/ or m/^---\s+(\S+)/) { + if (m/^ mode change [0-7]+ => [0-7]+ (\S+)\s*$/) { + my $filename = $1; + push(@files, $filename); + } elsif (m/^rename (?:from|to) (\S+)\s*$/) { + my $filename = $1; + push(@files, $filename); + } elsif (m/^diff --git a\/(\S+) b\/(\S+)\s*$/) { + my $filename1 = $1; + my $filename2 = $2; + push(@files, $filename1); + push(@files, $filename2); + } elsif (m/^\+\+\+\s+(\S+)/ or m/^---\s+(\S+)/) { my $filename = $1; $filename =~ s@^[^/]*/@@; $filename =~ s@\n@@; From ca1250bbd4e0b852cafbc94d4d2610171107b252 Mon Sep 17 00:00:00 2001 From: Yury Norov Date: Thu, 7 Jun 2018 17:10:41 -0700 Subject: [PATCH 099/118] lib/bitmap.c: micro-optimization for __bitmap_complement() Use BITS_TO_LONGS() macro to avoid calculation of reminder (bits % BITS_PER_LONG) On ARM64 it saves 5 instruction for function - 16 before and 11 after. Link: http://lkml.kernel.org/r/20180411145914.6011-1-ynorov@caviumnetworks.com Signed-off-by: Yury Norov Reviewed-by: Andrew Morton Cc: Matthew Wilcox Cc: Rasmus Villemoes Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/bitmap.c | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/lib/bitmap.c b/lib/bitmap.c index a42eff7e8c48..58f9750e49c6 100644 --- a/lib/bitmap.c +++ b/lib/bitmap.c @@ -64,12 +64,9 @@ EXPORT_SYMBOL(__bitmap_equal); void __bitmap_complement(unsigned long *dst, const unsigned long *src, unsigned int bits) { - unsigned int k, lim = bits/BITS_PER_LONG; + unsigned int k, lim = BITS_TO_LONGS(bits); for (k = 0; k < lim; ++k) dst[k] = ~src[k]; - - if (bits % BITS_PER_LONG) - dst[k] = ~src[k]; } EXPORT_SYMBOL(__bitmap_complement); From b94078e69533ba237e2c229bca61bae47e6fafcc Mon Sep 17 00:00:00 2001 From: Matthew Wilcox Date: Thu, 7 Jun 2018 17:10:45 -0700 Subject: [PATCH 100/118] lib/idr.c: remove simple_ida_lock Improve the scalability of the IDA by using the per-IDA xa_lock rather than the global simple_ida_lock. IDAs are not typically used in performance-sensitive locations, but since we have this lock anyway, we can use it. It is also a step towards converting the IDA from the radix tree to the XArray. [akpm@linux-foundation.org: idr.c needs xarray.h] Link: http://lkml.kernel.org/r/20180331125332.GF13332@bombadil.infradead.org Signed-off-by: Matthew Wilcox Reviewed-by: Andrew Morton Cc: Rasmus Villemoes Cc: Daniel Vetter Cc: Tejun Heo Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/idr.c | 10 +++++----- 1 file changed, 5 insertions(+), 5 deletions(-) diff --git a/lib/idr.c b/lib/idr.c index 823b813f08f8..ed9c169c12bd 100644 --- a/lib/idr.c +++ b/lib/idr.c @@ -4,9 +4,9 @@ #include #include #include +#include DEFINE_PER_CPU(struct ida_bitmap *, ida_bitmap); -static DEFINE_SPINLOCK(simple_ida_lock); /** * idr_alloc_u32() - Allocate an ID. @@ -581,7 +581,7 @@ again: if (!ida_pre_get(ida, gfp_mask)) return -ENOMEM; - spin_lock_irqsave(&simple_ida_lock, flags); + xa_lock_irqsave(&ida->ida_rt, flags); ret = ida_get_new_above(ida, start, &id); if (!ret) { if (id > max) { @@ -591,7 +591,7 @@ again: ret = id; } } - spin_unlock_irqrestore(&simple_ida_lock, flags); + xa_unlock_irqrestore(&ida->ida_rt, flags); if (unlikely(ret == -EAGAIN)) goto again; @@ -615,8 +615,8 @@ void ida_simple_remove(struct ida *ida, unsigned int id) unsigned long flags; BUG_ON((int)id < 0); - spin_lock_irqsave(&simple_ida_lock, flags); + xa_lock_irqsave(&ida->ida_rt, flags); ida_remove(ida, id); - spin_unlock_irqrestore(&simple_ida_lock, flags); + xa_unlock_irqrestore(&ida->ida_rt, flags); } EXPORT_SYMBOL(ida_simple_remove); From 804209d8a0096d9ed8c9891f987f42bc1a98b8f9 Mon Sep 17 00:00:00 2001 From: Sebastian Andrzej Siewior Date: Thu, 7 Jun 2018 17:10:48 -0700 Subject: [PATCH 101/118] lib/percpu_ida.c: use _irqsave() instead of local_irq_save() + spin_lock percpu_ida() decouples disabling interrupts from the locking operations. This breaks some assumptions if the locking operations are replaced like they are under -RT. The same locking can be achieved by avoiding local_irq_save() and using spin_lock_irqsave() instead. percpu_ida_alloc() gains one more preemption point because after unlocking the fastpath and before the pool lock is acquired, the interrupts are briefly enabled. Link: http://lkml.kernel.org/r/20180504153218.7301-1-bigeasy@linutronix.de Signed-off-by: Sebastian Andrzej Siewior Reviewed-by: Andrew Morton Cc: Thomas Gleixner Cc: Nicholas Bellinger Cc: Shaohua Li Cc: Kent Overstreet Cc: Matthew Wilcox Cc: Jens Axboe Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/percpu_ida.c | 63 ++++++++++++++++-------------------------------- 1 file changed, 21 insertions(+), 42 deletions(-) diff --git a/lib/percpu_ida.c b/lib/percpu_ida.c index 6016f1deb1f5..9bbd9c5d375a 100644 --- a/lib/percpu_ida.c +++ b/lib/percpu_ida.c @@ -112,18 +112,6 @@ static inline void alloc_global_tags(struct percpu_ida *pool, min(pool->nr_free, pool->percpu_batch_size)); } -static inline unsigned alloc_local_tag(struct percpu_ida_cpu *tags) -{ - int tag = -ENOSPC; - - spin_lock(&tags->lock); - if (tags->nr_free) - tag = tags->freelist[--tags->nr_free]; - spin_unlock(&tags->lock); - - return tag; -} - /** * percpu_ida_alloc - allocate a tag * @pool: pool to allocate from @@ -147,20 +135,22 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state) DEFINE_WAIT(wait); struct percpu_ida_cpu *tags; unsigned long flags; - int tag; + int tag = -ENOSPC; - local_irq_save(flags); - tags = this_cpu_ptr(pool->tag_cpu); + tags = raw_cpu_ptr(pool->tag_cpu); + spin_lock_irqsave(&tags->lock, flags); /* Fastpath */ - tag = alloc_local_tag(tags); - if (likely(tag >= 0)) { - local_irq_restore(flags); + if (likely(tags->nr_free >= 0)) { + tag = tags->freelist[--tags->nr_free]; + spin_unlock_irqrestore(&tags->lock, flags); return tag; } + spin_unlock_irqrestore(&tags->lock, flags); while (1) { - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); + tags = this_cpu_ptr(pool->tag_cpu); /* * prepare_to_wait() must come before steal_tags(), in case @@ -184,8 +174,7 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state) &pool->cpus_have_tags); } - spin_unlock(&pool->lock); - local_irq_restore(flags); + spin_unlock_irqrestore(&pool->lock, flags); if (tag >= 0 || state == TASK_RUNNING) break; @@ -196,9 +185,6 @@ int percpu_ida_alloc(struct percpu_ida *pool, int state) } schedule(); - - local_irq_save(flags); - tags = this_cpu_ptr(pool->tag_cpu); } if (state != TASK_RUNNING) finish_wait(&pool->wait, &wait); @@ -222,28 +208,24 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag) BUG_ON(tag >= pool->nr_tags); - local_irq_save(flags); - tags = this_cpu_ptr(pool->tag_cpu); + tags = raw_cpu_ptr(pool->tag_cpu); - spin_lock(&tags->lock); + spin_lock_irqsave(&tags->lock, flags); tags->freelist[tags->nr_free++] = tag; nr_free = tags->nr_free; - spin_unlock(&tags->lock); if (nr_free == 1) { cpumask_set_cpu(smp_processor_id(), &pool->cpus_have_tags); wake_up(&pool->wait); } + spin_unlock_irqrestore(&tags->lock, flags); if (nr_free == pool->percpu_max_size) { - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); + spin_lock(&tags->lock); - /* - * Global lock held and irqs disabled, don't need percpu - * lock - */ if (tags->nr_free == pool->percpu_max_size) { move_tags(pool->freelist, &pool->nr_free, tags->freelist, &tags->nr_free, @@ -251,10 +233,9 @@ void percpu_ida_free(struct percpu_ida *pool, unsigned tag) wake_up(&pool->wait); } - spin_unlock(&pool->lock); + spin_unlock(&tags->lock); + spin_unlock_irqrestore(&pool->lock, flags); } - - local_irq_restore(flags); } EXPORT_SYMBOL_GPL(percpu_ida_free); @@ -346,29 +327,27 @@ int percpu_ida_for_each_free(struct percpu_ida *pool, percpu_ida_cb fn, struct percpu_ida_cpu *remote; unsigned cpu, i, err = 0; - local_irq_save(flags); for_each_possible_cpu(cpu) { remote = per_cpu_ptr(pool->tag_cpu, cpu); - spin_lock(&remote->lock); + spin_lock_irqsave(&remote->lock, flags); for (i = 0; i < remote->nr_free; i++) { err = fn(remote->freelist[i], data); if (err) break; } - spin_unlock(&remote->lock); + spin_unlock_irqrestore(&remote->lock, flags); if (err) goto out; } - spin_lock(&pool->lock); + spin_lock_irqsave(&pool->lock, flags); for (i = 0; i < pool->nr_free; i++) { err = fn(pool->freelist[i], data); if (err) break; } - spin_unlock(&pool->lock); + spin_unlock_irqrestore(&pool->lock, flags); out: - local_irq_restore(flags); return err; } EXPORT_SYMBOL_GPL(percpu_ida_for_each_free); From cbdc61ae1fa5d824fcfd59282b040f21144999ab Mon Sep 17 00:00:00 2001 From: Vasily Averin Date: Thu, 7 Jun 2018 17:10:51 -0700 Subject: [PATCH 102/118] lib/mpi: headers cleanup MPI headers contain definitions for huge number of non-existing functions. Most part of these functions was removed in 2012 by Dmitry Kasatkin - 7cf4206a99d1 ("Remove unused code from MPI library") - 9e235dcaf4f6 ("Revert "crypto: GnuPG based MPI lib - additional ...") - bc95eeadf5c6 ("lib/mpi: removed unused functions") however headers wwere not updated properly. Also I deleted some unused macros. Link: http://lkml.kernel.org/r/fb2fc1ef-1185-f0a3-d8d0-173d2f97bbaf@virtuozzo.com Signed-off-by: Vasily Averin Reviewed-by: Andrew Morton Cc: Dmitry Kasatkin Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- include/linux/mpi.h | 61 ---------------------------------- lib/mpi/mpi-internal.h | 75 +++--------------------------------------- 2 files changed, 5 insertions(+), 131 deletions(-) diff --git a/include/linux/mpi.h b/include/linux/mpi.h index 1cc5ffb769af..7cd1473c64a4 100644 --- a/include/linux/mpi.h +++ b/include/linux/mpi.h @@ -53,93 +53,32 @@ struct gcry_mpi { typedef struct gcry_mpi *MPI; #define mpi_get_nlimbs(a) ((a)->nlimbs) -#define mpi_is_neg(a) ((a)->sign) /*-- mpiutil.c --*/ MPI mpi_alloc(unsigned nlimbs); -MPI mpi_alloc_secure(unsigned nlimbs); -MPI mpi_alloc_like(MPI a); void mpi_free(MPI a); int mpi_resize(MPI a, unsigned nlimbs); -int mpi_copy(MPI *copy, const MPI a); -void mpi_clear(MPI a); -int mpi_set(MPI w, MPI u); -int mpi_set_ui(MPI w, ulong u); -MPI mpi_alloc_set_ui(unsigned long u); -void mpi_m_check(MPI a); -void mpi_swap(MPI a, MPI b); /*-- mpicoder.c --*/ -MPI do_encode_md(const void *sha_buffer, unsigned nbits); MPI mpi_read_raw_data(const void *xbuffer, size_t nbytes); MPI mpi_read_from_buffer(const void *buffer, unsigned *ret_nread); MPI mpi_read_raw_from_sgl(struct scatterlist *sgl, unsigned int len); -int mpi_fromstr(MPI val, const char *str); -u32 mpi_get_keyid(MPI a, u32 *keyid); void *mpi_get_buffer(MPI a, unsigned *nbytes, int *sign); int mpi_read_buffer(MPI a, uint8_t *buf, unsigned buf_len, unsigned *nbytes, int *sign); -void *mpi_get_secure_buffer(MPI a, unsigned *nbytes, int *sign); int mpi_write_to_sgl(MPI a, struct scatterlist *sg, unsigned nbytes, int *sign); -#define log_mpidump g10_log_mpidump - -/*-- mpi-add.c --*/ -int mpi_add_ui(MPI w, MPI u, ulong v); -int mpi_add(MPI w, MPI u, MPI v); -int mpi_addm(MPI w, MPI u, MPI v, MPI m); -int mpi_sub_ui(MPI w, MPI u, ulong v); -int mpi_sub(MPI w, MPI u, MPI v); -int mpi_subm(MPI w, MPI u, MPI v, MPI m); - -/*-- mpi-mul.c --*/ -int mpi_mul_ui(MPI w, MPI u, ulong v); -int mpi_mul_2exp(MPI w, MPI u, ulong cnt); -int mpi_mul(MPI w, MPI u, MPI v); -int mpi_mulm(MPI w, MPI u, MPI v, MPI m); - -/*-- mpi-div.c --*/ -ulong mpi_fdiv_r_ui(MPI rem, MPI dividend, ulong divisor); -int mpi_fdiv_r(MPI rem, MPI dividend, MPI divisor); -int mpi_fdiv_q(MPI quot, MPI dividend, MPI divisor); -int mpi_fdiv_qr(MPI quot, MPI rem, MPI dividend, MPI divisor); -int mpi_tdiv_r(MPI rem, MPI num, MPI den); -int mpi_tdiv_qr(MPI quot, MPI rem, MPI num, MPI den); -int mpi_tdiv_q_2exp(MPI w, MPI u, unsigned count); -int mpi_divisible_ui(const MPI dividend, ulong divisor); - -/*-- mpi-gcd.c --*/ -int mpi_gcd(MPI g, const MPI a, const MPI b); - /*-- mpi-pow.c --*/ -int mpi_pow(MPI w, MPI u, MPI v); int mpi_powm(MPI res, MPI base, MPI exp, MPI mod); -/*-- mpi-mpow.c --*/ -int mpi_mulpowm(MPI res, MPI *basearray, MPI *exparray, MPI mod); - /*-- mpi-cmp.c --*/ int mpi_cmp_ui(MPI u, ulong v); int mpi_cmp(MPI u, MPI v); -/*-- mpi-scan.c --*/ -int mpi_getbyte(MPI a, unsigned idx); -void mpi_putbyte(MPI a, unsigned idx, int value); -unsigned mpi_trailing_zeros(MPI a); - /*-- mpi-bit.c --*/ void mpi_normalize(MPI a); unsigned mpi_get_nbits(MPI a); -int mpi_test_bit(MPI a, unsigned n); -int mpi_set_bit(MPI a, unsigned n); -int mpi_set_highbit(MPI a, unsigned n); -void mpi_clear_highbit(MPI a, unsigned n); -void mpi_clear_bit(MPI a, unsigned n); -int mpi_rshift(MPI x, MPI a, unsigned n); - -/*-- mpi-inv.c --*/ -int mpi_invm(MPI x, MPI u, MPI v); /* inline functions */ diff --git a/lib/mpi/mpi-internal.h b/lib/mpi/mpi-internal.h index 7eceeddb3fb8..c2d6f4efcfbc 100644 --- a/lib/mpi/mpi-internal.h +++ b/lib/mpi/mpi-internal.h @@ -65,13 +65,6 @@ typedef mpi_limb_t *mpi_ptr_t; /* pointer to a limb */ typedef int mpi_size_t; /* (must be a signed type) */ -static inline int RESIZE_IF_NEEDED(MPI a, unsigned b) -{ - if (a->alloced < b) - return mpi_resize(a, b); - return 0; -} - /* Copy N limbs from S to D. */ #define MPN_COPY(d, s, n) \ do { \ @@ -80,13 +73,6 @@ static inline int RESIZE_IF_NEEDED(MPI a, unsigned b) (d)[_i] = (s)[_i]; \ } while (0) -#define MPN_COPY_INCR(d, s, n) \ - do { \ - mpi_size_t _i; \ - for (_i = 0; _i < (n); _i++) \ - (d)[_i] = (s)[_i]; \ - } while (0) - #define MPN_COPY_DECR(d, s, n) \ do { \ mpi_size_t _i; \ @@ -111,15 +97,6 @@ static inline int RESIZE_IF_NEEDED(MPI a, unsigned b) } \ } while (0) -#define MPN_NORMALIZE_NOT_ZERO(d, n) \ - do { \ - for (;;) { \ - if ((d)[(n)-1]) \ - break; \ - (n)--; \ - } \ - } while (0) - #define MPN_MUL_N_RECURSE(prodp, up, vp, size, tspace) \ do { \ if ((size) < KARATSUBA_THRESHOLD) \ @@ -128,46 +105,11 @@ static inline int RESIZE_IF_NEEDED(MPI a, unsigned b) mul_n(prodp, up, vp, size, tspace); \ } while (0); -/* Divide the two-limb number in (NH,,NL) by D, with DI being the largest - * limb not larger than (2**(2*BITS_PER_MP_LIMB))/D - (2**BITS_PER_MP_LIMB). - * If this would yield overflow, DI should be the largest possible number - * (i.e., only ones). For correct operation, the most significant bit of D - * has to be set. Put the quotient in Q and the remainder in R. - */ -#define UDIV_QRNND_PREINV(q, r, nh, nl, d, di) \ - do { \ - mpi_limb_t _q, _ql, _r; \ - mpi_limb_t _xh, _xl; \ - umul_ppmm(_q, _ql, (nh), (di)); \ - _q += (nh); /* DI is 2**BITS_PER_MPI_LIMB too small */ \ - umul_ppmm(_xh, _xl, _q, (d)); \ - sub_ddmmss(_xh, _r, (nh), (nl), _xh, _xl); \ - if (_xh) { \ - sub_ddmmss(_xh, _r, _xh, _r, 0, (d)); \ - _q++; \ - if (_xh) { \ - sub_ddmmss(_xh, _r, _xh, _r, 0, (d)); \ - _q++; \ - } \ - } \ - if (_r >= (d)) { \ - _r -= (d); \ - _q++; \ - } \ - (r) = _r; \ - (q) = _q; \ - } while (0) - /*-- mpiutil.c --*/ mpi_ptr_t mpi_alloc_limb_space(unsigned nlimbs); void mpi_free_limb_space(mpi_ptr_t a); void mpi_assign_limb_space(MPI a, mpi_ptr_t ap, unsigned nlimbs); -/*-- mpi-bit.c --*/ -void mpi_rshift_limbs(MPI a, unsigned int count); -int mpi_lshift_limbs(MPI a, unsigned int count); - -/*-- mpihelp-add.c --*/ static inline mpi_limb_t mpihelp_add_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb); mpi_limb_t mpihelp_add_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, @@ -175,7 +117,6 @@ mpi_limb_t mpihelp_add_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, static inline mpi_limb_t mpihelp_add(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_ptr_t s2_ptr, mpi_size_t s2_size); -/*-- mpihelp-sub.c --*/ static inline mpi_limb_t mpihelp_sub_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb); mpi_limb_t mpihelp_sub_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, @@ -183,10 +124,10 @@ mpi_limb_t mpihelp_sub_n(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, static inline mpi_limb_t mpihelp_sub(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_ptr_t s2_ptr, mpi_size_t s2_size); -/*-- mpihelp-cmp.c --*/ +/*-- mpih-cmp.c --*/ int mpihelp_cmp(mpi_ptr_t op1_ptr, mpi_ptr_t op2_ptr, mpi_size_t size); -/*-- mpihelp-mul.c --*/ +/*-- mpih-mul.c --*/ struct karatsuba_ctx { struct karatsuba_ctx *next; @@ -202,7 +143,6 @@ mpi_limb_t mpihelp_addmul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb); mpi_limb_t mpihelp_submul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb); -int mpihelp_mul_n(mpi_ptr_t prodp, mpi_ptr_t up, mpi_ptr_t vp, mpi_size_t size); int mpihelp_mul(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t usize, mpi_ptr_t vp, mpi_size_t vsize, mpi_limb_t *_result); void mpih_sqr_n_basecase(mpi_ptr_t prodp, mpi_ptr_t up, mpi_size_t size); @@ -214,21 +154,16 @@ int mpihelp_mul_karatsuba_case(mpi_ptr_t prodp, mpi_ptr_t vp, mpi_size_t vsize, struct karatsuba_ctx *ctx); -/*-- mpihelp-mul_1.c (or xxx/cpu/ *.S) --*/ +/*-- generic_mpih-mul1.c --*/ mpi_limb_t mpihelp_mul_1(mpi_ptr_t res_ptr, mpi_ptr_t s1_ptr, mpi_size_t s1_size, mpi_limb_t s2_limb); -/*-- mpihelp-div.c --*/ -mpi_limb_t mpihelp_mod_1(mpi_ptr_t dividend_ptr, mpi_size_t dividend_size, - mpi_limb_t divisor_limb); +/*-- mpih-div.c --*/ mpi_limb_t mpihelp_divrem(mpi_ptr_t qp, mpi_size_t qextra_limbs, mpi_ptr_t np, mpi_size_t nsize, mpi_ptr_t dp, mpi_size_t dsize); -mpi_limb_t mpihelp_divmod_1(mpi_ptr_t quot_ptr, - mpi_ptr_t dividend_ptr, mpi_size_t dividend_size, - mpi_limb_t divisor_limb); -/*-- mpihelp-shift.c --*/ +/*-- generic_mpih-[lr]shift.c --*/ mpi_limb_t mpihelp_lshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, unsigned cnt); mpi_limb_t mpihelp_rshift(mpi_ptr_t wp, mpi_ptr_t up, mpi_size_t usize, From 09088a4047e2d1e749f8ceab1f39c5c0055032e5 Mon Sep 17 00:00:00 2001 From: Randy Dunlap Date: Thu, 7 Jun 2018 17:10:55 -0700 Subject: [PATCH 103/118] lib/ucs2_string.c: add MODULE_LICENSE() Fix missing MODULE_LICENSE() warning in lib/ucs2_string.c: WARNING: modpost: missing MODULE_LICENSE() in lib/ucs2_string.o see include/linux/module.h for more information Link: http://lkml.kernel.org/r/b2505bb4-dcf5-fc46-443d-e47db1cb2f59@infradead.org Signed-off-by: Randy Dunlap Cc: Greg Kroah-Hartman Cc: Matthew Garrett Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- lib/ucs2_string.c | 2 ++ 1 file changed, 2 insertions(+) diff --git a/lib/ucs2_string.c b/lib/ucs2_string.c index d7e06b28de38..0a559a42359b 100644 --- a/lib/ucs2_string.c +++ b/lib/ucs2_string.c @@ -112,3 +112,5 @@ ucs2_as_utf8(u8 *dest, const ucs2_char_t *src, unsigned long maxlength) return j; } EXPORT_SYMBOL(ucs2_as_utf8); + +MODULE_LICENSE("GPL v2"); From 12c253abb9c9d8e01d9365fc8cbde4688731f401 Mon Sep 17 00:00:00 2001 From: Joe Perches Date: Thu, 7 Jun 2018 17:10:58 -0700 Subject: [PATCH 104/118] checkpatch: improve patch recognition There are mode change and rename only patches that are unrecognized by checkpatch. Recognize them. [joe@perches.com: fix missing close parenthesis] Link: http://lkml.kernel.org/r/af44c893f6973393f2a5b11f1a8e5cd4c8bbbba5.camel@perches.com Link: http://lkml.kernel.org/r/974a407e6fa18abd5a965da39cc68986a4c4f091.1526949367.git.joe@perches.com Signed-off-by: Joe Perches Reported-by: Heinrich Schuchardt Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- scripts/checkpatch.pl | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/scripts/checkpatch.pl b/scripts/checkpatch.pl index dfd265e036f3..e3b7362b0ee4 100755 --- a/scripts/checkpatch.pl +++ b/scripts/checkpatch.pl @@ -2377,6 +2377,14 @@ sub process { my $rawline = $rawlines[$linenr - 1]; +# check if it's a mode change, rename or start of a patch + if (!$in_commit_log && + ($line =~ /^ mode change [0-7]+ => [0-7]+ \S+\s*$/ || + ($line =~ /^rename (?:from|to) \S+\s*$/ || + $line =~ /^diff --git a\/[\w\/\.\_\-]+ b\/\S+\s*$/))) { + $is_patch = 1; + } + #extract the line range in the file after the patch is applied if (!$in_commit_log && $line =~ /^\@\@ -\d+(?:,\d+)? \+(\d+)(,(\d+))? \@\@(.*)/) { From 5cc41e099504b77014358b58567c5ea6293dd220 Mon Sep 17 00:00:00 2001 From: Thadeu Lima de Souza Cascardo Date: Thu, 7 Jun 2018 17:11:01 -0700 Subject: [PATCH 105/118] fs/binfmt_misc.c: do not allow offset overflow WHen registering a new binfmt_misc handler, it is possible to overflow the offset to get a negative value, which might crash the system, or possibly leak kernel data. Here is a crash log when 2500000000 was used as an offset: BUG: unable to handle kernel paging request at ffff989cfd6edca0 IP: load_misc_binary+0x22b/0x470 [binfmt_misc] PGD 1ef3e067 P4D 1ef3e067 PUD 0 Oops: 0000 [#1] SMP NOPTI Modules linked in: binfmt_misc kvm_intel ppdev kvm irqbypass joydev input_leds serio_raw mac_hid parport_pc qemu_fw_cfg parpy CPU: 0 PID: 2499 Comm: bash Not tainted 4.15.0-22-generic #24-Ubuntu Hardware name: QEMU Standard PC (i440FX + PIIX, 1996), BIOS 1.11.1-1 04/01/2014 RIP: 0010:load_misc_binary+0x22b/0x470 [binfmt_misc] Call Trace: search_binary_handler+0x97/0x1d0 do_execveat_common.isra.34+0x667/0x810 SyS_execve+0x31/0x40 do_syscall_64+0x73/0x130 entry_SYSCALL_64_after_hwframe+0x3d/0xa2 Use kstrtoint instead of simple_strtoul. It will work as the code already set the delimiter byte to '\0' and we only do it when the field is not empty. Tested with offsets -1, 2500000000, UINT_MAX and INT_MAX. Also tested with examples documented at Documentation/admin-guide/binfmt-misc.rst and other registrations from packages on Ubuntu. Link: http://lkml.kernel.org/r/20180529135648.14254-1-cascardo@canonical.com Fixes: 1da177e4c3f4 ("Linux-2.6.12-rc2") Signed-off-by: Thadeu Lima de Souza Cascardo Reviewed-by: Andrew Morton Cc: Alexander Viro Cc: Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/binfmt_misc.c | 12 +++++++++--- 1 file changed, 9 insertions(+), 3 deletions(-) diff --git a/fs/binfmt_misc.c b/fs/binfmt_misc.c index a41b48f82a70..4de191563261 100644 --- a/fs/binfmt_misc.c +++ b/fs/binfmt_misc.c @@ -387,8 +387,13 @@ static Node *create_entry(const char __user *buffer, size_t count) s = strchr(p, del); if (!s) goto einval; - *s++ = '\0'; - e->offset = simple_strtoul(p, &p, 10); + *s = '\0'; + if (p != s) { + int r = kstrtoint(p, 10, &e->offset); + if (r != 0 || e->offset < 0) + goto einval; + } + p = s; if (*p++) goto einval; pr_debug("register: offset: %#x\n", e->offset); @@ -428,7 +433,8 @@ static Node *create_entry(const char __user *buffer, size_t count) if (e->mask && string_unescape_inplace(e->mask, UNESCAPE_HEX) != e->size) goto einval; - if (e->size + e->offset > BINPRM_BUF_SIZE) + if (e->size > BINPRM_BUF_SIZE || + BINPRM_BUF_SIZE - e->size < e->offset) goto einval; pr_debug("register: magic/mask length: %i\n", e->size); if (USE_DEBUG) { From ef8b42f78e883cb72fd781c2eff3a42d89c1c0c3 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:05 -0700 Subject: [PATCH 106/118] autofs4: merge auto_fs.h and auto_fs4.h The autofs module has long since been removed so there's no need to have two separate include files for autofs. Link: http://lkml.kernel.org/r/152626703024.28589.9571964661718767929.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 2 +- fs/compat_ioctl.c | 1 - include/uapi/linux/auto_fs.h | 169 +++++++++++++++++++++++++++++++--- include/uapi/linux/auto_fs4.h | 153 +----------------------------- 4 files changed, 161 insertions(+), 164 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 4737615f0eaa..01636f3945d5 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -9,7 +9,7 @@ /* Internal header file for autofs */ -#include +#include #include #include diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index ef80085ed564..b3e1768b636e 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -39,7 +39,6 @@ #include #include #include -#include #include #include #include diff --git a/include/uapi/linux/auto_fs.h b/include/uapi/linux/auto_fs.h index 2a4432c7a4b4..e13eec3dfb2f 100644 --- a/include/uapi/linux/auto_fs.h +++ b/include/uapi/linux/auto_fs.h @@ -1,6 +1,8 @@ /* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ /* - * Copyright 1997 Transmeta Corporation - All Rights Reserved + * Copyright 1997 Transmeta Corporation - All Rights Reserved + * Copyright 1999-2000 Jeremy Fitzhardinge + * Copyright 2005-2006,2013,2017-2018 Ian Kent * * This file is part of the Linux kernel and is made available under * the terms of the GNU General Public License, version 2, or at your @@ -8,7 +10,6 @@ * * ----------------------------------------------------------------------- */ - #ifndef _UAPI_LINUX_AUTO_FS_H #define _UAPI_LINUX_AUTO_FS_H @@ -18,13 +19,11 @@ #include #endif /* __KERNEL__ */ +#define AUTOFS_PROTO_VERSION 5 +#define AUTOFS_MIN_PROTO_VERSION 3 +#define AUTOFS_MAX_PROTO_VERSION 5 -/* This file describes autofs v3 */ -#define AUTOFS_PROTO_VERSION 3 - -/* Range of protocol versions defined */ -#define AUTOFS_MAX_PROTO_VERSION AUTOFS_PROTO_VERSION -#define AUTOFS_MIN_PROTO_VERSION AUTOFS_PROTO_VERSION +#define AUTOFS_PROTO_SUBVERSION 2 /* * The wait_queue_token (autofs_wqt_t) is part of a structure which is passed @@ -76,9 +75,155 @@ enum { #define AUTOFS_IOC_READY _IO(AUTOFS_IOCTL, AUTOFS_IOC_READY_CMD) #define AUTOFS_IOC_FAIL _IO(AUTOFS_IOCTL, AUTOFS_IOC_FAIL_CMD) #define AUTOFS_IOC_CATATONIC _IO(AUTOFS_IOCTL, AUTOFS_IOC_CATATONIC_CMD) -#define AUTOFS_IOC_PROTOVER _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOVER_CMD, int) -#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(AUTOFS_IOCTL, AUTOFS_IOC_SETTIMEOUT_CMD, compat_ulong_t) -#define AUTOFS_IOC_SETTIMEOUT _IOWR(AUTOFS_IOCTL, AUTOFS_IOC_SETTIMEOUT_CMD, unsigned long) -#define AUTOFS_IOC_EXPIRE _IOR(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_CMD, struct autofs_packet_expire) +#define AUTOFS_IOC_PROTOVER _IOR(AUTOFS_IOCTL, \ + AUTOFS_IOC_PROTOVER_CMD, int) +#define AUTOFS_IOC_SETTIMEOUT32 _IOWR(AUTOFS_IOCTL, \ + AUTOFS_IOC_SETTIMEOUT_CMD, \ + compat_ulong_t) +#define AUTOFS_IOC_SETTIMEOUT _IOWR(AUTOFS_IOCTL, \ + AUTOFS_IOC_SETTIMEOUT_CMD, \ + unsigned long) +#define AUTOFS_IOC_EXPIRE _IOR(AUTOFS_IOCTL, \ + AUTOFS_IOC_EXPIRE_CMD, \ + struct autofs_packet_expire) + +/* autofs version 4 and later definitions */ + +/* Mask for expire behaviour */ +#define AUTOFS_EXP_IMMEDIATE 1 +#define AUTOFS_EXP_LEAVES 2 + +#define AUTOFS_TYPE_ANY 0U +#define AUTOFS_TYPE_INDIRECT 1U +#define AUTOFS_TYPE_DIRECT 2U +#define AUTOFS_TYPE_OFFSET 4U + +static inline void set_autofs_type_indirect(unsigned int *type) +{ + *type = AUTOFS_TYPE_INDIRECT; +} + +static inline unsigned int autofs_type_indirect(unsigned int type) +{ + return (type == AUTOFS_TYPE_INDIRECT); +} + +static inline void set_autofs_type_direct(unsigned int *type) +{ + *type = AUTOFS_TYPE_DIRECT; +} + +static inline unsigned int autofs_type_direct(unsigned int type) +{ + return (type == AUTOFS_TYPE_DIRECT); +} + +static inline void set_autofs_type_offset(unsigned int *type) +{ + *type = AUTOFS_TYPE_OFFSET; +} + +static inline unsigned int autofs_type_offset(unsigned int type) +{ + return (type == AUTOFS_TYPE_OFFSET); +} + +static inline unsigned int autofs_type_trigger(unsigned int type) +{ + return (type == AUTOFS_TYPE_DIRECT || type == AUTOFS_TYPE_OFFSET); +} + +/* + * This isn't really a type as we use it to say "no type set" to + * indicate we want to search for "any" mount in the + * autofs_dev_ioctl_ismountpoint() device ioctl function. + */ +static inline void set_autofs_type_any(unsigned int *type) +{ + *type = AUTOFS_TYPE_ANY; +} + +static inline unsigned int autofs_type_any(unsigned int type) +{ + return (type == AUTOFS_TYPE_ANY); +} + +/* Daemon notification packet types */ +enum autofs_notify { + NFY_NONE, + NFY_MOUNT, + NFY_EXPIRE +}; + +/* Kernel protocol version 4 packet types */ + +/* Expire entry (umount request) */ +#define autofs_ptype_expire_multi 2 + +/* Kernel protocol version 5 packet types */ + +/* Indirect mount missing and expire requests. */ +#define autofs_ptype_missing_indirect 3 +#define autofs_ptype_expire_indirect 4 + +/* Direct mount missing and expire requests */ +#define autofs_ptype_missing_direct 5 +#define autofs_ptype_expire_direct 6 + +/* v4 multi expire (via pipe) */ +struct autofs_packet_expire_multi { + struct autofs_packet_hdr hdr; + autofs_wqt_t wait_queue_token; + int len; + char name[NAME_MAX+1]; +}; + +union autofs_packet_union { + struct autofs_packet_hdr hdr; + struct autofs_packet_missing missing; + struct autofs_packet_expire expire; + struct autofs_packet_expire_multi expire_multi; +}; + +/* autofs v5 common packet struct */ +struct autofs_v5_packet { + struct autofs_packet_hdr hdr; + autofs_wqt_t wait_queue_token; + __u32 dev; + __u64 ino; + __u32 uid; + __u32 gid; + __u32 pid; + __u32 tgid; + __u32 len; + char name[NAME_MAX+1]; +}; + +typedef struct autofs_v5_packet autofs_packet_missing_indirect_t; +typedef struct autofs_v5_packet autofs_packet_expire_indirect_t; +typedef struct autofs_v5_packet autofs_packet_missing_direct_t; +typedef struct autofs_v5_packet autofs_packet_expire_direct_t; + +union autofs_v5_packet_union { + struct autofs_packet_hdr hdr; + struct autofs_v5_packet v5_packet; + autofs_packet_missing_indirect_t missing_indirect; + autofs_packet_expire_indirect_t expire_indirect; + autofs_packet_missing_direct_t missing_direct; + autofs_packet_expire_direct_t expire_direct; +}; + +enum { + AUTOFS_IOC_EXPIRE_MULTI_CMD = 0x66, /* AUTOFS_IOC_EXPIRE_CMD + 1 */ + AUTOFS_IOC_PROTOSUBVER_CMD, + AUTOFS_IOC_ASKUMOUNT_CMD = 0x70, /* AUTOFS_DEV_IOCTL_VERSION_CMD - 1 */ +}; + +#define AUTOFS_IOC_EXPIRE_MULTI _IOW(AUTOFS_IOCTL, \ + AUTOFS_IOC_EXPIRE_MULTI_CMD, int) +#define AUTOFS_IOC_PROTOSUBVER _IOR(AUTOFS_IOCTL, \ + AUTOFS_IOC_PROTOSUBVER_CMD, int) +#define AUTOFS_IOC_ASKUMOUNT _IOR(AUTOFS_IOCTL, \ + AUTOFS_IOC_ASKUMOUNT_CMD, int) #endif /* _UAPI_LINUX_AUTO_FS_H */ diff --git a/include/uapi/linux/auto_fs4.h b/include/uapi/linux/auto_fs4.h index 1f608e27a06f..d01ef0a0189c 100644 --- a/include/uapi/linux/auto_fs4.h +++ b/include/uapi/linux/auto_fs4.h @@ -7,156 +7,9 @@ * option, any later version, incorporated herein by reference. */ -#ifndef _LINUX_AUTO_FS4_H -#define _LINUX_AUTO_FS4_H +#ifndef _UAPI_LINUX_AUTO_FS4_H +#define _UAPI_LINUX_AUTO_FS4_H -/* Include common v3 definitions */ -#include #include -/* autofs v4 definitions */ -#undef AUTOFS_PROTO_VERSION -#undef AUTOFS_MIN_PROTO_VERSION -#undef AUTOFS_MAX_PROTO_VERSION - -#define AUTOFS_PROTO_VERSION 5 -#define AUTOFS_MIN_PROTO_VERSION 3 -#define AUTOFS_MAX_PROTO_VERSION 5 - -#define AUTOFS_PROTO_SUBVERSION 2 - -/* Mask for expire behaviour */ -#define AUTOFS_EXP_IMMEDIATE 1 -#define AUTOFS_EXP_LEAVES 2 - -#define AUTOFS_TYPE_ANY 0U -#define AUTOFS_TYPE_INDIRECT 1U -#define AUTOFS_TYPE_DIRECT 2U -#define AUTOFS_TYPE_OFFSET 4U - -static inline void set_autofs_type_indirect(unsigned int *type) -{ - *type = AUTOFS_TYPE_INDIRECT; -} - -static inline unsigned int autofs_type_indirect(unsigned int type) -{ - return (type == AUTOFS_TYPE_INDIRECT); -} - -static inline void set_autofs_type_direct(unsigned int *type) -{ - *type = AUTOFS_TYPE_DIRECT; -} - -static inline unsigned int autofs_type_direct(unsigned int type) -{ - return (type == AUTOFS_TYPE_DIRECT); -} - -static inline void set_autofs_type_offset(unsigned int *type) -{ - *type = AUTOFS_TYPE_OFFSET; -} - -static inline unsigned int autofs_type_offset(unsigned int type) -{ - return (type == AUTOFS_TYPE_OFFSET); -} - -static inline unsigned int autofs_type_trigger(unsigned int type) -{ - return (type == AUTOFS_TYPE_DIRECT || type == AUTOFS_TYPE_OFFSET); -} - -/* - * This isn't really a type as we use it to say "no type set" to - * indicate we want to search for "any" mount in the - * autofs_dev_ioctl_ismountpoint() device ioctl function. - */ -static inline void set_autofs_type_any(unsigned int *type) -{ - *type = AUTOFS_TYPE_ANY; -} - -static inline unsigned int autofs_type_any(unsigned int type) -{ - return (type == AUTOFS_TYPE_ANY); -} - -/* Daemon notification packet types */ -enum autofs_notify { - NFY_NONE, - NFY_MOUNT, - NFY_EXPIRE -}; - -/* Kernel protocol version 4 packet types */ - -/* Expire entry (umount request) */ -#define autofs_ptype_expire_multi 2 - -/* Kernel protocol version 5 packet types */ - -/* Indirect mount missing and expire requests. */ -#define autofs_ptype_missing_indirect 3 -#define autofs_ptype_expire_indirect 4 - -/* Direct mount missing and expire requests */ -#define autofs_ptype_missing_direct 5 -#define autofs_ptype_expire_direct 6 - -/* v4 multi expire (via pipe) */ -struct autofs_packet_expire_multi { - struct autofs_packet_hdr hdr; - autofs_wqt_t wait_queue_token; - int len; - char name[NAME_MAX+1]; -}; - -union autofs_packet_union { - struct autofs_packet_hdr hdr; - struct autofs_packet_missing missing; - struct autofs_packet_expire expire; - struct autofs_packet_expire_multi expire_multi; -}; - -/* autofs v5 common packet struct */ -struct autofs_v5_packet { - struct autofs_packet_hdr hdr; - autofs_wqt_t wait_queue_token; - __u32 dev; - __u64 ino; - __u32 uid; - __u32 gid; - __u32 pid; - __u32 tgid; - __u32 len; - char name[NAME_MAX+1]; -}; - -typedef struct autofs_v5_packet autofs_packet_missing_indirect_t; -typedef struct autofs_v5_packet autofs_packet_expire_indirect_t; -typedef struct autofs_v5_packet autofs_packet_missing_direct_t; -typedef struct autofs_v5_packet autofs_packet_expire_direct_t; - -union autofs_v5_packet_union { - struct autofs_packet_hdr hdr; - struct autofs_v5_packet v5_packet; - autofs_packet_missing_indirect_t missing_indirect; - autofs_packet_expire_indirect_t expire_indirect; - autofs_packet_missing_direct_t missing_direct; - autofs_packet_expire_direct_t expire_direct; -}; - -enum { - AUTOFS_IOC_EXPIRE_MULTI_CMD = 0x66, /* AUTOFS_IOC_EXPIRE_CMD + 1 */ - AUTOFS_IOC_PROTOSUBVER_CMD, - AUTOFS_IOC_ASKUMOUNT_CMD = 0x70, /* AUTOFS_DEV_IOCTL_VERSION_CMD - 1 */ -}; - -#define AUTOFS_IOC_EXPIRE_MULTI _IOW(AUTOFS_IOCTL, AUTOFS_IOC_EXPIRE_MULTI_CMD, int) -#define AUTOFS_IOC_PROTOSUBVER _IOR(AUTOFS_IOCTL, AUTOFS_IOC_PROTOSUBVER_CMD, int) -#define AUTOFS_IOC_ASKUMOUNT _IOR(AUTOFS_IOCTL, AUTOFS_IOC_ASKUMOUNT_CMD, int) - -#endif /* _LINUX_AUTO_FS4_H */ +#endif /* _UAPI_LINUX_AUTO_FS4_H */ From 47206e012a0ad05f358342c083cc6021160b61f9 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:09 -0700 Subject: [PATCH 107/118] autofs4: use autofs instead of autofs4 everywhere Update naming within autofs source to be consistent by changing occurrences of autofs4 to autofs. Link: http://lkml.kernel.org/r/152626703688.28589.8315406711135226803.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 88 +++++++------- fs/autofs4/dev-ioctl.c | 18 +-- fs/autofs4/expire.c | 132 ++++++++++---------- fs/autofs4/init.c | 12 +- fs/autofs4/inode.c | 48 ++++---- fs/autofs4/root.c | 269 ++++++++++++++++++++--------------------- fs/autofs4/symlink.c | 16 +-- fs/autofs4/waitq.c | 53 ++++---- 8 files changed, 318 insertions(+), 318 deletions(-) diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h index 01636f3945d5..9110b66c7ef1 100644 --- a/fs/autofs4/autofs_i.h +++ b/fs/autofs4/autofs_i.h @@ -122,44 +122,44 @@ struct autofs_sb_info { struct rcu_head rcu; }; -static inline struct autofs_sb_info *autofs4_sbi(struct super_block *sb) +static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb) { return (struct autofs_sb_info *)(sb->s_fs_info); } -static inline struct autofs_info *autofs4_dentry_ino(struct dentry *dentry) +static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry) { return (struct autofs_info *)(dentry->d_fsdata); } -/* autofs4_oz_mode(): do we see the man behind the curtain? (The +/* autofs_oz_mode(): do we see the man behind the curtain? (The * processes which do manipulations for us in user space sees the raw * filesystem without "magic".) */ -static inline int autofs4_oz_mode(struct autofs_sb_info *sbi) +static inline int autofs_oz_mode(struct autofs_sb_info *sbi) { return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; } -struct inode *autofs4_get_inode(struct super_block *, umode_t); -void autofs4_free_ino(struct autofs_info *); +struct inode *autofs_get_inode(struct super_block *, umode_t); +void autofs_free_ino(struct autofs_info *); /* Expiration */ -int is_autofs4_dentry(struct dentry *); -int autofs4_expire_wait(const struct path *path, int rcu_walk); -int autofs4_expire_run(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, - struct autofs_packet_expire __user *); -int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when); -int autofs4_expire_multi(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, int __user *); -struct dentry *autofs4_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); -struct dentry *autofs4_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); +int is_autofs_dentry(struct dentry *); +int autofs_expire_wait(const struct path *path, int rcu_walk); +int autofs_expire_run(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, + struct autofs_packet_expire __user *); +int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when); +int autofs_expire_multi(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, int __user *); +struct dentry *autofs_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); +struct dentry *autofs_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); /* Device node initialization */ @@ -168,11 +168,11 @@ void autofs_dev_ioctl_exit(void); /* Operations structures */ -extern const struct inode_operations autofs4_symlink_inode_operations; -extern const struct inode_operations autofs4_dir_inode_operations; -extern const struct file_operations autofs4_dir_operations; -extern const struct file_operations autofs4_root_operations; -extern const struct dentry_operations autofs4_dentry_operations; +extern const struct inode_operations autofs_symlink_inode_operations; +extern const struct inode_operations autofs_dir_inode_operations; +extern const struct file_operations autofs_dir_operations; +extern const struct file_operations autofs_root_operations; +extern const struct dentry_operations autofs_dentry_operations; /* VFS automount flags management functions */ static inline void __managed_dentry_set_managed(struct dentry *dentry) @@ -201,9 +201,9 @@ static inline void managed_dentry_clear_managed(struct dentry *dentry) /* Initializing function */ -int autofs4_fill_super(struct super_block *, void *, int); -struct autofs_info *autofs4_new_ino(struct autofs_sb_info *); -void autofs4_clean_ino(struct autofs_info *); +int autofs_fill_super(struct super_block *, void *, int); +struct autofs_info *autofs_new_ino(struct autofs_sb_info *); +void autofs_clean_ino(struct autofs_info *); static inline int autofs_prepare_pipe(struct file *pipe) { @@ -218,25 +218,25 @@ static inline int autofs_prepare_pipe(struct file *pipe) /* Queue management functions */ -int autofs4_wait(struct autofs_sb_info *, +int autofs_wait(struct autofs_sb_info *, const struct path *, enum autofs_notify); -int autofs4_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); -void autofs4_catatonic_mode(struct autofs_sb_info *); +int autofs_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); +void autofs_catatonic_mode(struct autofs_sb_info *); -static inline u32 autofs4_get_dev(struct autofs_sb_info *sbi) +static inline u32 autofs_get_dev(struct autofs_sb_info *sbi) { return new_encode_dev(sbi->sb->s_dev); } -static inline u64 autofs4_get_ino(struct autofs_sb_info *sbi) +static inline u64 autofs_get_ino(struct autofs_sb_info *sbi) { return d_inode(sbi->sb->s_root)->i_ino; } -static inline void __autofs4_add_expiring(struct dentry *dentry) +static inline void __autofs_add_expiring(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { if (list_empty(&ino->expiring)) @@ -244,10 +244,10 @@ static inline void __autofs4_add_expiring(struct dentry *dentry) } } -static inline void autofs4_add_expiring(struct dentry *dentry) +static inline void autofs_add_expiring(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); @@ -257,10 +257,10 @@ static inline void autofs4_add_expiring(struct dentry *dentry) } } -static inline void autofs4_del_expiring(struct dentry *dentry) +static inline void autofs_del_expiring(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); @@ -270,4 +270,4 @@ static inline void autofs4_del_expiring(struct dentry *dentry) } } -void autofs4_kill_sb(struct super_block *); +void autofs_kill_sb(struct super_block *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c index 26f6b4f41ce6..a2281ab2b957 100644 --- a/fs/autofs4/dev-ioctl.c +++ b/fs/autofs4/dev-ioctl.c @@ -166,7 +166,7 @@ static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) if (f) { inode = file_inode(f); - sbi = autofs4_sbi(inode->i_sb); + sbi = autofs_sbi(inode->i_sb); } return sbi; } @@ -236,7 +236,7 @@ static int test_by_dev(const struct path *path, void *p) static int test_by_type(const struct path *path, void *p) { - struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + struct autofs_info *ino = autofs_dentry_ino(path->dentry); return ino && ino->sbi->type & *(unsigned *)p; } @@ -324,7 +324,7 @@ static int autofs_dev_ioctl_ready(struct file *fp, autofs_wqt_t token; token = (autofs_wqt_t) param->ready.token; - return autofs4_wait_release(sbi, token, 0); + return autofs_wait_release(sbi, token, 0); } /* @@ -340,7 +340,7 @@ static int autofs_dev_ioctl_fail(struct file *fp, token = (autofs_wqt_t) param->fail.token; status = param->fail.status < 0 ? param->fail.status : -ENOENT; - return autofs4_wait_release(sbi, token, status); + return autofs_wait_release(sbi, token, status); } /* @@ -412,7 +412,7 @@ static int autofs_dev_ioctl_catatonic(struct file *fp, struct autofs_sb_info *sbi, struct autofs_dev_ioctl *param) { - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); return 0; } @@ -459,10 +459,10 @@ static int autofs_dev_ioctl_requester(struct file *fp, if (err) goto out; - ino = autofs4_dentry_ino(path.dentry); + ino = autofs_dentry_ino(path.dentry); if (ino) { err = 0; - autofs4_expire_wait(&path, 0); + autofs_expire_wait(&path, 0); spin_lock(&sbi->fs_lock); param->requester.uid = from_kuid_munged(current_user_ns(), ino->uid); @@ -489,7 +489,7 @@ static int autofs_dev_ioctl_expire(struct file *fp, how = param->expire.how; mnt = fp->f_path.mnt; - return autofs4_do_expire_multi(sbi->sb, mnt, sbi, how); + return autofs_do_expire_multi(sbi->sb, mnt, sbi, how); } /* Check if autofs mount point is in use */ @@ -686,7 +686,7 @@ static int _autofs_dev_ioctl(unsigned int command, * Admin needs to be able to set the mount catatonic in * order to be able to perform the re-open. */ - if (!autofs4_oz_mode(sbi) && + if (!autofs_oz_mode(sbi) && cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { err = -EACCES; fput(fp); diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c index 57725d4a8c59..36f16b67a3bf 100644 --- a/fs/autofs4/expire.c +++ b/fs/autofs4/expire.c @@ -13,10 +13,10 @@ static unsigned long now; /* Check if a dentry can be expired */ -static inline int autofs4_can_expire(struct dentry *dentry, - unsigned long timeout, int do_now) +static inline int autofs_can_expire(struct dentry *dentry, + unsigned long timeout, int do_now) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs_dentry_ino(dentry); /* dentry in the process of being deleted */ if (ino == NULL) @@ -31,7 +31,7 @@ static inline int autofs4_can_expire(struct dentry *dentry, } /* Check a mount point for busyness */ -static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) +static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry) { struct dentry *top = dentry; struct path path = {.mnt = mnt, .dentry = dentry}; @@ -44,8 +44,8 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) if (!follow_down_one(&path)) goto done; - if (is_autofs4_dentry(path.dentry)) { - struct autofs_sb_info *sbi = autofs4_sbi(path.dentry->d_sb); + if (is_autofs_dentry(path.dentry)) { + struct autofs_sb_info *sbi = autofs_sbi(path.dentry->d_sb); /* This is an autofs submount, we can't expire it */ if (autofs_type_indirect(sbi->type)) @@ -56,7 +56,7 @@ static int autofs4_mount_busy(struct vfsmount *mnt, struct dentry *dentry) if (!may_umount_tree(path.mnt)) { struct autofs_info *ino; - ino = autofs4_dentry_ino(top); + ino = autofs_dentry_ino(top); ino->last_used = jiffies; goto done; } @@ -74,7 +74,7 @@ done: static struct dentry *get_next_positive_subdir(struct dentry *prev, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct list_head *next; struct dentry *q; @@ -121,7 +121,7 @@ cont: static struct dentry *get_next_positive_dentry(struct dentry *prev, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct list_head *next; struct dentry *p, *ret; @@ -184,10 +184,10 @@ again: * The tree is not busy iff no mountpoints are busy and there are no * autofs submounts. */ -static int autofs4_direct_busy(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) +static int autofs_direct_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) { pr_debug("top %p %pd\n", top, top); @@ -195,14 +195,14 @@ static int autofs4_direct_busy(struct vfsmount *mnt, if (!may_umount_tree(mnt)) { struct autofs_info *ino; - ino = autofs4_dentry_ino(top); + ino = autofs_dentry_ino(top); if (ino) ino->last_used = jiffies; return 1; } /* Timeout of a direct mount is determined by its top dentry */ - if (!autofs4_can_expire(top, timeout, do_now)) + if (!autofs_can_expire(top, timeout, do_now)) return 1; return 0; @@ -212,12 +212,12 @@ static int autofs4_direct_busy(struct vfsmount *mnt, * Check a directory tree of mount points for busyness * The tree is not busy iff no mountpoints are busy */ -static int autofs4_tree_busy(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) +static int autofs_tree_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) { - struct autofs_info *top_ino = autofs4_dentry_ino(top); + struct autofs_info *top_ino = autofs_dentry_ino(top); struct dentry *p; pr_debug("top %p %pd\n", top, top); @@ -237,13 +237,13 @@ static int autofs4_tree_busy(struct vfsmount *mnt, * If the fs is busy update the expiry counter. */ if (d_mountpoint(p)) { - if (autofs4_mount_busy(mnt, p)) { + if (autofs_mount_busy(mnt, p)) { top_ino->last_used = jiffies; dput(p); return 1; } } else { - struct autofs_info *ino = autofs4_dentry_ino(p); + struct autofs_info *ino = autofs_dentry_ino(p); unsigned int ino_count = atomic_read(&ino->count); /* allow for dget above and top is already dgot */ @@ -261,16 +261,16 @@ static int autofs4_tree_busy(struct vfsmount *mnt, } /* Timeout of a tree mount is ultimately determined by its top dentry */ - if (!autofs4_can_expire(top, timeout, do_now)) + if (!autofs_can_expire(top, timeout, do_now)) return 1; return 0; } -static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, - struct dentry *parent, - unsigned long timeout, - int do_now) +static struct dentry *autofs_check_leaves(struct vfsmount *mnt, + struct dentry *parent, + unsigned long timeout, + int do_now) { struct dentry *p; @@ -282,11 +282,11 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, if (d_mountpoint(p)) { /* Can we umount this guy */ - if (autofs4_mount_busy(mnt, p)) + if (autofs_mount_busy(mnt, p)) continue; /* Can we expire this guy */ - if (autofs4_can_expire(p, timeout, do_now)) + if (autofs_can_expire(p, timeout, do_now)) return p; } } @@ -294,10 +294,10 @@ static struct dentry *autofs4_check_leaves(struct vfsmount *mnt, } /* Check if we can expire a direct mount (possibly a tree) */ -struct dentry *autofs4_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = dget(sb->s_root); @@ -310,9 +310,9 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, now = jiffies; timeout = sbi->exp_timeout; - if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + if (!autofs_direct_busy(mnt, root, timeout, do_now)) { spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(root); + ino = autofs_dentry_ino(root); /* No point expiring a pending mount */ if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); @@ -321,7 +321,7 @@ struct dentry *autofs4_expire_direct(struct super_block *sb, ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); - if (!autofs4_direct_busy(mnt, root, timeout, do_now)) { + if (!autofs_direct_busy(mnt, root, timeout, do_now)) { spin_lock(&sbi->fs_lock); ino->flags |= AUTOFS_INF_EXPIRING; init_completion(&ino->expire_complete); @@ -350,7 +350,7 @@ static struct dentry *should_expire(struct dentry *dentry, { int do_now = how & AUTOFS_EXP_IMMEDIATE; int exp_leaves = how & AUTOFS_EXP_LEAVES; - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs_dentry_ino(dentry); unsigned int ino_count; /* No point expiring a pending mount */ @@ -367,11 +367,11 @@ static struct dentry *should_expire(struct dentry *dentry, pr_debug("checking mountpoint %p %pd\n", dentry, dentry); /* Can we umount this guy */ - if (autofs4_mount_busy(mnt, dentry)) + if (autofs_mount_busy(mnt, dentry)) return NULL; /* Can we expire this guy */ - if (autofs4_can_expire(dentry, timeout, do_now)) + if (autofs_can_expire(dentry, timeout, do_now)) return dentry; return NULL; } @@ -382,7 +382,7 @@ static struct dentry *should_expire(struct dentry *dentry, * A symlink can't be "busy" in the usual sense so * just check last used for expire timeout. */ - if (autofs4_can_expire(dentry, timeout, do_now)) + if (autofs_can_expire(dentry, timeout, do_now)) return dentry; return NULL; } @@ -397,7 +397,7 @@ static struct dentry *should_expire(struct dentry *dentry, if (d_count(dentry) > ino_count) return NULL; - if (!autofs4_tree_busy(mnt, dentry, timeout, do_now)) + if (!autofs_tree_busy(mnt, dentry, timeout, do_now)) return dentry; /* * Case 3: pseudo direct mount, expire individual leaves @@ -411,7 +411,7 @@ static struct dentry *should_expire(struct dentry *dentry, if (d_count(dentry) > ino_count) return NULL; - expired = autofs4_check_leaves(mnt, dentry, timeout, do_now); + expired = autofs_check_leaves(mnt, dentry, timeout, do_now); if (expired) { if (expired == dentry) dput(dentry); @@ -427,10 +427,10 @@ static struct dentry *should_expire(struct dentry *dentry, * - it is unused by any user process * - it has been unused for exp_timeout time */ -struct dentry *autofs4_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) +struct dentry *autofs_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) { unsigned long timeout; struct dentry *root = sb->s_root; @@ -450,7 +450,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, int flags = how; spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { spin_unlock(&sbi->fs_lock); continue; @@ -462,7 +462,7 @@ struct dentry *autofs4_expire_indirect(struct super_block *sb, continue; spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(expired); + ino = autofs_dentry_ino(expired); ino->flags |= AUTOFS_INF_WANT_EXPIRE; spin_unlock(&sbi->fs_lock); synchronize_rcu(); @@ -498,11 +498,11 @@ found: return expired; } -int autofs4_expire_wait(const struct path *path, int rcu_walk) +int autofs_expire_wait(const struct path *path, int rcu_walk) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); int status; int state; @@ -529,7 +529,7 @@ retry: pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); - status = autofs4_wait(sbi, path, NFY_NONE); + status = autofs_wait(sbi, path, NFY_NONE); wait_for_completion(&ino->expire_complete); pr_debug("expire done status=%d\n", status); @@ -545,10 +545,10 @@ retry: } /* Perform an expiry operation */ -int autofs4_expire_run(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - struct autofs_packet_expire __user *pkt_p) +int autofs_expire_run(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + struct autofs_packet_expire __user *pkt_p) { struct autofs_packet_expire pkt; struct autofs_info *ino; @@ -560,7 +560,7 @@ int autofs4_expire_run(struct super_block *sb, pkt.hdr.proto_version = sbi->version; pkt.hdr.type = autofs_ptype_expire; - dentry = autofs4_expire_indirect(sb, mnt, sbi, 0); + dentry = autofs_expire_indirect(sb, mnt, sbi, 0); if (!dentry) return -EAGAIN; @@ -573,7 +573,7 @@ int autofs4_expire_run(struct super_block *sb, ret = -EFAULT; spin_lock(&sbi->fs_lock); - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); /* avoid rapid-fire expire attempts if expiry fails */ ino->last_used = now; ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); @@ -583,25 +583,25 @@ int autofs4_expire_run(struct super_block *sb, return ret; } -int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when) +int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when) { struct dentry *dentry; int ret = -EAGAIN; if (autofs_type_trigger(sbi->type)) - dentry = autofs4_expire_direct(sb, mnt, sbi, when); + dentry = autofs_expire_direct(sb, mnt, sbi, when); else - dentry = autofs4_expire_indirect(sb, mnt, sbi, when); + dentry = autofs_expire_indirect(sb, mnt, sbi, when); if (dentry) { - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_info *ino = autofs_dentry_ino(dentry); const struct path path = { .mnt = mnt, .dentry = dentry }; /* This is synchronous because it makes the daemon a * little easier */ - ret = autofs4_wait(sbi, &path, NFY_EXPIRE); + ret = autofs_wait(sbi, &path, NFY_EXPIRE); spin_lock(&sbi->fs_lock); /* avoid rapid-fire expire attempts if expiry fails */ @@ -619,7 +619,7 @@ int autofs4_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, * Call repeatedly until it returns -EAGAIN, meaning there's nothing * more to be done. */ -int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, +int autofs_expire_multi(struct super_block *sb, struct vfsmount *mnt, struct autofs_sb_info *sbi, int __user *arg) { int do_now = 0; @@ -627,6 +627,6 @@ int autofs4_expire_multi(struct super_block *sb, struct vfsmount *mnt, if (arg && get_user(do_now, arg)) return -EFAULT; - return autofs4_do_expire_multi(sb, mnt, sbi, do_now); + return autofs_do_expire_multi(sb, mnt, sbi, do_now); } diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c index 8cf0e63389ae..16fb61315843 100644 --- a/fs/autofs4/init.c +++ b/fs/autofs4/init.c @@ -13,18 +13,18 @@ static struct dentry *autofs_mount(struct file_system_type *fs_type, int flags, const char *dev_name, void *data) { - return mount_nodev(fs_type, flags, data, autofs4_fill_super); + return mount_nodev(fs_type, flags, data, autofs_fill_super); } static struct file_system_type autofs_fs_type = { .owner = THIS_MODULE, .name = "autofs", .mount = autofs_mount, - .kill_sb = autofs4_kill_sb, + .kill_sb = autofs_kill_sb, }; MODULE_ALIAS_FS("autofs"); -static int __init init_autofs4_fs(void) +static int __init init_autofs_fs(void) { int err; @@ -37,12 +37,12 @@ static int __init init_autofs4_fs(void) return err; } -static void __exit exit_autofs4_fs(void) +static void __exit exit_autofs_fs(void) { autofs_dev_ioctl_exit(); unregister_filesystem(&autofs_fs_type); } -module_init(init_autofs4_fs) -module_exit(exit_autofs4_fs) +module_init(init_autofs_fs) +module_exit(exit_autofs_fs) MODULE_LICENSE("GPL"); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c index 09e7d68dff02..6262819ede45 100644 --- a/fs/autofs4/inode.c +++ b/fs/autofs4/inode.c @@ -18,7 +18,7 @@ #include "autofs_i.h" #include -struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) +struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) { struct autofs_info *ino; @@ -32,21 +32,21 @@ struct autofs_info *autofs4_new_ino(struct autofs_sb_info *sbi) return ino; } -void autofs4_clean_ino(struct autofs_info *ino) +void autofs_clean_ino(struct autofs_info *ino) { ino->uid = GLOBAL_ROOT_UID; ino->gid = GLOBAL_ROOT_GID; ino->last_used = jiffies; } -void autofs4_free_ino(struct autofs_info *ino) +void autofs_free_ino(struct autofs_info *ino) { kfree(ino); } -void autofs4_kill_sb(struct super_block *sb) +void autofs_kill_sb(struct super_block *sb) { - struct autofs_sb_info *sbi = autofs4_sbi(sb); + struct autofs_sb_info *sbi = autofs_sbi(sb); /* * In the event of a failure in get_sb_nodev the superblock @@ -56,7 +56,7 @@ void autofs4_kill_sb(struct super_block *sb) */ if (sbi) { /* Free wait queues, close pipe */ - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); put_pid(sbi->oz_pgrp); } @@ -66,9 +66,9 @@ void autofs4_kill_sb(struct super_block *sb) kfree_rcu(sbi, rcu); } -static int autofs4_show_options(struct seq_file *m, struct dentry *root) +static int autofs_show_options(struct seq_file *m, struct dentry *root) { - struct autofs_sb_info *sbi = autofs4_sbi(root->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); struct inode *root_inode = d_inode(root->d_sb->s_root); if (!sbi) @@ -101,16 +101,16 @@ static int autofs4_show_options(struct seq_file *m, struct dentry *root) return 0; } -static void autofs4_evict_inode(struct inode *inode) +static void autofs_evict_inode(struct inode *inode) { clear_inode(inode); kfree(inode->i_private); } -static const struct super_operations autofs4_sops = { +static const struct super_operations autofs_sops = { .statfs = simple_statfs, - .show_options = autofs4_show_options, - .evict_inode = autofs4_evict_inode, + .show_options = autofs_show_options, + .evict_inode = autofs_evict_inode, }; enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, @@ -206,7 +206,7 @@ static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, return (*pipefd < 0); } -int autofs4_fill_super(struct super_block *s, void *data, int silent) +int autofs_fill_super(struct super_block *s, void *data, int silent) { struct inode *root_inode; struct dentry *root; @@ -246,19 +246,19 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) s->s_blocksize = 1024; s->s_blocksize_bits = 10; s->s_magic = AUTOFS_SUPER_MAGIC; - s->s_op = &autofs4_sops; - s->s_d_op = &autofs4_dentry_operations; + s->s_op = &autofs_sops; + s->s_d_op = &autofs_dentry_operations; s->s_time_gran = 1; /* * Get the root inode and dentry, but defer checking for errors. */ - ino = autofs4_new_ino(sbi); + ino = autofs_new_ino(sbi); if (!ino) { ret = -ENOMEM; goto fail_free; } - root_inode = autofs4_get_inode(s, S_IFDIR | 0755); + root_inode = autofs_get_inode(s, S_IFDIR | 0755); root = d_make_root(root_inode); if (!root) goto fail_ino; @@ -305,8 +305,8 @@ int autofs4_fill_super(struct super_block *s, void *data, int silent) if (autofs_type_trigger(sbi->type)) __managed_dentry_set_managed(root); - root_inode->i_fop = &autofs4_root_operations; - root_inode->i_op = &autofs4_dir_inode_operations; + root_inode->i_fop = &autofs_root_operations; + root_inode->i_op = &autofs_dir_inode_operations; pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); pipe = fget(pipefd); @@ -340,14 +340,14 @@ fail_dput: dput(root); goto fail_free; fail_ino: - autofs4_free_ino(ino); + autofs_free_ino(ino); fail_free: kfree(sbi); s->s_fs_info = NULL; return ret; } -struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) +struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) { struct inode *inode = new_inode(sb); @@ -364,10 +364,10 @@ struct inode *autofs4_get_inode(struct super_block *sb, umode_t mode) if (S_ISDIR(mode)) { set_nlink(inode, 2); - inode->i_op = &autofs4_dir_inode_operations; - inode->i_fop = &autofs4_dir_operations; + inode->i_op = &autofs_dir_inode_operations; + inode->i_fop = &autofs_dir_operations; } else if (S_ISLNK(mode)) { - inode->i_op = &autofs4_symlink_inode_operations; + inode->i_op = &autofs_symlink_inode_operations; } else WARN_ON(1); diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c index b12e37f27530..a4b36e44f73c 100644 --- a/fs/autofs4/root.c +++ b/fs/autofs4/root.c @@ -19,62 +19,62 @@ #include "autofs_i.h" -static int autofs4_dir_symlink(struct inode *, struct dentry *, const char *); -static int autofs4_dir_unlink(struct inode *, struct dentry *); -static int autofs4_dir_rmdir(struct inode *, struct dentry *); -static int autofs4_dir_mkdir(struct inode *, struct dentry *, umode_t); -static long autofs4_root_ioctl(struct file *, unsigned int, unsigned long); +static int autofs_dir_symlink(struct inode *, struct dentry *, const char *); +static int autofs_dir_unlink(struct inode *, struct dentry *); +static int autofs_dir_rmdir(struct inode *, struct dentry *); +static int autofs_dir_mkdir(struct inode *, struct dentry *, umode_t); +static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); #ifdef CONFIG_COMPAT -static long autofs4_root_compat_ioctl(struct file *, - unsigned int, unsigned long); +static long autofs_root_compat_ioctl(struct file *, + unsigned int, unsigned long); #endif -static int autofs4_dir_open(struct inode *inode, struct file *file); -static struct dentry *autofs4_lookup(struct inode *, - struct dentry *, unsigned int); -static struct vfsmount *autofs4_d_automount(struct path *); -static int autofs4_d_manage(const struct path *, bool); -static void autofs4_dentry_release(struct dentry *); +static int autofs_dir_open(struct inode *inode, struct file *file); +static struct dentry *autofs_lookup(struct inode *, + struct dentry *, unsigned int); +static struct vfsmount *autofs_d_automount(struct path *); +static int autofs_d_manage(const struct path *, bool); +static void autofs_dentry_release(struct dentry *); -const struct file_operations autofs4_root_operations = { +const struct file_operations autofs_root_operations = { .open = dcache_dir_open, .release = dcache_dir_close, .read = generic_read_dir, .iterate_shared = dcache_readdir, .llseek = dcache_dir_lseek, - .unlocked_ioctl = autofs4_root_ioctl, + .unlocked_ioctl = autofs_root_ioctl, #ifdef CONFIG_COMPAT - .compat_ioctl = autofs4_root_compat_ioctl, + .compat_ioctl = autofs_root_compat_ioctl, #endif }; -const struct file_operations autofs4_dir_operations = { - .open = autofs4_dir_open, +const struct file_operations autofs_dir_operations = { + .open = autofs_dir_open, .release = dcache_dir_close, .read = generic_read_dir, .iterate_shared = dcache_readdir, .llseek = dcache_dir_lseek, }; -const struct inode_operations autofs4_dir_inode_operations = { - .lookup = autofs4_lookup, - .unlink = autofs4_dir_unlink, - .symlink = autofs4_dir_symlink, - .mkdir = autofs4_dir_mkdir, - .rmdir = autofs4_dir_rmdir, +const struct inode_operations autofs_dir_inode_operations = { + .lookup = autofs_lookup, + .unlink = autofs_dir_unlink, + .symlink = autofs_dir_symlink, + .mkdir = autofs_dir_mkdir, + .rmdir = autofs_dir_rmdir, }; -const struct dentry_operations autofs4_dentry_operations = { - .d_automount = autofs4_d_automount, - .d_manage = autofs4_d_manage, - .d_release = autofs4_dentry_release, +const struct dentry_operations autofs_dentry_operations = { + .d_automount = autofs_d_automount, + .d_manage = autofs_d_manage, + .d_release = autofs_dentry_release, }; -static void autofs4_add_active(struct dentry *dentry) +static void autofs_add_active(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino; - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); if (!ino->active_count) { @@ -86,12 +86,12 @@ static void autofs4_add_active(struct dentry *dentry) } } -static void autofs4_del_active(struct dentry *dentry) +static void autofs_del_active(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct autofs_info *ino; - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (ino) { spin_lock(&sbi->lookup_lock); ino->active_count--; @@ -103,14 +103,14 @@ static void autofs4_del_active(struct dentry *dentry) } } -static int autofs4_dir_open(struct inode *inode, struct file *file) +static int autofs_dir_open(struct inode *inode, struct file *file) { struct dentry *dentry = file->f_path.dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); - if (autofs4_oz_mode(sbi)) + if (autofs_oz_mode(sbi)) goto out; /* @@ -133,10 +133,10 @@ out: return dcache_dir_open(inode, file); } -static void autofs4_dentry_release(struct dentry *de) +static void autofs_dentry_release(struct dentry *de) { - struct autofs_info *ino = autofs4_dentry_ino(de); - struct autofs_sb_info *sbi = autofs4_sbi(de->d_sb); + struct autofs_info *ino = autofs_dentry_ino(de); + struct autofs_sb_info *sbi = autofs_sbi(de->d_sb); pr_debug("releasing %p\n", de); @@ -152,12 +152,12 @@ static void autofs4_dentry_release(struct dentry *de) spin_unlock(&sbi->lookup_lock); } - autofs4_free_ino(ino); + autofs_free_ino(ino); } -static struct dentry *autofs4_lookup_active(struct dentry *dentry) +static struct dentry *autofs_lookup_active(struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; const struct qstr *name = &dentry->d_name; unsigned int len = name->len; @@ -209,10 +209,10 @@ next: return NULL; } -static struct dentry *autofs4_lookup_expiring(struct dentry *dentry, - bool rcu_walk) +static struct dentry *autofs_lookup_expiring(struct dentry *dentry, + bool rcu_walk) { - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); struct dentry *parent = dentry->d_parent; const struct qstr *name = &dentry->d_name; unsigned int len = name->len; @@ -269,17 +269,17 @@ next: return NULL; } -static int autofs4_mount_wait(const struct path *path, bool rcu_walk) +static int autofs_mount_wait(const struct path *path, bool rcu_walk) { - struct autofs_sb_info *sbi = autofs4_sbi(path->dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(path->dentry); + struct autofs_sb_info *sbi = autofs_sbi(path->dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(path->dentry); int status = 0; if (ino->flags & AUTOFS_INF_PENDING) { if (rcu_walk) return -ECHILD; pr_debug("waiting for mount name=%pd\n", path->dentry); - status = autofs4_wait(sbi, path, NFY_MOUNT); + status = autofs_wait(sbi, path, NFY_MOUNT); pr_debug("mount wait done status=%d\n", status); } ino->last_used = jiffies; @@ -291,11 +291,11 @@ static int do_expire_wait(const struct path *path, bool rcu_walk) struct dentry *dentry = path->dentry; struct dentry *expiring; - expiring = autofs4_lookup_expiring(dentry, rcu_walk); + expiring = autofs_lookup_expiring(dentry, rcu_walk); if (IS_ERR(expiring)) return PTR_ERR(expiring); if (!expiring) - return autofs4_expire_wait(path, rcu_walk); + return autofs_expire_wait(path, rcu_walk); else { const struct path this = { .mnt = path->mnt, .dentry = expiring }; /* @@ -303,17 +303,17 @@ static int do_expire_wait(const struct path *path, bool rcu_walk) * be quite complete, but the directory has been removed * so it must have been successful, just wait for it. */ - autofs4_expire_wait(&this, 0); - autofs4_del_expiring(expiring); + autofs_expire_wait(&this, 0); + autofs_del_expiring(expiring); dput(expiring); } return 0; } -static struct dentry *autofs4_mountpoint_changed(struct path *path) +static struct dentry *autofs_mountpoint_changed(struct path *path) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); /* * If this is an indirect mount the dentry could have gone away @@ -327,7 +327,7 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) new = d_lookup(parent, &dentry->d_name); if (!new) return NULL; - ino = autofs4_dentry_ino(new); + ino = autofs_dentry_ino(new); ino->last_used = jiffies; dput(path->dentry); path->dentry = new; @@ -335,17 +335,17 @@ static struct dentry *autofs4_mountpoint_changed(struct path *path) return path->dentry; } -static struct vfsmount *autofs4_d_automount(struct path *path) +static struct vfsmount *autofs_d_automount(struct path *path) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); int status; pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never triggers a mount. */ - if (autofs4_oz_mode(sbi)) + if (autofs_oz_mode(sbi)) return NULL; /* @@ -364,7 +364,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_lock(&sbi->fs_lock); if (ino->flags & AUTOFS_INF_PENDING) { spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(path, 0); + status = autofs_mount_wait(path, 0); if (status) return ERR_PTR(status); goto done; @@ -405,7 +405,7 @@ static struct vfsmount *autofs4_d_automount(struct path *path) } ino->flags |= AUTOFS_INF_PENDING; spin_unlock(&sbi->fs_lock); - status = autofs4_mount_wait(path, 0); + status = autofs_mount_wait(path, 0); spin_lock(&sbi->fs_lock); ino->flags &= ~AUTOFS_INF_PENDING; if (status) { @@ -416,24 +416,24 @@ static struct vfsmount *autofs4_d_automount(struct path *path) spin_unlock(&sbi->fs_lock); done: /* Mount succeeded, check if we ended up with a new dentry */ - dentry = autofs4_mountpoint_changed(path); + dentry = autofs_mountpoint_changed(path); if (!dentry) return ERR_PTR(-ENOENT); return NULL; } -static int autofs4_d_manage(const struct path *path, bool rcu_walk) +static int autofs_d_manage(const struct path *path, bool rcu_walk) { struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs4_sbi(dentry->d_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); int status; pr_debug("dentry=%p %pd\n", dentry, dentry); /* The daemon never waits. */ - if (autofs4_oz_mode(sbi)) { + if (autofs_oz_mode(sbi)) { if (!path_is_mountpoint(path)) return -EISDIR; return 0; @@ -447,7 +447,7 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) * This dentry may be under construction so wait on mount * completion. */ - status = autofs4_mount_wait(path, rcu_walk); + status = autofs_mount_wait(path, rcu_walk); if (status) return status; @@ -500,8 +500,8 @@ static int autofs4_d_manage(const struct path *path, bool rcu_walk) } /* Lookups in the root directory */ -static struct dentry *autofs4_lookup(struct inode *dir, - struct dentry *dentry, unsigned int flags) +static struct dentry *autofs_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) { struct autofs_sb_info *sbi; struct autofs_info *ino; @@ -513,13 +513,13 @@ static struct dentry *autofs4_lookup(struct inode *dir, if (dentry->d_name.len > NAME_MAX) return ERR_PTR(-ENAMETOOLONG); - sbi = autofs4_sbi(dir->i_sb); + sbi = autofs_sbi(dir->i_sb); pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", current->pid, task_pgrp_nr(current), sbi->catatonic, - autofs4_oz_mode(sbi)); + autofs_oz_mode(sbi)); - active = autofs4_lookup_active(dentry); + active = autofs_lookup_active(dentry); if (active) return active; else { @@ -529,7 +529,7 @@ static struct dentry *autofs4_lookup(struct inode *dir, * can return fail immediately. The daemon however does need * to create directories within the file system. */ - if (!autofs4_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) + if (!autofs_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) return ERR_PTR(-ENOENT); /* Mark entries in the root as mount triggers */ @@ -537,24 +537,24 @@ static struct dentry *autofs4_lookup(struct inode *dir, autofs_type_indirect(sbi->type)) __managed_dentry_set_managed(dentry); - ino = autofs4_new_ino(sbi); + ino = autofs_new_ino(sbi); if (!ino) return ERR_PTR(-ENOMEM); dentry->d_fsdata = ino; ino->dentry = dentry; - autofs4_add_active(dentry); + autofs_add_active(dentry); } return NULL; } -static int autofs4_dir_symlink(struct inode *dir, +static int autofs_dir_symlink(struct inode *dir, struct dentry *dentry, const char *symname) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; size_t size = strlen(symname); @@ -562,14 +562,14 @@ static int autofs4_dir_symlink(struct inode *dir, pr_debug("%s <- %pd\n", symname, dentry); - if (!autofs4_oz_mode(sbi)) + if (!autofs_oz_mode(sbi)) return -EACCES; BUG_ON(!ino); - autofs4_clean_ino(ino); + autofs_clean_ino(ino); - autofs4_del_active(dentry); + autofs_del_active(dentry); cp = kmalloc(size + 1, GFP_KERNEL); if (!cp) @@ -577,7 +577,7 @@ static int autofs4_dir_symlink(struct inode *dir, strcpy(cp, symname); - inode = autofs4_get_inode(dir->i_sb, S_IFLNK | 0555); + inode = autofs_get_inode(dir->i_sb, S_IFLNK | 0555); if (!inode) { kfree(cp); return -ENOMEM; @@ -588,7 +588,7 @@ static int autofs4_dir_symlink(struct inode *dir, dget(dentry); atomic_inc(&ino->count); - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); @@ -610,20 +610,20 @@ static int autofs4_dir_symlink(struct inode *dir, * If a process is blocked on the dentry waiting for the expire to finish, * it will invalidate the dentry and try to mount with a new one. * - * Also see autofs4_dir_rmdir().. + * Also see autofs_dir_rmdir().. */ -static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) +static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; /* This allows root to remove symlinks */ - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && !IS_ROOT(dentry)) atomic_dec(&p_ino->count); } @@ -635,7 +635,7 @@ static int autofs4_dir_unlink(struct inode *dir, struct dentry *dentry) dir->i_mtime = current_time(dir); spin_lock(&sbi->lookup_lock); - __autofs4_add_expiring(dentry); + __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); @@ -692,15 +692,15 @@ static void autofs_clear_leaf_automount_flags(struct dentry *dentry) managed_dentry_set_managed(parent); } -static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) +static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; pr_debug("dentry %p, removing %pd\n", dentry, dentry); - if (!autofs4_oz_mode(sbi)) + if (!autofs_oz_mode(sbi)) return -EACCES; spin_lock(&sbi->lookup_lock); @@ -708,7 +708,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) spin_unlock(&sbi->lookup_lock); return -ENOTEMPTY; } - __autofs4_add_expiring(dentry); + __autofs_add_expiring(dentry); d_drop(dentry); spin_unlock(&sbi->lookup_lock); @@ -716,7 +716,7 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) autofs_clear_leaf_automount_flags(dentry); if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && dentry->d_parent != dentry) atomic_dec(&p_ino->count); } @@ -730,26 +730,26 @@ static int autofs4_dir_rmdir(struct inode *dir, struct dentry *dentry) return 0; } -static int autofs4_dir_mkdir(struct inode *dir, - struct dentry *dentry, umode_t mode) +static int autofs_dir_mkdir(struct inode *dir, + struct dentry *dentry, umode_t mode) { - struct autofs_sb_info *sbi = autofs4_sbi(dir->i_sb); - struct autofs_info *ino = autofs4_dentry_ino(dentry); + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); struct autofs_info *p_ino; struct inode *inode; - if (!autofs4_oz_mode(sbi)) + if (!autofs_oz_mode(sbi)) return -EACCES; pr_debug("dentry %p, creating %pd\n", dentry, dentry); BUG_ON(!ino); - autofs4_clean_ino(ino); + autofs_clean_ino(ino); - autofs4_del_active(dentry); + autofs_del_active(dentry); - inode = autofs4_get_inode(dir->i_sb, S_IFDIR | mode); + inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode); if (!inode) return -ENOMEM; d_add(dentry, inode); @@ -759,7 +759,7 @@ static int autofs4_dir_mkdir(struct inode *dir, dget(dentry); atomic_inc(&ino->count); - p_ino = autofs4_dentry_ino(dentry->d_parent); + p_ino = autofs_dentry_ino(dentry->d_parent); if (p_ino && !IS_ROOT(dentry)) atomic_inc(&p_ino->count); inc_nlink(dir); @@ -770,7 +770,7 @@ static int autofs4_dir_mkdir(struct inode *dir, /* Get/set timeout ioctl() operation */ #ifdef CONFIG_COMPAT -static inline int autofs4_compat_get_set_timeout(struct autofs_sb_info *sbi, +static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi, compat_ulong_t __user *p) { unsigned long ntimeout; @@ -795,7 +795,7 @@ error: } #endif -static inline int autofs4_get_set_timeout(struct autofs_sb_info *sbi, +static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi, unsigned long __user *p) { unsigned long ntimeout; @@ -820,14 +820,14 @@ error: } /* Return protocol version */ -static inline int autofs4_get_protover(struct autofs_sb_info *sbi, +static inline int autofs_get_protover(struct autofs_sb_info *sbi, int __user *p) { return put_user(sbi->version, p); } /* Return protocol sub version */ -static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, +static inline int autofs_get_protosubver(struct autofs_sb_info *sbi, int __user *p) { return put_user(sbi->sub_version, p); @@ -836,7 +836,7 @@ static inline int autofs4_get_protosubver(struct autofs_sb_info *sbi, /* * Tells the daemon whether it can umount the autofs mount. */ -static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) +static inline int autofs_ask_umount(struct vfsmount *mnt, int __user *p) { int status = 0; @@ -850,14 +850,14 @@ static inline int autofs4_ask_umount(struct vfsmount *mnt, int __user *p) return status; } -/* Identify autofs4_dentries - this is so we can tell if there's +/* Identify autofs_dentries - this is so we can tell if there's * an extra dentry refcount or not. We only hold a refcount on the * dentry if its non-negative (ie, d_inode != NULL) */ -int is_autofs4_dentry(struct dentry *dentry) +int is_autofs_dentry(struct dentry *dentry) { return dentry && d_really_is_positive(dentry) && - dentry->d_op == &autofs4_dentry_operations && + dentry->d_op == &autofs_dentry_operations && dentry->d_fsdata != NULL; } @@ -865,10 +865,10 @@ int is_autofs4_dentry(struct dentry *dentry) * ioctl()'s on the root directory is the chief method for the daemon to * generate kernel reactions */ -static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, +static int autofs_root_ioctl_unlocked(struct inode *inode, struct file *filp, unsigned int cmd, unsigned long arg) { - struct autofs_sb_info *sbi = autofs4_sbi(inode->i_sb); + struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); void __user *p = (void __user *)arg; pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", @@ -878,64 +878,63 @@ static int autofs4_root_ioctl_unlocked(struct inode *inode, struct file *filp, _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) return -ENOTTY; - if (!autofs4_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) return -EPERM; switch (cmd) { case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ - return autofs4_wait_release(sbi, (autofs_wqt_t) arg, 0); + return autofs_wait_release(sbi, (autofs_wqt_t) arg, 0); case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ - return autofs4_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); + return autofs_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); return 0; case AUTOFS_IOC_PROTOVER: /* Get protocol version */ - return autofs4_get_protover(sbi, p); + return autofs_get_protover(sbi, p); case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */ - return autofs4_get_protosubver(sbi, p); + return autofs_get_protosubver(sbi, p); case AUTOFS_IOC_SETTIMEOUT: - return autofs4_get_set_timeout(sbi, p); + return autofs_get_set_timeout(sbi, p); #ifdef CONFIG_COMPAT case AUTOFS_IOC_SETTIMEOUT32: - return autofs4_compat_get_set_timeout(sbi, p); + return autofs_compat_get_set_timeout(sbi, p); #endif case AUTOFS_IOC_ASKUMOUNT: - return autofs4_ask_umount(filp->f_path.mnt, p); + return autofs_ask_umount(filp->f_path.mnt, p); /* return a single thing to expire */ case AUTOFS_IOC_EXPIRE: - return autofs4_expire_run(inode->i_sb, - filp->f_path.mnt, sbi, p); + return autofs_expire_run(inode->i_sb, filp->f_path.mnt, sbi, p); /* same as above, but can send multiple expires through pipe */ case AUTOFS_IOC_EXPIRE_MULTI: - return autofs4_expire_multi(inode->i_sb, - filp->f_path.mnt, sbi, p); + return autofs_expire_multi(inode->i_sb, + filp->f_path.mnt, sbi, p); default: return -EINVAL; } } -static long autofs4_root_ioctl(struct file *filp, +static long autofs_root_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); - return autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + return autofs_root_ioctl_unlocked(inode, filp, cmd, arg); } #ifdef CONFIG_COMPAT -static long autofs4_root_compat_ioctl(struct file *filp, +static long autofs_root_compat_ioctl(struct file *filp, unsigned int cmd, unsigned long arg) { struct inode *inode = file_inode(filp); int ret; if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) - ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, arg); + ret = autofs_root_ioctl_unlocked(inode, filp, cmd, arg); else - ret = autofs4_root_ioctl_unlocked(inode, filp, cmd, + ret = autofs_root_ioctl_unlocked(inode, filp, cmd, (unsigned long) compat_ptr(arg)); return ret; diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c index ab0b4285a202..aad3902c0cc1 100644 --- a/fs/autofs4/symlink.c +++ b/fs/autofs4/symlink.c @@ -8,22 +8,22 @@ #include "autofs_i.h" -static const char *autofs4_get_link(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) +static const char *autofs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) { struct autofs_sb_info *sbi; struct autofs_info *ino; if (!dentry) return ERR_PTR(-ECHILD); - sbi = autofs4_sbi(dentry->d_sb); - ino = autofs4_dentry_ino(dentry); - if (ino && !autofs4_oz_mode(sbi)) + sbi = autofs_sbi(dentry->d_sb); + ino = autofs_dentry_ino(dentry); + if (ino && !autofs_oz_mode(sbi)) ino->last_used = jiffies; return d_inode(dentry)->i_private; } -const struct inode_operations autofs4_symlink_inode_operations = { - .get_link = autofs4_get_link +const struct inode_operations autofs_symlink_inode_operations = { + .get_link = autofs_get_link }; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c index be9c3dc048ab..8a566fa66afe 100644 --- a/fs/autofs4/waitq.c +++ b/fs/autofs4/waitq.c @@ -17,9 +17,9 @@ /* We make this a static variable rather than a part of the superblock; it * is better if we don't reassign numbers easily even across filesystems */ -static autofs_wqt_t autofs4_next_wait_queue = 1; +static autofs_wqt_t autofs_next_wait_queue = 1; -void autofs4_catatonic_mode(struct autofs_sb_info *sbi) +void autofs_catatonic_mode(struct autofs_sb_info *sbi) { struct autofs_wait_queue *wq, *nwq; @@ -49,8 +49,8 @@ void autofs4_catatonic_mode(struct autofs_sb_info *sbi) mutex_unlock(&sbi->wq_mutex); } -static int autofs4_write(struct autofs_sb_info *sbi, - struct file *file, const void *addr, int bytes) +static int autofs_write(struct autofs_sb_info *sbi, + struct file *file, const void *addr, int bytes) { unsigned long sigpipe, flags; const char *data = (const char *)addr; @@ -82,7 +82,7 @@ static int autofs4_write(struct autofs_sb_info *sbi, return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; } -static void autofs4_notify_daemon(struct autofs_sb_info *sbi, +static void autofs_notify_daemon(struct autofs_sb_info *sbi, struct autofs_wait_queue *wq, int type) { @@ -167,23 +167,23 @@ static void autofs4_notify_daemon(struct autofs_sb_info *sbi, mutex_unlock(&sbi->wq_mutex); - switch (ret = autofs4_write(sbi, pipe, &pkt, pktsz)) { + switch (ret = autofs_write(sbi, pipe, &pkt, pktsz)) { case 0: break; case -ENOMEM: case -ERESTARTSYS: /* Just fail this one */ - autofs4_wait_release(sbi, wq->wait_queue_token, ret); + autofs_wait_release(sbi, wq->wait_queue_token, ret); break; default: - autofs4_catatonic_mode(sbi); + autofs_catatonic_mode(sbi); break; } fput(pipe); } -static int autofs4_getpath(struct autofs_sb_info *sbi, - struct dentry *dentry, char **name) +static int autofs_getpath(struct autofs_sb_info *sbi, + struct dentry *dentry, char **name) { struct dentry *root = sbi->sb->s_root; struct dentry *tmp; @@ -228,7 +228,7 @@ rename_retry: } static struct autofs_wait_queue * -autofs4_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) +autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) { struct autofs_wait_queue *wq; @@ -263,7 +263,7 @@ static int validate_request(struct autofs_wait_queue **wait, return -ENOENT; /* Wait in progress, continue; */ - wq = autofs4_find_wait(sbi, qstr); + wq = autofs_find_wait(sbi, qstr); if (wq) { *wait = wq; return 1; @@ -272,7 +272,7 @@ static int validate_request(struct autofs_wait_queue **wait, *wait = NULL; /* If we don't yet have any info this is a new request */ - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (!ino) return 1; @@ -297,7 +297,7 @@ static int validate_request(struct autofs_wait_queue **wait, if (sbi->catatonic) return -ENOENT; - wq = autofs4_find_wait(sbi, qstr); + wq = autofs_find_wait(sbi, qstr); if (wq) { *wait = wq; return 1; @@ -351,7 +351,7 @@ static int validate_request(struct autofs_wait_queue **wait, return 1; } -int autofs4_wait(struct autofs_sb_info *sbi, +int autofs_wait(struct autofs_sb_info *sbi, const struct path *path, enum autofs_notify notify) { struct dentry *dentry = path->dentry; @@ -399,7 +399,7 @@ int autofs4_wait(struct autofs_sb_info *sbi, if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) qstr.len = sprintf(name, "%p", dentry); else { - qstr.len = autofs4_getpath(sbi, dentry, &name); + qstr.len = autofs_getpath(sbi, dentry, &name); if (!qstr.len) { kfree(name); return -ENOENT; @@ -430,15 +430,15 @@ int autofs4_wait(struct autofs_sb_info *sbi, return -ENOMEM; } - wq->wait_queue_token = autofs4_next_wait_queue; - if (++autofs4_next_wait_queue == 0) - autofs4_next_wait_queue = 1; + wq->wait_queue_token = autofs_next_wait_queue; + if (++autofs_next_wait_queue == 0) + autofs_next_wait_queue = 1; wq->next = sbi->queues; sbi->queues = wq; init_waitqueue_head(&wq->queue); memcpy(&wq->name, &qstr, sizeof(struct qstr)); - wq->dev = autofs4_get_dev(sbi); - wq->ino = autofs4_get_ino(sbi); + wq->dev = autofs_get_dev(sbi); + wq->ino = autofs_get_ino(sbi); wq->uid = current_uid(); wq->gid = current_gid(); wq->pid = pid; @@ -467,9 +467,9 @@ int autofs4_wait(struct autofs_sb_info *sbi, wq->name.name, notify); /* - * autofs4_notify_daemon() may block; it will unlock ->wq_mutex + * autofs_notify_daemon() may block; it will unlock ->wq_mutex */ - autofs4_notify_daemon(sbi, wq, type); + autofs_notify_daemon(sbi, wq, type); } else { wq->wait_ctr++; pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", @@ -500,12 +500,12 @@ int autofs4_wait(struct autofs_sb_info *sbi, struct dentry *de = NULL; /* direct mount or browsable map */ - ino = autofs4_dentry_ino(dentry); + ino = autofs_dentry_ino(dentry); if (!ino) { /* If not lookup actual dentry used */ de = d_lookup(dentry->d_parent, &dentry->d_name); if (de) - ino = autofs4_dentry_ino(de); + ino = autofs_dentry_ino(de); } /* Set mount requester */ @@ -530,7 +530,8 @@ int autofs4_wait(struct autofs_sb_info *sbi, } -int autofs4_wait_release(struct autofs_sb_info *sbi, autofs_wqt_t wait_queue_token, int status) +int autofs_wait_release(struct autofs_sb_info *sbi, + autofs_wqt_t wait_queue_token, int status) { struct autofs_wait_queue *wq, **wql; From ebc921ca9b92a3cf304d99bd7b7f373ec78c7ed7 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:13 -0700 Subject: [PATCH 108/118] autofs: copy autofs4 to autofs Copy source files from the autofs4 directory to the autofs directory. Link: http://lkml.kernel.org/r/152626705013.28589.931913083997578251.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 273 ++++++++++++ fs/autofs/dev-ioctl.c | 761 ++++++++++++++++++++++++++++++++++ fs/autofs/expire.c | 631 ++++++++++++++++++++++++++++ fs/autofs/init.c | 48 +++ fs/autofs/inode.c | 375 +++++++++++++++++ fs/autofs/root.c | 942 ++++++++++++++++++++++++++++++++++++++++++ fs/autofs/symlink.c | 29 ++ fs/autofs/waitq.c | 559 +++++++++++++++++++++++++ 8 files changed, 3618 insertions(+) create mode 100644 fs/autofs/autofs_i.h create mode 100644 fs/autofs/dev-ioctl.c create mode 100644 fs/autofs/expire.c create mode 100644 fs/autofs/init.c create mode 100644 fs/autofs/inode.c create mode 100644 fs/autofs/root.c create mode 100644 fs/autofs/symlink.c create mode 100644 fs/autofs/waitq.c diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h new file mode 100644 index 000000000000..9110b66c7ef1 --- /dev/null +++ b/fs/autofs/autofs_i.h @@ -0,0 +1,273 @@ +/* + * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved + * Copyright 2005-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +/* Internal header file for autofs */ + +#include +#include + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +/* This is the range of ioctl() numbers we claim as ours */ +#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY +#define AUTOFS_IOC_COUNT 32 + +#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) +#define AUTOFS_DEV_IOCTL_IOC_COUNT \ + (AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD) + +#ifdef pr_fmt +#undef pr_fmt +#endif +#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__ + +/* + * Unified info structure. This is pointed to by both the dentry and + * inode structures. Each file in the filesystem has an instance of this + * structure. It holds a reference to the dentry, so dentries are never + * flushed while the file exists. All name lookups are dealt with at the + * dentry level, although the filesystem can interfere in the validation + * process. Readdir is implemented by traversing the dentry lists. + */ +struct autofs_info { + struct dentry *dentry; + struct inode *inode; + + int flags; + + struct completion expire_complete; + + struct list_head active; + int active_count; + + struct list_head expiring; + + struct autofs_sb_info *sbi; + unsigned long last_used; + atomic_t count; + + kuid_t uid; + kgid_t gid; +}; + +#define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */ +#define AUTOFS_INF_WANT_EXPIRE (1<<1) /* the dentry is being considered + * for expiry, so RCU_walk is + * not permitted. If it progresses to + * actual expiry attempt, the flag is + * not cleared when EXPIRING is set - + * in that case it gets cleared only + * when it comes to clearing EXPIRING. + */ +#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ + +struct autofs_wait_queue { + wait_queue_head_t queue; + struct autofs_wait_queue *next; + autofs_wqt_t wait_queue_token; + /* We use the following to see what we are waiting for */ + struct qstr name; + u32 dev; + u64 ino; + kuid_t uid; + kgid_t gid; + pid_t pid; + pid_t tgid; + /* This is for status reporting upon return */ + int status; + unsigned int wait_ctr; +}; + +#define AUTOFS_SBI_MAGIC 0x6d4a556d + +struct autofs_sb_info { + u32 magic; + int pipefd; + struct file *pipe; + struct pid *oz_pgrp; + int catatonic; + int version; + int sub_version; + int min_proto; + int max_proto; + unsigned long exp_timeout; + unsigned int type; + struct super_block *sb; + struct mutex wq_mutex; + struct mutex pipe_mutex; + spinlock_t fs_lock; + struct autofs_wait_queue *queues; /* Wait queue pointer */ + spinlock_t lookup_lock; + struct list_head active_list; + struct list_head expiring_list; + struct rcu_head rcu; +}; + +static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb) +{ + return (struct autofs_sb_info *)(sb->s_fs_info); +} + +static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry) +{ + return (struct autofs_info *)(dentry->d_fsdata); +} + +/* autofs_oz_mode(): do we see the man behind the curtain? (The + * processes which do manipulations for us in user space sees the raw + * filesystem without "magic".) + */ +static inline int autofs_oz_mode(struct autofs_sb_info *sbi) +{ + return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; +} + +struct inode *autofs_get_inode(struct super_block *, umode_t); +void autofs_free_ino(struct autofs_info *); + +/* Expiration */ +int is_autofs_dentry(struct dentry *); +int autofs_expire_wait(const struct path *path, int rcu_walk); +int autofs_expire_run(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, + struct autofs_packet_expire __user *); +int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when); +int autofs_expire_multi(struct super_block *, struct vfsmount *, + struct autofs_sb_info *, int __user *); +struct dentry *autofs_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); +struct dentry *autofs_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, int how); + +/* Device node initialization */ + +int autofs_dev_ioctl_init(void); +void autofs_dev_ioctl_exit(void); + +/* Operations structures */ + +extern const struct inode_operations autofs_symlink_inode_operations; +extern const struct inode_operations autofs_dir_inode_operations; +extern const struct file_operations autofs_dir_operations; +extern const struct file_operations autofs_root_operations; +extern const struct dentry_operations autofs_dentry_operations; + +/* VFS automount flags management functions */ +static inline void __managed_dentry_set_managed(struct dentry *dentry) +{ + dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); +} + +static inline void managed_dentry_set_managed(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_set_managed(dentry); + spin_unlock(&dentry->d_lock); +} + +static inline void __managed_dentry_clear_managed(struct dentry *dentry) +{ + dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); +} + +static inline void managed_dentry_clear_managed(struct dentry *dentry) +{ + spin_lock(&dentry->d_lock); + __managed_dentry_clear_managed(dentry); + spin_unlock(&dentry->d_lock); +} + +/* Initializing function */ + +int autofs_fill_super(struct super_block *, void *, int); +struct autofs_info *autofs_new_ino(struct autofs_sb_info *); +void autofs_clean_ino(struct autofs_info *); + +static inline int autofs_prepare_pipe(struct file *pipe) +{ + if (!(pipe->f_mode & FMODE_CAN_WRITE)) + return -EINVAL; + if (!S_ISFIFO(file_inode(pipe)->i_mode)) + return -EINVAL; + /* We want a packet pipe */ + pipe->f_flags |= O_DIRECT; + return 0; +} + +/* Queue management functions */ + +int autofs_wait(struct autofs_sb_info *, + const struct path *, enum autofs_notify); +int autofs_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); +void autofs_catatonic_mode(struct autofs_sb_info *); + +static inline u32 autofs_get_dev(struct autofs_sb_info *sbi) +{ + return new_encode_dev(sbi->sb->s_dev); +} + +static inline u64 autofs_get_ino(struct autofs_sb_info *sbi) +{ + return d_inode(sbi->sb->s_root)->i_ino; +} + +static inline void __autofs_add_expiring(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); + + if (ino) { + if (list_empty(&ino->expiring)) + list_add(&ino->expiring, &sbi->expiring_list); + } +} + +static inline void autofs_add_expiring(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); + + if (ino) { + spin_lock(&sbi->lookup_lock); + if (list_empty(&ino->expiring)) + list_add(&ino->expiring, &sbi->expiring_list); + spin_unlock(&sbi->lookup_lock); + } +} + +static inline void autofs_del_expiring(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); + + if (ino) { + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->expiring)) + list_del_init(&ino->expiring); + spin_unlock(&sbi->lookup_lock); + } +} + +void autofs_kill_sb(struct super_block *); diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c new file mode 100644 index 000000000000..a2281ab2b957 --- /dev/null +++ b/fs/autofs/dev-ioctl.c @@ -0,0 +1,761 @@ +/* + * Copyright 2008 Red Hat, Inc. All rights reserved. + * Copyright 2008 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +#include "autofs_i.h" + +/* + * This module implements an interface for routing autofs ioctl control + * commands via a miscellaneous device file. + * + * The alternate interface is needed because we need to be able open + * an ioctl file descriptor on an autofs mount that may be covered by + * another mount. This situation arises when starting automount(8) + * or other user space daemon which uses direct mounts or offset + * mounts (used for autofs lazy mount/umount of nested mount trees), + * which have been left busy at at service shutdown. + */ + +typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *, + struct autofs_dev_ioctl *); + +static int check_name(const char *name) +{ + if (!strchr(name, '/')) + return -EINVAL; + return 0; +} + +/* + * Check a string doesn't overrun the chunk of + * memory we copied from user land. + */ +static int invalid_str(char *str, size_t size) +{ + if (memchr(str, 0, size)) + return 0; + return -EINVAL; +} + +/* + * Check that the user compiled against correct version of autofs + * misc device code. + * + * As well as checking the version compatibility this always copies + * the kernel interface version out. + */ +static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) +{ + int err = 0; + + if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || + (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { + pr_warn("ioctl control interface version mismatch: " + "kernel(%u.%u), user(%u.%u), cmd(0x%08x)\n", + AUTOFS_DEV_IOCTL_VERSION_MAJOR, + AUTOFS_DEV_IOCTL_VERSION_MINOR, + param->ver_major, param->ver_minor, cmd); + err = -EINVAL; + } + + /* Fill in the kernel version. */ + param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + + return err; +} + +/* + * Copy parameter control struct, including a possible path allocated + * at the end of the struct. + */ +static struct autofs_dev_ioctl * +copy_dev_ioctl(struct autofs_dev_ioctl __user *in) +{ + struct autofs_dev_ioctl tmp, *res; + + if (copy_from_user(&tmp, in, AUTOFS_DEV_IOCTL_SIZE)) + return ERR_PTR(-EFAULT); + + if (tmp.size < AUTOFS_DEV_IOCTL_SIZE) + return ERR_PTR(-EINVAL); + + if (tmp.size > AUTOFS_DEV_IOCTL_SIZE + PATH_MAX) + return ERR_PTR(-ENAMETOOLONG); + + res = memdup_user(in, tmp.size); + if (!IS_ERR(res)) + res->size = tmp.size; + + return res; +} + +static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) +{ + kfree(param); +} + +/* + * Check sanity of parameter control fields and if a path is present + * check that it is terminated and contains at least one "/". + */ +static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) +{ + int err; + + err = check_dev_ioctl_version(cmd, param); + if (err) { + pr_warn("invalid device control module version " + "supplied for cmd(0x%08x)\n", cmd); + goto out; + } + + if (param->size > AUTOFS_DEV_IOCTL_SIZE) { + err = invalid_str(param->path, param->size - AUTOFS_DEV_IOCTL_SIZE); + if (err) { + pr_warn( + "path string terminator missing for cmd(0x%08x)\n", + cmd); + goto out; + } + + err = check_name(param->path); + if (err) { + pr_warn("invalid path supplied for cmd(0x%08x)\n", + cmd); + goto out; + } + } + + err = 0; +out: + return err; +} + +/* + * Get the autofs super block info struct from the file opened on + * the autofs mount point. + */ +static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) +{ + struct autofs_sb_info *sbi = NULL; + struct inode *inode; + + if (f) { + inode = file_inode(f); + sbi = autofs_sbi(inode->i_sb); + } + return sbi; +} + +/* Return autofs dev ioctl version */ +static int autofs_dev_ioctl_version(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + /* This should have already been set. */ + param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; + param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; + return 0; +} + +/* Return autofs module protocol version */ +static int autofs_dev_ioctl_protover(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->protover.version = sbi->version; + return 0; +} + +/* Return autofs module protocol sub version */ +static int autofs_dev_ioctl_protosubver(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->protosubver.sub_version = sbi->sub_version; + return 0; +} + +/* Find the topmost mount satisfying test() */ +static int find_autofs_mount(const char *pathname, + struct path *res, + int test(const struct path *path, void *data), + void *data) +{ + struct path path; + int err; + + err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); + if (err) + return err; + err = -ENOENT; + while (path.dentry == path.mnt->mnt_root) { + if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { + if (test(&path, data)) { + path_get(&path); + *res = path; + err = 0; + break; + } + } + if (!follow_up(&path)) + break; + } + path_put(&path); + return err; +} + +static int test_by_dev(const struct path *path, void *p) +{ + return path->dentry->d_sb->s_dev == *(dev_t *)p; +} + +static int test_by_type(const struct path *path, void *p) +{ + struct autofs_info *ino = autofs_dentry_ino(path->dentry); + + return ino && ino->sbi->type & *(unsigned *)p; +} + +/* + * Open a file descriptor on the autofs mount point corresponding + * to the given path and device number (aka. new_encode_dev(sb->s_dev)). + */ +static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) +{ + int err, fd; + + fd = get_unused_fd_flags(O_CLOEXEC); + if (likely(fd >= 0)) { + struct file *filp; + struct path path; + + err = find_autofs_mount(name, &path, test_by_dev, &devid); + if (err) + goto out; + + filp = dentry_open(&path, O_RDONLY, current_cred()); + path_put(&path); + if (IS_ERR(filp)) { + err = PTR_ERR(filp); + goto out; + } + + fd_install(fd, filp); + } + + return fd; + +out: + put_unused_fd(fd); + return err; +} + +/* Open a file descriptor on an autofs mount point */ +static int autofs_dev_ioctl_openmount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + const char *path; + dev_t devid; + int err, fd; + + /* param->path has already been checked */ + if (!param->openmount.devid) + return -EINVAL; + + param->ioctlfd = -1; + + path = param->path; + devid = new_decode_dev(param->openmount.devid); + + err = 0; + fd = autofs_dev_ioctl_open_mountpoint(path, devid); + if (unlikely(fd < 0)) { + err = fd; + goto out; + } + + param->ioctlfd = fd; +out: + return err; +} + +/* Close file descriptor allocated above (user can also use close(2)). */ +static int autofs_dev_ioctl_closemount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + return ksys_close(param->ioctlfd); +} + +/* + * Send "ready" status for an existing wait (either a mount or an expire + * request). + */ +static int autofs_dev_ioctl_ready(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_wqt_t token; + + token = (autofs_wqt_t) param->ready.token; + return autofs_wait_release(sbi, token, 0); +} + +/* + * Send "fail" status for an existing wait (either a mount or an expire + * request). + */ +static int autofs_dev_ioctl_fail(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_wqt_t token; + int status; + + token = (autofs_wqt_t) param->fail.token; + status = param->fail.status < 0 ? param->fail.status : -ENOENT; + return autofs_wait_release(sbi, token, status); +} + +/* + * Set the pipe fd for kernel communication to the daemon. + * + * Normally this is set at mount using an option but if we + * are reconnecting to a busy mount then we need to use this + * to tell the autofs mount about the new kernel pipe fd. In + * order to protect mounts against incorrectly setting the + * pipefd we also require that the autofs mount be catatonic. + * + * This also sets the process group id used to identify the + * controlling process (eg. the owning automount(8) daemon). + */ +static int autofs_dev_ioctl_setpipefd(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + int pipefd; + int err = 0; + struct pid *new_pid = NULL; + + if (param->setpipefd.pipefd == -1) + return -EINVAL; + + pipefd = param->setpipefd.pipefd; + + mutex_lock(&sbi->wq_mutex); + if (!sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return -EBUSY; + } else { + struct file *pipe; + + new_pid = get_task_pid(current, PIDTYPE_PGID); + + if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { + pr_warn("not allowed to change PID namespace\n"); + err = -EINVAL; + goto out; + } + + pipe = fget(pipefd); + if (!pipe) { + err = -EBADF; + goto out; + } + if (autofs_prepare_pipe(pipe) < 0) { + err = -EPIPE; + fput(pipe); + goto out; + } + swap(sbi->oz_pgrp, new_pid); + sbi->pipefd = pipefd; + sbi->pipe = pipe; + sbi->catatonic = 0; + } +out: + put_pid(new_pid); + mutex_unlock(&sbi->wq_mutex); + return err; +} + +/* + * Make the autofs mount point catatonic, no longer responsive to + * mount requests. Also closes the kernel pipe file descriptor. + */ +static int autofs_dev_ioctl_catatonic(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + autofs_catatonic_mode(sbi); + return 0; +} + +/* Set the autofs mount timeout */ +static int autofs_dev_ioctl_timeout(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + unsigned long timeout; + + timeout = param->timeout.timeout; + param->timeout.timeout = sbi->exp_timeout / HZ; + sbi->exp_timeout = timeout * HZ; + return 0; +} + +/* + * Return the uid and gid of the last request for the mount + * + * When reconstructing an autofs mount tree with active mounts + * we need to re-connect to mounts that may have used the original + * process uid and gid (or string variations of them) for mount + * lookups within the map entry. + */ +static int autofs_dev_ioctl_requester(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct autofs_info *ino; + struct path path; + dev_t devid; + int err = -ENOENT; + + if (param->size <= AUTOFS_DEV_IOCTL_SIZE) { + err = -EINVAL; + goto out; + } + + devid = sbi->sb->s_dev; + + param->requester.uid = param->requester.gid = -1; + + err = find_autofs_mount(param->path, &path, test_by_dev, &devid); + if (err) + goto out; + + ino = autofs_dentry_ino(path.dentry); + if (ino) { + err = 0; + autofs_expire_wait(&path, 0); + spin_lock(&sbi->fs_lock); + param->requester.uid = + from_kuid_munged(current_user_ns(), ino->uid); + param->requester.gid = + from_kgid_munged(current_user_ns(), ino->gid); + spin_unlock(&sbi->fs_lock); + } + path_put(&path); +out: + return err; +} + +/* + * Call repeatedly until it returns -EAGAIN, meaning there's nothing + * more that can be done. + */ +static int autofs_dev_ioctl_expire(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct vfsmount *mnt; + int how; + + how = param->expire.how; + mnt = fp->f_path.mnt; + + return autofs_do_expire_multi(sbi->sb, mnt, sbi, how); +} + +/* Check if autofs mount point is in use */ +static int autofs_dev_ioctl_askumount(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + param->askumount.may_umount = 0; + if (may_umount(fp->f_path.mnt)) + param->askumount.may_umount = 1; + return 0; +} + +/* + * Check if the given path is a mountpoint. + * + * If we are supplied with the file descriptor of an autofs + * mount we're looking for a specific mount. In this case + * the path is considered a mountpoint if it is itself a + * mountpoint or contains a mount, such as a multi-mount + * without a root mount. In this case we return 1 if the + * path is a mount point and the super magic of the covering + * mount if there is one or 0 if it isn't a mountpoint. + * + * If we aren't supplied with a file descriptor then we + * lookup the path and check if it is the root of a mount. + * If a type is given we are looking for a particular autofs + * mount and if we don't find a match we return fail. If the + * located path is the root of a mount we return 1 along with + * the super magic of the mount or 0 otherwise. + * + * In both cases the the device number (as returned by + * new_encode_dev()) is also returned. + */ +static int autofs_dev_ioctl_ismountpoint(struct file *fp, + struct autofs_sb_info *sbi, + struct autofs_dev_ioctl *param) +{ + struct path path; + const char *name; + unsigned int type; + unsigned int devid, magic; + int err = -ENOENT; + + if (param->size <= AUTOFS_DEV_IOCTL_SIZE) { + err = -EINVAL; + goto out; + } + + name = param->path; + type = param->ismountpoint.in.type; + + param->ismountpoint.out.devid = devid = 0; + param->ismountpoint.out.magic = magic = 0; + + if (!fp || param->ioctlfd == -1) { + if (autofs_type_any(type)) + err = kern_path_mountpoint(AT_FDCWD, + name, &path, LOOKUP_FOLLOW); + else + err = find_autofs_mount(name, &path, + test_by_type, &type); + if (err) + goto out; + devid = new_encode_dev(path.dentry->d_sb->s_dev); + err = 0; + if (path.mnt->mnt_root == path.dentry) { + err = 1; + magic = path.dentry->d_sb->s_magic; + } + } else { + dev_t dev = sbi->sb->s_dev; + + err = find_autofs_mount(name, &path, test_by_dev, &dev); + if (err) + goto out; + + devid = new_encode_dev(dev); + + err = path_has_submounts(&path); + + if (follow_down_one(&path)) + magic = path.dentry->d_sb->s_magic; + } + + param->ismountpoint.out.devid = devid; + param->ismountpoint.out.magic = magic; + path_put(&path); +out: + return err; +} + +/* + * Our range of ioctl numbers isn't 0 based so we need to shift + * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table + * lookup. + */ +#define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST)) + +static ioctl_fn lookup_dev_ioctl(unsigned int cmd) +{ + static ioctl_fn _ioctls[] = { + autofs_dev_ioctl_version, + autofs_dev_ioctl_protover, + autofs_dev_ioctl_protosubver, + autofs_dev_ioctl_openmount, + autofs_dev_ioctl_closemount, + autofs_dev_ioctl_ready, + autofs_dev_ioctl_fail, + autofs_dev_ioctl_setpipefd, + autofs_dev_ioctl_catatonic, + autofs_dev_ioctl_timeout, + autofs_dev_ioctl_requester, + autofs_dev_ioctl_expire, + autofs_dev_ioctl_askumount, + autofs_dev_ioctl_ismountpoint, + }; + unsigned int idx = cmd_idx(cmd); + + return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx]; +} + +/* ioctl dispatcher */ +static int _autofs_dev_ioctl(unsigned int command, + struct autofs_dev_ioctl __user *user) +{ + struct autofs_dev_ioctl *param; + struct file *fp; + struct autofs_sb_info *sbi; + unsigned int cmd_first, cmd; + ioctl_fn fn = NULL; + int err = 0; + + cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); + cmd = _IOC_NR(command); + + if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || + cmd - cmd_first > AUTOFS_DEV_IOCTL_IOC_COUNT) { + return -ENOTTY; + } + + /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD + * and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD + */ + if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && + cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD && + !capable(CAP_SYS_ADMIN)) + return -EPERM; + + /* Copy the parameters into kernel space. */ + param = copy_dev_ioctl(user); + if (IS_ERR(param)) + return PTR_ERR(param); + + err = validate_dev_ioctl(command, param); + if (err) + goto out; + + fn = lookup_dev_ioctl(cmd); + if (!fn) { + pr_warn("unknown command 0x%08x\n", command); + err = -ENOTTY; + goto out; + } + + fp = NULL; + sbi = NULL; + + /* + * For obvious reasons the openmount can't have a file + * descriptor yet. We don't take a reference to the + * file during close to allow for immediate release, + * and the same for retrieving ioctl version. + */ + if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && + cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && + cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { + fp = fget(param->ioctlfd); + if (!fp) { + if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) + goto cont; + err = -EBADF; + goto out; + } + + sbi = autofs_dev_ioctl_sbi(fp); + if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) { + err = -EINVAL; + fput(fp); + goto out; + } + + /* + * Admin needs to be able to set the mount catatonic in + * order to be able to perform the re-open. + */ + if (!autofs_oz_mode(sbi) && + cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { + err = -EACCES; + fput(fp); + goto out; + } + } +cont: + err = fn(fp, sbi, param); + + if (fp) + fput(fp); + if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) + err = -EFAULT; +out: + free_dev_ioctl(param); + return err; +} + +static long autofs_dev_ioctl(struct file *file, unsigned int command, + unsigned long u) +{ + int err; + + err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); + return (long) err; +} + +#ifdef CONFIG_COMPAT +static long autofs_dev_ioctl_compat(struct file *file, unsigned int command, + unsigned long u) +{ + return autofs_dev_ioctl(file, command, (unsigned long) compat_ptr(u)); +} +#else +#define autofs_dev_ioctl_compat NULL +#endif + +static const struct file_operations _dev_ioctl_fops = { + .unlocked_ioctl = autofs_dev_ioctl, + .compat_ioctl = autofs_dev_ioctl_compat, + .owner = THIS_MODULE, + .llseek = noop_llseek, +}; + +static struct miscdevice _autofs_dev_ioctl_misc = { + .minor = AUTOFS_MINOR, + .name = AUTOFS_DEVICE_NAME, + .fops = &_dev_ioctl_fops, + .mode = 0644, +}; + +MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); +MODULE_ALIAS("devname:autofs"); + +/* Register/deregister misc character device */ +int __init autofs_dev_ioctl_init(void) +{ + int r; + + r = misc_register(&_autofs_dev_ioctl_misc); + if (r) { + pr_err("misc_register failed for control device\n"); + return r; + } + + return 0; +} + +void autofs_dev_ioctl_exit(void) +{ + misc_deregister(&_autofs_dev_ioctl_misc); +} diff --git a/fs/autofs/expire.c b/fs/autofs/expire.c new file mode 100644 index 000000000000..b332d3f6e730 --- /dev/null +++ b/fs/autofs/expire.c @@ -0,0 +1,631 @@ +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 1999-2000 Jeremy Fitzhardinge + * Copyright 2001-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#include "autofs_i.h" + +static unsigned long now; + +/* Check if a dentry can be expired */ +static inline int autofs_can_expire(struct dentry *dentry, + unsigned long timeout, int do_now) +{ + struct autofs_info *ino = autofs_dentry_ino(dentry); + + /* dentry in the process of being deleted */ + if (ino == NULL) + return 0; + + if (!do_now) { + /* Too young to die */ + if (!timeout || time_after(ino->last_used + timeout, now)) + return 0; + } + return 1; +} + +/* Check a mount point for busyness */ +static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry) +{ + struct dentry *top = dentry; + struct path path = {.mnt = mnt, .dentry = dentry}; + int status = 1; + + pr_debug("dentry %p %pd\n", dentry, dentry); + + path_get(&path); + + if (!follow_down_one(&path)) + goto done; + + if (is_autofs_dentry(path.dentry)) { + struct autofs_sb_info *sbi = autofs_sbi(path.dentry->d_sb); + + /* This is an autofs submount, we can't expire it */ + if (autofs_type_indirect(sbi->type)) + goto done; + } + + /* Update the expiry counter if fs is busy */ + if (!may_umount_tree(path.mnt)) { + struct autofs_info *ino; + + ino = autofs_dentry_ino(top); + ino->last_used = jiffies; + goto done; + } + + status = 0; +done: + pr_debug("returning = %d\n", status); + path_put(&path); + return status; +} + +/* + * Calculate and dget next entry in the subdirs list under root. + */ +static struct dentry *get_next_positive_subdir(struct dentry *prev, + struct dentry *root) +{ + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); + struct list_head *next; + struct dentry *q; + + spin_lock(&sbi->lookup_lock); + spin_lock(&root->d_lock); + + if (prev) + next = prev->d_child.next; + else { + prev = dget_dlock(root); + next = prev->d_subdirs.next; + } + +cont: + if (next == &root->d_subdirs) { + spin_unlock(&root->d_lock); + spin_unlock(&sbi->lookup_lock); + dput(prev); + return NULL; + } + + q = list_entry(next, struct dentry, d_child); + + spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); + /* Already gone or negative dentry (under construction) - try next */ + if (!d_count(q) || !simple_positive(q)) { + spin_unlock(&q->d_lock); + next = q->d_child.next; + goto cont; + } + dget_dlock(q); + spin_unlock(&q->d_lock); + spin_unlock(&root->d_lock); + spin_unlock(&sbi->lookup_lock); + + dput(prev); + + return q; +} + +/* + * Calculate and dget next entry in top down tree traversal. + */ +static struct dentry *get_next_positive_dentry(struct dentry *prev, + struct dentry *root) +{ + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); + struct list_head *next; + struct dentry *p, *ret; + + if (prev == NULL) + return dget(root); + + spin_lock(&sbi->lookup_lock); +relock: + p = prev; + spin_lock(&p->d_lock); +again: + next = p->d_subdirs.next; + if (next == &p->d_subdirs) { + while (1) { + struct dentry *parent; + + if (p == root) { + spin_unlock(&p->d_lock); + spin_unlock(&sbi->lookup_lock); + dput(prev); + return NULL; + } + + parent = p->d_parent; + if (!spin_trylock(&parent->d_lock)) { + spin_unlock(&p->d_lock); + cpu_relax(); + goto relock; + } + spin_unlock(&p->d_lock); + next = p->d_child.next; + p = parent; + if (next != &parent->d_subdirs) + break; + } + } + ret = list_entry(next, struct dentry, d_child); + + spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED); + /* Negative dentry - try next */ + if (!simple_positive(ret)) { + spin_unlock(&p->d_lock); + lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_); + p = ret; + goto again; + } + dget_dlock(ret); + spin_unlock(&ret->d_lock); + spin_unlock(&p->d_lock); + spin_unlock(&sbi->lookup_lock); + + dput(prev); + + return ret; +} + +/* + * Check a direct mount point for busyness. + * Direct mounts have similar expiry semantics to tree mounts. + * The tree is not busy iff no mountpoints are busy and there are no + * autofs submounts. + */ +static int autofs_direct_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) +{ + pr_debug("top %p %pd\n", top, top); + + /* If it's busy update the expiry counters */ + if (!may_umount_tree(mnt)) { + struct autofs_info *ino; + + ino = autofs_dentry_ino(top); + if (ino) + ino->last_used = jiffies; + return 1; + } + + /* Timeout of a direct mount is determined by its top dentry */ + if (!autofs_can_expire(top, timeout, do_now)) + return 1; + + return 0; +} + +/* + * Check a directory tree of mount points for busyness + * The tree is not busy iff no mountpoints are busy + */ +static int autofs_tree_busy(struct vfsmount *mnt, + struct dentry *top, + unsigned long timeout, + int do_now) +{ + struct autofs_info *top_ino = autofs_dentry_ino(top); + struct dentry *p; + + pr_debug("top %p %pd\n", top, top); + + /* Negative dentry - give up */ + if (!simple_positive(top)) + return 1; + + p = NULL; + while ((p = get_next_positive_dentry(p, top))) { + pr_debug("dentry %p %pd\n", p, p); + + /* + * Is someone visiting anywhere in the subtree ? + * If there's no mount we need to check the usage + * count for the autofs dentry. + * If the fs is busy update the expiry counter. + */ + if (d_mountpoint(p)) { + if (autofs_mount_busy(mnt, p)) { + top_ino->last_used = jiffies; + dput(p); + return 1; + } + } else { + struct autofs_info *ino = autofs_dentry_ino(p); + unsigned int ino_count = atomic_read(&ino->count); + + /* allow for dget above and top is already dgot */ + if (p == top) + ino_count += 2; + else + ino_count++; + + if (d_count(p) > ino_count) { + top_ino->last_used = jiffies; + dput(p); + return 1; + } + } + } + + /* Timeout of a tree mount is ultimately determined by its top dentry */ + if (!autofs_can_expire(top, timeout, do_now)) + return 1; + + return 0; +} + +static struct dentry *autofs_check_leaves(struct vfsmount *mnt, + struct dentry *parent, + unsigned long timeout, + int do_now) +{ + struct dentry *p; + + pr_debug("parent %p %pd\n", parent, parent); + + p = NULL; + while ((p = get_next_positive_dentry(p, parent))) { + pr_debug("dentry %p %pd\n", p, p); + + if (d_mountpoint(p)) { + /* Can we umount this guy */ + if (autofs_mount_busy(mnt, p)) + continue; + + /* Can we expire this guy */ + if (autofs_can_expire(p, timeout, do_now)) + return p; + } + } + return NULL; +} + +/* Check if we can expire a direct mount (possibly a tree) */ +struct dentry *autofs_expire_direct(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) +{ + unsigned long timeout; + struct dentry *root = dget(sb->s_root); + int do_now = how & AUTOFS_EXP_IMMEDIATE; + struct autofs_info *ino; + + if (!root) + return NULL; + + now = jiffies; + timeout = sbi->exp_timeout; + + if (!autofs_direct_busy(mnt, root, timeout, do_now)) { + spin_lock(&sbi->fs_lock); + ino = autofs_dentry_ino(root); + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) { + spin_unlock(&sbi->fs_lock); + goto out; + } + ino->flags |= AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); + synchronize_rcu(); + if (!autofs_direct_busy(mnt, root, timeout, do_now)) { + spin_lock(&sbi->fs_lock); + ino->flags |= AUTOFS_INF_EXPIRING; + init_completion(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + return root; + } + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); + } +out: + dput(root); + + return NULL; +} + +/* Check if 'dentry' should expire, or return a nearby + * dentry that is suitable. + * If returned dentry is different from arg dentry, + * then a dget() reference was taken, else not. + */ +static struct dentry *should_expire(struct dentry *dentry, + struct vfsmount *mnt, + unsigned long timeout, + int how) +{ + int do_now = how & AUTOFS_EXP_IMMEDIATE; + int exp_leaves = how & AUTOFS_EXP_LEAVES; + struct autofs_info *ino = autofs_dentry_ino(dentry); + unsigned int ino_count; + + /* No point expiring a pending mount */ + if (ino->flags & AUTOFS_INF_PENDING) + return NULL; + + /* + * Case 1: (i) indirect mount or top level pseudo direct mount + * (autofs-4.1). + * (ii) indirect mount with offset mount, check the "/" + * offset (autofs-5.0+). + */ + if (d_mountpoint(dentry)) { + pr_debug("checking mountpoint %p %pd\n", dentry, dentry); + + /* Can we umount this guy */ + if (autofs_mount_busy(mnt, dentry)) + return NULL; + + /* Can we expire this guy */ + if (autofs_can_expire(dentry, timeout, do_now)) + return dentry; + return NULL; + } + + if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { + pr_debug("checking symlink %p %pd\n", dentry, dentry); + /* + * A symlink can't be "busy" in the usual sense so + * just check last used for expire timeout. + */ + if (autofs_can_expire(dentry, timeout, do_now)) + return dentry; + return NULL; + } + + if (simple_empty(dentry)) + return NULL; + + /* Case 2: tree mount, expire iff entire tree is not busy */ + if (!exp_leaves) { + /* Path walk currently on this dentry? */ + ino_count = atomic_read(&ino->count) + 1; + if (d_count(dentry) > ino_count) + return NULL; + + if (!autofs_tree_busy(mnt, dentry, timeout, do_now)) + return dentry; + /* + * Case 3: pseudo direct mount, expire individual leaves + * (autofs-4.1). + */ + } else { + /* Path walk currently on this dentry? */ + struct dentry *expired; + + ino_count = atomic_read(&ino->count) + 1; + if (d_count(dentry) > ino_count) + return NULL; + + expired = autofs_check_leaves(mnt, dentry, timeout, do_now); + if (expired) { + if (expired == dentry) + dput(dentry); + return expired; + } + } + return NULL; +} + +/* + * Find an eligible tree to time-out + * A tree is eligible if :- + * - it is unused by any user process + * - it has been unused for exp_timeout time + */ +struct dentry *autofs_expire_indirect(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + int how) +{ + unsigned long timeout; + struct dentry *root = sb->s_root; + struct dentry *dentry; + struct dentry *expired; + struct dentry *found; + struct autofs_info *ino; + + if (!root) + return NULL; + + now = jiffies; + timeout = sbi->exp_timeout; + + dentry = NULL; + while ((dentry = get_next_positive_subdir(dentry, root))) { + int flags = how; + + spin_lock(&sbi->fs_lock); + ino = autofs_dentry_ino(dentry); + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { + spin_unlock(&sbi->fs_lock); + continue; + } + spin_unlock(&sbi->fs_lock); + + expired = should_expire(dentry, mnt, timeout, flags); + if (!expired) + continue; + + spin_lock(&sbi->fs_lock); + ino = autofs_dentry_ino(expired); + ino->flags |= AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); + synchronize_rcu(); + + /* Make sure a reference is not taken on found if + * things have changed. + */ + flags &= ~AUTOFS_EXP_LEAVES; + found = should_expire(expired, mnt, timeout, how); + if (!found || found != expired) + /* Something has changed, continue */ + goto next; + + if (expired != dentry) + dput(dentry); + + spin_lock(&sbi->fs_lock); + goto found; +next: + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; + spin_unlock(&sbi->fs_lock); + if (expired != dentry) + dput(expired); + } + return NULL; + +found: + pr_debug("returning %p %pd\n", expired, expired); + ino->flags |= AUTOFS_INF_EXPIRING; + init_completion(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + return expired; +} + +int autofs_expire_wait(const struct path *path, int rcu_walk) +{ + struct dentry *dentry = path->dentry; + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); + int status; + int state; + + /* Block on any pending expire */ + if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE)) + return 0; + if (rcu_walk) + return -ECHILD; + +retry: + spin_lock(&sbi->fs_lock); + state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING); + if (state == AUTOFS_INF_WANT_EXPIRE) { + spin_unlock(&sbi->fs_lock); + /* + * Possibly being selected for expire, wait until + * it's selected or not. + */ + schedule_timeout_uninterruptible(HZ/10); + goto retry; + } + if (state & AUTOFS_INF_EXPIRING) { + spin_unlock(&sbi->fs_lock); + + pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); + + status = autofs_wait(sbi, path, NFY_NONE); + wait_for_completion(&ino->expire_complete); + + pr_debug("expire done status=%d\n", status); + + if (d_unhashed(dentry)) + return -EAGAIN; + + return status; + } + spin_unlock(&sbi->fs_lock); + + return 0; +} + +/* Perform an expiry operation */ +int autofs_expire_run(struct super_block *sb, + struct vfsmount *mnt, + struct autofs_sb_info *sbi, + struct autofs_packet_expire __user *pkt_p) +{ + struct autofs_packet_expire pkt; + struct autofs_info *ino; + struct dentry *dentry; + int ret = 0; + + memset(&pkt, 0, sizeof(pkt)); + + pkt.hdr.proto_version = sbi->version; + pkt.hdr.type = autofs_ptype_expire; + + dentry = autofs_expire_indirect(sb, mnt, sbi, 0); + if (!dentry) + return -EAGAIN; + + pkt.len = dentry->d_name.len; + memcpy(pkt.name, dentry->d_name.name, pkt.len); + pkt.name[pkt.len] = '\0'; + dput(dentry); + + if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire))) + ret = -EFAULT; + + spin_lock(&sbi->fs_lock); + ino = autofs_dentry_ino(dentry); + /* avoid rapid-fire expire attempts if expiry fails */ + ino->last_used = now; + ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + + return ret; +} + +int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int when) +{ + struct dentry *dentry; + int ret = -EAGAIN; + + if (autofs_type_trigger(sbi->type)) + dentry = autofs_expire_direct(sb, mnt, sbi, when); + else + dentry = autofs_expire_indirect(sb, mnt, sbi, when); + + if (dentry) { + struct autofs_info *ino = autofs_dentry_ino(dentry); + const struct path path = { .mnt = mnt, .dentry = dentry }; + + /* This is synchronous because it makes the daemon a + * little easier + */ + ret = autofs_wait(sbi, &path, NFY_EXPIRE); + + spin_lock(&sbi->fs_lock); + /* avoid rapid-fire expire attempts if expiry fails */ + ino->last_used = now; + ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); + complete_all(&ino->expire_complete); + spin_unlock(&sbi->fs_lock); + dput(dentry); + } + + return ret; +} + +/* + * Call repeatedly until it returns -EAGAIN, meaning there's nothing + * more to be done. + */ +int autofs_expire_multi(struct super_block *sb, struct vfsmount *mnt, + struct autofs_sb_info *sbi, int __user *arg) +{ + int do_now = 0; + + if (arg && get_user(do_now, arg)) + return -EFAULT; + + return autofs_do_expire_multi(sb, mnt, sbi, do_now); +} diff --git a/fs/autofs/init.c b/fs/autofs/init.c new file mode 100644 index 000000000000..16fb61315843 --- /dev/null +++ b/fs/autofs/init.c @@ -0,0 +1,48 @@ +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#include +#include +#include "autofs_i.h" + +static struct dentry *autofs_mount(struct file_system_type *fs_type, + int flags, const char *dev_name, void *data) +{ + return mount_nodev(fs_type, flags, data, autofs_fill_super); +} + +static struct file_system_type autofs_fs_type = { + .owner = THIS_MODULE, + .name = "autofs", + .mount = autofs_mount, + .kill_sb = autofs_kill_sb, +}; +MODULE_ALIAS_FS("autofs"); + +static int __init init_autofs_fs(void) +{ + int err; + + autofs_dev_ioctl_init(); + + err = register_filesystem(&autofs_fs_type); + if (err) + autofs_dev_ioctl_exit(); + + return err; +} + +static void __exit exit_autofs_fs(void) +{ + autofs_dev_ioctl_exit(); + unregister_filesystem(&autofs_fs_type); +} + +module_init(init_autofs_fs) +module_exit(exit_autofs_fs) +MODULE_LICENSE("GPL"); diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c new file mode 100644 index 000000000000..6262819ede45 --- /dev/null +++ b/fs/autofs/inode.c @@ -0,0 +1,375 @@ +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 2005-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include +#include +#include +#include "autofs_i.h" +#include + +struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) +{ + struct autofs_info *ino; + + ino = kzalloc(sizeof(*ino), GFP_KERNEL); + if (ino) { + INIT_LIST_HEAD(&ino->active); + INIT_LIST_HEAD(&ino->expiring); + ino->last_used = jiffies; + ino->sbi = sbi; + } + return ino; +} + +void autofs_clean_ino(struct autofs_info *ino) +{ + ino->uid = GLOBAL_ROOT_UID; + ino->gid = GLOBAL_ROOT_GID; + ino->last_used = jiffies; +} + +void autofs_free_ino(struct autofs_info *ino) +{ + kfree(ino); +} + +void autofs_kill_sb(struct super_block *sb) +{ + struct autofs_sb_info *sbi = autofs_sbi(sb); + + /* + * In the event of a failure in get_sb_nodev the superblock + * info is not present so nothing else has been setup, so + * just call kill_anon_super when we are called from + * deactivate_super. + */ + if (sbi) { + /* Free wait queues, close pipe */ + autofs_catatonic_mode(sbi); + put_pid(sbi->oz_pgrp); + } + + pr_debug("shutting down\n"); + kill_litter_super(sb); + if (sbi) + kfree_rcu(sbi, rcu); +} + +static int autofs_show_options(struct seq_file *m, struct dentry *root) +{ + struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); + struct inode *root_inode = d_inode(root->d_sb->s_root); + + if (!sbi) + return 0; + + seq_printf(m, ",fd=%d", sbi->pipefd); + if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) + seq_printf(m, ",uid=%u", + from_kuid_munged(&init_user_ns, root_inode->i_uid)); + if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) + seq_printf(m, ",gid=%u", + from_kgid_munged(&init_user_ns, root_inode->i_gid)); + seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); + seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); + seq_printf(m, ",minproto=%d", sbi->min_proto); + seq_printf(m, ",maxproto=%d", sbi->max_proto); + + if (autofs_type_offset(sbi->type)) + seq_printf(m, ",offset"); + else if (autofs_type_direct(sbi->type)) + seq_printf(m, ",direct"); + else + seq_printf(m, ",indirect"); +#ifdef CONFIG_CHECKPOINT_RESTORE + if (sbi->pipe) + seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); + else + seq_printf(m, ",pipe_ino=-1"); +#endif + return 0; +} + +static void autofs_evict_inode(struct inode *inode) +{ + clear_inode(inode); + kfree(inode->i_private); +} + +static const struct super_operations autofs_sops = { + .statfs = simple_statfs, + .show_options = autofs_show_options, + .evict_inode = autofs_evict_inode, +}; + +enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, + Opt_indirect, Opt_direct, Opt_offset}; + +static const match_table_t tokens = { + {Opt_fd, "fd=%u"}, + {Opt_uid, "uid=%u"}, + {Opt_gid, "gid=%u"}, + {Opt_pgrp, "pgrp=%u"}, + {Opt_minproto, "minproto=%u"}, + {Opt_maxproto, "maxproto=%u"}, + {Opt_indirect, "indirect"}, + {Opt_direct, "direct"}, + {Opt_offset, "offset"}, + {Opt_err, NULL} +}; + +static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, + int *pgrp, bool *pgrp_set, unsigned int *type, + int *minproto, int *maxproto) +{ + char *p; + substring_t args[MAX_OPT_ARGS]; + int option; + + *uid = current_uid(); + *gid = current_gid(); + + *minproto = AUTOFS_MIN_PROTO_VERSION; + *maxproto = AUTOFS_MAX_PROTO_VERSION; + + *pipefd = -1; + + if (!options) + return 1; + + while ((p = strsep(&options, ",")) != NULL) { + int token; + + if (!*p) + continue; + + token = match_token(p, tokens, args); + switch (token) { + case Opt_fd: + if (match_int(args, pipefd)) + return 1; + break; + case Opt_uid: + if (match_int(args, &option)) + return 1; + *uid = make_kuid(current_user_ns(), option); + if (!uid_valid(*uid)) + return 1; + break; + case Opt_gid: + if (match_int(args, &option)) + return 1; + *gid = make_kgid(current_user_ns(), option); + if (!gid_valid(*gid)) + return 1; + break; + case Opt_pgrp: + if (match_int(args, &option)) + return 1; + *pgrp = option; + *pgrp_set = true; + break; + case Opt_minproto: + if (match_int(args, &option)) + return 1; + *minproto = option; + break; + case Opt_maxproto: + if (match_int(args, &option)) + return 1; + *maxproto = option; + break; + case Opt_indirect: + set_autofs_type_indirect(type); + break; + case Opt_direct: + set_autofs_type_direct(type); + break; + case Opt_offset: + set_autofs_type_offset(type); + break; + default: + return 1; + } + } + return (*pipefd < 0); +} + +int autofs_fill_super(struct super_block *s, void *data, int silent) +{ + struct inode *root_inode; + struct dentry *root; + struct file *pipe; + int pipefd; + struct autofs_sb_info *sbi; + struct autofs_info *ino; + int pgrp = 0; + bool pgrp_set = false; + int ret = -EINVAL; + + sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); + if (!sbi) + return -ENOMEM; + pr_debug("starting up, sbi = %p\n", sbi); + + s->s_fs_info = sbi; + sbi->magic = AUTOFS_SBI_MAGIC; + sbi->pipefd = -1; + sbi->pipe = NULL; + sbi->catatonic = 1; + sbi->exp_timeout = 0; + sbi->oz_pgrp = NULL; + sbi->sb = s; + sbi->version = 0; + sbi->sub_version = 0; + set_autofs_type_indirect(&sbi->type); + sbi->min_proto = 0; + sbi->max_proto = 0; + mutex_init(&sbi->wq_mutex); + mutex_init(&sbi->pipe_mutex); + spin_lock_init(&sbi->fs_lock); + sbi->queues = NULL; + spin_lock_init(&sbi->lookup_lock); + INIT_LIST_HEAD(&sbi->active_list); + INIT_LIST_HEAD(&sbi->expiring_list); + s->s_blocksize = 1024; + s->s_blocksize_bits = 10; + s->s_magic = AUTOFS_SUPER_MAGIC; + s->s_op = &autofs_sops; + s->s_d_op = &autofs_dentry_operations; + s->s_time_gran = 1; + + /* + * Get the root inode and dentry, but defer checking for errors. + */ + ino = autofs_new_ino(sbi); + if (!ino) { + ret = -ENOMEM; + goto fail_free; + } + root_inode = autofs_get_inode(s, S_IFDIR | 0755); + root = d_make_root(root_inode); + if (!root) + goto fail_ino; + pipe = NULL; + + root->d_fsdata = ino; + + /* Can this call block? */ + if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, + &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, + &sbi->max_proto)) { + pr_err("called with bogus options\n"); + goto fail_dput; + } + + /* Test versions first */ + if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || + sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { + pr_err("kernel does not match daemon version " + "daemon (%d, %d) kernel (%d, %d)\n", + sbi->min_proto, sbi->max_proto, + AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); + goto fail_dput; + } + + /* Establish highest kernel protocol version */ + if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) + sbi->version = AUTOFS_MAX_PROTO_VERSION; + else + sbi->version = sbi->max_proto; + sbi->sub_version = AUTOFS_PROTO_SUBVERSION; + + if (pgrp_set) { + sbi->oz_pgrp = find_get_pid(pgrp); + if (!sbi->oz_pgrp) { + pr_err("could not find process group %d\n", + pgrp); + goto fail_dput; + } + } else { + sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); + } + + if (autofs_type_trigger(sbi->type)) + __managed_dentry_set_managed(root); + + root_inode->i_fop = &autofs_root_operations; + root_inode->i_op = &autofs_dir_inode_operations; + + pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); + pipe = fget(pipefd); + + if (!pipe) { + pr_err("could not open pipe file descriptor\n"); + goto fail_put_pid; + } + ret = autofs_prepare_pipe(pipe); + if (ret < 0) + goto fail_fput; + sbi->pipe = pipe; + sbi->pipefd = pipefd; + sbi->catatonic = 0; + + /* + * Success! Install the root dentry now to indicate completion. + */ + s->s_root = root; + return 0; + + /* + * Failure ... clean up. + */ +fail_fput: + pr_err("pipe file descriptor does not contain proper ops\n"); + fput(pipe); +fail_put_pid: + put_pid(sbi->oz_pgrp); +fail_dput: + dput(root); + goto fail_free; +fail_ino: + autofs_free_ino(ino); +fail_free: + kfree(sbi); + s->s_fs_info = NULL; + return ret; +} + +struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) +{ + struct inode *inode = new_inode(sb); + + if (inode == NULL) + return NULL; + + inode->i_mode = mode; + if (sb->s_root) { + inode->i_uid = d_inode(sb->s_root)->i_uid; + inode->i_gid = d_inode(sb->s_root)->i_gid; + } + inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); + inode->i_ino = get_next_ino(); + + if (S_ISDIR(mode)) { + set_nlink(inode, 2); + inode->i_op = &autofs_dir_inode_operations; + inode->i_fop = &autofs_dir_operations; + } else if (S_ISLNK(mode)) { + inode->i_op = &autofs_symlink_inode_operations; + } else + WARN_ON(1); + + return inode; +} diff --git a/fs/autofs/root.c b/fs/autofs/root.c new file mode 100644 index 000000000000..a4b36e44f73c --- /dev/null +++ b/fs/autofs/root.c @@ -0,0 +1,942 @@ +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 1999-2000 Jeremy Fitzhardinge + * Copyright 2001-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include +#include +#include + +#include "autofs_i.h" + +static int autofs_dir_symlink(struct inode *, struct dentry *, const char *); +static int autofs_dir_unlink(struct inode *, struct dentry *); +static int autofs_dir_rmdir(struct inode *, struct dentry *); +static int autofs_dir_mkdir(struct inode *, struct dentry *, umode_t); +static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); +#ifdef CONFIG_COMPAT +static long autofs_root_compat_ioctl(struct file *, + unsigned int, unsigned long); +#endif +static int autofs_dir_open(struct inode *inode, struct file *file); +static struct dentry *autofs_lookup(struct inode *, + struct dentry *, unsigned int); +static struct vfsmount *autofs_d_automount(struct path *); +static int autofs_d_manage(const struct path *, bool); +static void autofs_dentry_release(struct dentry *); + +const struct file_operations autofs_root_operations = { + .open = dcache_dir_open, + .release = dcache_dir_close, + .read = generic_read_dir, + .iterate_shared = dcache_readdir, + .llseek = dcache_dir_lseek, + .unlocked_ioctl = autofs_root_ioctl, +#ifdef CONFIG_COMPAT + .compat_ioctl = autofs_root_compat_ioctl, +#endif +}; + +const struct file_operations autofs_dir_operations = { + .open = autofs_dir_open, + .release = dcache_dir_close, + .read = generic_read_dir, + .iterate_shared = dcache_readdir, + .llseek = dcache_dir_lseek, +}; + +const struct inode_operations autofs_dir_inode_operations = { + .lookup = autofs_lookup, + .unlink = autofs_dir_unlink, + .symlink = autofs_dir_symlink, + .mkdir = autofs_dir_mkdir, + .rmdir = autofs_dir_rmdir, +}; + +const struct dentry_operations autofs_dentry_operations = { + .d_automount = autofs_d_automount, + .d_manage = autofs_d_manage, + .d_release = autofs_dentry_release, +}; + +static void autofs_add_active(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino; + + ino = autofs_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + if (!ino->active_count) { + if (list_empty(&ino->active)) + list_add(&ino->active, &sbi->active_list); + } + ino->active_count++; + spin_unlock(&sbi->lookup_lock); + } +} + +static void autofs_del_active(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino; + + ino = autofs_dentry_ino(dentry); + if (ino) { + spin_lock(&sbi->lookup_lock); + ino->active_count--; + if (!ino->active_count) { + if (!list_empty(&ino->active)) + list_del_init(&ino->active); + } + spin_unlock(&sbi->lookup_lock); + } +} + +static int autofs_dir_open(struct inode *inode, struct file *file) +{ + struct dentry *dentry = file->f_path.dentry; + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + + pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); + + if (autofs_oz_mode(sbi)) + goto out; + + /* + * An empty directory in an autofs file system is always a + * mount point. The daemon must have failed to mount this + * during lookup so it doesn't exist. This can happen, for + * example, if user space returns an incorrect status for a + * mount request. Otherwise we're doing a readdir on the + * autofs file system so just let the libfs routines handle + * it. + */ + spin_lock(&sbi->lookup_lock); + if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) { + spin_unlock(&sbi->lookup_lock); + return -ENOENT; + } + spin_unlock(&sbi->lookup_lock); + +out: + return dcache_dir_open(inode, file); +} + +static void autofs_dentry_release(struct dentry *de) +{ + struct autofs_info *ino = autofs_dentry_ino(de); + struct autofs_sb_info *sbi = autofs_sbi(de->d_sb); + + pr_debug("releasing %p\n", de); + + if (!ino) + return; + + if (sbi) { + spin_lock(&sbi->lookup_lock); + if (!list_empty(&ino->active)) + list_del(&ino->active); + if (!list_empty(&ino->expiring)) + list_del(&ino->expiring); + spin_unlock(&sbi->lookup_lock); + } + + autofs_free_ino(ino); +} + +static struct dentry *autofs_lookup_active(struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct dentry *parent = dentry->d_parent; + const struct qstr *name = &dentry->d_name; + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct list_head *p, *head; + + head = &sbi->active_list; + if (list_empty(head)) + return NULL; + spin_lock(&sbi->lookup_lock); + list_for_each(p, head) { + struct autofs_info *ino; + struct dentry *active; + const struct qstr *qstr; + + ino = list_entry(p, struct autofs_info, active); + active = ino->dentry; + + spin_lock(&active->d_lock); + + /* Already gone? */ + if ((int) d_count(active) <= 0) + goto next; + + qstr = &active->d_name; + + if (active->d_name.hash != hash) + goto next; + if (active->d_parent != parent) + goto next; + + if (qstr->len != len) + goto next; + if (memcmp(qstr->name, str, len)) + goto next; + + if (d_unhashed(active)) { + dget_dlock(active); + spin_unlock(&active->d_lock); + spin_unlock(&sbi->lookup_lock); + return active; + } +next: + spin_unlock(&active->d_lock); + } + spin_unlock(&sbi->lookup_lock); + + return NULL; +} + +static struct dentry *autofs_lookup_expiring(struct dentry *dentry, + bool rcu_walk) +{ + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct dentry *parent = dentry->d_parent; + const struct qstr *name = &dentry->d_name; + unsigned int len = name->len; + unsigned int hash = name->hash; + const unsigned char *str = name->name; + struct list_head *p, *head; + + head = &sbi->expiring_list; + if (list_empty(head)) + return NULL; + spin_lock(&sbi->lookup_lock); + list_for_each(p, head) { + struct autofs_info *ino; + struct dentry *expiring; + const struct qstr *qstr; + + if (rcu_walk) { + spin_unlock(&sbi->lookup_lock); + return ERR_PTR(-ECHILD); + } + + ino = list_entry(p, struct autofs_info, expiring); + expiring = ino->dentry; + + spin_lock(&expiring->d_lock); + + /* We've already been dentry_iput or unlinked */ + if (d_really_is_negative(expiring)) + goto next; + + qstr = &expiring->d_name; + + if (expiring->d_name.hash != hash) + goto next; + if (expiring->d_parent != parent) + goto next; + + if (qstr->len != len) + goto next; + if (memcmp(qstr->name, str, len)) + goto next; + + if (d_unhashed(expiring)) { + dget_dlock(expiring); + spin_unlock(&expiring->d_lock); + spin_unlock(&sbi->lookup_lock); + return expiring; + } +next: + spin_unlock(&expiring->d_lock); + } + spin_unlock(&sbi->lookup_lock); + + return NULL; +} + +static int autofs_mount_wait(const struct path *path, bool rcu_walk) +{ + struct autofs_sb_info *sbi = autofs_sbi(path->dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(path->dentry); + int status = 0; + + if (ino->flags & AUTOFS_INF_PENDING) { + if (rcu_walk) + return -ECHILD; + pr_debug("waiting for mount name=%pd\n", path->dentry); + status = autofs_wait(sbi, path, NFY_MOUNT); + pr_debug("mount wait done status=%d\n", status); + } + ino->last_used = jiffies; + return status; +} + +static int do_expire_wait(const struct path *path, bool rcu_walk) +{ + struct dentry *dentry = path->dentry; + struct dentry *expiring; + + expiring = autofs_lookup_expiring(dentry, rcu_walk); + if (IS_ERR(expiring)) + return PTR_ERR(expiring); + if (!expiring) + return autofs_expire_wait(path, rcu_walk); + else { + const struct path this = { .mnt = path->mnt, .dentry = expiring }; + /* + * If we are racing with expire the request might not + * be quite complete, but the directory has been removed + * so it must have been successful, just wait for it. + */ + autofs_expire_wait(&this, 0); + autofs_del_expiring(expiring); + dput(expiring); + } + return 0; +} + +static struct dentry *autofs_mountpoint_changed(struct path *path) +{ + struct dentry *dentry = path->dentry; + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + + /* + * If this is an indirect mount the dentry could have gone away + * as a result of an expire and a new one created. + */ + if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { + struct dentry *parent = dentry->d_parent; + struct autofs_info *ino; + struct dentry *new; + + new = d_lookup(parent, &dentry->d_name); + if (!new) + return NULL; + ino = autofs_dentry_ino(new); + ino->last_used = jiffies; + dput(path->dentry); + path->dentry = new; + } + return path->dentry; +} + +static struct vfsmount *autofs_d_automount(struct path *path) +{ + struct dentry *dentry = path->dentry; + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); + int status; + + pr_debug("dentry=%p %pd\n", dentry, dentry); + + /* The daemon never triggers a mount. */ + if (autofs_oz_mode(sbi)) + return NULL; + + /* + * If an expire request is pending everyone must wait. + * If the expire fails we're still mounted so continue + * the follow and return. A return of -EAGAIN (which only + * happens with indirect mounts) means the expire completed + * and the directory was removed, so just go ahead and try + * the mount. + */ + status = do_expire_wait(path, 0); + if (status && status != -EAGAIN) + return NULL; + + /* Callback to the daemon to perform the mount or wait */ + spin_lock(&sbi->fs_lock); + if (ino->flags & AUTOFS_INF_PENDING) { + spin_unlock(&sbi->fs_lock); + status = autofs_mount_wait(path, 0); + if (status) + return ERR_PTR(status); + goto done; + } + + /* + * If the dentry is a symlink it's equivalent to a directory + * having path_is_mountpoint() true, so there's no need to call + * back to the daemon. + */ + if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { + spin_unlock(&sbi->fs_lock); + goto done; + } + + if (!path_is_mountpoint(path)) { + /* + * It's possible that user space hasn't removed directories + * after umounting a rootless multi-mount, although it + * should. For v5 path_has_submounts() is sufficient to + * handle this because the leaves of the directory tree under + * the mount never trigger mounts themselves (they have an + * autofs trigger mount mounted on them). But v4 pseudo direct + * mounts do need the leaves to trigger mounts. In this case + * we have no choice but to use the list_empty() check and + * require user space behave. + */ + if (sbi->version > 4) { + if (path_has_submounts(path)) { + spin_unlock(&sbi->fs_lock); + goto done; + } + } else { + if (!simple_empty(dentry)) { + spin_unlock(&sbi->fs_lock); + goto done; + } + } + ino->flags |= AUTOFS_INF_PENDING; + spin_unlock(&sbi->fs_lock); + status = autofs_mount_wait(path, 0); + spin_lock(&sbi->fs_lock); + ino->flags &= ~AUTOFS_INF_PENDING; + if (status) { + spin_unlock(&sbi->fs_lock); + return ERR_PTR(status); + } + } + spin_unlock(&sbi->fs_lock); +done: + /* Mount succeeded, check if we ended up with a new dentry */ + dentry = autofs_mountpoint_changed(path); + if (!dentry) + return ERR_PTR(-ENOENT); + + return NULL; +} + +static int autofs_d_manage(const struct path *path, bool rcu_walk) +{ + struct dentry *dentry = path->dentry; + struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); + int status; + + pr_debug("dentry=%p %pd\n", dentry, dentry); + + /* The daemon never waits. */ + if (autofs_oz_mode(sbi)) { + if (!path_is_mountpoint(path)) + return -EISDIR; + return 0; + } + + /* Wait for pending expires */ + if (do_expire_wait(path, rcu_walk) == -ECHILD) + return -ECHILD; + + /* + * This dentry may be under construction so wait on mount + * completion. + */ + status = autofs_mount_wait(path, rcu_walk); + if (status) + return status; + + if (rcu_walk) { + /* We don't need fs_lock in rcu_walk mode, + * just testing 'AUTOFS_INFO_NO_RCU' is enough. + * simple_empty() takes a spinlock, so leave it + * to last. + * We only return -EISDIR when certain this isn't + * a mount-trap. + */ + struct inode *inode; + + if (ino->flags & AUTOFS_INF_WANT_EXPIRE) + return 0; + if (path_is_mountpoint(path)) + return 0; + inode = d_inode_rcu(dentry); + if (inode && S_ISLNK(inode->i_mode)) + return -EISDIR; + if (list_empty(&dentry->d_subdirs)) + return 0; + if (!simple_empty(dentry)) + return -EISDIR; + return 0; + } + + spin_lock(&sbi->fs_lock); + /* + * If the dentry has been selected for expire while we slept + * on the lock then it might go away. We'll deal with that in + * ->d_automount() and wait on a new mount if the expire + * succeeds or return here if it doesn't (since there's no + * mount to follow with a rootless multi-mount). + */ + if (!(ino->flags & AUTOFS_INF_EXPIRING)) { + /* + * Any needed mounting has been completed and the path + * updated so check if this is a rootless multi-mount so + * we can avoid needless calls ->d_automount() and avoid + * an incorrect ELOOP error return. + */ + if ((!path_is_mountpoint(path) && !simple_empty(dentry)) || + (d_really_is_positive(dentry) && d_is_symlink(dentry))) + status = -EISDIR; + } + spin_unlock(&sbi->fs_lock); + + return status; +} + +/* Lookups in the root directory */ +static struct dentry *autofs_lookup(struct inode *dir, + struct dentry *dentry, unsigned int flags) +{ + struct autofs_sb_info *sbi; + struct autofs_info *ino; + struct dentry *active; + + pr_debug("name = %pd\n", dentry); + + /* File name too long to exist */ + if (dentry->d_name.len > NAME_MAX) + return ERR_PTR(-ENAMETOOLONG); + + sbi = autofs_sbi(dir->i_sb); + + pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", + current->pid, task_pgrp_nr(current), sbi->catatonic, + autofs_oz_mode(sbi)); + + active = autofs_lookup_active(dentry); + if (active) + return active; + else { + /* + * A dentry that is not within the root can never trigger a + * mount operation, unless the directory already exists, so we + * can return fail immediately. The daemon however does need + * to create directories within the file system. + */ + if (!autofs_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) + return ERR_PTR(-ENOENT); + + /* Mark entries in the root as mount triggers */ + if (IS_ROOT(dentry->d_parent) && + autofs_type_indirect(sbi->type)) + __managed_dentry_set_managed(dentry); + + ino = autofs_new_ino(sbi); + if (!ino) + return ERR_PTR(-ENOMEM); + + dentry->d_fsdata = ino; + ino->dentry = dentry; + + autofs_add_active(dentry); + } + return NULL; +} + +static int autofs_dir_symlink(struct inode *dir, + struct dentry *dentry, + const char *symname) +{ + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); + struct autofs_info *p_ino; + struct inode *inode; + size_t size = strlen(symname); + char *cp; + + pr_debug("%s <- %pd\n", symname, dentry); + + if (!autofs_oz_mode(sbi)) + return -EACCES; + + BUG_ON(!ino); + + autofs_clean_ino(ino); + + autofs_del_active(dentry); + + cp = kmalloc(size + 1, GFP_KERNEL); + if (!cp) + return -ENOMEM; + + strcpy(cp, symname); + + inode = autofs_get_inode(dir->i_sb, S_IFLNK | 0555); + if (!inode) { + kfree(cp); + return -ENOMEM; + } + inode->i_private = cp; + inode->i_size = size; + d_add(dentry, inode); + + dget(dentry); + atomic_inc(&ino->count); + p_ino = autofs_dentry_ino(dentry->d_parent); + if (p_ino && !IS_ROOT(dentry)) + atomic_inc(&p_ino->count); + + dir->i_mtime = current_time(dir); + + return 0; +} + +/* + * NOTE! + * + * Normal filesystems would do a "d_delete()" to tell the VFS dcache + * that the file no longer exists. However, doing that means that the + * VFS layer can turn the dentry into a negative dentry. We don't want + * this, because the unlink is probably the result of an expire. + * We simply d_drop it and add it to a expiring list in the super block, + * which allows the dentry lookup to check for an incomplete expire. + * + * If a process is blocked on the dentry waiting for the expire to finish, + * it will invalidate the dentry and try to mount with a new one. + * + * Also see autofs_dir_rmdir().. + */ +static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); + struct autofs_info *p_ino; + + /* This allows root to remove symlinks */ + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + if (atomic_dec_and_test(&ino->count)) { + p_ino = autofs_dentry_ino(dentry->d_parent); + if (p_ino && !IS_ROOT(dentry)) + atomic_dec(&p_ino->count); + } + dput(ino->dentry); + + d_inode(dentry)->i_size = 0; + clear_nlink(d_inode(dentry)); + + dir->i_mtime = current_time(dir); + + spin_lock(&sbi->lookup_lock); + __autofs_add_expiring(dentry); + d_drop(dentry); + spin_unlock(&sbi->lookup_lock); + + return 0; +} + +/* + * Version 4 of autofs provides a pseudo direct mount implementation + * that relies on directories at the leaves of a directory tree under + * an indirect mount to trigger mounts. To allow for this we need to + * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves + * of the directory tree. There is no need to clear the automount flag + * following a mount or restore it after an expire because these mounts + * are always covered. However, it is necessary to ensure that these + * flags are clear on non-empty directories to avoid unnecessary calls + * during path walks. + */ +static void autofs_set_leaf_automount_flags(struct dentry *dentry) +{ + struct dentry *parent; + + /* root and dentrys in the root are already handled */ + if (IS_ROOT(dentry->d_parent)) + return; + + managed_dentry_set_managed(dentry); + + parent = dentry->d_parent; + /* only consider parents below dentrys in the root */ + if (IS_ROOT(parent->d_parent)) + return; + managed_dentry_clear_managed(parent); +} + +static void autofs_clear_leaf_automount_flags(struct dentry *dentry) +{ + struct list_head *d_child; + struct dentry *parent; + + /* flags for dentrys in the root are handled elsewhere */ + if (IS_ROOT(dentry->d_parent)) + return; + + managed_dentry_clear_managed(dentry); + + parent = dentry->d_parent; + /* only consider parents below dentrys in the root */ + if (IS_ROOT(parent->d_parent)) + return; + d_child = &dentry->d_child; + /* Set parent managed if it's becoming empty */ + if (d_child->next == &parent->d_subdirs && + d_child->prev == &parent->d_subdirs) + managed_dentry_set_managed(parent); +} + +static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) +{ + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); + struct autofs_info *p_ino; + + pr_debug("dentry %p, removing %pd\n", dentry, dentry); + + if (!autofs_oz_mode(sbi)) + return -EACCES; + + spin_lock(&sbi->lookup_lock); + if (!simple_empty(dentry)) { + spin_unlock(&sbi->lookup_lock); + return -ENOTEMPTY; + } + __autofs_add_expiring(dentry); + d_drop(dentry); + spin_unlock(&sbi->lookup_lock); + + if (sbi->version < 5) + autofs_clear_leaf_automount_flags(dentry); + + if (atomic_dec_and_test(&ino->count)) { + p_ino = autofs_dentry_ino(dentry->d_parent); + if (p_ino && dentry->d_parent != dentry) + atomic_dec(&p_ino->count); + } + dput(ino->dentry); + d_inode(dentry)->i_size = 0; + clear_nlink(d_inode(dentry)); + + if (dir->i_nlink) + drop_nlink(dir); + + return 0; +} + +static int autofs_dir_mkdir(struct inode *dir, + struct dentry *dentry, umode_t mode) +{ + struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); + struct autofs_info *ino = autofs_dentry_ino(dentry); + struct autofs_info *p_ino; + struct inode *inode; + + if (!autofs_oz_mode(sbi)) + return -EACCES; + + pr_debug("dentry %p, creating %pd\n", dentry, dentry); + + BUG_ON(!ino); + + autofs_clean_ino(ino); + + autofs_del_active(dentry); + + inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode); + if (!inode) + return -ENOMEM; + d_add(dentry, inode); + + if (sbi->version < 5) + autofs_set_leaf_automount_flags(dentry); + + dget(dentry); + atomic_inc(&ino->count); + p_ino = autofs_dentry_ino(dentry->d_parent); + if (p_ino && !IS_ROOT(dentry)) + atomic_inc(&p_ino->count); + inc_nlink(dir); + dir->i_mtime = current_time(dir); + + return 0; +} + +/* Get/set timeout ioctl() operation */ +#ifdef CONFIG_COMPAT +static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi, + compat_ulong_t __user *p) +{ + unsigned long ntimeout; + int rv; + + rv = get_user(ntimeout, p); + if (rv) + goto error; + + rv = put_user(sbi->exp_timeout/HZ, p); + if (rv) + goto error; + + if (ntimeout > UINT_MAX/HZ) + sbi->exp_timeout = 0; + else + sbi->exp_timeout = ntimeout * HZ; + + return 0; +error: + return rv; +} +#endif + +static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi, + unsigned long __user *p) +{ + unsigned long ntimeout; + int rv; + + rv = get_user(ntimeout, p); + if (rv) + goto error; + + rv = put_user(sbi->exp_timeout/HZ, p); + if (rv) + goto error; + + if (ntimeout > ULONG_MAX/HZ) + sbi->exp_timeout = 0; + else + sbi->exp_timeout = ntimeout * HZ; + + return 0; +error: + return rv; +} + +/* Return protocol version */ +static inline int autofs_get_protover(struct autofs_sb_info *sbi, + int __user *p) +{ + return put_user(sbi->version, p); +} + +/* Return protocol sub version */ +static inline int autofs_get_protosubver(struct autofs_sb_info *sbi, + int __user *p) +{ + return put_user(sbi->sub_version, p); +} + +/* +* Tells the daemon whether it can umount the autofs mount. +*/ +static inline int autofs_ask_umount(struct vfsmount *mnt, int __user *p) +{ + int status = 0; + + if (may_umount(mnt)) + status = 1; + + pr_debug("may umount %d\n", status); + + status = put_user(status, p); + + return status; +} + +/* Identify autofs_dentries - this is so we can tell if there's + * an extra dentry refcount or not. We only hold a refcount on the + * dentry if its non-negative (ie, d_inode != NULL) + */ +int is_autofs_dentry(struct dentry *dentry) +{ + return dentry && d_really_is_positive(dentry) && + dentry->d_op == &autofs_dentry_operations && + dentry->d_fsdata != NULL; +} + +/* + * ioctl()'s on the root directory is the chief method for the daemon to + * generate kernel reactions + */ +static int autofs_root_ioctl_unlocked(struct inode *inode, struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); + void __user *p = (void __user *)arg; + + pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", + cmd, arg, sbi, task_pgrp_nr(current)); + + if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || + _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) + return -ENOTTY; + + if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) + return -EPERM; + + switch (cmd) { + case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ + return autofs_wait_release(sbi, (autofs_wqt_t) arg, 0); + case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ + return autofs_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); + case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ + autofs_catatonic_mode(sbi); + return 0; + case AUTOFS_IOC_PROTOVER: /* Get protocol version */ + return autofs_get_protover(sbi, p); + case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */ + return autofs_get_protosubver(sbi, p); + case AUTOFS_IOC_SETTIMEOUT: + return autofs_get_set_timeout(sbi, p); +#ifdef CONFIG_COMPAT + case AUTOFS_IOC_SETTIMEOUT32: + return autofs_compat_get_set_timeout(sbi, p); +#endif + + case AUTOFS_IOC_ASKUMOUNT: + return autofs_ask_umount(filp->f_path.mnt, p); + + /* return a single thing to expire */ + case AUTOFS_IOC_EXPIRE: + return autofs_expire_run(inode->i_sb, filp->f_path.mnt, sbi, p); + /* same as above, but can send multiple expires through pipe */ + case AUTOFS_IOC_EXPIRE_MULTI: + return autofs_expire_multi(inode->i_sb, + filp->f_path.mnt, sbi, p); + + default: + return -EINVAL; + } +} + +static long autofs_root_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + + return autofs_root_ioctl_unlocked(inode, filp, cmd, arg); +} + +#ifdef CONFIG_COMPAT +static long autofs_root_compat_ioctl(struct file *filp, + unsigned int cmd, unsigned long arg) +{ + struct inode *inode = file_inode(filp); + int ret; + + if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) + ret = autofs_root_ioctl_unlocked(inode, filp, cmd, arg); + else + ret = autofs_root_ioctl_unlocked(inode, filp, cmd, + (unsigned long) compat_ptr(arg)); + + return ret; +} +#endif diff --git a/fs/autofs/symlink.c b/fs/autofs/symlink.c new file mode 100644 index 000000000000..aad3902c0cc1 --- /dev/null +++ b/fs/autofs/symlink.c @@ -0,0 +1,29 @@ +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#include "autofs_i.h" + +static const char *autofs_get_link(struct dentry *dentry, + struct inode *inode, + struct delayed_call *done) +{ + struct autofs_sb_info *sbi; + struct autofs_info *ino; + + if (!dentry) + return ERR_PTR(-ECHILD); + sbi = autofs_sbi(dentry->d_sb); + ino = autofs_dentry_ino(dentry); + if (ino && !autofs_oz_mode(sbi)) + ino->last_used = jiffies; + return d_inode(dentry)->i_private; +} + +const struct inode_operations autofs_symlink_inode_operations = { + .get_link = autofs_get_link +}; diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c new file mode 100644 index 000000000000..8a566fa66afe --- /dev/null +++ b/fs/autofs/waitq.c @@ -0,0 +1,559 @@ +/* + * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved + * Copyright 2001-2006 Ian Kent + * + * This file is part of the Linux kernel and is made available under + * the terms of the GNU General Public License, version 2, or at your + * option, any later version, incorporated herein by reference. + */ + +#include +#include +#include +#include +#include +#include "autofs_i.h" + +/* We make this a static variable rather than a part of the superblock; it + * is better if we don't reassign numbers easily even across filesystems + */ +static autofs_wqt_t autofs_next_wait_queue = 1; + +void autofs_catatonic_mode(struct autofs_sb_info *sbi) +{ + struct autofs_wait_queue *wq, *nwq; + + mutex_lock(&sbi->wq_mutex); + if (sbi->catatonic) { + mutex_unlock(&sbi->wq_mutex); + return; + } + + pr_debug("entering catatonic mode\n"); + + sbi->catatonic = 1; + wq = sbi->queues; + sbi->queues = NULL; /* Erase all wait queues */ + while (wq) { + nwq = wq->next; + wq->status = -ENOENT; /* Magic is gone - report failure */ + kfree(wq->name.name); + wq->name.name = NULL; + wq->wait_ctr--; + wake_up_interruptible(&wq->queue); + wq = nwq; + } + fput(sbi->pipe); /* Close the pipe */ + sbi->pipe = NULL; + sbi->pipefd = -1; + mutex_unlock(&sbi->wq_mutex); +} + +static int autofs_write(struct autofs_sb_info *sbi, + struct file *file, const void *addr, int bytes) +{ + unsigned long sigpipe, flags; + const char *data = (const char *)addr; + ssize_t wr = 0; + + sigpipe = sigismember(¤t->pending.signal, SIGPIPE); + + mutex_lock(&sbi->pipe_mutex); + while (bytes) { + wr = __kernel_write(file, data, bytes, &file->f_pos); + if (wr <= 0) + break; + data += wr; + bytes -= wr; + } + mutex_unlock(&sbi->pipe_mutex); + + /* Keep the currently executing process from receiving a + * SIGPIPE unless it was already supposed to get one + */ + if (wr == -EPIPE && !sigpipe) { + spin_lock_irqsave(¤t->sighand->siglock, flags); + sigdelset(¤t->pending.signal, SIGPIPE); + recalc_sigpending(); + spin_unlock_irqrestore(¤t->sighand->siglock, flags); + } + + /* if 'wr' returned 0 (impossible) we assume -EIO (safe) */ + return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; +} + +static void autofs_notify_daemon(struct autofs_sb_info *sbi, + struct autofs_wait_queue *wq, + int type) +{ + union { + struct autofs_packet_hdr hdr; + union autofs_packet_union v4_pkt; + union autofs_v5_packet_union v5_pkt; + } pkt; + struct file *pipe = NULL; + size_t pktsz; + int ret; + + pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", + (unsigned long) wq->wait_queue_token, + wq->name.len, wq->name.name, type); + + memset(&pkt, 0, sizeof(pkt)); /* For security reasons */ + + pkt.hdr.proto_version = sbi->version; + pkt.hdr.type = type; + + switch (type) { + /* Kernel protocol v4 missing and expire packets */ + case autofs_ptype_missing: + { + struct autofs_packet_missing *mp = &pkt.v4_pkt.missing; + + pktsz = sizeof(*mp); + + mp->wait_queue_token = wq->wait_queue_token; + mp->len = wq->name.len; + memcpy(mp->name, wq->name.name, wq->name.len); + mp->name[wq->name.len] = '\0'; + break; + } + case autofs_ptype_expire_multi: + { + struct autofs_packet_expire_multi *ep = + &pkt.v4_pkt.expire_multi; + + pktsz = sizeof(*ep); + + ep->wait_queue_token = wq->wait_queue_token; + ep->len = wq->name.len; + memcpy(ep->name, wq->name.name, wq->name.len); + ep->name[wq->name.len] = '\0'; + break; + } + /* + * Kernel protocol v5 packet for handling indirect and direct + * mount missing and expire requests + */ + case autofs_ptype_missing_indirect: + case autofs_ptype_expire_indirect: + case autofs_ptype_missing_direct: + case autofs_ptype_expire_direct: + { + struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; + struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; + + pktsz = sizeof(*packet); + + packet->wait_queue_token = wq->wait_queue_token; + packet->len = wq->name.len; + memcpy(packet->name, wq->name.name, wq->name.len); + packet->name[wq->name.len] = '\0'; + packet->dev = wq->dev; + packet->ino = wq->ino; + packet->uid = from_kuid_munged(user_ns, wq->uid); + packet->gid = from_kgid_munged(user_ns, wq->gid); + packet->pid = wq->pid; + packet->tgid = wq->tgid; + break; + } + default: + pr_warn("bad type %d!\n", type); + mutex_unlock(&sbi->wq_mutex); + return; + } + + pipe = get_file(sbi->pipe); + + mutex_unlock(&sbi->wq_mutex); + + switch (ret = autofs_write(sbi, pipe, &pkt, pktsz)) { + case 0: + break; + case -ENOMEM: + case -ERESTARTSYS: + /* Just fail this one */ + autofs_wait_release(sbi, wq->wait_queue_token, ret); + break; + default: + autofs_catatonic_mode(sbi); + break; + } + fput(pipe); +} + +static int autofs_getpath(struct autofs_sb_info *sbi, + struct dentry *dentry, char **name) +{ + struct dentry *root = sbi->sb->s_root; + struct dentry *tmp; + char *buf; + char *p; + int len; + unsigned seq; + +rename_retry: + buf = *name; + len = 0; + + seq = read_seqbegin(&rename_lock); + rcu_read_lock(); + spin_lock(&sbi->fs_lock); + for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) + len += tmp->d_name.len + 1; + + if (!len || --len > NAME_MAX) { + spin_unlock(&sbi->fs_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; + return 0; + } + + *(buf + len) = '\0'; + p = buf + len - dentry->d_name.len; + strncpy(p, dentry->d_name.name, dentry->d_name.len); + + for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) { + *(--p) = '/'; + p -= tmp->d_name.len; + strncpy(p, tmp->d_name.name, tmp->d_name.len); + } + spin_unlock(&sbi->fs_lock); + rcu_read_unlock(); + if (read_seqretry(&rename_lock, seq)) + goto rename_retry; + + return len; +} + +static struct autofs_wait_queue * +autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) +{ + struct autofs_wait_queue *wq; + + for (wq = sbi->queues; wq; wq = wq->next) { + if (wq->name.hash == qstr->hash && + wq->name.len == qstr->len && + wq->name.name && + !memcmp(wq->name.name, qstr->name, qstr->len)) + break; + } + return wq; +} + +/* + * Check if we have a valid request. + * Returns + * 1 if the request should continue. + * In this case we can return an autofs_wait_queue entry if one is + * found or NULL to idicate a new wait needs to be created. + * 0 or a negative errno if the request shouldn't continue. + */ +static int validate_request(struct autofs_wait_queue **wait, + struct autofs_sb_info *sbi, + const struct qstr *qstr, + const struct path *path, enum autofs_notify notify) +{ + struct dentry *dentry = path->dentry; + struct autofs_wait_queue *wq; + struct autofs_info *ino; + + if (sbi->catatonic) + return -ENOENT; + + /* Wait in progress, continue; */ + wq = autofs_find_wait(sbi, qstr); + if (wq) { + *wait = wq; + return 1; + } + + *wait = NULL; + + /* If we don't yet have any info this is a new request */ + ino = autofs_dentry_ino(dentry); + if (!ino) + return 1; + + /* + * If we've been asked to wait on an existing expire (NFY_NONE) + * but there is no wait in the queue ... + */ + if (notify == NFY_NONE) { + /* + * Either we've betean the pending expire to post it's + * wait or it finished while we waited on the mutex. + * So we need to wait till either, the wait appears + * or the expire finishes. + */ + + while (ino->flags & AUTOFS_INF_EXPIRING) { + mutex_unlock(&sbi->wq_mutex); + schedule_timeout_interruptible(HZ/10); + if (mutex_lock_interruptible(&sbi->wq_mutex)) + return -EINTR; + + if (sbi->catatonic) + return -ENOENT; + + wq = autofs_find_wait(sbi, qstr); + if (wq) { + *wait = wq; + return 1; + } + } + + /* + * Not ideal but the status has already gone. Of the two + * cases where we wait on NFY_NONE neither depend on the + * return status of the wait. + */ + return 0; + } + + /* + * If we've been asked to trigger a mount and the request + * completed while we waited on the mutex ... + */ + if (notify == NFY_MOUNT) { + struct dentry *new = NULL; + struct path this; + int valid = 1; + + /* + * If the dentry was successfully mounted while we slept + * on the wait queue mutex we can return success. If it + * isn't mounted (doesn't have submounts for the case of + * a multi-mount with no mount at it's base) we can + * continue on and create a new request. + */ + if (!IS_ROOT(dentry)) { + if (d_unhashed(dentry) && + d_really_is_positive(dentry)) { + struct dentry *parent = dentry->d_parent; + + new = d_lookup(parent, &dentry->d_name); + if (new) + dentry = new; + } + } + this.mnt = path->mnt; + this.dentry = dentry; + if (path_has_submounts(&this)) + valid = 0; + + if (new) + dput(new); + return valid; + } + + return 1; +} + +int autofs_wait(struct autofs_sb_info *sbi, + const struct path *path, enum autofs_notify notify) +{ + struct dentry *dentry = path->dentry; + struct autofs_wait_queue *wq; + struct qstr qstr; + char *name; + int status, ret, type; + pid_t pid; + pid_t tgid; + + /* In catatonic mode, we don't wait for nobody */ + if (sbi->catatonic) + return -ENOENT; + + /* + * Try translating pids to the namespace of the daemon. + * + * Zero means failure: we are in an unrelated pid namespace. + */ + pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); + if (pid == 0 || tgid == 0) + return -ENOENT; + + if (d_really_is_negative(dentry)) { + /* + * A wait for a negative dentry is invalid for certain + * cases. A direct or offset mount "always" has its mount + * point directory created and so the request dentry must + * be positive or the map key doesn't exist. The situation + * is very similar for indirect mounts except only dentrys + * in the root of the autofs file system may be negative. + */ + if (autofs_type_trigger(sbi->type)) + return -ENOENT; + else if (!IS_ROOT(dentry->d_parent)) + return -ENOENT; + } + + name = kmalloc(NAME_MAX + 1, GFP_KERNEL); + if (!name) + return -ENOMEM; + + /* If this is a direct mount request create a dummy name */ + if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) + qstr.len = sprintf(name, "%p", dentry); + else { + qstr.len = autofs_getpath(sbi, dentry, &name); + if (!qstr.len) { + kfree(name); + return -ENOENT; + } + } + qstr.name = name; + qstr.hash = full_name_hash(dentry, name, qstr.len); + + if (mutex_lock_interruptible(&sbi->wq_mutex)) { + kfree(qstr.name); + return -EINTR; + } + + ret = validate_request(&wq, sbi, &qstr, path, notify); + if (ret <= 0) { + if (ret != -EINTR) + mutex_unlock(&sbi->wq_mutex); + kfree(qstr.name); + return ret; + } + + if (!wq) { + /* Create a new wait queue */ + wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL); + if (!wq) { + kfree(qstr.name); + mutex_unlock(&sbi->wq_mutex); + return -ENOMEM; + } + + wq->wait_queue_token = autofs_next_wait_queue; + if (++autofs_next_wait_queue == 0) + autofs_next_wait_queue = 1; + wq->next = sbi->queues; + sbi->queues = wq; + init_waitqueue_head(&wq->queue); + memcpy(&wq->name, &qstr, sizeof(struct qstr)); + wq->dev = autofs_get_dev(sbi); + wq->ino = autofs_get_ino(sbi); + wq->uid = current_uid(); + wq->gid = current_gid(); + wq->pid = pid; + wq->tgid = tgid; + wq->status = -EINTR; /* Status return if interrupted */ + wq->wait_ctr = 2; + + if (sbi->version < 5) { + if (notify == NFY_MOUNT) + type = autofs_ptype_missing; + else + type = autofs_ptype_expire_multi; + } else { + if (notify == NFY_MOUNT) + type = autofs_type_trigger(sbi->type) ? + autofs_ptype_missing_direct : + autofs_ptype_missing_indirect; + else + type = autofs_type_trigger(sbi->type) ? + autofs_ptype_expire_direct : + autofs_ptype_expire_indirect; + } + + pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); + + /* + * autofs_notify_daemon() may block; it will unlock ->wq_mutex + */ + autofs_notify_daemon(sbi, wq, type); + } else { + wq->wait_ctr++; + pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", + (unsigned long) wq->wait_queue_token, wq->name.len, + wq->name.name, notify); + mutex_unlock(&sbi->wq_mutex); + kfree(qstr.name); + } + + /* + * wq->name.name is NULL iff the lock is already released + * or the mount has been made catatonic. + */ + wait_event_killable(wq->queue, wq->name.name == NULL); + status = wq->status; + + /* + * For direct and offset mounts we need to track the requester's + * uid and gid in the dentry info struct. This is so it can be + * supplied, on request, by the misc device ioctl interface. + * This is needed during daemon resatart when reconnecting + * to existing, active, autofs mounts. The uid and gid (and + * related string values) may be used for macro substitution + * in autofs mount maps. + */ + if (!status) { + struct autofs_info *ino; + struct dentry *de = NULL; + + /* direct mount or browsable map */ + ino = autofs_dentry_ino(dentry); + if (!ino) { + /* If not lookup actual dentry used */ + de = d_lookup(dentry->d_parent, &dentry->d_name); + if (de) + ino = autofs_dentry_ino(de); + } + + /* Set mount requester */ + if (ino) { + spin_lock(&sbi->fs_lock); + ino->uid = wq->uid; + ino->gid = wq->gid; + spin_unlock(&sbi->fs_lock); + } + + if (de) + dput(de); + } + + /* Are we the last process to need status? */ + mutex_lock(&sbi->wq_mutex); + if (!--wq->wait_ctr) + kfree(wq); + mutex_unlock(&sbi->wq_mutex); + + return status; +} + + +int autofs_wait_release(struct autofs_sb_info *sbi, + autofs_wqt_t wait_queue_token, int status) +{ + struct autofs_wait_queue *wq, **wql; + + mutex_lock(&sbi->wq_mutex); + for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) { + if (wq->wait_queue_token == wait_queue_token) + break; + } + + if (!wq) { + mutex_unlock(&sbi->wq_mutex); + return -EINVAL; + } + + *wql = wq->next; /* Unlink from chain */ + kfree(wq->name.name); + wq->name.name = NULL; /* Do not wait on this queue */ + wq->status = status; + wake_up(&wq->queue); + if (!--wq->wait_ctr) + kfree(wq); + mutex_unlock(&sbi->wq_mutex); + + return 0; +} From 6ed38746041810219ca6b676904f05fe3a8fd333 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:17 -0700 Subject: [PATCH 109/118] autofs: update fs/autofs4/Kconfig Update Kconfig and add a depricated warning. [raven@themaw.net: make autofs4 Kconfig depend on AUTOFS_FS] Link: http://lkml.kernel.org/r/152687649097.8263.7046086367407522029.stgit@pluto.themaw.net Link: http://lkml.kernel.org/r/152626706133.28589.11994171621899212952.stgit@pluto.themaw.net Signed-off-by: Ian Kent Tested-by: Randy Dunlap Cc: Al Viro Cc: Arnd Bergmann Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/Kconfig | 32 ++++++++++++++++++++++---------- 1 file changed, 22 insertions(+), 10 deletions(-) diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig index 44727bf18297..53bc592a250d 100644 --- a/fs/autofs4/Kconfig +++ b/fs/autofs4/Kconfig @@ -1,5 +1,6 @@ config AUTOFS4_FS - tristate "Kernel automounter version 4 support (also supports v3)" + tristate "Kernel automounter version 4 support (also supports v3 and v5)" + default n help The automounter is a tool to automatically mount remote file systems on demand. This implementation is partially kernel-based to reduce @@ -7,14 +8,25 @@ config AUTOFS4_FS automounter (amd), which is a pure user space daemon. To use the automounter you need the user-space tools from - ; you also - want to answer Y to "NFS file system support", below. + ; you also want + to answer Y to "NFS file system support", below. - To compile this support as a module, choose M here: the module will be - called autofs4. You will need to add "alias autofs autofs4" to your - modules configuration file. + This module is in the process of being renamed from autofs4 to + autofs. Since autofs is now the only module that provides the + autofs file system the module is not version 4 specific. - If you are not a part of a fairly large, distributed network or - don't have a laptop which needs to dynamically reconfigure to the - local network, you probably do not need an automounter, and can say - N here. + The autofs4 module is now built from the source located in + fs/autofs. The autofs4 directory and its configuration entry + will be removed two kernel versions from the inclusion of this + change. + + Changes that will need to be made should be limited to: + - source include statments should be changed from autofs_fs4.h to + autofs_fs.h since these two header files have been merged. + - user space scripts that manually load autofs4.ko should be + changed to load autofs.ko. But since the module directory name + and the module name are the same as the file system name there + is no need to manually load module. + - any "alias autofs autofs4" will need to be removed. + + Please configure AUTOFS_FS instead of AUTOFS4_FS from now on. From f7e095f5d113ab3b433c2302f5ea308285d370cb Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:22 -0700 Subject: [PATCH 110/118] autofs: update fs/autofs4/Makefile Update Makefile to build from source in fs/autofs instead of fs/autofs4. Link: http://lkml.kernel.org/r/152626706824.28589.1915028175544560855.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/Makefile | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/fs/autofs4/Makefile b/fs/autofs4/Makefile index a811c1f7d9ab..417dd726d9ef 100644 --- a/fs/autofs4/Makefile +++ b/fs/autofs4/Makefile @@ -4,4 +4,6 @@ obj-$(CONFIG_AUTOFS4_FS) += autofs4.o -autofs4-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o +autofs4-objs := ../autofs/init.o ../autofs/inode.o ../autofs/root.o \ + ../autofs/symlink.o ../autofs/waitq.o ../autofs/expire.o \ + ../autofs/dev-ioctl.o From 8547190490759608dfa51987374ada2ce8a2331d Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:26 -0700 Subject: [PATCH 111/118] autofs: delete fs/autofs4 source files Delete the now unused autofs4 module files. Link: http://lkml.kernel.org/r/152626707391.28589.3553309771262313504.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/autofs_i.h | 273 ------------ fs/autofs4/dev-ioctl.c | 761 --------------------------------- fs/autofs4/expire.c | 632 --------------------------- fs/autofs4/init.c | 48 --- fs/autofs4/inode.c | 375 ---------------- fs/autofs4/root.c | 942 ----------------------------------------- fs/autofs4/symlink.c | 29 -- fs/autofs4/waitq.c | 559 ------------------------ 8 files changed, 3619 deletions(-) delete mode 100644 fs/autofs4/autofs_i.h delete mode 100644 fs/autofs4/dev-ioctl.c delete mode 100644 fs/autofs4/expire.c delete mode 100644 fs/autofs4/init.c delete mode 100644 fs/autofs4/inode.c delete mode 100644 fs/autofs4/root.c delete mode 100644 fs/autofs4/symlink.c delete mode 100644 fs/autofs4/waitq.c diff --git a/fs/autofs4/autofs_i.h b/fs/autofs4/autofs_i.h deleted file mode 100644 index 9110b66c7ef1..000000000000 --- a/fs/autofs4/autofs_i.h +++ /dev/null @@ -1,273 +0,0 @@ -/* - * Copyright 1997-1998 Transmeta Corporation - All Rights Reserved - * Copyright 2005-2006 Ian Kent - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - */ - -/* Internal header file for autofs */ - -#include -#include - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -/* This is the range of ioctl() numbers we claim as ours */ -#define AUTOFS_IOC_FIRST AUTOFS_IOC_READY -#define AUTOFS_IOC_COUNT 32 - -#define AUTOFS_DEV_IOCTL_IOC_FIRST (AUTOFS_DEV_IOCTL_VERSION) -#define AUTOFS_DEV_IOCTL_IOC_COUNT \ - (AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - AUTOFS_DEV_IOCTL_VERSION_CMD) - -#ifdef pr_fmt -#undef pr_fmt -#endif -#define pr_fmt(fmt) KBUILD_MODNAME ":pid:%d:%s: " fmt, current->pid, __func__ - -/* - * Unified info structure. This is pointed to by both the dentry and - * inode structures. Each file in the filesystem has an instance of this - * structure. It holds a reference to the dentry, so dentries are never - * flushed while the file exists. All name lookups are dealt with at the - * dentry level, although the filesystem can interfere in the validation - * process. Readdir is implemented by traversing the dentry lists. - */ -struct autofs_info { - struct dentry *dentry; - struct inode *inode; - - int flags; - - struct completion expire_complete; - - struct list_head active; - int active_count; - - struct list_head expiring; - - struct autofs_sb_info *sbi; - unsigned long last_used; - atomic_t count; - - kuid_t uid; - kgid_t gid; -}; - -#define AUTOFS_INF_EXPIRING (1<<0) /* dentry in the process of expiring */ -#define AUTOFS_INF_WANT_EXPIRE (1<<1) /* the dentry is being considered - * for expiry, so RCU_walk is - * not permitted. If it progresses to - * actual expiry attempt, the flag is - * not cleared when EXPIRING is set - - * in that case it gets cleared only - * when it comes to clearing EXPIRING. - */ -#define AUTOFS_INF_PENDING (1<<2) /* dentry pending mount */ - -struct autofs_wait_queue { - wait_queue_head_t queue; - struct autofs_wait_queue *next; - autofs_wqt_t wait_queue_token; - /* We use the following to see what we are waiting for */ - struct qstr name; - u32 dev; - u64 ino; - kuid_t uid; - kgid_t gid; - pid_t pid; - pid_t tgid; - /* This is for status reporting upon return */ - int status; - unsigned int wait_ctr; -}; - -#define AUTOFS_SBI_MAGIC 0x6d4a556d - -struct autofs_sb_info { - u32 magic; - int pipefd; - struct file *pipe; - struct pid *oz_pgrp; - int catatonic; - int version; - int sub_version; - int min_proto; - int max_proto; - unsigned long exp_timeout; - unsigned int type; - struct super_block *sb; - struct mutex wq_mutex; - struct mutex pipe_mutex; - spinlock_t fs_lock; - struct autofs_wait_queue *queues; /* Wait queue pointer */ - spinlock_t lookup_lock; - struct list_head active_list; - struct list_head expiring_list; - struct rcu_head rcu; -}; - -static inline struct autofs_sb_info *autofs_sbi(struct super_block *sb) -{ - return (struct autofs_sb_info *)(sb->s_fs_info); -} - -static inline struct autofs_info *autofs_dentry_ino(struct dentry *dentry) -{ - return (struct autofs_info *)(dentry->d_fsdata); -} - -/* autofs_oz_mode(): do we see the man behind the curtain? (The - * processes which do manipulations for us in user space sees the raw - * filesystem without "magic".) - */ -static inline int autofs_oz_mode(struct autofs_sb_info *sbi) -{ - return sbi->catatonic || task_pgrp(current) == sbi->oz_pgrp; -} - -struct inode *autofs_get_inode(struct super_block *, umode_t); -void autofs_free_ino(struct autofs_info *); - -/* Expiration */ -int is_autofs_dentry(struct dentry *); -int autofs_expire_wait(const struct path *path, int rcu_walk); -int autofs_expire_run(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, - struct autofs_packet_expire __user *); -int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when); -int autofs_expire_multi(struct super_block *, struct vfsmount *, - struct autofs_sb_info *, int __user *); -struct dentry *autofs_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); -struct dentry *autofs_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, int how); - -/* Device node initialization */ - -int autofs_dev_ioctl_init(void); -void autofs_dev_ioctl_exit(void); - -/* Operations structures */ - -extern const struct inode_operations autofs_symlink_inode_operations; -extern const struct inode_operations autofs_dir_inode_operations; -extern const struct file_operations autofs_dir_operations; -extern const struct file_operations autofs_root_operations; -extern const struct dentry_operations autofs_dentry_operations; - -/* VFS automount flags management functions */ -static inline void __managed_dentry_set_managed(struct dentry *dentry) -{ - dentry->d_flags |= (DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); -} - -static inline void managed_dentry_set_managed(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - __managed_dentry_set_managed(dentry); - spin_unlock(&dentry->d_lock); -} - -static inline void __managed_dentry_clear_managed(struct dentry *dentry) -{ - dentry->d_flags &= ~(DCACHE_NEED_AUTOMOUNT|DCACHE_MANAGE_TRANSIT); -} - -static inline void managed_dentry_clear_managed(struct dentry *dentry) -{ - spin_lock(&dentry->d_lock); - __managed_dentry_clear_managed(dentry); - spin_unlock(&dentry->d_lock); -} - -/* Initializing function */ - -int autofs_fill_super(struct super_block *, void *, int); -struct autofs_info *autofs_new_ino(struct autofs_sb_info *); -void autofs_clean_ino(struct autofs_info *); - -static inline int autofs_prepare_pipe(struct file *pipe) -{ - if (!(pipe->f_mode & FMODE_CAN_WRITE)) - return -EINVAL; - if (!S_ISFIFO(file_inode(pipe)->i_mode)) - return -EINVAL; - /* We want a packet pipe */ - pipe->f_flags |= O_DIRECT; - return 0; -} - -/* Queue management functions */ - -int autofs_wait(struct autofs_sb_info *, - const struct path *, enum autofs_notify); -int autofs_wait_release(struct autofs_sb_info *, autofs_wqt_t, int); -void autofs_catatonic_mode(struct autofs_sb_info *); - -static inline u32 autofs_get_dev(struct autofs_sb_info *sbi) -{ - return new_encode_dev(sbi->sb->s_dev); -} - -static inline u64 autofs_get_ino(struct autofs_sb_info *sbi) -{ - return d_inode(sbi->sb->s_root)->i_ino; -} - -static inline void __autofs_add_expiring(struct dentry *dentry) -{ - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - struct autofs_info *ino = autofs_dentry_ino(dentry); - - if (ino) { - if (list_empty(&ino->expiring)) - list_add(&ino->expiring, &sbi->expiring_list); - } -} - -static inline void autofs_add_expiring(struct dentry *dentry) -{ - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - struct autofs_info *ino = autofs_dentry_ino(dentry); - - if (ino) { - spin_lock(&sbi->lookup_lock); - if (list_empty(&ino->expiring)) - list_add(&ino->expiring, &sbi->expiring_list); - spin_unlock(&sbi->lookup_lock); - } -} - -static inline void autofs_del_expiring(struct dentry *dentry) -{ - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - struct autofs_info *ino = autofs_dentry_ino(dentry); - - if (ino) { - spin_lock(&sbi->lookup_lock); - if (!list_empty(&ino->expiring)) - list_del_init(&ino->expiring); - spin_unlock(&sbi->lookup_lock); - } -} - -void autofs_kill_sb(struct super_block *); diff --git a/fs/autofs4/dev-ioctl.c b/fs/autofs4/dev-ioctl.c deleted file mode 100644 index a2281ab2b957..000000000000 --- a/fs/autofs4/dev-ioctl.c +++ /dev/null @@ -1,761 +0,0 @@ -/* - * Copyright 2008 Red Hat, Inc. All rights reserved. - * Copyright 2008 Ian Kent - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include -#include - -#include "autofs_i.h" - -/* - * This module implements an interface for routing autofs ioctl control - * commands via a miscellaneous device file. - * - * The alternate interface is needed because we need to be able open - * an ioctl file descriptor on an autofs mount that may be covered by - * another mount. This situation arises when starting automount(8) - * or other user space daemon which uses direct mounts or offset - * mounts (used for autofs lazy mount/umount of nested mount trees), - * which have been left busy at at service shutdown. - */ - -typedef int (*ioctl_fn)(struct file *, struct autofs_sb_info *, - struct autofs_dev_ioctl *); - -static int check_name(const char *name) -{ - if (!strchr(name, '/')) - return -EINVAL; - return 0; -} - -/* - * Check a string doesn't overrun the chunk of - * memory we copied from user land. - */ -static int invalid_str(char *str, size_t size) -{ - if (memchr(str, 0, size)) - return 0; - return -EINVAL; -} - -/* - * Check that the user compiled against correct version of autofs - * misc device code. - * - * As well as checking the version compatibility this always copies - * the kernel interface version out. - */ -static int check_dev_ioctl_version(int cmd, struct autofs_dev_ioctl *param) -{ - int err = 0; - - if ((param->ver_major != AUTOFS_DEV_IOCTL_VERSION_MAJOR) || - (param->ver_minor > AUTOFS_DEV_IOCTL_VERSION_MINOR)) { - pr_warn("ioctl control interface version mismatch: " - "kernel(%u.%u), user(%u.%u), cmd(0x%08x)\n", - AUTOFS_DEV_IOCTL_VERSION_MAJOR, - AUTOFS_DEV_IOCTL_VERSION_MINOR, - param->ver_major, param->ver_minor, cmd); - err = -EINVAL; - } - - /* Fill in the kernel version. */ - param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; - param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; - - return err; -} - -/* - * Copy parameter control struct, including a possible path allocated - * at the end of the struct. - */ -static struct autofs_dev_ioctl * -copy_dev_ioctl(struct autofs_dev_ioctl __user *in) -{ - struct autofs_dev_ioctl tmp, *res; - - if (copy_from_user(&tmp, in, AUTOFS_DEV_IOCTL_SIZE)) - return ERR_PTR(-EFAULT); - - if (tmp.size < AUTOFS_DEV_IOCTL_SIZE) - return ERR_PTR(-EINVAL); - - if (tmp.size > AUTOFS_DEV_IOCTL_SIZE + PATH_MAX) - return ERR_PTR(-ENAMETOOLONG); - - res = memdup_user(in, tmp.size); - if (!IS_ERR(res)) - res->size = tmp.size; - - return res; -} - -static inline void free_dev_ioctl(struct autofs_dev_ioctl *param) -{ - kfree(param); -} - -/* - * Check sanity of parameter control fields and if a path is present - * check that it is terminated and contains at least one "/". - */ -static int validate_dev_ioctl(int cmd, struct autofs_dev_ioctl *param) -{ - int err; - - err = check_dev_ioctl_version(cmd, param); - if (err) { - pr_warn("invalid device control module version " - "supplied for cmd(0x%08x)\n", cmd); - goto out; - } - - if (param->size > AUTOFS_DEV_IOCTL_SIZE) { - err = invalid_str(param->path, param->size - AUTOFS_DEV_IOCTL_SIZE); - if (err) { - pr_warn( - "path string terminator missing for cmd(0x%08x)\n", - cmd); - goto out; - } - - err = check_name(param->path); - if (err) { - pr_warn("invalid path supplied for cmd(0x%08x)\n", - cmd); - goto out; - } - } - - err = 0; -out: - return err; -} - -/* - * Get the autofs super block info struct from the file opened on - * the autofs mount point. - */ -static struct autofs_sb_info *autofs_dev_ioctl_sbi(struct file *f) -{ - struct autofs_sb_info *sbi = NULL; - struct inode *inode; - - if (f) { - inode = file_inode(f); - sbi = autofs_sbi(inode->i_sb); - } - return sbi; -} - -/* Return autofs dev ioctl version */ -static int autofs_dev_ioctl_version(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - /* This should have already been set. */ - param->ver_major = AUTOFS_DEV_IOCTL_VERSION_MAJOR; - param->ver_minor = AUTOFS_DEV_IOCTL_VERSION_MINOR; - return 0; -} - -/* Return autofs module protocol version */ -static int autofs_dev_ioctl_protover(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - param->protover.version = sbi->version; - return 0; -} - -/* Return autofs module protocol sub version */ -static int autofs_dev_ioctl_protosubver(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - param->protosubver.sub_version = sbi->sub_version; - return 0; -} - -/* Find the topmost mount satisfying test() */ -static int find_autofs_mount(const char *pathname, - struct path *res, - int test(const struct path *path, void *data), - void *data) -{ - struct path path; - int err; - - err = kern_path_mountpoint(AT_FDCWD, pathname, &path, 0); - if (err) - return err; - err = -ENOENT; - while (path.dentry == path.mnt->mnt_root) { - if (path.dentry->d_sb->s_magic == AUTOFS_SUPER_MAGIC) { - if (test(&path, data)) { - path_get(&path); - *res = path; - err = 0; - break; - } - } - if (!follow_up(&path)) - break; - } - path_put(&path); - return err; -} - -static int test_by_dev(const struct path *path, void *p) -{ - return path->dentry->d_sb->s_dev == *(dev_t *)p; -} - -static int test_by_type(const struct path *path, void *p) -{ - struct autofs_info *ino = autofs_dentry_ino(path->dentry); - - return ino && ino->sbi->type & *(unsigned *)p; -} - -/* - * Open a file descriptor on the autofs mount point corresponding - * to the given path and device number (aka. new_encode_dev(sb->s_dev)). - */ -static int autofs_dev_ioctl_open_mountpoint(const char *name, dev_t devid) -{ - int err, fd; - - fd = get_unused_fd_flags(O_CLOEXEC); - if (likely(fd >= 0)) { - struct file *filp; - struct path path; - - err = find_autofs_mount(name, &path, test_by_dev, &devid); - if (err) - goto out; - - filp = dentry_open(&path, O_RDONLY, current_cred()); - path_put(&path); - if (IS_ERR(filp)) { - err = PTR_ERR(filp); - goto out; - } - - fd_install(fd, filp); - } - - return fd; - -out: - put_unused_fd(fd); - return err; -} - -/* Open a file descriptor on an autofs mount point */ -static int autofs_dev_ioctl_openmount(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - const char *path; - dev_t devid; - int err, fd; - - /* param->path has already been checked */ - if (!param->openmount.devid) - return -EINVAL; - - param->ioctlfd = -1; - - path = param->path; - devid = new_decode_dev(param->openmount.devid); - - err = 0; - fd = autofs_dev_ioctl_open_mountpoint(path, devid); - if (unlikely(fd < 0)) { - err = fd; - goto out; - } - - param->ioctlfd = fd; -out: - return err; -} - -/* Close file descriptor allocated above (user can also use close(2)). */ -static int autofs_dev_ioctl_closemount(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - return ksys_close(param->ioctlfd); -} - -/* - * Send "ready" status for an existing wait (either a mount or an expire - * request). - */ -static int autofs_dev_ioctl_ready(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - autofs_wqt_t token; - - token = (autofs_wqt_t) param->ready.token; - return autofs_wait_release(sbi, token, 0); -} - -/* - * Send "fail" status for an existing wait (either a mount or an expire - * request). - */ -static int autofs_dev_ioctl_fail(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - autofs_wqt_t token; - int status; - - token = (autofs_wqt_t) param->fail.token; - status = param->fail.status < 0 ? param->fail.status : -ENOENT; - return autofs_wait_release(sbi, token, status); -} - -/* - * Set the pipe fd for kernel communication to the daemon. - * - * Normally this is set at mount using an option but if we - * are reconnecting to a busy mount then we need to use this - * to tell the autofs mount about the new kernel pipe fd. In - * order to protect mounts against incorrectly setting the - * pipefd we also require that the autofs mount be catatonic. - * - * This also sets the process group id used to identify the - * controlling process (eg. the owning automount(8) daemon). - */ -static int autofs_dev_ioctl_setpipefd(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - int pipefd; - int err = 0; - struct pid *new_pid = NULL; - - if (param->setpipefd.pipefd == -1) - return -EINVAL; - - pipefd = param->setpipefd.pipefd; - - mutex_lock(&sbi->wq_mutex); - if (!sbi->catatonic) { - mutex_unlock(&sbi->wq_mutex); - return -EBUSY; - } else { - struct file *pipe; - - new_pid = get_task_pid(current, PIDTYPE_PGID); - - if (ns_of_pid(new_pid) != ns_of_pid(sbi->oz_pgrp)) { - pr_warn("not allowed to change PID namespace\n"); - err = -EINVAL; - goto out; - } - - pipe = fget(pipefd); - if (!pipe) { - err = -EBADF; - goto out; - } - if (autofs_prepare_pipe(pipe) < 0) { - err = -EPIPE; - fput(pipe); - goto out; - } - swap(sbi->oz_pgrp, new_pid); - sbi->pipefd = pipefd; - sbi->pipe = pipe; - sbi->catatonic = 0; - } -out: - put_pid(new_pid); - mutex_unlock(&sbi->wq_mutex); - return err; -} - -/* - * Make the autofs mount point catatonic, no longer responsive to - * mount requests. Also closes the kernel pipe file descriptor. - */ -static int autofs_dev_ioctl_catatonic(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - autofs_catatonic_mode(sbi); - return 0; -} - -/* Set the autofs mount timeout */ -static int autofs_dev_ioctl_timeout(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - unsigned long timeout; - - timeout = param->timeout.timeout; - param->timeout.timeout = sbi->exp_timeout / HZ; - sbi->exp_timeout = timeout * HZ; - return 0; -} - -/* - * Return the uid and gid of the last request for the mount - * - * When reconstructing an autofs mount tree with active mounts - * we need to re-connect to mounts that may have used the original - * process uid and gid (or string variations of them) for mount - * lookups within the map entry. - */ -static int autofs_dev_ioctl_requester(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - struct autofs_info *ino; - struct path path; - dev_t devid; - int err = -ENOENT; - - if (param->size <= AUTOFS_DEV_IOCTL_SIZE) { - err = -EINVAL; - goto out; - } - - devid = sbi->sb->s_dev; - - param->requester.uid = param->requester.gid = -1; - - err = find_autofs_mount(param->path, &path, test_by_dev, &devid); - if (err) - goto out; - - ino = autofs_dentry_ino(path.dentry); - if (ino) { - err = 0; - autofs_expire_wait(&path, 0); - spin_lock(&sbi->fs_lock); - param->requester.uid = - from_kuid_munged(current_user_ns(), ino->uid); - param->requester.gid = - from_kgid_munged(current_user_ns(), ino->gid); - spin_unlock(&sbi->fs_lock); - } - path_put(&path); -out: - return err; -} - -/* - * Call repeatedly until it returns -EAGAIN, meaning there's nothing - * more that can be done. - */ -static int autofs_dev_ioctl_expire(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - struct vfsmount *mnt; - int how; - - how = param->expire.how; - mnt = fp->f_path.mnt; - - return autofs_do_expire_multi(sbi->sb, mnt, sbi, how); -} - -/* Check if autofs mount point is in use */ -static int autofs_dev_ioctl_askumount(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - param->askumount.may_umount = 0; - if (may_umount(fp->f_path.mnt)) - param->askumount.may_umount = 1; - return 0; -} - -/* - * Check if the given path is a mountpoint. - * - * If we are supplied with the file descriptor of an autofs - * mount we're looking for a specific mount. In this case - * the path is considered a mountpoint if it is itself a - * mountpoint or contains a mount, such as a multi-mount - * without a root mount. In this case we return 1 if the - * path is a mount point and the super magic of the covering - * mount if there is one or 0 if it isn't a mountpoint. - * - * If we aren't supplied with a file descriptor then we - * lookup the path and check if it is the root of a mount. - * If a type is given we are looking for a particular autofs - * mount and if we don't find a match we return fail. If the - * located path is the root of a mount we return 1 along with - * the super magic of the mount or 0 otherwise. - * - * In both cases the the device number (as returned by - * new_encode_dev()) is also returned. - */ -static int autofs_dev_ioctl_ismountpoint(struct file *fp, - struct autofs_sb_info *sbi, - struct autofs_dev_ioctl *param) -{ - struct path path; - const char *name; - unsigned int type; - unsigned int devid, magic; - int err = -ENOENT; - - if (param->size <= AUTOFS_DEV_IOCTL_SIZE) { - err = -EINVAL; - goto out; - } - - name = param->path; - type = param->ismountpoint.in.type; - - param->ismountpoint.out.devid = devid = 0; - param->ismountpoint.out.magic = magic = 0; - - if (!fp || param->ioctlfd == -1) { - if (autofs_type_any(type)) - err = kern_path_mountpoint(AT_FDCWD, - name, &path, LOOKUP_FOLLOW); - else - err = find_autofs_mount(name, &path, - test_by_type, &type); - if (err) - goto out; - devid = new_encode_dev(path.dentry->d_sb->s_dev); - err = 0; - if (path.mnt->mnt_root == path.dentry) { - err = 1; - magic = path.dentry->d_sb->s_magic; - } - } else { - dev_t dev = sbi->sb->s_dev; - - err = find_autofs_mount(name, &path, test_by_dev, &dev); - if (err) - goto out; - - devid = new_encode_dev(dev); - - err = path_has_submounts(&path); - - if (follow_down_one(&path)) - magic = path.dentry->d_sb->s_magic; - } - - param->ismountpoint.out.devid = devid; - param->ismountpoint.out.magic = magic; - path_put(&path); -out: - return err; -} - -/* - * Our range of ioctl numbers isn't 0 based so we need to shift - * the array index by _IOC_NR(AUTOFS_CTL_IOC_FIRST) for the table - * lookup. - */ -#define cmd_idx(cmd) (cmd - _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST)) - -static ioctl_fn lookup_dev_ioctl(unsigned int cmd) -{ - static ioctl_fn _ioctls[] = { - autofs_dev_ioctl_version, - autofs_dev_ioctl_protover, - autofs_dev_ioctl_protosubver, - autofs_dev_ioctl_openmount, - autofs_dev_ioctl_closemount, - autofs_dev_ioctl_ready, - autofs_dev_ioctl_fail, - autofs_dev_ioctl_setpipefd, - autofs_dev_ioctl_catatonic, - autofs_dev_ioctl_timeout, - autofs_dev_ioctl_requester, - autofs_dev_ioctl_expire, - autofs_dev_ioctl_askumount, - autofs_dev_ioctl_ismountpoint, - }; - unsigned int idx = cmd_idx(cmd); - - return (idx >= ARRAY_SIZE(_ioctls)) ? NULL : _ioctls[idx]; -} - -/* ioctl dispatcher */ -static int _autofs_dev_ioctl(unsigned int command, - struct autofs_dev_ioctl __user *user) -{ - struct autofs_dev_ioctl *param; - struct file *fp; - struct autofs_sb_info *sbi; - unsigned int cmd_first, cmd; - ioctl_fn fn = NULL; - int err = 0; - - cmd_first = _IOC_NR(AUTOFS_DEV_IOCTL_IOC_FIRST); - cmd = _IOC_NR(command); - - if (_IOC_TYPE(command) != _IOC_TYPE(AUTOFS_DEV_IOCTL_IOC_FIRST) || - cmd - cmd_first > AUTOFS_DEV_IOCTL_IOC_COUNT) { - return -ENOTTY; - } - - /* Only root can use ioctls other than AUTOFS_DEV_IOCTL_VERSION_CMD - * and AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD - */ - if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && - cmd != AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD && - !capable(CAP_SYS_ADMIN)) - return -EPERM; - - /* Copy the parameters into kernel space. */ - param = copy_dev_ioctl(user); - if (IS_ERR(param)) - return PTR_ERR(param); - - err = validate_dev_ioctl(command, param); - if (err) - goto out; - - fn = lookup_dev_ioctl(cmd); - if (!fn) { - pr_warn("unknown command 0x%08x\n", command); - err = -ENOTTY; - goto out; - } - - fp = NULL; - sbi = NULL; - - /* - * For obvious reasons the openmount can't have a file - * descriptor yet. We don't take a reference to the - * file during close to allow for immediate release, - * and the same for retrieving ioctl version. - */ - if (cmd != AUTOFS_DEV_IOCTL_VERSION_CMD && - cmd != AUTOFS_DEV_IOCTL_OPENMOUNT_CMD && - cmd != AUTOFS_DEV_IOCTL_CLOSEMOUNT_CMD) { - fp = fget(param->ioctlfd); - if (!fp) { - if (cmd == AUTOFS_DEV_IOCTL_ISMOUNTPOINT_CMD) - goto cont; - err = -EBADF; - goto out; - } - - sbi = autofs_dev_ioctl_sbi(fp); - if (!sbi || sbi->magic != AUTOFS_SBI_MAGIC) { - err = -EINVAL; - fput(fp); - goto out; - } - - /* - * Admin needs to be able to set the mount catatonic in - * order to be able to perform the re-open. - */ - if (!autofs_oz_mode(sbi) && - cmd != AUTOFS_DEV_IOCTL_CATATONIC_CMD) { - err = -EACCES; - fput(fp); - goto out; - } - } -cont: - err = fn(fp, sbi, param); - - if (fp) - fput(fp); - if (err >= 0 && copy_to_user(user, param, AUTOFS_DEV_IOCTL_SIZE)) - err = -EFAULT; -out: - free_dev_ioctl(param); - return err; -} - -static long autofs_dev_ioctl(struct file *file, unsigned int command, - unsigned long u) -{ - int err; - - err = _autofs_dev_ioctl(command, (struct autofs_dev_ioctl __user *) u); - return (long) err; -} - -#ifdef CONFIG_COMPAT -static long autofs_dev_ioctl_compat(struct file *file, unsigned int command, - unsigned long u) -{ - return autofs_dev_ioctl(file, command, (unsigned long) compat_ptr(u)); -} -#else -#define autofs_dev_ioctl_compat NULL -#endif - -static const struct file_operations _dev_ioctl_fops = { - .unlocked_ioctl = autofs_dev_ioctl, - .compat_ioctl = autofs_dev_ioctl_compat, - .owner = THIS_MODULE, - .llseek = noop_llseek, -}; - -static struct miscdevice _autofs_dev_ioctl_misc = { - .minor = AUTOFS_MINOR, - .name = AUTOFS_DEVICE_NAME, - .fops = &_dev_ioctl_fops, - .mode = 0644, -}; - -MODULE_ALIAS_MISCDEV(AUTOFS_MINOR); -MODULE_ALIAS("devname:autofs"); - -/* Register/deregister misc character device */ -int __init autofs_dev_ioctl_init(void) -{ - int r; - - r = misc_register(&_autofs_dev_ioctl_misc); - if (r) { - pr_err("misc_register failed for control device\n"); - return r; - } - - return 0; -} - -void autofs_dev_ioctl_exit(void) -{ - misc_deregister(&_autofs_dev_ioctl_misc); -} diff --git a/fs/autofs4/expire.c b/fs/autofs4/expire.c deleted file mode 100644 index 36f16b67a3bf..000000000000 --- a/fs/autofs4/expire.c +++ /dev/null @@ -1,632 +0,0 @@ -/* - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 1999-2000 Jeremy Fitzhardinge - * Copyright 2001-2006 Ian Kent - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - */ - -#include "autofs_i.h" - -static unsigned long now; - -/* Check if a dentry can be expired */ -static inline int autofs_can_expire(struct dentry *dentry, - unsigned long timeout, int do_now) -{ - struct autofs_info *ino = autofs_dentry_ino(dentry); - - /* dentry in the process of being deleted */ - if (ino == NULL) - return 0; - - if (!do_now) { - /* Too young to die */ - if (!timeout || time_after(ino->last_used + timeout, now)) - return 0; - } - return 1; -} - -/* Check a mount point for busyness */ -static int autofs_mount_busy(struct vfsmount *mnt, struct dentry *dentry) -{ - struct dentry *top = dentry; - struct path path = {.mnt = mnt, .dentry = dentry}; - int status = 1; - - pr_debug("dentry %p %pd\n", dentry, dentry); - - path_get(&path); - - if (!follow_down_one(&path)) - goto done; - - if (is_autofs_dentry(path.dentry)) { - struct autofs_sb_info *sbi = autofs_sbi(path.dentry->d_sb); - - /* This is an autofs submount, we can't expire it */ - if (autofs_type_indirect(sbi->type)) - goto done; - } - - /* Update the expiry counter if fs is busy */ - if (!may_umount_tree(path.mnt)) { - struct autofs_info *ino; - - ino = autofs_dentry_ino(top); - ino->last_used = jiffies; - goto done; - } - - status = 0; -done: - pr_debug("returning = %d\n", status); - path_put(&path); - return status; -} - -/* - * Calculate and dget next entry in the subdirs list under root. - */ -static struct dentry *get_next_positive_subdir(struct dentry *prev, - struct dentry *root) -{ - struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); - struct list_head *next; - struct dentry *q; - - spin_lock(&sbi->lookup_lock); - spin_lock(&root->d_lock); - - if (prev) - next = prev->d_child.next; - else { - prev = dget_dlock(root); - next = prev->d_subdirs.next; - } - -cont: - if (next == &root->d_subdirs) { - spin_unlock(&root->d_lock); - spin_unlock(&sbi->lookup_lock); - dput(prev); - return NULL; - } - - q = list_entry(next, struct dentry, d_child); - - spin_lock_nested(&q->d_lock, DENTRY_D_LOCK_NESTED); - /* Already gone or negative dentry (under construction) - try next */ - if (!d_count(q) || !simple_positive(q)) { - spin_unlock(&q->d_lock); - next = q->d_child.next; - goto cont; - } - dget_dlock(q); - spin_unlock(&q->d_lock); - spin_unlock(&root->d_lock); - spin_unlock(&sbi->lookup_lock); - - dput(prev); - - return q; -} - -/* - * Calculate and dget next entry in top down tree traversal. - */ -static struct dentry *get_next_positive_dentry(struct dentry *prev, - struct dentry *root) -{ - struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); - struct list_head *next; - struct dentry *p, *ret; - - if (prev == NULL) - return dget(root); - - spin_lock(&sbi->lookup_lock); -relock: - p = prev; - spin_lock(&p->d_lock); -again: - next = p->d_subdirs.next; - if (next == &p->d_subdirs) { - while (1) { - struct dentry *parent; - - if (p == root) { - spin_unlock(&p->d_lock); - spin_unlock(&sbi->lookup_lock); - dput(prev); - return NULL; - } - - parent = p->d_parent; - if (!spin_trylock(&parent->d_lock)) { - spin_unlock(&p->d_lock); - cpu_relax(); - goto relock; - } - spin_unlock(&p->d_lock); - next = p->d_child.next; - p = parent; - if (next != &parent->d_subdirs) - break; - } - } - ret = list_entry(next, struct dentry, d_child); - - spin_lock_nested(&ret->d_lock, DENTRY_D_LOCK_NESTED); - /* Negative dentry - try next */ - if (!simple_positive(ret)) { - spin_unlock(&p->d_lock); - lock_set_subclass(&ret->d_lock.dep_map, 0, _RET_IP_); - p = ret; - goto again; - } - dget_dlock(ret); - spin_unlock(&ret->d_lock); - spin_unlock(&p->d_lock); - spin_unlock(&sbi->lookup_lock); - - dput(prev); - - return ret; -} - -/* - * Check a direct mount point for busyness. - * Direct mounts have similar expiry semantics to tree mounts. - * The tree is not busy iff no mountpoints are busy and there are no - * autofs submounts. - */ -static int autofs_direct_busy(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) -{ - pr_debug("top %p %pd\n", top, top); - - /* If it's busy update the expiry counters */ - if (!may_umount_tree(mnt)) { - struct autofs_info *ino; - - ino = autofs_dentry_ino(top); - if (ino) - ino->last_used = jiffies; - return 1; - } - - /* Timeout of a direct mount is determined by its top dentry */ - if (!autofs_can_expire(top, timeout, do_now)) - return 1; - - return 0; -} - -/* - * Check a directory tree of mount points for busyness - * The tree is not busy iff no mountpoints are busy - */ -static int autofs_tree_busy(struct vfsmount *mnt, - struct dentry *top, - unsigned long timeout, - int do_now) -{ - struct autofs_info *top_ino = autofs_dentry_ino(top); - struct dentry *p; - - pr_debug("top %p %pd\n", top, top); - - /* Negative dentry - give up */ - if (!simple_positive(top)) - return 1; - - p = NULL; - while ((p = get_next_positive_dentry(p, top))) { - pr_debug("dentry %p %pd\n", p, p); - - /* - * Is someone visiting anywhere in the subtree ? - * If there's no mount we need to check the usage - * count for the autofs dentry. - * If the fs is busy update the expiry counter. - */ - if (d_mountpoint(p)) { - if (autofs_mount_busy(mnt, p)) { - top_ino->last_used = jiffies; - dput(p); - return 1; - } - } else { - struct autofs_info *ino = autofs_dentry_ino(p); - unsigned int ino_count = atomic_read(&ino->count); - - /* allow for dget above and top is already dgot */ - if (p == top) - ino_count += 2; - else - ino_count++; - - if (d_count(p) > ino_count) { - top_ino->last_used = jiffies; - dput(p); - return 1; - } - } - } - - /* Timeout of a tree mount is ultimately determined by its top dentry */ - if (!autofs_can_expire(top, timeout, do_now)) - return 1; - - return 0; -} - -static struct dentry *autofs_check_leaves(struct vfsmount *mnt, - struct dentry *parent, - unsigned long timeout, - int do_now) -{ - struct dentry *p; - - pr_debug("parent %p %pd\n", parent, parent); - - p = NULL; - while ((p = get_next_positive_dentry(p, parent))) { - pr_debug("dentry %p %pd\n", p, p); - - if (d_mountpoint(p)) { - /* Can we umount this guy */ - if (autofs_mount_busy(mnt, p)) - continue; - - /* Can we expire this guy */ - if (autofs_can_expire(p, timeout, do_now)) - return p; - } - } - return NULL; -} - -/* Check if we can expire a direct mount (possibly a tree) */ -struct dentry *autofs_expire_direct(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) -{ - unsigned long timeout; - struct dentry *root = dget(sb->s_root); - int do_now = how & AUTOFS_EXP_IMMEDIATE; - struct autofs_info *ino; - - if (!root) - return NULL; - - now = jiffies; - timeout = sbi->exp_timeout; - - if (!autofs_direct_busy(mnt, root, timeout, do_now)) { - spin_lock(&sbi->fs_lock); - ino = autofs_dentry_ino(root); - /* No point expiring a pending mount */ - if (ino->flags & AUTOFS_INF_PENDING) { - spin_unlock(&sbi->fs_lock); - goto out; - } - ino->flags |= AUTOFS_INF_WANT_EXPIRE; - spin_unlock(&sbi->fs_lock); - synchronize_rcu(); - if (!autofs_direct_busy(mnt, root, timeout, do_now)) { - spin_lock(&sbi->fs_lock); - ino->flags |= AUTOFS_INF_EXPIRING; - init_completion(&ino->expire_complete); - spin_unlock(&sbi->fs_lock); - return root; - } - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; - spin_unlock(&sbi->fs_lock); - } -out: - dput(root); - - return NULL; -} - -/* Check if 'dentry' should expire, or return a nearby - * dentry that is suitable. - * If returned dentry is different from arg dentry, - * then a dget() reference was taken, else not. - */ -static struct dentry *should_expire(struct dentry *dentry, - struct vfsmount *mnt, - unsigned long timeout, - int how) -{ - int do_now = how & AUTOFS_EXP_IMMEDIATE; - int exp_leaves = how & AUTOFS_EXP_LEAVES; - struct autofs_info *ino = autofs_dentry_ino(dentry); - unsigned int ino_count; - - /* No point expiring a pending mount */ - if (ino->flags & AUTOFS_INF_PENDING) - return NULL; - - /* - * Case 1: (i) indirect mount or top level pseudo direct mount - * (autofs-4.1). - * (ii) indirect mount with offset mount, check the "/" - * offset (autofs-5.0+). - */ - if (d_mountpoint(dentry)) { - pr_debug("checking mountpoint %p %pd\n", dentry, dentry); - - /* Can we umount this guy */ - if (autofs_mount_busy(mnt, dentry)) - return NULL; - - /* Can we expire this guy */ - if (autofs_can_expire(dentry, timeout, do_now)) - return dentry; - return NULL; - } - - if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { - pr_debug("checking symlink %p %pd\n", dentry, dentry); - /* - * A symlink can't be "busy" in the usual sense so - * just check last used for expire timeout. - */ - if (autofs_can_expire(dentry, timeout, do_now)) - return dentry; - return NULL; - } - - if (simple_empty(dentry)) - return NULL; - - /* Case 2: tree mount, expire iff entire tree is not busy */ - if (!exp_leaves) { - /* Path walk currently on this dentry? */ - ino_count = atomic_read(&ino->count) + 1; - if (d_count(dentry) > ino_count) - return NULL; - - if (!autofs_tree_busy(mnt, dentry, timeout, do_now)) - return dentry; - /* - * Case 3: pseudo direct mount, expire individual leaves - * (autofs-4.1). - */ - } else { - /* Path walk currently on this dentry? */ - struct dentry *expired; - - ino_count = atomic_read(&ino->count) + 1; - if (d_count(dentry) > ino_count) - return NULL; - - expired = autofs_check_leaves(mnt, dentry, timeout, do_now); - if (expired) { - if (expired == dentry) - dput(dentry); - return expired; - } - } - return NULL; -} - -/* - * Find an eligible tree to time-out - * A tree is eligible if :- - * - it is unused by any user process - * - it has been unused for exp_timeout time - */ -struct dentry *autofs_expire_indirect(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - int how) -{ - unsigned long timeout; - struct dentry *root = sb->s_root; - struct dentry *dentry; - struct dentry *expired; - struct dentry *found; - struct autofs_info *ino; - - if (!root) - return NULL; - - now = jiffies; - timeout = sbi->exp_timeout; - - dentry = NULL; - while ((dentry = get_next_positive_subdir(dentry, root))) { - int flags = how; - - spin_lock(&sbi->fs_lock); - ino = autofs_dentry_ino(dentry); - if (ino->flags & AUTOFS_INF_WANT_EXPIRE) { - spin_unlock(&sbi->fs_lock); - continue; - } - spin_unlock(&sbi->fs_lock); - - expired = should_expire(dentry, mnt, timeout, flags); - if (!expired) - continue; - - spin_lock(&sbi->fs_lock); - ino = autofs_dentry_ino(expired); - ino->flags |= AUTOFS_INF_WANT_EXPIRE; - spin_unlock(&sbi->fs_lock); - synchronize_rcu(); - - /* Make sure a reference is not taken on found if - * things have changed. - */ - flags &= ~AUTOFS_EXP_LEAVES; - found = should_expire(expired, mnt, timeout, how); - if (!found || found != expired) - /* Something has changed, continue */ - goto next; - - if (expired != dentry) - dput(dentry); - - spin_lock(&sbi->fs_lock); - goto found; -next: - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_WANT_EXPIRE; - spin_unlock(&sbi->fs_lock); - if (expired != dentry) - dput(expired); - } - return NULL; - -found: - pr_debug("returning %p %pd\n", expired, expired); - ino->flags |= AUTOFS_INF_EXPIRING; - init_completion(&ino->expire_complete); - spin_unlock(&sbi->fs_lock); - return expired; -} - -int autofs_expire_wait(const struct path *path, int rcu_walk) -{ - struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - struct autofs_info *ino = autofs_dentry_ino(dentry); - int status; - int state; - - /* Block on any pending expire */ - if (!(ino->flags & AUTOFS_INF_WANT_EXPIRE)) - return 0; - if (rcu_walk) - return -ECHILD; - -retry: - spin_lock(&sbi->fs_lock); - state = ino->flags & (AUTOFS_INF_WANT_EXPIRE | AUTOFS_INF_EXPIRING); - if (state == AUTOFS_INF_WANT_EXPIRE) { - spin_unlock(&sbi->fs_lock); - /* - * Possibly being selected for expire, wait until - * it's selected or not. - */ - schedule_timeout_uninterruptible(HZ/10); - goto retry; - } - if (state & AUTOFS_INF_EXPIRING) { - spin_unlock(&sbi->fs_lock); - - pr_debug("waiting for expire %p name=%pd\n", dentry, dentry); - - status = autofs_wait(sbi, path, NFY_NONE); - wait_for_completion(&ino->expire_complete); - - pr_debug("expire done status=%d\n", status); - - if (d_unhashed(dentry)) - return -EAGAIN; - - return status; - } - spin_unlock(&sbi->fs_lock); - - return 0; -} - -/* Perform an expiry operation */ -int autofs_expire_run(struct super_block *sb, - struct vfsmount *mnt, - struct autofs_sb_info *sbi, - struct autofs_packet_expire __user *pkt_p) -{ - struct autofs_packet_expire pkt; - struct autofs_info *ino; - struct dentry *dentry; - int ret = 0; - - memset(&pkt, 0, sizeof(pkt)); - - pkt.hdr.proto_version = sbi->version; - pkt.hdr.type = autofs_ptype_expire; - - dentry = autofs_expire_indirect(sb, mnt, sbi, 0); - if (!dentry) - return -EAGAIN; - - pkt.len = dentry->d_name.len; - memcpy(pkt.name, dentry->d_name.name, pkt.len); - pkt.name[pkt.len] = '\0'; - dput(dentry); - - if (copy_to_user(pkt_p, &pkt, sizeof(struct autofs_packet_expire))) - ret = -EFAULT; - - spin_lock(&sbi->fs_lock); - ino = autofs_dentry_ino(dentry); - /* avoid rapid-fire expire attempts if expiry fails */ - ino->last_used = now; - ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); - complete_all(&ino->expire_complete); - spin_unlock(&sbi->fs_lock); - - return ret; -} - -int autofs_do_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int when) -{ - struct dentry *dentry; - int ret = -EAGAIN; - - if (autofs_type_trigger(sbi->type)) - dentry = autofs_expire_direct(sb, mnt, sbi, when); - else - dentry = autofs_expire_indirect(sb, mnt, sbi, when); - - if (dentry) { - struct autofs_info *ino = autofs_dentry_ino(dentry); - const struct path path = { .mnt = mnt, .dentry = dentry }; - - /* This is synchronous because it makes the daemon a - * little easier - */ - ret = autofs_wait(sbi, &path, NFY_EXPIRE); - - spin_lock(&sbi->fs_lock); - /* avoid rapid-fire expire attempts if expiry fails */ - ino->last_used = now; - ino->flags &= ~(AUTOFS_INF_EXPIRING|AUTOFS_INF_WANT_EXPIRE); - complete_all(&ino->expire_complete); - spin_unlock(&sbi->fs_lock); - dput(dentry); - } - - return ret; -} - -/* - * Call repeatedly until it returns -EAGAIN, meaning there's nothing - * more to be done. - */ -int autofs_expire_multi(struct super_block *sb, struct vfsmount *mnt, - struct autofs_sb_info *sbi, int __user *arg) -{ - int do_now = 0; - - if (arg && get_user(do_now, arg)) - return -EFAULT; - - return autofs_do_expire_multi(sb, mnt, sbi, do_now); -} - diff --git a/fs/autofs4/init.c b/fs/autofs4/init.c deleted file mode 100644 index 16fb61315843..000000000000 --- a/fs/autofs4/init.c +++ /dev/null @@ -1,48 +0,0 @@ -/* - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - */ - -#include -#include -#include "autofs_i.h" - -static struct dentry *autofs_mount(struct file_system_type *fs_type, - int flags, const char *dev_name, void *data) -{ - return mount_nodev(fs_type, flags, data, autofs_fill_super); -} - -static struct file_system_type autofs_fs_type = { - .owner = THIS_MODULE, - .name = "autofs", - .mount = autofs_mount, - .kill_sb = autofs_kill_sb, -}; -MODULE_ALIAS_FS("autofs"); - -static int __init init_autofs_fs(void) -{ - int err; - - autofs_dev_ioctl_init(); - - err = register_filesystem(&autofs_fs_type); - if (err) - autofs_dev_ioctl_exit(); - - return err; -} - -static void __exit exit_autofs_fs(void) -{ - autofs_dev_ioctl_exit(); - unregister_filesystem(&autofs_fs_type); -} - -module_init(init_autofs_fs) -module_exit(exit_autofs_fs) -MODULE_LICENSE("GPL"); diff --git a/fs/autofs4/inode.c b/fs/autofs4/inode.c deleted file mode 100644 index 6262819ede45..000000000000 --- a/fs/autofs4/inode.c +++ /dev/null @@ -1,375 +0,0 @@ -/* - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 2005-2006 Ian Kent - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - */ - -#include -#include -#include -#include -#include -#include -#include -#include -#include "autofs_i.h" -#include - -struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) -{ - struct autofs_info *ino; - - ino = kzalloc(sizeof(*ino), GFP_KERNEL); - if (ino) { - INIT_LIST_HEAD(&ino->active); - INIT_LIST_HEAD(&ino->expiring); - ino->last_used = jiffies; - ino->sbi = sbi; - } - return ino; -} - -void autofs_clean_ino(struct autofs_info *ino) -{ - ino->uid = GLOBAL_ROOT_UID; - ino->gid = GLOBAL_ROOT_GID; - ino->last_used = jiffies; -} - -void autofs_free_ino(struct autofs_info *ino) -{ - kfree(ino); -} - -void autofs_kill_sb(struct super_block *sb) -{ - struct autofs_sb_info *sbi = autofs_sbi(sb); - - /* - * In the event of a failure in get_sb_nodev the superblock - * info is not present so nothing else has been setup, so - * just call kill_anon_super when we are called from - * deactivate_super. - */ - if (sbi) { - /* Free wait queues, close pipe */ - autofs_catatonic_mode(sbi); - put_pid(sbi->oz_pgrp); - } - - pr_debug("shutting down\n"); - kill_litter_super(sb); - if (sbi) - kfree_rcu(sbi, rcu); -} - -static int autofs_show_options(struct seq_file *m, struct dentry *root) -{ - struct autofs_sb_info *sbi = autofs_sbi(root->d_sb); - struct inode *root_inode = d_inode(root->d_sb->s_root); - - if (!sbi) - return 0; - - seq_printf(m, ",fd=%d", sbi->pipefd); - if (!uid_eq(root_inode->i_uid, GLOBAL_ROOT_UID)) - seq_printf(m, ",uid=%u", - from_kuid_munged(&init_user_ns, root_inode->i_uid)); - if (!gid_eq(root_inode->i_gid, GLOBAL_ROOT_GID)) - seq_printf(m, ",gid=%u", - from_kgid_munged(&init_user_ns, root_inode->i_gid)); - seq_printf(m, ",pgrp=%d", pid_vnr(sbi->oz_pgrp)); - seq_printf(m, ",timeout=%lu", sbi->exp_timeout/HZ); - seq_printf(m, ",minproto=%d", sbi->min_proto); - seq_printf(m, ",maxproto=%d", sbi->max_proto); - - if (autofs_type_offset(sbi->type)) - seq_printf(m, ",offset"); - else if (autofs_type_direct(sbi->type)) - seq_printf(m, ",direct"); - else - seq_printf(m, ",indirect"); -#ifdef CONFIG_CHECKPOINT_RESTORE - if (sbi->pipe) - seq_printf(m, ",pipe_ino=%ld", file_inode(sbi->pipe)->i_ino); - else - seq_printf(m, ",pipe_ino=-1"); -#endif - return 0; -} - -static void autofs_evict_inode(struct inode *inode) -{ - clear_inode(inode); - kfree(inode->i_private); -} - -static const struct super_operations autofs_sops = { - .statfs = simple_statfs, - .show_options = autofs_show_options, - .evict_inode = autofs_evict_inode, -}; - -enum {Opt_err, Opt_fd, Opt_uid, Opt_gid, Opt_pgrp, Opt_minproto, Opt_maxproto, - Opt_indirect, Opt_direct, Opt_offset}; - -static const match_table_t tokens = { - {Opt_fd, "fd=%u"}, - {Opt_uid, "uid=%u"}, - {Opt_gid, "gid=%u"}, - {Opt_pgrp, "pgrp=%u"}, - {Opt_minproto, "minproto=%u"}, - {Opt_maxproto, "maxproto=%u"}, - {Opt_indirect, "indirect"}, - {Opt_direct, "direct"}, - {Opt_offset, "offset"}, - {Opt_err, NULL} -}; - -static int parse_options(char *options, int *pipefd, kuid_t *uid, kgid_t *gid, - int *pgrp, bool *pgrp_set, unsigned int *type, - int *minproto, int *maxproto) -{ - char *p; - substring_t args[MAX_OPT_ARGS]; - int option; - - *uid = current_uid(); - *gid = current_gid(); - - *minproto = AUTOFS_MIN_PROTO_VERSION; - *maxproto = AUTOFS_MAX_PROTO_VERSION; - - *pipefd = -1; - - if (!options) - return 1; - - while ((p = strsep(&options, ",")) != NULL) { - int token; - - if (!*p) - continue; - - token = match_token(p, tokens, args); - switch (token) { - case Opt_fd: - if (match_int(args, pipefd)) - return 1; - break; - case Opt_uid: - if (match_int(args, &option)) - return 1; - *uid = make_kuid(current_user_ns(), option); - if (!uid_valid(*uid)) - return 1; - break; - case Opt_gid: - if (match_int(args, &option)) - return 1; - *gid = make_kgid(current_user_ns(), option); - if (!gid_valid(*gid)) - return 1; - break; - case Opt_pgrp: - if (match_int(args, &option)) - return 1; - *pgrp = option; - *pgrp_set = true; - break; - case Opt_minproto: - if (match_int(args, &option)) - return 1; - *minproto = option; - break; - case Opt_maxproto: - if (match_int(args, &option)) - return 1; - *maxproto = option; - break; - case Opt_indirect: - set_autofs_type_indirect(type); - break; - case Opt_direct: - set_autofs_type_direct(type); - break; - case Opt_offset: - set_autofs_type_offset(type); - break; - default: - return 1; - } - } - return (*pipefd < 0); -} - -int autofs_fill_super(struct super_block *s, void *data, int silent) -{ - struct inode *root_inode; - struct dentry *root; - struct file *pipe; - int pipefd; - struct autofs_sb_info *sbi; - struct autofs_info *ino; - int pgrp = 0; - bool pgrp_set = false; - int ret = -EINVAL; - - sbi = kzalloc(sizeof(*sbi), GFP_KERNEL); - if (!sbi) - return -ENOMEM; - pr_debug("starting up, sbi = %p\n", sbi); - - s->s_fs_info = sbi; - sbi->magic = AUTOFS_SBI_MAGIC; - sbi->pipefd = -1; - sbi->pipe = NULL; - sbi->catatonic = 1; - sbi->exp_timeout = 0; - sbi->oz_pgrp = NULL; - sbi->sb = s; - sbi->version = 0; - sbi->sub_version = 0; - set_autofs_type_indirect(&sbi->type); - sbi->min_proto = 0; - sbi->max_proto = 0; - mutex_init(&sbi->wq_mutex); - mutex_init(&sbi->pipe_mutex); - spin_lock_init(&sbi->fs_lock); - sbi->queues = NULL; - spin_lock_init(&sbi->lookup_lock); - INIT_LIST_HEAD(&sbi->active_list); - INIT_LIST_HEAD(&sbi->expiring_list); - s->s_blocksize = 1024; - s->s_blocksize_bits = 10; - s->s_magic = AUTOFS_SUPER_MAGIC; - s->s_op = &autofs_sops; - s->s_d_op = &autofs_dentry_operations; - s->s_time_gran = 1; - - /* - * Get the root inode and dentry, but defer checking for errors. - */ - ino = autofs_new_ino(sbi); - if (!ino) { - ret = -ENOMEM; - goto fail_free; - } - root_inode = autofs_get_inode(s, S_IFDIR | 0755); - root = d_make_root(root_inode); - if (!root) - goto fail_ino; - pipe = NULL; - - root->d_fsdata = ino; - - /* Can this call block? */ - if (parse_options(data, &pipefd, &root_inode->i_uid, &root_inode->i_gid, - &pgrp, &pgrp_set, &sbi->type, &sbi->min_proto, - &sbi->max_proto)) { - pr_err("called with bogus options\n"); - goto fail_dput; - } - - /* Test versions first */ - if (sbi->max_proto < AUTOFS_MIN_PROTO_VERSION || - sbi->min_proto > AUTOFS_MAX_PROTO_VERSION) { - pr_err("kernel does not match daemon version " - "daemon (%d, %d) kernel (%d, %d)\n", - sbi->min_proto, sbi->max_proto, - AUTOFS_MIN_PROTO_VERSION, AUTOFS_MAX_PROTO_VERSION); - goto fail_dput; - } - - /* Establish highest kernel protocol version */ - if (sbi->max_proto > AUTOFS_MAX_PROTO_VERSION) - sbi->version = AUTOFS_MAX_PROTO_VERSION; - else - sbi->version = sbi->max_proto; - sbi->sub_version = AUTOFS_PROTO_SUBVERSION; - - if (pgrp_set) { - sbi->oz_pgrp = find_get_pid(pgrp); - if (!sbi->oz_pgrp) { - pr_err("could not find process group %d\n", - pgrp); - goto fail_dput; - } - } else { - sbi->oz_pgrp = get_task_pid(current, PIDTYPE_PGID); - } - - if (autofs_type_trigger(sbi->type)) - __managed_dentry_set_managed(root); - - root_inode->i_fop = &autofs_root_operations; - root_inode->i_op = &autofs_dir_inode_operations; - - pr_debug("pipe fd = %d, pgrp = %u\n", pipefd, pid_nr(sbi->oz_pgrp)); - pipe = fget(pipefd); - - if (!pipe) { - pr_err("could not open pipe file descriptor\n"); - goto fail_put_pid; - } - ret = autofs_prepare_pipe(pipe); - if (ret < 0) - goto fail_fput; - sbi->pipe = pipe; - sbi->pipefd = pipefd; - sbi->catatonic = 0; - - /* - * Success! Install the root dentry now to indicate completion. - */ - s->s_root = root; - return 0; - - /* - * Failure ... clean up. - */ -fail_fput: - pr_err("pipe file descriptor does not contain proper ops\n"); - fput(pipe); -fail_put_pid: - put_pid(sbi->oz_pgrp); -fail_dput: - dput(root); - goto fail_free; -fail_ino: - autofs_free_ino(ino); -fail_free: - kfree(sbi); - s->s_fs_info = NULL; - return ret; -} - -struct inode *autofs_get_inode(struct super_block *sb, umode_t mode) -{ - struct inode *inode = new_inode(sb); - - if (inode == NULL) - return NULL; - - inode->i_mode = mode; - if (sb->s_root) { - inode->i_uid = d_inode(sb->s_root)->i_uid; - inode->i_gid = d_inode(sb->s_root)->i_gid; - } - inode->i_atime = inode->i_mtime = inode->i_ctime = current_time(inode); - inode->i_ino = get_next_ino(); - - if (S_ISDIR(mode)) { - set_nlink(inode, 2); - inode->i_op = &autofs_dir_inode_operations; - inode->i_fop = &autofs_dir_operations; - } else if (S_ISLNK(mode)) { - inode->i_op = &autofs_symlink_inode_operations; - } else - WARN_ON(1); - - return inode; -} diff --git a/fs/autofs4/root.c b/fs/autofs4/root.c deleted file mode 100644 index a4b36e44f73c..000000000000 --- a/fs/autofs4/root.c +++ /dev/null @@ -1,942 +0,0 @@ -/* - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 1999-2000 Jeremy Fitzhardinge - * Copyright 2001-2006 Ian Kent - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - */ - -#include -#include -#include -#include -#include -#include -#include -#include - -#include "autofs_i.h" - -static int autofs_dir_symlink(struct inode *, struct dentry *, const char *); -static int autofs_dir_unlink(struct inode *, struct dentry *); -static int autofs_dir_rmdir(struct inode *, struct dentry *); -static int autofs_dir_mkdir(struct inode *, struct dentry *, umode_t); -static long autofs_root_ioctl(struct file *, unsigned int, unsigned long); -#ifdef CONFIG_COMPAT -static long autofs_root_compat_ioctl(struct file *, - unsigned int, unsigned long); -#endif -static int autofs_dir_open(struct inode *inode, struct file *file); -static struct dentry *autofs_lookup(struct inode *, - struct dentry *, unsigned int); -static struct vfsmount *autofs_d_automount(struct path *); -static int autofs_d_manage(const struct path *, bool); -static void autofs_dentry_release(struct dentry *); - -const struct file_operations autofs_root_operations = { - .open = dcache_dir_open, - .release = dcache_dir_close, - .read = generic_read_dir, - .iterate_shared = dcache_readdir, - .llseek = dcache_dir_lseek, - .unlocked_ioctl = autofs_root_ioctl, -#ifdef CONFIG_COMPAT - .compat_ioctl = autofs_root_compat_ioctl, -#endif -}; - -const struct file_operations autofs_dir_operations = { - .open = autofs_dir_open, - .release = dcache_dir_close, - .read = generic_read_dir, - .iterate_shared = dcache_readdir, - .llseek = dcache_dir_lseek, -}; - -const struct inode_operations autofs_dir_inode_operations = { - .lookup = autofs_lookup, - .unlink = autofs_dir_unlink, - .symlink = autofs_dir_symlink, - .mkdir = autofs_dir_mkdir, - .rmdir = autofs_dir_rmdir, -}; - -const struct dentry_operations autofs_dentry_operations = { - .d_automount = autofs_d_automount, - .d_manage = autofs_d_manage, - .d_release = autofs_dentry_release, -}; - -static void autofs_add_active(struct dentry *dentry) -{ - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - struct autofs_info *ino; - - ino = autofs_dentry_ino(dentry); - if (ino) { - spin_lock(&sbi->lookup_lock); - if (!ino->active_count) { - if (list_empty(&ino->active)) - list_add(&ino->active, &sbi->active_list); - } - ino->active_count++; - spin_unlock(&sbi->lookup_lock); - } -} - -static void autofs_del_active(struct dentry *dentry) -{ - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - struct autofs_info *ino; - - ino = autofs_dentry_ino(dentry); - if (ino) { - spin_lock(&sbi->lookup_lock); - ino->active_count--; - if (!ino->active_count) { - if (!list_empty(&ino->active)) - list_del_init(&ino->active); - } - spin_unlock(&sbi->lookup_lock); - } -} - -static int autofs_dir_open(struct inode *inode, struct file *file) -{ - struct dentry *dentry = file->f_path.dentry; - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - - pr_debug("file=%p dentry=%p %pd\n", file, dentry, dentry); - - if (autofs_oz_mode(sbi)) - goto out; - - /* - * An empty directory in an autofs file system is always a - * mount point. The daemon must have failed to mount this - * during lookup so it doesn't exist. This can happen, for - * example, if user space returns an incorrect status for a - * mount request. Otherwise we're doing a readdir on the - * autofs file system so just let the libfs routines handle - * it. - */ - spin_lock(&sbi->lookup_lock); - if (!path_is_mountpoint(&file->f_path) && simple_empty(dentry)) { - spin_unlock(&sbi->lookup_lock); - return -ENOENT; - } - spin_unlock(&sbi->lookup_lock); - -out: - return dcache_dir_open(inode, file); -} - -static void autofs_dentry_release(struct dentry *de) -{ - struct autofs_info *ino = autofs_dentry_ino(de); - struct autofs_sb_info *sbi = autofs_sbi(de->d_sb); - - pr_debug("releasing %p\n", de); - - if (!ino) - return; - - if (sbi) { - spin_lock(&sbi->lookup_lock); - if (!list_empty(&ino->active)) - list_del(&ino->active); - if (!list_empty(&ino->expiring)) - list_del(&ino->expiring); - spin_unlock(&sbi->lookup_lock); - } - - autofs_free_ino(ino); -} - -static struct dentry *autofs_lookup_active(struct dentry *dentry) -{ - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - struct dentry *parent = dentry->d_parent; - const struct qstr *name = &dentry->d_name; - unsigned int len = name->len; - unsigned int hash = name->hash; - const unsigned char *str = name->name; - struct list_head *p, *head; - - head = &sbi->active_list; - if (list_empty(head)) - return NULL; - spin_lock(&sbi->lookup_lock); - list_for_each(p, head) { - struct autofs_info *ino; - struct dentry *active; - const struct qstr *qstr; - - ino = list_entry(p, struct autofs_info, active); - active = ino->dentry; - - spin_lock(&active->d_lock); - - /* Already gone? */ - if ((int) d_count(active) <= 0) - goto next; - - qstr = &active->d_name; - - if (active->d_name.hash != hash) - goto next; - if (active->d_parent != parent) - goto next; - - if (qstr->len != len) - goto next; - if (memcmp(qstr->name, str, len)) - goto next; - - if (d_unhashed(active)) { - dget_dlock(active); - spin_unlock(&active->d_lock); - spin_unlock(&sbi->lookup_lock); - return active; - } -next: - spin_unlock(&active->d_lock); - } - spin_unlock(&sbi->lookup_lock); - - return NULL; -} - -static struct dentry *autofs_lookup_expiring(struct dentry *dentry, - bool rcu_walk) -{ - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - struct dentry *parent = dentry->d_parent; - const struct qstr *name = &dentry->d_name; - unsigned int len = name->len; - unsigned int hash = name->hash; - const unsigned char *str = name->name; - struct list_head *p, *head; - - head = &sbi->expiring_list; - if (list_empty(head)) - return NULL; - spin_lock(&sbi->lookup_lock); - list_for_each(p, head) { - struct autofs_info *ino; - struct dentry *expiring; - const struct qstr *qstr; - - if (rcu_walk) { - spin_unlock(&sbi->lookup_lock); - return ERR_PTR(-ECHILD); - } - - ino = list_entry(p, struct autofs_info, expiring); - expiring = ino->dentry; - - spin_lock(&expiring->d_lock); - - /* We've already been dentry_iput or unlinked */ - if (d_really_is_negative(expiring)) - goto next; - - qstr = &expiring->d_name; - - if (expiring->d_name.hash != hash) - goto next; - if (expiring->d_parent != parent) - goto next; - - if (qstr->len != len) - goto next; - if (memcmp(qstr->name, str, len)) - goto next; - - if (d_unhashed(expiring)) { - dget_dlock(expiring); - spin_unlock(&expiring->d_lock); - spin_unlock(&sbi->lookup_lock); - return expiring; - } -next: - spin_unlock(&expiring->d_lock); - } - spin_unlock(&sbi->lookup_lock); - - return NULL; -} - -static int autofs_mount_wait(const struct path *path, bool rcu_walk) -{ - struct autofs_sb_info *sbi = autofs_sbi(path->dentry->d_sb); - struct autofs_info *ino = autofs_dentry_ino(path->dentry); - int status = 0; - - if (ino->flags & AUTOFS_INF_PENDING) { - if (rcu_walk) - return -ECHILD; - pr_debug("waiting for mount name=%pd\n", path->dentry); - status = autofs_wait(sbi, path, NFY_MOUNT); - pr_debug("mount wait done status=%d\n", status); - } - ino->last_used = jiffies; - return status; -} - -static int do_expire_wait(const struct path *path, bool rcu_walk) -{ - struct dentry *dentry = path->dentry; - struct dentry *expiring; - - expiring = autofs_lookup_expiring(dentry, rcu_walk); - if (IS_ERR(expiring)) - return PTR_ERR(expiring); - if (!expiring) - return autofs_expire_wait(path, rcu_walk); - else { - const struct path this = { .mnt = path->mnt, .dentry = expiring }; - /* - * If we are racing with expire the request might not - * be quite complete, but the directory has been removed - * so it must have been successful, just wait for it. - */ - autofs_expire_wait(&this, 0); - autofs_del_expiring(expiring); - dput(expiring); - } - return 0; -} - -static struct dentry *autofs_mountpoint_changed(struct path *path) -{ - struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - - /* - * If this is an indirect mount the dentry could have gone away - * as a result of an expire and a new one created. - */ - if (autofs_type_indirect(sbi->type) && d_unhashed(dentry)) { - struct dentry *parent = dentry->d_parent; - struct autofs_info *ino; - struct dentry *new; - - new = d_lookup(parent, &dentry->d_name); - if (!new) - return NULL; - ino = autofs_dentry_ino(new); - ino->last_used = jiffies; - dput(path->dentry); - path->dentry = new; - } - return path->dentry; -} - -static struct vfsmount *autofs_d_automount(struct path *path) -{ - struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - struct autofs_info *ino = autofs_dentry_ino(dentry); - int status; - - pr_debug("dentry=%p %pd\n", dentry, dentry); - - /* The daemon never triggers a mount. */ - if (autofs_oz_mode(sbi)) - return NULL; - - /* - * If an expire request is pending everyone must wait. - * If the expire fails we're still mounted so continue - * the follow and return. A return of -EAGAIN (which only - * happens with indirect mounts) means the expire completed - * and the directory was removed, so just go ahead and try - * the mount. - */ - status = do_expire_wait(path, 0); - if (status && status != -EAGAIN) - return NULL; - - /* Callback to the daemon to perform the mount or wait */ - spin_lock(&sbi->fs_lock); - if (ino->flags & AUTOFS_INF_PENDING) { - spin_unlock(&sbi->fs_lock); - status = autofs_mount_wait(path, 0); - if (status) - return ERR_PTR(status); - goto done; - } - - /* - * If the dentry is a symlink it's equivalent to a directory - * having path_is_mountpoint() true, so there's no need to call - * back to the daemon. - */ - if (d_really_is_positive(dentry) && d_is_symlink(dentry)) { - spin_unlock(&sbi->fs_lock); - goto done; - } - - if (!path_is_mountpoint(path)) { - /* - * It's possible that user space hasn't removed directories - * after umounting a rootless multi-mount, although it - * should. For v5 path_has_submounts() is sufficient to - * handle this because the leaves of the directory tree under - * the mount never trigger mounts themselves (they have an - * autofs trigger mount mounted on them). But v4 pseudo direct - * mounts do need the leaves to trigger mounts. In this case - * we have no choice but to use the list_empty() check and - * require user space behave. - */ - if (sbi->version > 4) { - if (path_has_submounts(path)) { - spin_unlock(&sbi->fs_lock); - goto done; - } - } else { - if (!simple_empty(dentry)) { - spin_unlock(&sbi->fs_lock); - goto done; - } - } - ino->flags |= AUTOFS_INF_PENDING; - spin_unlock(&sbi->fs_lock); - status = autofs_mount_wait(path, 0); - spin_lock(&sbi->fs_lock); - ino->flags &= ~AUTOFS_INF_PENDING; - if (status) { - spin_unlock(&sbi->fs_lock); - return ERR_PTR(status); - } - } - spin_unlock(&sbi->fs_lock); -done: - /* Mount succeeded, check if we ended up with a new dentry */ - dentry = autofs_mountpoint_changed(path); - if (!dentry) - return ERR_PTR(-ENOENT); - - return NULL; -} - -static int autofs_d_manage(const struct path *path, bool rcu_walk) -{ - struct dentry *dentry = path->dentry; - struct autofs_sb_info *sbi = autofs_sbi(dentry->d_sb); - struct autofs_info *ino = autofs_dentry_ino(dentry); - int status; - - pr_debug("dentry=%p %pd\n", dentry, dentry); - - /* The daemon never waits. */ - if (autofs_oz_mode(sbi)) { - if (!path_is_mountpoint(path)) - return -EISDIR; - return 0; - } - - /* Wait for pending expires */ - if (do_expire_wait(path, rcu_walk) == -ECHILD) - return -ECHILD; - - /* - * This dentry may be under construction so wait on mount - * completion. - */ - status = autofs_mount_wait(path, rcu_walk); - if (status) - return status; - - if (rcu_walk) { - /* We don't need fs_lock in rcu_walk mode, - * just testing 'AUTOFS_INFO_NO_RCU' is enough. - * simple_empty() takes a spinlock, so leave it - * to last. - * We only return -EISDIR when certain this isn't - * a mount-trap. - */ - struct inode *inode; - - if (ino->flags & AUTOFS_INF_WANT_EXPIRE) - return 0; - if (path_is_mountpoint(path)) - return 0; - inode = d_inode_rcu(dentry); - if (inode && S_ISLNK(inode->i_mode)) - return -EISDIR; - if (list_empty(&dentry->d_subdirs)) - return 0; - if (!simple_empty(dentry)) - return -EISDIR; - return 0; - } - - spin_lock(&sbi->fs_lock); - /* - * If the dentry has been selected for expire while we slept - * on the lock then it might go away. We'll deal with that in - * ->d_automount() and wait on a new mount if the expire - * succeeds or return here if it doesn't (since there's no - * mount to follow with a rootless multi-mount). - */ - if (!(ino->flags & AUTOFS_INF_EXPIRING)) { - /* - * Any needed mounting has been completed and the path - * updated so check if this is a rootless multi-mount so - * we can avoid needless calls ->d_automount() and avoid - * an incorrect ELOOP error return. - */ - if ((!path_is_mountpoint(path) && !simple_empty(dentry)) || - (d_really_is_positive(dentry) && d_is_symlink(dentry))) - status = -EISDIR; - } - spin_unlock(&sbi->fs_lock); - - return status; -} - -/* Lookups in the root directory */ -static struct dentry *autofs_lookup(struct inode *dir, - struct dentry *dentry, unsigned int flags) -{ - struct autofs_sb_info *sbi; - struct autofs_info *ino; - struct dentry *active; - - pr_debug("name = %pd\n", dentry); - - /* File name too long to exist */ - if (dentry->d_name.len > NAME_MAX) - return ERR_PTR(-ENAMETOOLONG); - - sbi = autofs_sbi(dir->i_sb); - - pr_debug("pid = %u, pgrp = %u, catatonic = %d, oz_mode = %d\n", - current->pid, task_pgrp_nr(current), sbi->catatonic, - autofs_oz_mode(sbi)); - - active = autofs_lookup_active(dentry); - if (active) - return active; - else { - /* - * A dentry that is not within the root can never trigger a - * mount operation, unless the directory already exists, so we - * can return fail immediately. The daemon however does need - * to create directories within the file system. - */ - if (!autofs_oz_mode(sbi) && !IS_ROOT(dentry->d_parent)) - return ERR_PTR(-ENOENT); - - /* Mark entries in the root as mount triggers */ - if (IS_ROOT(dentry->d_parent) && - autofs_type_indirect(sbi->type)) - __managed_dentry_set_managed(dentry); - - ino = autofs_new_ino(sbi); - if (!ino) - return ERR_PTR(-ENOMEM); - - dentry->d_fsdata = ino; - ino->dentry = dentry; - - autofs_add_active(dentry); - } - return NULL; -} - -static int autofs_dir_symlink(struct inode *dir, - struct dentry *dentry, - const char *symname) -{ - struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); - struct autofs_info *ino = autofs_dentry_ino(dentry); - struct autofs_info *p_ino; - struct inode *inode; - size_t size = strlen(symname); - char *cp; - - pr_debug("%s <- %pd\n", symname, dentry); - - if (!autofs_oz_mode(sbi)) - return -EACCES; - - BUG_ON(!ino); - - autofs_clean_ino(ino); - - autofs_del_active(dentry); - - cp = kmalloc(size + 1, GFP_KERNEL); - if (!cp) - return -ENOMEM; - - strcpy(cp, symname); - - inode = autofs_get_inode(dir->i_sb, S_IFLNK | 0555); - if (!inode) { - kfree(cp); - return -ENOMEM; - } - inode->i_private = cp; - inode->i_size = size; - d_add(dentry, inode); - - dget(dentry); - atomic_inc(&ino->count); - p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && !IS_ROOT(dentry)) - atomic_inc(&p_ino->count); - - dir->i_mtime = current_time(dir); - - return 0; -} - -/* - * NOTE! - * - * Normal filesystems would do a "d_delete()" to tell the VFS dcache - * that the file no longer exists. However, doing that means that the - * VFS layer can turn the dentry into a negative dentry. We don't want - * this, because the unlink is probably the result of an expire. - * We simply d_drop it and add it to a expiring list in the super block, - * which allows the dentry lookup to check for an incomplete expire. - * - * If a process is blocked on the dentry waiting for the expire to finish, - * it will invalidate the dentry and try to mount with a new one. - * - * Also see autofs_dir_rmdir().. - */ -static int autofs_dir_unlink(struct inode *dir, struct dentry *dentry) -{ - struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); - struct autofs_info *ino = autofs_dentry_ino(dentry); - struct autofs_info *p_ino; - - /* This allows root to remove symlinks */ - if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && !IS_ROOT(dentry)) - atomic_dec(&p_ino->count); - } - dput(ino->dentry); - - d_inode(dentry)->i_size = 0; - clear_nlink(d_inode(dentry)); - - dir->i_mtime = current_time(dir); - - spin_lock(&sbi->lookup_lock); - __autofs_add_expiring(dentry); - d_drop(dentry); - spin_unlock(&sbi->lookup_lock); - - return 0; -} - -/* - * Version 4 of autofs provides a pseudo direct mount implementation - * that relies on directories at the leaves of a directory tree under - * an indirect mount to trigger mounts. To allow for this we need to - * set the DMANAGED_AUTOMOUNT and DMANAGED_TRANSIT flags on the leaves - * of the directory tree. There is no need to clear the automount flag - * following a mount or restore it after an expire because these mounts - * are always covered. However, it is necessary to ensure that these - * flags are clear on non-empty directories to avoid unnecessary calls - * during path walks. - */ -static void autofs_set_leaf_automount_flags(struct dentry *dentry) -{ - struct dentry *parent; - - /* root and dentrys in the root are already handled */ - if (IS_ROOT(dentry->d_parent)) - return; - - managed_dentry_set_managed(dentry); - - parent = dentry->d_parent; - /* only consider parents below dentrys in the root */ - if (IS_ROOT(parent->d_parent)) - return; - managed_dentry_clear_managed(parent); -} - -static void autofs_clear_leaf_automount_flags(struct dentry *dentry) -{ - struct list_head *d_child; - struct dentry *parent; - - /* flags for dentrys in the root are handled elsewhere */ - if (IS_ROOT(dentry->d_parent)) - return; - - managed_dentry_clear_managed(dentry); - - parent = dentry->d_parent; - /* only consider parents below dentrys in the root */ - if (IS_ROOT(parent->d_parent)) - return; - d_child = &dentry->d_child; - /* Set parent managed if it's becoming empty */ - if (d_child->next == &parent->d_subdirs && - d_child->prev == &parent->d_subdirs) - managed_dentry_set_managed(parent); -} - -static int autofs_dir_rmdir(struct inode *dir, struct dentry *dentry) -{ - struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); - struct autofs_info *ino = autofs_dentry_ino(dentry); - struct autofs_info *p_ino; - - pr_debug("dentry %p, removing %pd\n", dentry, dentry); - - if (!autofs_oz_mode(sbi)) - return -EACCES; - - spin_lock(&sbi->lookup_lock); - if (!simple_empty(dentry)) { - spin_unlock(&sbi->lookup_lock); - return -ENOTEMPTY; - } - __autofs_add_expiring(dentry); - d_drop(dentry); - spin_unlock(&sbi->lookup_lock); - - if (sbi->version < 5) - autofs_clear_leaf_automount_flags(dentry); - - if (atomic_dec_and_test(&ino->count)) { - p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && dentry->d_parent != dentry) - atomic_dec(&p_ino->count); - } - dput(ino->dentry); - d_inode(dentry)->i_size = 0; - clear_nlink(d_inode(dentry)); - - if (dir->i_nlink) - drop_nlink(dir); - - return 0; -} - -static int autofs_dir_mkdir(struct inode *dir, - struct dentry *dentry, umode_t mode) -{ - struct autofs_sb_info *sbi = autofs_sbi(dir->i_sb); - struct autofs_info *ino = autofs_dentry_ino(dentry); - struct autofs_info *p_ino; - struct inode *inode; - - if (!autofs_oz_mode(sbi)) - return -EACCES; - - pr_debug("dentry %p, creating %pd\n", dentry, dentry); - - BUG_ON(!ino); - - autofs_clean_ino(ino); - - autofs_del_active(dentry); - - inode = autofs_get_inode(dir->i_sb, S_IFDIR | mode); - if (!inode) - return -ENOMEM; - d_add(dentry, inode); - - if (sbi->version < 5) - autofs_set_leaf_automount_flags(dentry); - - dget(dentry); - atomic_inc(&ino->count); - p_ino = autofs_dentry_ino(dentry->d_parent); - if (p_ino && !IS_ROOT(dentry)) - atomic_inc(&p_ino->count); - inc_nlink(dir); - dir->i_mtime = current_time(dir); - - return 0; -} - -/* Get/set timeout ioctl() operation */ -#ifdef CONFIG_COMPAT -static inline int autofs_compat_get_set_timeout(struct autofs_sb_info *sbi, - compat_ulong_t __user *p) -{ - unsigned long ntimeout; - int rv; - - rv = get_user(ntimeout, p); - if (rv) - goto error; - - rv = put_user(sbi->exp_timeout/HZ, p); - if (rv) - goto error; - - if (ntimeout > UINT_MAX/HZ) - sbi->exp_timeout = 0; - else - sbi->exp_timeout = ntimeout * HZ; - - return 0; -error: - return rv; -} -#endif - -static inline int autofs_get_set_timeout(struct autofs_sb_info *sbi, - unsigned long __user *p) -{ - unsigned long ntimeout; - int rv; - - rv = get_user(ntimeout, p); - if (rv) - goto error; - - rv = put_user(sbi->exp_timeout/HZ, p); - if (rv) - goto error; - - if (ntimeout > ULONG_MAX/HZ) - sbi->exp_timeout = 0; - else - sbi->exp_timeout = ntimeout * HZ; - - return 0; -error: - return rv; -} - -/* Return protocol version */ -static inline int autofs_get_protover(struct autofs_sb_info *sbi, - int __user *p) -{ - return put_user(sbi->version, p); -} - -/* Return protocol sub version */ -static inline int autofs_get_protosubver(struct autofs_sb_info *sbi, - int __user *p) -{ - return put_user(sbi->sub_version, p); -} - -/* -* Tells the daemon whether it can umount the autofs mount. -*/ -static inline int autofs_ask_umount(struct vfsmount *mnt, int __user *p) -{ - int status = 0; - - if (may_umount(mnt)) - status = 1; - - pr_debug("may umount %d\n", status); - - status = put_user(status, p); - - return status; -} - -/* Identify autofs_dentries - this is so we can tell if there's - * an extra dentry refcount or not. We only hold a refcount on the - * dentry if its non-negative (ie, d_inode != NULL) - */ -int is_autofs_dentry(struct dentry *dentry) -{ - return dentry && d_really_is_positive(dentry) && - dentry->d_op == &autofs_dentry_operations && - dentry->d_fsdata != NULL; -} - -/* - * ioctl()'s on the root directory is the chief method for the daemon to - * generate kernel reactions - */ -static int autofs_root_ioctl_unlocked(struct inode *inode, struct file *filp, - unsigned int cmd, unsigned long arg) -{ - struct autofs_sb_info *sbi = autofs_sbi(inode->i_sb); - void __user *p = (void __user *)arg; - - pr_debug("cmd = 0x%08x, arg = 0x%08lx, sbi = %p, pgrp = %u\n", - cmd, arg, sbi, task_pgrp_nr(current)); - - if (_IOC_TYPE(cmd) != _IOC_TYPE(AUTOFS_IOC_FIRST) || - _IOC_NR(cmd) - _IOC_NR(AUTOFS_IOC_FIRST) >= AUTOFS_IOC_COUNT) - return -ENOTTY; - - if (!autofs_oz_mode(sbi) && !capable(CAP_SYS_ADMIN)) - return -EPERM; - - switch (cmd) { - case AUTOFS_IOC_READY: /* Wait queue: go ahead and retry */ - return autofs_wait_release(sbi, (autofs_wqt_t) arg, 0); - case AUTOFS_IOC_FAIL: /* Wait queue: fail with ENOENT */ - return autofs_wait_release(sbi, (autofs_wqt_t) arg, -ENOENT); - case AUTOFS_IOC_CATATONIC: /* Enter catatonic mode (daemon shutdown) */ - autofs_catatonic_mode(sbi); - return 0; - case AUTOFS_IOC_PROTOVER: /* Get protocol version */ - return autofs_get_protover(sbi, p); - case AUTOFS_IOC_PROTOSUBVER: /* Get protocol sub version */ - return autofs_get_protosubver(sbi, p); - case AUTOFS_IOC_SETTIMEOUT: - return autofs_get_set_timeout(sbi, p); -#ifdef CONFIG_COMPAT - case AUTOFS_IOC_SETTIMEOUT32: - return autofs_compat_get_set_timeout(sbi, p); -#endif - - case AUTOFS_IOC_ASKUMOUNT: - return autofs_ask_umount(filp->f_path.mnt, p); - - /* return a single thing to expire */ - case AUTOFS_IOC_EXPIRE: - return autofs_expire_run(inode->i_sb, filp->f_path.mnt, sbi, p); - /* same as above, but can send multiple expires through pipe */ - case AUTOFS_IOC_EXPIRE_MULTI: - return autofs_expire_multi(inode->i_sb, - filp->f_path.mnt, sbi, p); - - default: - return -EINVAL; - } -} - -static long autofs_root_ioctl(struct file *filp, - unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file_inode(filp); - - return autofs_root_ioctl_unlocked(inode, filp, cmd, arg); -} - -#ifdef CONFIG_COMPAT -static long autofs_root_compat_ioctl(struct file *filp, - unsigned int cmd, unsigned long arg) -{ - struct inode *inode = file_inode(filp); - int ret; - - if (cmd == AUTOFS_IOC_READY || cmd == AUTOFS_IOC_FAIL) - ret = autofs_root_ioctl_unlocked(inode, filp, cmd, arg); - else - ret = autofs_root_ioctl_unlocked(inode, filp, cmd, - (unsigned long) compat_ptr(arg)); - - return ret; -} -#endif diff --git a/fs/autofs4/symlink.c b/fs/autofs4/symlink.c deleted file mode 100644 index aad3902c0cc1..000000000000 --- a/fs/autofs4/symlink.c +++ /dev/null @@ -1,29 +0,0 @@ -/* - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - */ - -#include "autofs_i.h" - -static const char *autofs_get_link(struct dentry *dentry, - struct inode *inode, - struct delayed_call *done) -{ - struct autofs_sb_info *sbi; - struct autofs_info *ino; - - if (!dentry) - return ERR_PTR(-ECHILD); - sbi = autofs_sbi(dentry->d_sb); - ino = autofs_dentry_ino(dentry); - if (ino && !autofs_oz_mode(sbi)) - ino->last_used = jiffies; - return d_inode(dentry)->i_private; -} - -const struct inode_operations autofs_symlink_inode_operations = { - .get_link = autofs_get_link -}; diff --git a/fs/autofs4/waitq.c b/fs/autofs4/waitq.c deleted file mode 100644 index 8a566fa66afe..000000000000 --- a/fs/autofs4/waitq.c +++ /dev/null @@ -1,559 +0,0 @@ -/* - * Copyright 1997-1998 Transmeta Corporation -- All Rights Reserved - * Copyright 2001-2006 Ian Kent - * - * This file is part of the Linux kernel and is made available under - * the terms of the GNU General Public License, version 2, or at your - * option, any later version, incorporated herein by reference. - */ - -#include -#include -#include -#include -#include -#include "autofs_i.h" - -/* We make this a static variable rather than a part of the superblock; it - * is better if we don't reassign numbers easily even across filesystems - */ -static autofs_wqt_t autofs_next_wait_queue = 1; - -void autofs_catatonic_mode(struct autofs_sb_info *sbi) -{ - struct autofs_wait_queue *wq, *nwq; - - mutex_lock(&sbi->wq_mutex); - if (sbi->catatonic) { - mutex_unlock(&sbi->wq_mutex); - return; - } - - pr_debug("entering catatonic mode\n"); - - sbi->catatonic = 1; - wq = sbi->queues; - sbi->queues = NULL; /* Erase all wait queues */ - while (wq) { - nwq = wq->next; - wq->status = -ENOENT; /* Magic is gone - report failure */ - kfree(wq->name.name); - wq->name.name = NULL; - wq->wait_ctr--; - wake_up_interruptible(&wq->queue); - wq = nwq; - } - fput(sbi->pipe); /* Close the pipe */ - sbi->pipe = NULL; - sbi->pipefd = -1; - mutex_unlock(&sbi->wq_mutex); -} - -static int autofs_write(struct autofs_sb_info *sbi, - struct file *file, const void *addr, int bytes) -{ - unsigned long sigpipe, flags; - const char *data = (const char *)addr; - ssize_t wr = 0; - - sigpipe = sigismember(¤t->pending.signal, SIGPIPE); - - mutex_lock(&sbi->pipe_mutex); - while (bytes) { - wr = __kernel_write(file, data, bytes, &file->f_pos); - if (wr <= 0) - break; - data += wr; - bytes -= wr; - } - mutex_unlock(&sbi->pipe_mutex); - - /* Keep the currently executing process from receiving a - * SIGPIPE unless it was already supposed to get one - */ - if (wr == -EPIPE && !sigpipe) { - spin_lock_irqsave(¤t->sighand->siglock, flags); - sigdelset(¤t->pending.signal, SIGPIPE); - recalc_sigpending(); - spin_unlock_irqrestore(¤t->sighand->siglock, flags); - } - - /* if 'wr' returned 0 (impossible) we assume -EIO (safe) */ - return bytes == 0 ? 0 : wr < 0 ? wr : -EIO; -} - -static void autofs_notify_daemon(struct autofs_sb_info *sbi, - struct autofs_wait_queue *wq, - int type) -{ - union { - struct autofs_packet_hdr hdr; - union autofs_packet_union v4_pkt; - union autofs_v5_packet_union v5_pkt; - } pkt; - struct file *pipe = NULL; - size_t pktsz; - int ret; - - pr_debug("wait id = 0x%08lx, name = %.*s, type=%d\n", - (unsigned long) wq->wait_queue_token, - wq->name.len, wq->name.name, type); - - memset(&pkt, 0, sizeof(pkt)); /* For security reasons */ - - pkt.hdr.proto_version = sbi->version; - pkt.hdr.type = type; - - switch (type) { - /* Kernel protocol v4 missing and expire packets */ - case autofs_ptype_missing: - { - struct autofs_packet_missing *mp = &pkt.v4_pkt.missing; - - pktsz = sizeof(*mp); - - mp->wait_queue_token = wq->wait_queue_token; - mp->len = wq->name.len; - memcpy(mp->name, wq->name.name, wq->name.len); - mp->name[wq->name.len] = '\0'; - break; - } - case autofs_ptype_expire_multi: - { - struct autofs_packet_expire_multi *ep = - &pkt.v4_pkt.expire_multi; - - pktsz = sizeof(*ep); - - ep->wait_queue_token = wq->wait_queue_token; - ep->len = wq->name.len; - memcpy(ep->name, wq->name.name, wq->name.len); - ep->name[wq->name.len] = '\0'; - break; - } - /* - * Kernel protocol v5 packet for handling indirect and direct - * mount missing and expire requests - */ - case autofs_ptype_missing_indirect: - case autofs_ptype_expire_indirect: - case autofs_ptype_missing_direct: - case autofs_ptype_expire_direct: - { - struct autofs_v5_packet *packet = &pkt.v5_pkt.v5_packet; - struct user_namespace *user_ns = sbi->pipe->f_cred->user_ns; - - pktsz = sizeof(*packet); - - packet->wait_queue_token = wq->wait_queue_token; - packet->len = wq->name.len; - memcpy(packet->name, wq->name.name, wq->name.len); - packet->name[wq->name.len] = '\0'; - packet->dev = wq->dev; - packet->ino = wq->ino; - packet->uid = from_kuid_munged(user_ns, wq->uid); - packet->gid = from_kgid_munged(user_ns, wq->gid); - packet->pid = wq->pid; - packet->tgid = wq->tgid; - break; - } - default: - pr_warn("bad type %d!\n", type); - mutex_unlock(&sbi->wq_mutex); - return; - } - - pipe = get_file(sbi->pipe); - - mutex_unlock(&sbi->wq_mutex); - - switch (ret = autofs_write(sbi, pipe, &pkt, pktsz)) { - case 0: - break; - case -ENOMEM: - case -ERESTARTSYS: - /* Just fail this one */ - autofs_wait_release(sbi, wq->wait_queue_token, ret); - break; - default: - autofs_catatonic_mode(sbi); - break; - } - fput(pipe); -} - -static int autofs_getpath(struct autofs_sb_info *sbi, - struct dentry *dentry, char **name) -{ - struct dentry *root = sbi->sb->s_root; - struct dentry *tmp; - char *buf; - char *p; - int len; - unsigned seq; - -rename_retry: - buf = *name; - len = 0; - - seq = read_seqbegin(&rename_lock); - rcu_read_lock(); - spin_lock(&sbi->fs_lock); - for (tmp = dentry ; tmp != root ; tmp = tmp->d_parent) - len += tmp->d_name.len + 1; - - if (!len || --len > NAME_MAX) { - spin_unlock(&sbi->fs_lock); - rcu_read_unlock(); - if (read_seqretry(&rename_lock, seq)) - goto rename_retry; - return 0; - } - - *(buf + len) = '\0'; - p = buf + len - dentry->d_name.len; - strncpy(p, dentry->d_name.name, dentry->d_name.len); - - for (tmp = dentry->d_parent; tmp != root ; tmp = tmp->d_parent) { - *(--p) = '/'; - p -= tmp->d_name.len; - strncpy(p, tmp->d_name.name, tmp->d_name.len); - } - spin_unlock(&sbi->fs_lock); - rcu_read_unlock(); - if (read_seqretry(&rename_lock, seq)) - goto rename_retry; - - return len; -} - -static struct autofs_wait_queue * -autofs_find_wait(struct autofs_sb_info *sbi, const struct qstr *qstr) -{ - struct autofs_wait_queue *wq; - - for (wq = sbi->queues; wq; wq = wq->next) { - if (wq->name.hash == qstr->hash && - wq->name.len == qstr->len && - wq->name.name && - !memcmp(wq->name.name, qstr->name, qstr->len)) - break; - } - return wq; -} - -/* - * Check if we have a valid request. - * Returns - * 1 if the request should continue. - * In this case we can return an autofs_wait_queue entry if one is - * found or NULL to idicate a new wait needs to be created. - * 0 or a negative errno if the request shouldn't continue. - */ -static int validate_request(struct autofs_wait_queue **wait, - struct autofs_sb_info *sbi, - const struct qstr *qstr, - const struct path *path, enum autofs_notify notify) -{ - struct dentry *dentry = path->dentry; - struct autofs_wait_queue *wq; - struct autofs_info *ino; - - if (sbi->catatonic) - return -ENOENT; - - /* Wait in progress, continue; */ - wq = autofs_find_wait(sbi, qstr); - if (wq) { - *wait = wq; - return 1; - } - - *wait = NULL; - - /* If we don't yet have any info this is a new request */ - ino = autofs_dentry_ino(dentry); - if (!ino) - return 1; - - /* - * If we've been asked to wait on an existing expire (NFY_NONE) - * but there is no wait in the queue ... - */ - if (notify == NFY_NONE) { - /* - * Either we've betean the pending expire to post it's - * wait or it finished while we waited on the mutex. - * So we need to wait till either, the wait appears - * or the expire finishes. - */ - - while (ino->flags & AUTOFS_INF_EXPIRING) { - mutex_unlock(&sbi->wq_mutex); - schedule_timeout_interruptible(HZ/10); - if (mutex_lock_interruptible(&sbi->wq_mutex)) - return -EINTR; - - if (sbi->catatonic) - return -ENOENT; - - wq = autofs_find_wait(sbi, qstr); - if (wq) { - *wait = wq; - return 1; - } - } - - /* - * Not ideal but the status has already gone. Of the two - * cases where we wait on NFY_NONE neither depend on the - * return status of the wait. - */ - return 0; - } - - /* - * If we've been asked to trigger a mount and the request - * completed while we waited on the mutex ... - */ - if (notify == NFY_MOUNT) { - struct dentry *new = NULL; - struct path this; - int valid = 1; - - /* - * If the dentry was successfully mounted while we slept - * on the wait queue mutex we can return success. If it - * isn't mounted (doesn't have submounts for the case of - * a multi-mount with no mount at it's base) we can - * continue on and create a new request. - */ - if (!IS_ROOT(dentry)) { - if (d_unhashed(dentry) && - d_really_is_positive(dentry)) { - struct dentry *parent = dentry->d_parent; - - new = d_lookup(parent, &dentry->d_name); - if (new) - dentry = new; - } - } - this.mnt = path->mnt; - this.dentry = dentry; - if (path_has_submounts(&this)) - valid = 0; - - if (new) - dput(new); - return valid; - } - - return 1; -} - -int autofs_wait(struct autofs_sb_info *sbi, - const struct path *path, enum autofs_notify notify) -{ - struct dentry *dentry = path->dentry; - struct autofs_wait_queue *wq; - struct qstr qstr; - char *name; - int status, ret, type; - pid_t pid; - pid_t tgid; - - /* In catatonic mode, we don't wait for nobody */ - if (sbi->catatonic) - return -ENOENT; - - /* - * Try translating pids to the namespace of the daemon. - * - * Zero means failure: we are in an unrelated pid namespace. - */ - pid = task_pid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); - tgid = task_tgid_nr_ns(current, ns_of_pid(sbi->oz_pgrp)); - if (pid == 0 || tgid == 0) - return -ENOENT; - - if (d_really_is_negative(dentry)) { - /* - * A wait for a negative dentry is invalid for certain - * cases. A direct or offset mount "always" has its mount - * point directory created and so the request dentry must - * be positive or the map key doesn't exist. The situation - * is very similar for indirect mounts except only dentrys - * in the root of the autofs file system may be negative. - */ - if (autofs_type_trigger(sbi->type)) - return -ENOENT; - else if (!IS_ROOT(dentry->d_parent)) - return -ENOENT; - } - - name = kmalloc(NAME_MAX + 1, GFP_KERNEL); - if (!name) - return -ENOMEM; - - /* If this is a direct mount request create a dummy name */ - if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) - qstr.len = sprintf(name, "%p", dentry); - else { - qstr.len = autofs_getpath(sbi, dentry, &name); - if (!qstr.len) { - kfree(name); - return -ENOENT; - } - } - qstr.name = name; - qstr.hash = full_name_hash(dentry, name, qstr.len); - - if (mutex_lock_interruptible(&sbi->wq_mutex)) { - kfree(qstr.name); - return -EINTR; - } - - ret = validate_request(&wq, sbi, &qstr, path, notify); - if (ret <= 0) { - if (ret != -EINTR) - mutex_unlock(&sbi->wq_mutex); - kfree(qstr.name); - return ret; - } - - if (!wq) { - /* Create a new wait queue */ - wq = kmalloc(sizeof(struct autofs_wait_queue), GFP_KERNEL); - if (!wq) { - kfree(qstr.name); - mutex_unlock(&sbi->wq_mutex); - return -ENOMEM; - } - - wq->wait_queue_token = autofs_next_wait_queue; - if (++autofs_next_wait_queue == 0) - autofs_next_wait_queue = 1; - wq->next = sbi->queues; - sbi->queues = wq; - init_waitqueue_head(&wq->queue); - memcpy(&wq->name, &qstr, sizeof(struct qstr)); - wq->dev = autofs_get_dev(sbi); - wq->ino = autofs_get_ino(sbi); - wq->uid = current_uid(); - wq->gid = current_gid(); - wq->pid = pid; - wq->tgid = tgid; - wq->status = -EINTR; /* Status return if interrupted */ - wq->wait_ctr = 2; - - if (sbi->version < 5) { - if (notify == NFY_MOUNT) - type = autofs_ptype_missing; - else - type = autofs_ptype_expire_multi; - } else { - if (notify == NFY_MOUNT) - type = autofs_type_trigger(sbi->type) ? - autofs_ptype_missing_direct : - autofs_ptype_missing_indirect; - else - type = autofs_type_trigger(sbi->type) ? - autofs_ptype_expire_direct : - autofs_ptype_expire_indirect; - } - - pr_debug("new wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->name.len, - wq->name.name, notify); - - /* - * autofs_notify_daemon() may block; it will unlock ->wq_mutex - */ - autofs_notify_daemon(sbi, wq, type); - } else { - wq->wait_ctr++; - pr_debug("existing wait id = 0x%08lx, name = %.*s, nfy=%d\n", - (unsigned long) wq->wait_queue_token, wq->name.len, - wq->name.name, notify); - mutex_unlock(&sbi->wq_mutex); - kfree(qstr.name); - } - - /* - * wq->name.name is NULL iff the lock is already released - * or the mount has been made catatonic. - */ - wait_event_killable(wq->queue, wq->name.name == NULL); - status = wq->status; - - /* - * For direct and offset mounts we need to track the requester's - * uid and gid in the dentry info struct. This is so it can be - * supplied, on request, by the misc device ioctl interface. - * This is needed during daemon resatart when reconnecting - * to existing, active, autofs mounts. The uid and gid (and - * related string values) may be used for macro substitution - * in autofs mount maps. - */ - if (!status) { - struct autofs_info *ino; - struct dentry *de = NULL; - - /* direct mount or browsable map */ - ino = autofs_dentry_ino(dentry); - if (!ino) { - /* If not lookup actual dentry used */ - de = d_lookup(dentry->d_parent, &dentry->d_name); - if (de) - ino = autofs_dentry_ino(de); - } - - /* Set mount requester */ - if (ino) { - spin_lock(&sbi->fs_lock); - ino->uid = wq->uid; - ino->gid = wq->gid; - spin_unlock(&sbi->fs_lock); - } - - if (de) - dput(de); - } - - /* Are we the last process to need status? */ - mutex_lock(&sbi->wq_mutex); - if (!--wq->wait_ctr) - kfree(wq); - mutex_unlock(&sbi->wq_mutex); - - return status; -} - - -int autofs_wait_release(struct autofs_sb_info *sbi, - autofs_wqt_t wait_queue_token, int status) -{ - struct autofs_wait_queue *wq, **wql; - - mutex_lock(&sbi->wq_mutex); - for (wql = &sbi->queues; (wq = *wql) != NULL; wql = &wq->next) { - if (wq->wait_queue_token == wait_queue_token) - break; - } - - if (!wq) { - mutex_unlock(&sbi->wq_mutex); - return -EINVAL; - } - - *wql = wq->next; /* Unlink from chain */ - kfree(wq->name.name); - wq->name.name = NULL; /* Do not wait on this queue */ - wq->status = status; - wake_up(&wq->queue); - if (!--wq->wait_ctr) - kfree(wq); - mutex_unlock(&sbi->wq_mutex); - - return 0; -} From 2a3ae0a1212dc5a0f40d79e05b9de3846663e973 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:31 -0700 Subject: [PATCH 112/118] autofs: create autofs Kconfig and Makefile Create Makefile and Kconfig for autofs module. [raven@themaw.net: make autofs4 Kconfig depend on AUTOFS_FS] Link: http://lkml.kernel.org/r/152687649097.8263.7046086367407522029.stgit@pluto.themaw.net Link: http://lkml.kernel.org/r/152626705591.28589.356365986974038383.stgit@pluto.themaw.net Signed-off-by: Ian Kent Tested-by: Randy Dunlap Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/Kconfig | 1 + fs/Makefile | 1 + fs/autofs/Kconfig | 20 ++++++++++++++++++++ fs/autofs/Makefile | 7 +++++++ fs/autofs4/Kconfig | 8 ++++++++ 5 files changed, 37 insertions(+) create mode 100644 fs/autofs/Kconfig create mode 100644 fs/autofs/Makefile diff --git a/fs/Kconfig b/fs/Kconfig index 51f78a28072a..40cdae75e3b4 100644 --- a/fs/Kconfig +++ b/fs/Kconfig @@ -108,6 +108,7 @@ source "fs/notify/Kconfig" source "fs/quota/Kconfig" +source "fs/autofs/Kconfig" source "fs/autofs4/Kconfig" source "fs/fuse/Kconfig" source "fs/overlayfs/Kconfig" diff --git a/fs/Makefile b/fs/Makefile index c9375fd2c8c4..2e005525cc19 100644 --- a/fs/Makefile +++ b/fs/Makefile @@ -102,6 +102,7 @@ obj-$(CONFIG_AFFS_FS) += affs/ obj-$(CONFIG_ROMFS_FS) += romfs/ obj-$(CONFIG_QNX4FS_FS) += qnx4/ obj-$(CONFIG_QNX6FS_FS) += qnx6/ +obj-$(CONFIG_AUTOFS_FS) += autofs/ obj-$(CONFIG_AUTOFS4_FS) += autofs4/ obj-$(CONFIG_ADFS_FS) += adfs/ obj-$(CONFIG_FUSE_FS) += fuse/ diff --git a/fs/autofs/Kconfig b/fs/autofs/Kconfig new file mode 100644 index 000000000000..6a2064eb3b27 --- /dev/null +++ b/fs/autofs/Kconfig @@ -0,0 +1,20 @@ +config AUTOFS_FS + tristate "Kernel automounter support (supports v3, v4 and v5)" + default n + help + The automounter is a tool to automatically mount remote file systems + on demand. This implementation is partially kernel-based to reduce + overhead in the already-mounted case; this is unlike the BSD + automounter (amd), which is a pure user space daemon. + + To use the automounter you need the user-space tools from + ; you also want + to answer Y to "NFS file system support", below. + + To compile this support as a module, choose M here: the module will be + called autofs. + + If you are not a part of a fairly large, distributed network or + don't have a laptop which needs to dynamically reconfigure to the + local network, you probably do not need an automounter, and can say + N here. diff --git a/fs/autofs/Makefile b/fs/autofs/Makefile new file mode 100644 index 000000000000..43fedde15c26 --- /dev/null +++ b/fs/autofs/Makefile @@ -0,0 +1,7 @@ +# +# Makefile for the linux autofs-filesystem routines. +# + +obj-$(CONFIG_AUTOFS_FS) += autofs.o + +autofs-objs := init.o inode.o root.o symlink.o waitq.o expire.o dev-ioctl.o diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig index 53bc592a250d..2c2fdf989f90 100644 --- a/fs/autofs4/Kconfig +++ b/fs/autofs4/Kconfig @@ -1,6 +1,7 @@ config AUTOFS4_FS tristate "Kernel automounter version 4 support (also supports v3 and v5)" default n + depends on AUTOFS_FS = n help The automounter is a tool to automatically mount remote file systems on demand. This implementation is partially kernel-based to reduce @@ -30,3 +31,10 @@ config AUTOFS4_FS - any "alias autofs autofs4" will need to be removed. Please configure AUTOFS_FS instead of AUTOFS4_FS from now on. + + NOTE: Since the modules autofs and autofs4 use the same file system + type name of "autofs" only one can be built. The "depends" + above will result in AUTOFS4_FS not appearing in .config for + any setting of AUTOFS_FS other than n and AUTOFS4_FS will + appear under the AUTOFS_FS entry otherwise which is intended + to draw attention to the module rename change. From 9005d833385508d647680bfac90f9b2cf2ac5838 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:35 -0700 Subject: [PATCH 113/118] autofs: rename autofs documentation files There are two files in Documentation/filsystems that should now use autofs rather than autofs4 in their names. Link: http://lkml.kernel.org/r/152626707957.28589.3325300375892913999.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- .../{autofs4-mount-control.txt => autofs-mount-control.txt} | 1 - Documentation/filesystems/{autofs4.txt => autofs.txt} | 0 2 files changed, 1 deletion(-) rename Documentation/filesystems/{autofs4-mount-control.txt => autofs-mount-control.txt} (99%) rename Documentation/filesystems/{autofs4.txt => autofs.txt} (100%) diff --git a/Documentation/filesystems/autofs4-mount-control.txt b/Documentation/filesystems/autofs-mount-control.txt similarity index 99% rename from Documentation/filesystems/autofs4-mount-control.txt rename to Documentation/filesystems/autofs-mount-control.txt index e5177cb31a04..52c1b6cdfc91 100644 --- a/Documentation/filesystems/autofs4-mount-control.txt +++ b/Documentation/filesystems/autofs-mount-control.txt @@ -404,4 +404,3 @@ type is also given we are looking for a particular autofs mount and if a match isn't found a fail is returned. If the the located path is the root of a mount 1 is returned along with the super magic of the mount or 0 otherwise. - diff --git a/Documentation/filesystems/autofs4.txt b/Documentation/filesystems/autofs.txt similarity index 100% rename from Documentation/filesystems/autofs4.txt rename to Documentation/filesystems/autofs.txt From b6bb226a72f0b95f0ce3edfc108776a274d21a98 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:38 -0700 Subject: [PATCH 114/118] autofs: use autofs instead of autofs4 in documentation Finally remove autofs4 references in the filesystems documentation. Link: http://lkml.kernel.org/r/152626709055.28589.416082809460051475.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- Documentation/filesystems/00-INDEX | 4 ++-- Documentation/filesystems/autofs-mount-control.txt | 8 ++++---- Documentation/filesystems/autofs.txt | 10 +++++----- Documentation/filesystems/automount-support.txt | 2 +- Documentation/filesystems/path-lookup.md | 2 +- 5 files changed, 13 insertions(+), 13 deletions(-) diff --git a/Documentation/filesystems/00-INDEX b/Documentation/filesystems/00-INDEX index b7bd6c9009cc..a8bd4af7fbce 100644 --- a/Documentation/filesystems/00-INDEX +++ b/Documentation/filesystems/00-INDEX @@ -10,8 +10,8 @@ afs.txt - info and examples for the distributed AFS (Andrew File System) fs. affs.txt - info and mount options for the Amiga Fast File System. -autofs4-mount-control.txt - - info on device control operations for autofs4 module. +autofs-mount-control.txt + - info on device control operations for autofs module. automount-support.txt - information about filesystem automount support. befs.txt diff --git a/Documentation/filesystems/autofs-mount-control.txt b/Documentation/filesystems/autofs-mount-control.txt index 52c1b6cdfc91..45edad6933cc 100644 --- a/Documentation/filesystems/autofs-mount-control.txt +++ b/Documentation/filesystems/autofs-mount-control.txt @@ -1,5 +1,5 @@ -Miscellaneous Device control operations for the autofs4 kernel module +Miscellaneous Device control operations for the autofs kernel module ==================================================================== The problem @@ -164,7 +164,7 @@ possibility for future development due to the requirements of the message bus architecture. -autofs4 Miscellaneous Device mount control interface +autofs Miscellaneous Device mount control interface ==================================================== The control interface is opening a device node, typically /dev/autofs. @@ -244,7 +244,7 @@ The device node ioctl operations implemented by this interface are: AUTOFS_DEV_IOCTL_VERSION ------------------------ -Get the major and minor version of the autofs4 device ioctl kernel module +Get the major and minor version of the autofs device ioctl kernel module implementation. It requires an initialized struct autofs_dev_ioctl as an input parameter and sets the version information in the passed in structure. It returns 0 on success or the error -EINVAL if a version mismatch is @@ -254,7 +254,7 @@ detected. AUTOFS_DEV_IOCTL_PROTOVER_CMD and AUTOFS_DEV_IOCTL_PROTOSUBVER_CMD ------------------------------------------------------------------ -Get the major and minor version of the autofs4 protocol version understood +Get the major and minor version of the autofs protocol version understood by loaded module. This call requires an initialized struct autofs_dev_ioctl with the ioctlfd field set to a valid autofs mount point descriptor and sets the requested version number in version field of struct args_protover diff --git a/Documentation/filesystems/autofs.txt b/Documentation/filesystems/autofs.txt index f10dd590f69f..373ad25852d3 100644 --- a/Documentation/filesystems/autofs.txt +++ b/Documentation/filesystems/autofs.txt @@ -30,15 +30,15 @@ key advantages: Context ------- -The "autofs4" filesystem module is only one part of an autofs system. +The "autofs" filesystem module is only one part of an autofs system. There also needs to be a user-space program which looks up names and mounts filesystems. This will often be the "automount" program, -though other tools including "systemd" can make use of "autofs4". +though other tools including "systemd" can make use of "autofs". This document describes only the kernel module and the interactions required with any user-space program. Subsequent text refers to this as the "automount daemon" or simply "the daemon". -"autofs4" is a Linux kernel module with provides the "autofs" +"autofs" is a Linux kernel module with provides the "autofs" filesystem type. Several "autofs" filesystems can be mounted and they can each be managed separately, or all managed by the same daemon. @@ -215,7 +215,7 @@ of expiry. The VFS also supports "expiry" of mounts using the MNT_EXPIRE flag to the `umount` system call. Unmounting with MNT_EXPIRE will fail unless a previous attempt had been made, and the filesystem has been inactive -and untouched since that previous attempt. autofs4 does not depend on +and untouched since that previous attempt. autofs does not depend on this but has its own internal tracking of whether filesystems were recently used. This allows individual names in the autofs directory to expire separately. @@ -415,7 +415,7 @@ which can be used to communicate directly with the autofs filesystem. It requires CAP_SYS_ADMIN for access. The `ioctl`s that can be used on this device are described in a separate -document `autofs4-mount-control.txt`, and are summarized briefly here. +document `autofs-mount-control.txt`, and are summarized briefly here. Each ioctl is passed a pointer to an `autofs_dev_ioctl` structure: struct autofs_dev_ioctl { diff --git a/Documentation/filesystems/automount-support.txt b/Documentation/filesystems/automount-support.txt index 7eb762eb3136..b0afd3d55eaf 100644 --- a/Documentation/filesystems/automount-support.txt +++ b/Documentation/filesystems/automount-support.txt @@ -9,7 +9,7 @@ also be requested by userspace. IN-KERNEL AUTOMOUNTING ====================== -See section "Mount Traps" of Documentation/filesystems/autofs4.txt +See section "Mount Traps" of Documentation/filesystems/autofs.txt Then from userspace, you can just do something like: diff --git a/Documentation/filesystems/path-lookup.md b/Documentation/filesystems/path-lookup.md index 1933ef734e63..e2edd45c4bc0 100644 --- a/Documentation/filesystems/path-lookup.md +++ b/Documentation/filesystems/path-lookup.md @@ -460,7 +460,7 @@ this retry process in the next article. Automount points are locations in the filesystem where an attempt to lookup a name can trigger changes to how that lookup should be handled, in particular by mounting a filesystem there. These are -covered in greater detail in autofs4.txt in the Linux documentation +covered in greater detail in autofs.txt in the Linux documentation tree, but a few notes specifically related to path lookup are in order here. From 0ab88bacc41df707750d8edf8ce7377d05403790 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:42 -0700 Subject: [PATCH 115/118] autofs: update MAINTAINERS entry for autofs Update the autofs entry in MAINTAINERS to reflect the rename of autofs4 to autofs. Link: http://lkml.kernel.org/r/152626709611.28589.456596640024354223.stgit@pluto.themaw.net Signed-off-by: Ian Kent Cc: Al Viro Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- MAINTAINERS | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/MAINTAINERS b/MAINTAINERS index dc241b04d1bd..fff6439819c8 100644 --- a/MAINTAINERS +++ b/MAINTAINERS @@ -7727,11 +7727,11 @@ W: https://linuxtv.org S: Maintained F: drivers/media/radio/radio-keene* -KERNEL AUTOMOUNTER v4 (AUTOFS4) +KERNEL AUTOMOUNTER M: Ian Kent L: autofs@vger.kernel.org S: Maintained -F: fs/autofs4/ +F: fs/autofs/ KERNEL BUILD + files below scripts/ (unless maintained elsewhere) M: Masahiro Yamada From 8240b716e200b3da215ea5a22a49f76f2c846a78 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:45 -0700 Subject: [PATCH 116/118] autofs: comment on selinux changes needed for module autoload Due to the autofs4 module using a file system type name of autofs different from the module containing directory name autoload did not function properly. To work around this kernel configurations have often elected to build the module into the kernel. This can result in selinux policies that prohibit autoloading of the autofs module which need to be changed. Add a comment about this to "possible changes" section of the autofs4 module help. Link: http://lkml.kernel.org/r/152686474171.6155.1239659539983577463.stgit@pluto.themaw.net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs4/Kconfig | 6 ++++++ 1 file changed, 6 insertions(+) diff --git a/fs/autofs4/Kconfig b/fs/autofs4/Kconfig index 2c2fdf989f90..99fda4d6da25 100644 --- a/fs/autofs4/Kconfig +++ b/fs/autofs4/Kconfig @@ -29,6 +29,12 @@ config AUTOFS4_FS and the module name are the same as the file system name there is no need to manually load module. - any "alias autofs autofs4" will need to be removed. + - due to the autofs4 module directory name not being the same as + its file system name autoloading didn't work properly. Because + of this kernel configurations would often build the module into + the kernel. This may have resulted in selinux policies that will + prevent the autofs module from autoloading and will need to be + updated. Please configure AUTOFS_FS instead of AUTOFS4_FS from now on. From 6471e93863d6494e74e99ae2bdffac9c46571f81 Mon Sep 17 00:00:00 2001 From: Ian Kent Date: Thu, 7 Jun 2018 17:11:48 -0700 Subject: [PATCH 117/118] autofs: clean up includes Remove includes that aren't needed from autofs (and fs/compat_ioctl.c). Link: http://lkml.kernel.org/r/152635085258.5968.9743527195522188148.stgit@pluto.themaw.net Signed-off-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/autofs_i.h | 2 +- fs/autofs/dev-ioctl.c | 13 ------------- fs/autofs/inode.c | 6 +----- fs/autofs/root.c | 6 ------ fs/autofs/waitq.c | 4 ---- fs/compat_ioctl.c | 1 - 6 files changed, 2 insertions(+), 30 deletions(-) diff --git a/fs/autofs/autofs_i.h b/fs/autofs/autofs_i.h index 9110b66c7ef1..9400a9f6318a 100644 --- a/fs/autofs/autofs_i.h +++ b/fs/autofs/autofs_i.h @@ -25,7 +25,7 @@ #include #include #include -#include +#include /* This is the range of ioctl() numbers we claim as ours */ #define AUTOFS_IOC_FIRST AUTOFS_IOC_READY diff --git a/fs/autofs/dev-ioctl.c b/fs/autofs/dev-ioctl.c index a2281ab2b957..ea4ca1445ab7 100644 --- a/fs/autofs/dev-ioctl.c +++ b/fs/autofs/dev-ioctl.c @@ -7,23 +7,10 @@ * option, any later version, incorporated herein by reference. */ -#include -#include #include -#include -#include -#include -#include -#include -#include -#include -#include #include #include #include -#include -#include -#include #include "autofs_i.h" diff --git a/fs/autofs/inode.c b/fs/autofs/inode.c index 6262819ede45..b51980fc274e 100644 --- a/fs/autofs/inode.c +++ b/fs/autofs/inode.c @@ -7,16 +7,12 @@ * option, any later version, incorporated herein by reference. */ -#include -#include -#include #include #include #include -#include #include + #include "autofs_i.h" -#include struct autofs_info *autofs_new_ino(struct autofs_sb_info *sbi) { diff --git a/fs/autofs/root.c b/fs/autofs/root.c index a4b36e44f73c..a3d414150578 100644 --- a/fs/autofs/root.c +++ b/fs/autofs/root.c @@ -9,13 +9,7 @@ */ #include -#include -#include -#include -#include -#include #include -#include #include "autofs_i.h" diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 8a566fa66afe..8c858126c751 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -7,11 +7,7 @@ * option, any later version, incorporated herein by reference. */ -#include -#include -#include #include -#include #include "autofs_i.h" /* We make this a static variable rather than a part of the superblock; it diff --git a/fs/compat_ioctl.c b/fs/compat_ioctl.c index b3e1768b636e..9907475b4226 100644 --- a/fs/compat_ioctl.c +++ b/fs/compat_ioctl.c @@ -38,7 +38,6 @@ #include #include #include -#include #include #include #include From 016e92da037e0b43dd5e5848c19b0b9749506963 Mon Sep 17 00:00:00 2001 From: Dan Carpenter Date: Thu, 7 Jun 2018 17:11:52 -0700 Subject: [PATCH 118/118] autofs: small cleanup in autofs_getpath() We don't set "*name" so it's slightly nicer to just pass "name" instead of "&name". Link: http://lkml.kernel.org/r/20180531064736.lnisb55eajwjynvk@kili.mountain Signed-off-by: Dan Carpenter Acked-by: "Eric W. Biederman" Acked-by: Ian Kent Signed-off-by: Andrew Morton Signed-off-by: Linus Torvalds --- fs/autofs/waitq.c | 6 +++--- 1 file changed, 3 insertions(+), 3 deletions(-) diff --git a/fs/autofs/waitq.c b/fs/autofs/waitq.c index 8c858126c751..f6385c6ef0a5 100644 --- a/fs/autofs/waitq.c +++ b/fs/autofs/waitq.c @@ -179,7 +179,7 @@ static void autofs_notify_daemon(struct autofs_sb_info *sbi, } static int autofs_getpath(struct autofs_sb_info *sbi, - struct dentry *dentry, char **name) + struct dentry *dentry, char *name) { struct dentry *root = sbi->sb->s_root; struct dentry *tmp; @@ -189,7 +189,7 @@ static int autofs_getpath(struct autofs_sb_info *sbi, unsigned seq; rename_retry: - buf = *name; + buf = name; len = 0; seq = read_seqbegin(&rename_lock); @@ -395,7 +395,7 @@ int autofs_wait(struct autofs_sb_info *sbi, if (IS_ROOT(dentry) && autofs_type_trigger(sbi->type)) qstr.len = sprintf(name, "%p", dentry); else { - qstr.len = autofs_getpath(sbi, dentry, &name); + qstr.len = autofs_getpath(sbi, dentry, name); if (!qstr.len) { kfree(name); return -ENOENT;