mm: fix locking order in mm_take_all_locks()
Dmitry Vyukov has reported[1] possible deadlock (triggered by his syzkaller fuzzer): Possible unsafe locking scenario: CPU0 CPU1 ---- ---- lock(&hugetlbfs_i_mmap_rwsem_key); lock(&mapping->i_mmap_rwsem); lock(&hugetlbfs_i_mmap_rwsem_key); lock(&mapping->i_mmap_rwsem); Both traces points to mm_take_all_locks() as a source of the problem. It doesn't take care about ordering or hugetlbfs_i_mmap_rwsem_key (aka mapping->i_mmap_rwsem for hugetlb mapping) vs. i_mmap_rwsem. huge_pmd_share() does memory allocation under hugetlbfs_i_mmap_rwsem_key and allocator can take i_mmap_rwsem if it hit reclaim. So we need to take i_mmap_rwsem from all hugetlb VMAs before taking i_mmap_rwsem from rest of VMAs. The patch also documents locking order for hugetlbfs_i_mmap_rwsem_key. [1] http://lkml.kernel.org/r/CACT4Y+Zu95tBs-0EvdiAKzUOsb4tczRRfCRTpLr4bg_OP9HuVg@mail.gmail.com Signed-off-by: Kirill A. Shutemov <kirill.shutemov@linux.intel.com> Reported-by: Dmitry Vyukov <dvyukov@google.com> Reviewed-by: Michal Hocko <mhocko@suse.com> Cc: Peter Zijlstra <peterz@infradead.org> Cc: Andrea Arcangeli <aarcange@redhat.com> Signed-off-by: Andrew Morton <akpm@linux-foundation.org> Signed-off-by: Linus Torvalds <torvalds@linux-foundation.org>
This commit is contained in:
parent
d645fc0eab
commit
88f306b68c
|
@ -708,7 +708,7 @@ static struct inode *hugetlbfs_get_root(struct super_block *sb,
|
||||||
/*
|
/*
|
||||||
* Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
|
* Hugetlbfs is not reclaimable; therefore its i_mmap_rwsem will never
|
||||||
* be taken from reclaim -- unlike regular filesystems. This needs an
|
* be taken from reclaim -- unlike regular filesystems. This needs an
|
||||||
* annotation because huge_pmd_share() does an allocation under
|
* annotation because huge_pmd_share() does an allocation under hugetlb's
|
||||||
* i_mmap_rwsem.
|
* i_mmap_rwsem.
|
||||||
*/
|
*/
|
||||||
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
|
static struct lock_class_key hugetlbfs_i_mmap_rwsem_key;
|
||||||
|
|
25
mm/mmap.c
25
mm/mmap.c
|
@ -3184,10 +3184,16 @@ static void vm_lock_mapping(struct mm_struct *mm, struct address_space *mapping)
|
||||||
* mapping->flags avoid to take the same lock twice, if more than one
|
* mapping->flags avoid to take the same lock twice, if more than one
|
||||||
* vma in this mm is backed by the same anon_vma or address_space.
|
* vma in this mm is backed by the same anon_vma or address_space.
|
||||||
*
|
*
|
||||||
* We can take all the locks in random order because the VM code
|
* We take locks in following order, accordingly to comment at beginning
|
||||||
* taking i_mmap_rwsem or anon_vma->rwsem outside the mmap_sem never
|
* of mm/rmap.c:
|
||||||
* takes more than one of them in a row. Secondly we're protected
|
* - all hugetlbfs_i_mmap_rwsem_key locks (aka mapping->i_mmap_rwsem for
|
||||||
* against a concurrent mm_take_all_locks() by the mm_all_locks_mutex.
|
* hugetlb mapping);
|
||||||
|
* - all i_mmap_rwsem locks;
|
||||||
|
* - all anon_vma->rwseml
|
||||||
|
*
|
||||||
|
* We can take all locks within these types randomly because the VM code
|
||||||
|
* doesn't nest them and we protected from parallel mm_take_all_locks() by
|
||||||
|
* mm_all_locks_mutex.
|
||||||
*
|
*
|
||||||
* mm_take_all_locks() and mm_drop_all_locks are expensive operations
|
* mm_take_all_locks() and mm_drop_all_locks are expensive operations
|
||||||
* that may have to take thousand of locks.
|
* that may have to take thousand of locks.
|
||||||
|
@ -3206,7 +3212,16 @@ int mm_take_all_locks(struct mm_struct *mm)
|
||||||
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||||
if (signal_pending(current))
|
if (signal_pending(current))
|
||||||
goto out_unlock;
|
goto out_unlock;
|
||||||
if (vma->vm_file && vma->vm_file->f_mapping)
|
if (vma->vm_file && vma->vm_file->f_mapping &&
|
||||||
|
is_vm_hugetlb_page(vma))
|
||||||
|
vm_lock_mapping(mm, vma->vm_file->f_mapping);
|
||||||
|
}
|
||||||
|
|
||||||
|
for (vma = mm->mmap; vma; vma = vma->vm_next) {
|
||||||
|
if (signal_pending(current))
|
||||||
|
goto out_unlock;
|
||||||
|
if (vma->vm_file && vma->vm_file->f_mapping &&
|
||||||
|
!is_vm_hugetlb_page(vma))
|
||||||
vm_lock_mapping(mm, vma->vm_file->f_mapping);
|
vm_lock_mapping(mm, vma->vm_file->f_mapping);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
31
mm/rmap.c
31
mm/rmap.c
|
@ -23,21 +23,22 @@
|
||||||
* inode->i_mutex (while writing or truncating, not reading or faulting)
|
* inode->i_mutex (while writing or truncating, not reading or faulting)
|
||||||
* mm->mmap_sem
|
* mm->mmap_sem
|
||||||
* page->flags PG_locked (lock_page)
|
* page->flags PG_locked (lock_page)
|
||||||
* mapping->i_mmap_rwsem
|
* hugetlbfs_i_mmap_rwsem_key (in huge_pmd_share)
|
||||||
* anon_vma->rwsem
|
* mapping->i_mmap_rwsem
|
||||||
* mm->page_table_lock or pte_lock
|
* anon_vma->rwsem
|
||||||
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
|
* mm->page_table_lock or pte_lock
|
||||||
* swap_lock (in swap_duplicate, swap_info_get)
|
* zone->lru_lock (in mark_page_accessed, isolate_lru_page)
|
||||||
* mmlist_lock (in mmput, drain_mmlist and others)
|
* swap_lock (in swap_duplicate, swap_info_get)
|
||||||
* mapping->private_lock (in __set_page_dirty_buffers)
|
* mmlist_lock (in mmput, drain_mmlist and others)
|
||||||
* mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
|
* mapping->private_lock (in __set_page_dirty_buffers)
|
||||||
* mapping->tree_lock (widely used)
|
* mem_cgroup_{begin,end}_page_stat (memcg->move_lock)
|
||||||
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
|
* mapping->tree_lock (widely used)
|
||||||
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
|
* inode->i_lock (in set_page_dirty's __mark_inode_dirty)
|
||||||
* sb_lock (within inode_lock in fs/fs-writeback.c)
|
* bdi.wb->list_lock (in set_page_dirty's __mark_inode_dirty)
|
||||||
* mapping->tree_lock (widely used, in set_page_dirty,
|
* sb_lock (within inode_lock in fs/fs-writeback.c)
|
||||||
* in arch-dependent flush_dcache_mmap_lock,
|
* mapping->tree_lock (widely used, in set_page_dirty,
|
||||||
* within bdi.wb->list_lock in __sync_single_inode)
|
* in arch-dependent flush_dcache_mmap_lock,
|
||||||
|
* within bdi.wb->list_lock in __sync_single_inode)
|
||||||
*
|
*
|
||||||
* anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
|
* anon_vma->rwsem,mapping->i_mutex (memory_failure, collect_procs_anon)
|
||||||
* ->tasklist_lock
|
* ->tasklist_lock
|
||||||
|
|
Loading…
Reference in New Issue