Merge branch 'akpm' (patches from Andrew)
Merge misc fixes from Andrew Morton: "15 fixes" [ This does not merge the "fortify: use WARN instead of BUG for now" patch, which needs a bit of extra work to build cleanly with all configurations. Arnd is on it. - Linus ] * emailed patches from Andrew Morton <akpm@linux-foundation.org>: ocfs2: don't clear SGID when inheriting ACLs mm: allow page_cache_get_speculative in interrupt context userfaultfd: non-cooperative: flush event_wqh at release time ipc: add missing container_of()s for randstruct cpuset: fix a deadlock due to incomplete patching of cpusets_enabled() userfaultfd_zeropage: return -ENOSPC in case mm has gone mm: take memory hotplug lock within numa_zonelist_order_handler() mm/page_io.c: fix oops during block io poll in swapin path zram: do not free pool->size_class kthread: fix documentation build warning kasan: avoid -Wmaybe-uninitialized warning userfaultfd: non-cooperative: notify about unmap of destination during mremap mm, mprotect: flush TLB if potentially racing with a parallel reclaim leaving stale TLB entries pid: kill pidhash_size in pidhash_init() mm/hugetlb.c: __get_user_pages ignores certain follow_hugetlb_page errors
This commit is contained in:
commit
995d03ae26
|
@ -240,18 +240,6 @@ int ocfs2_set_acl(handle_t *handle,
|
|||
switch (type) {
|
||||
case ACL_TYPE_ACCESS:
|
||||
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_ACCESS;
|
||||
if (acl) {
|
||||
umode_t mode;
|
||||
|
||||
ret = posix_acl_update_mode(inode, &mode, &acl);
|
||||
if (ret)
|
||||
return ret;
|
||||
|
||||
ret = ocfs2_acl_set_mode(inode, di_bh,
|
||||
handle, mode);
|
||||
if (ret)
|
||||
return ret;
|
||||
}
|
||||
break;
|
||||
case ACL_TYPE_DEFAULT:
|
||||
name_index = OCFS2_XATTR_INDEX_POSIX_ACL_DEFAULT;
|
||||
|
@ -289,7 +277,19 @@ int ocfs2_iop_set_acl(struct inode *inode, struct posix_acl *acl, int type)
|
|||
had_lock = ocfs2_inode_lock_tracker(inode, &bh, 1, &oh);
|
||||
if (had_lock < 0)
|
||||
return had_lock;
|
||||
if (type == ACL_TYPE_ACCESS && acl) {
|
||||
umode_t mode;
|
||||
|
||||
status = posix_acl_update_mode(inode, &mode, &acl);
|
||||
if (status)
|
||||
goto unlock;
|
||||
|
||||
status = ocfs2_acl_set_mode(inode, bh, NULL, mode);
|
||||
if (status)
|
||||
goto unlock;
|
||||
}
|
||||
status = ocfs2_set_acl(NULL, inode, bh, type, acl, NULL, NULL);
|
||||
unlock:
|
||||
ocfs2_inode_unlock_tracker(inode, 1, &oh, had_lock);
|
||||
brelse(bh);
|
||||
return status;
|
||||
|
|
|
@ -854,6 +854,9 @@ wakeup:
|
|||
__wake_up_locked_key(&ctx->fault_wqh, TASK_NORMAL, &range);
|
||||
spin_unlock(&ctx->fault_pending_wqh.lock);
|
||||
|
||||
/* Flush pending events that may still wait on event_wqh */
|
||||
wake_up_all(&ctx->event_wqh);
|
||||
|
||||
wake_up_poll(&ctx->fd_wqh, POLLHUP);
|
||||
userfaultfd_ctx_put(ctx);
|
||||
return 0;
|
||||
|
@ -1643,6 +1646,8 @@ static int userfaultfd_zeropage(struct userfaultfd_ctx *ctx,
|
|||
ret = mfill_zeropage(ctx->mm, uffdio_zeropage.range.start,
|
||||
uffdio_zeropage.range.len);
|
||||
mmput(ctx->mm);
|
||||
} else {
|
||||
return -ENOSPC;
|
||||
}
|
||||
if (unlikely(put_user(ret, &user_uffdio_zeropage->zeropage)))
|
||||
return -EFAULT;
|
||||
|
|
|
@ -18,6 +18,19 @@
|
|||
|
||||
#ifdef CONFIG_CPUSETS
|
||||
|
||||
/*
|
||||
* Static branch rewrites can happen in an arbitrary order for a given
|
||||
* key. In code paths where we need to loop with read_mems_allowed_begin() and
|
||||
* read_mems_allowed_retry() to get a consistent view of mems_allowed, we need
|
||||
* to ensure that begin() always gets rewritten before retry() in the
|
||||
* disabled -> enabled transition. If not, then if local irqs are disabled
|
||||
* around the loop, we can deadlock since retry() would always be
|
||||
* comparing the latest value of the mems_allowed seqcount against 0 as
|
||||
* begin() still would see cpusets_enabled() as false. The enabled -> disabled
|
||||
* transition should happen in reverse order for the same reasons (want to stop
|
||||
* looking at real value of mems_allowed.sequence in retry() first).
|
||||
*/
|
||||
extern struct static_key_false cpusets_pre_enable_key;
|
||||
extern struct static_key_false cpusets_enabled_key;
|
||||
static inline bool cpusets_enabled(void)
|
||||
{
|
||||
|
@ -32,12 +45,14 @@ static inline int nr_cpusets(void)
|
|||
|
||||
static inline void cpuset_inc(void)
|
||||
{
|
||||
static_branch_inc(&cpusets_pre_enable_key);
|
||||
static_branch_inc(&cpusets_enabled_key);
|
||||
}
|
||||
|
||||
static inline void cpuset_dec(void)
|
||||
{
|
||||
static_branch_dec(&cpusets_enabled_key);
|
||||
static_branch_dec(&cpusets_pre_enable_key);
|
||||
}
|
||||
|
||||
extern int cpuset_init(void);
|
||||
|
@ -115,7 +130,7 @@ extern void cpuset_print_current_mems_allowed(void);
|
|||
*/
|
||||
static inline unsigned int read_mems_allowed_begin(void)
|
||||
{
|
||||
if (!cpusets_enabled())
|
||||
if (!static_branch_unlikely(&cpusets_pre_enable_key))
|
||||
return 0;
|
||||
|
||||
return read_seqcount_begin(¤t->mems_allowed_seq);
|
||||
|
@ -129,7 +144,7 @@ static inline unsigned int read_mems_allowed_begin(void)
|
|||
*/
|
||||
static inline bool read_mems_allowed_retry(unsigned int seq)
|
||||
{
|
||||
if (!cpusets_enabled())
|
||||
if (!static_branch_unlikely(&cpusets_enabled_key))
|
||||
return false;
|
||||
|
||||
return read_seqcount_retry(¤t->mems_allowed_seq, seq);
|
||||
|
|
|
@ -15,7 +15,7 @@ struct task_struct *kthread_create_on_node(int (*threadfn)(void *data),
|
|||
* @threadfn: the function to run in the thread
|
||||
* @data: data pointer for @threadfn()
|
||||
* @namefmt: printf-style format string for the thread name
|
||||
* @...: arguments for @namefmt.
|
||||
* @arg...: arguments for @namefmt.
|
||||
*
|
||||
* This macro will create a kthread on the current node, leaving it in
|
||||
* the stopped state. This is just a helper for kthread_create_on_node();
|
||||
|
|
|
@ -494,6 +494,10 @@ struct mm_struct {
|
|||
* PROT_NONE or PROT_NUMA mapped page.
|
||||
*/
|
||||
bool tlb_flush_pending;
|
||||
#endif
|
||||
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
||||
/* See flush_tlb_batched_pending() */
|
||||
bool tlb_flush_batched;
|
||||
#endif
|
||||
struct uprobes_state uprobes_state;
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
|
|
|
@ -163,8 +163,6 @@ void release_pages(struct page **pages, int nr, bool cold);
|
|||
*/
|
||||
static inline int page_cache_get_speculative(struct page *page)
|
||||
{
|
||||
VM_BUG_ON(in_interrupt());
|
||||
|
||||
#ifdef CONFIG_TINY_RCU
|
||||
# ifdef CONFIG_PREEMPT_COUNT
|
||||
VM_BUG_ON(!in_atomic() && !irqs_disabled());
|
||||
|
|
|
@ -1034,7 +1034,8 @@ void msg_exit_ns(struct ipc_namespace *ns)
|
|||
static int sysvipc_msg_proc_show(struct seq_file *s, void *it)
|
||||
{
|
||||
struct user_namespace *user_ns = seq_user_ns(s);
|
||||
struct msg_queue *msq = it;
|
||||
struct kern_ipc_perm *ipcp = it;
|
||||
struct msg_queue *msq = container_of(ipcp, struct msg_queue, q_perm);
|
||||
|
||||
seq_printf(s,
|
||||
"%10d %10d %4o %10lu %10lu %5u %5u %5u %5u %5u %5u %10lu %10lu %10lu\n",
|
||||
|
|
|
@ -2179,7 +2179,8 @@ void exit_sem(struct task_struct *tsk)
|
|||
static int sysvipc_sem_proc_show(struct seq_file *s, void *it)
|
||||
{
|
||||
struct user_namespace *user_ns = seq_user_ns(s);
|
||||
struct sem_array *sma = it;
|
||||
struct kern_ipc_perm *ipcp = it;
|
||||
struct sem_array *sma = container_of(ipcp, struct sem_array, sem_perm);
|
||||
time_t sem_otime;
|
||||
|
||||
/*
|
||||
|
|
|
@ -1380,9 +1380,11 @@ SYSCALL_DEFINE1(shmdt, char __user *, shmaddr)
|
|||
static int sysvipc_shm_proc_show(struct seq_file *s, void *it)
|
||||
{
|
||||
struct user_namespace *user_ns = seq_user_ns(s);
|
||||
struct shmid_kernel *shp = it;
|
||||
struct kern_ipc_perm *ipcp = it;
|
||||
struct shmid_kernel *shp;
|
||||
unsigned long rss = 0, swp = 0;
|
||||
|
||||
shp = container_of(ipcp, struct shmid_kernel, shm_perm);
|
||||
shm_add_rss_swap(shp, &rss, &swp);
|
||||
|
||||
#if BITS_PER_LONG <= 32
|
||||
|
|
|
@ -63,6 +63,7 @@
|
|||
#include <linux/cgroup.h>
|
||||
#include <linux/wait.h>
|
||||
|
||||
DEFINE_STATIC_KEY_FALSE(cpusets_pre_enable_key);
|
||||
DEFINE_STATIC_KEY_FALSE(cpusets_enabled_key);
|
||||
|
||||
/* See "Frequency meter" comments, below. */
|
||||
|
|
|
@ -575,13 +575,10 @@ struct pid *find_ge_pid(int nr, struct pid_namespace *ns)
|
|||
*/
|
||||
void __init pidhash_init(void)
|
||||
{
|
||||
unsigned int pidhash_size;
|
||||
|
||||
pid_hash = alloc_large_system_hash("PID", sizeof(*pid_hash), 0, 18,
|
||||
HASH_EARLY | HASH_SMALL | HASH_ZERO,
|
||||
&pidhash_shift, NULL,
|
||||
0, 4096);
|
||||
pidhash_size = 1U << pidhash_shift;
|
||||
}
|
||||
|
||||
void __init pidmap_init(void)
|
||||
|
|
|
@ -4078,6 +4078,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
unsigned long vaddr = *position;
|
||||
unsigned long remainder = *nr_pages;
|
||||
struct hstate *h = hstate_vma(vma);
|
||||
int err = -EFAULT;
|
||||
|
||||
while (vaddr < vma->vm_end && remainder) {
|
||||
pte_t *pte;
|
||||
|
@ -4154,11 +4155,7 @@ long follow_hugetlb_page(struct mm_struct *mm, struct vm_area_struct *vma,
|
|||
}
|
||||
ret = hugetlb_fault(mm, vma, vaddr, fault_flags);
|
||||
if (ret & VM_FAULT_ERROR) {
|
||||
int err = vm_fault_to_errno(ret, flags);
|
||||
|
||||
if (err)
|
||||
return err;
|
||||
|
||||
err = vm_fault_to_errno(ret, flags);
|
||||
remainder = 0;
|
||||
break;
|
||||
}
|
||||
|
@ -4213,7 +4210,7 @@ same_page:
|
|||
*/
|
||||
*position = vaddr;
|
||||
|
||||
return i ? i : -EFAULT;
|
||||
return i ? i : err;
|
||||
}
|
||||
|
||||
#ifndef __HAVE_ARCH_FLUSH_HUGETLB_TLB_RANGE
|
||||
|
|
|
@ -498,6 +498,7 @@ extern struct workqueue_struct *mm_percpu_wq;
|
|||
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
||||
void try_to_unmap_flush(void);
|
||||
void try_to_unmap_flush_dirty(void);
|
||||
void flush_tlb_batched_pending(struct mm_struct *mm);
|
||||
#else
|
||||
static inline void try_to_unmap_flush(void)
|
||||
{
|
||||
|
@ -505,7 +506,9 @@ static inline void try_to_unmap_flush(void)
|
|||
static inline void try_to_unmap_flush_dirty(void)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void flush_tlb_batched_pending(struct mm_struct *mm)
|
||||
{
|
||||
}
|
||||
#endif /* CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH */
|
||||
|
||||
extern const struct trace_print_flags pageflag_names[];
|
||||
|
|
|
@ -401,6 +401,7 @@ void kasan_report(unsigned long addr, size_t size,
|
|||
disable_trace_on_warning();
|
||||
|
||||
info.access_addr = (void *)addr;
|
||||
info.first_bad_addr = (void *)addr;
|
||||
info.access_size = size;
|
||||
info.is_write = is_write;
|
||||
info.ip = ip;
|
||||
|
|
|
@ -320,6 +320,7 @@ static int madvise_free_pte_range(pmd_t *pmd, unsigned long addr,
|
|||
|
||||
tlb_remove_check_page_size_change(tlb, PAGE_SIZE);
|
||||
orig_pte = pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
|
||||
flush_tlb_batched_pending(mm);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
for (; addr != end; pte++, addr += PAGE_SIZE) {
|
||||
ptent = *pte;
|
||||
|
|
|
@ -1197,6 +1197,7 @@ again:
|
|||
init_rss_vec(rss);
|
||||
start_pte = pte_offset_map_lock(mm, pmd, addr, &ptl);
|
||||
pte = start_pte;
|
||||
flush_tlb_batched_pending(mm);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
do {
|
||||
pte_t ptent = *pte;
|
||||
|
|
|
@ -64,6 +64,7 @@ static unsigned long change_pte_range(struct vm_area_struct *vma, pmd_t *pmd,
|
|||
atomic_read(&vma->vm_mm->mm_users) == 1)
|
||||
target_node = numa_node_id();
|
||||
|
||||
flush_tlb_batched_pending(vma->vm_mm);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
do {
|
||||
oldpte = *pte;
|
||||
|
|
|
@ -152,6 +152,7 @@ static void move_ptes(struct vm_area_struct *vma, pmd_t *old_pmd,
|
|||
new_ptl = pte_lockptr(mm, new_pmd);
|
||||
if (new_ptl != old_ptl)
|
||||
spin_lock_nested(new_ptl, SINGLE_DEPTH_NESTING);
|
||||
flush_tlb_batched_pending(vma->vm_mm);
|
||||
arch_enter_lazy_mmu_mode();
|
||||
|
||||
for (; old_addr < old_end; old_pte++, old_addr += PAGE_SIZE,
|
||||
|
@ -428,6 +429,7 @@ static struct vm_area_struct *vma_to_resize(unsigned long addr,
|
|||
static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
|
||||
unsigned long new_addr, unsigned long new_len, bool *locked,
|
||||
struct vm_userfaultfd_ctx *uf,
|
||||
struct list_head *uf_unmap_early,
|
||||
struct list_head *uf_unmap)
|
||||
{
|
||||
struct mm_struct *mm = current->mm;
|
||||
|
@ -446,7 +448,7 @@ static unsigned long mremap_to(unsigned long addr, unsigned long old_len,
|
|||
if (addr + old_len > new_addr && new_addr + new_len > addr)
|
||||
goto out;
|
||||
|
||||
ret = do_munmap(mm, new_addr, new_len, NULL);
|
||||
ret = do_munmap(mm, new_addr, new_len, uf_unmap_early);
|
||||
if (ret)
|
||||
goto out;
|
||||
|
||||
|
@ -514,6 +516,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
|||
unsigned long charged = 0;
|
||||
bool locked = false;
|
||||
struct vm_userfaultfd_ctx uf = NULL_VM_UFFD_CTX;
|
||||
LIST_HEAD(uf_unmap_early);
|
||||
LIST_HEAD(uf_unmap);
|
||||
|
||||
if (flags & ~(MREMAP_FIXED | MREMAP_MAYMOVE))
|
||||
|
@ -541,7 +544,7 @@ SYSCALL_DEFINE5(mremap, unsigned long, addr, unsigned long, old_len,
|
|||
|
||||
if (flags & MREMAP_FIXED) {
|
||||
ret = mremap_to(addr, old_len, new_addr, new_len,
|
||||
&locked, &uf, &uf_unmap);
|
||||
&locked, &uf, &uf_unmap_early, &uf_unmap);
|
||||
goto out;
|
||||
}
|
||||
|
||||
|
@ -621,6 +624,7 @@ out:
|
|||
up_write(¤t->mm->mmap_sem);
|
||||
if (locked && new_len > old_len)
|
||||
mm_populate(new_addr + old_len, new_len - old_len);
|
||||
userfaultfd_unmap_complete(mm, &uf_unmap_early);
|
||||
mremap_userfaultfd_complete(&uf, addr, new_addr, old_len);
|
||||
userfaultfd_unmap_complete(mm, &uf_unmap);
|
||||
return ret;
|
||||
|
|
|
@ -4891,9 +4891,11 @@ int numa_zonelist_order_handler(struct ctl_table *table, int write,
|
|||
NUMA_ZONELIST_ORDER_LEN);
|
||||
user_zonelist_order = oldval;
|
||||
} else if (oldval != user_zonelist_order) {
|
||||
mem_hotplug_begin();
|
||||
mutex_lock(&zonelists_mutex);
|
||||
build_all_zonelists(NULL, NULL);
|
||||
mutex_unlock(&zonelists_mutex);
|
||||
mem_hotplug_done();
|
||||
}
|
||||
}
|
||||
out:
|
||||
|
|
|
@ -22,6 +22,7 @@
|
|||
#include <linux/frontswap.h>
|
||||
#include <linux/blkdev.h>
|
||||
#include <linux/uio.h>
|
||||
#include <linux/sched/task.h>
|
||||
#include <asm/pgtable.h>
|
||||
|
||||
static struct bio *get_swap_bio(gfp_t gfp_flags,
|
||||
|
@ -136,6 +137,7 @@ out:
|
|||
WRITE_ONCE(bio->bi_private, NULL);
|
||||
bio_put(bio);
|
||||
wake_up_process(waiter);
|
||||
put_task_struct(waiter);
|
||||
}
|
||||
|
||||
int generic_swapfile_activate(struct swap_info_struct *sis,
|
||||
|
@ -378,6 +380,11 @@ int swap_readpage(struct page *page, bool do_poll)
|
|||
goto out;
|
||||
}
|
||||
bdev = bio->bi_bdev;
|
||||
/*
|
||||
* Keep this task valid during swap readpage because the oom killer may
|
||||
* attempt to access it in the page fault retry time check.
|
||||
*/
|
||||
get_task_struct(current);
|
||||
bio->bi_private = current;
|
||||
bio_set_op_attrs(bio, REQ_OP_READ, 0);
|
||||
count_vm_event(PSWPIN);
|
||||
|
|
36
mm/rmap.c
36
mm/rmap.c
|
@ -604,6 +604,13 @@ static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
|
|||
arch_tlbbatch_add_mm(&tlb_ubc->arch, mm);
|
||||
tlb_ubc->flush_required = true;
|
||||
|
||||
/*
|
||||
* Ensure compiler does not re-order the setting of tlb_flush_batched
|
||||
* before the PTE is cleared.
|
||||
*/
|
||||
barrier();
|
||||
mm->tlb_flush_batched = true;
|
||||
|
||||
/*
|
||||
* If the PTE was dirty then it's best to assume it's writable. The
|
||||
* caller must use try_to_unmap_flush_dirty() or try_to_unmap_flush()
|
||||
|
@ -631,6 +638,35 @@ static bool should_defer_flush(struct mm_struct *mm, enum ttu_flags flags)
|
|||
|
||||
return should_defer;
|
||||
}
|
||||
|
||||
/*
|
||||
* Reclaim unmaps pages under the PTL but do not flush the TLB prior to
|
||||
* releasing the PTL if TLB flushes are batched. It's possible for a parallel
|
||||
* operation such as mprotect or munmap to race between reclaim unmapping
|
||||
* the page and flushing the page. If this race occurs, it potentially allows
|
||||
* access to data via a stale TLB entry. Tracking all mm's that have TLB
|
||||
* batching in flight would be expensive during reclaim so instead track
|
||||
* whether TLB batching occurred in the past and if so then do a flush here
|
||||
* if required. This will cost one additional flush per reclaim cycle paid
|
||||
* by the first operation at risk such as mprotect and mumap.
|
||||
*
|
||||
* This must be called under the PTL so that an access to tlb_flush_batched
|
||||
* that is potentially a "reclaim vs mprotect/munmap/etc" race will synchronise
|
||||
* via the PTL.
|
||||
*/
|
||||
void flush_tlb_batched_pending(struct mm_struct *mm)
|
||||
{
|
||||
if (mm->tlb_flush_batched) {
|
||||
flush_tlb_mm(mm);
|
||||
|
||||
/*
|
||||
* Do not allow the compiler to re-order the clearing of
|
||||
* tlb_flush_batched before the tlb is flushed.
|
||||
*/
|
||||
barrier();
|
||||
mm->tlb_flush_batched = false;
|
||||
}
|
||||
}
|
||||
#else
|
||||
static void set_tlb_ubc_flush_pending(struct mm_struct *mm, bool writable)
|
||||
{
|
||||
|
|
|
@ -2453,7 +2453,6 @@ void zs_destroy_pool(struct zs_pool *pool)
|
|||
}
|
||||
|
||||
destroy_cache(pool);
|
||||
kfree(pool->size_class);
|
||||
kfree(pool->name);
|
||||
kfree(pool);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue