Merge branch 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip
Pull x86 mm updates from Thomas Gleixner: - Make lazy TLB mode even lazier to avoid pointless switch_mm() operations, which reduces CPU load by 1-2% for memcache workloads - Small cleanups and improvements all over the place * 'x86-mm-for-linus' of git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip: x86/mm: Remove redundant check for kmem_cache_create() arm/asm/tlb.h: Fix build error implicit func declaration x86/mm/tlb: Make clear_asid_other() static x86/mm/tlb: Skip atomic operations for 'init_mm' in switch_mm_irqs_off() x86/mm/tlb: Always use lazy TLB mode x86/mm/tlb: Only send page table free TLB flush to lazy TLB CPUs x86/mm/tlb: Make lazy TLB mode lazier x86/mm/tlb: Restructure switch_mm_irqs_off() x86/mm/tlb: Leave lazy TLB mode at page table free time mm: Allocate the mm_cpumask (mm->cpu_bitmap[]) dynamically based on nr_cpu_ids x86/mm: Add TLB purge to free pmd/pte page interfaces ioremap: Update pgtable free interfaces with addr x86/mm: Disable ioremap free page handling on x86-PAE
This commit is contained in:
commit
203b4fc903
|
@ -292,5 +292,13 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
|
|||
{
|
||||
}
|
||||
|
||||
static inline void tlb_flush_remove_tables(struct mm_struct *mm)
|
||||
{
|
||||
}
|
||||
|
||||
static inline void tlb_flush_remove_tables_local(void *arg)
|
||||
{
|
||||
}
|
||||
|
||||
#endif /* CONFIG_MMU */
|
||||
#endif
|
||||
|
|
|
@ -977,12 +977,12 @@ int pmd_clear_huge(pmd_t *pmdp)
|
|||
return 1;
|
||||
}
|
||||
|
||||
int pud_free_pmd_page(pud_t *pud)
|
||||
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
|
||||
{
|
||||
return pud_none(*pud);
|
||||
}
|
||||
|
||||
int pmd_free_pte_page(pmd_t *pmd)
|
||||
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
|
||||
{
|
||||
return pmd_none(*pmd);
|
||||
}
|
||||
|
|
|
@ -148,22 +148,6 @@ static inline unsigned long build_cr3_noflush(pgd_t *pgd, u16 asid)
|
|||
#define __flush_tlb_one_user(addr) __native_flush_tlb_one_user(addr)
|
||||
#endif
|
||||
|
||||
static inline bool tlb_defer_switch_to_init_mm(void)
|
||||
{
|
||||
/*
|
||||
* If we have PCID, then switching to init_mm is reasonably
|
||||
* fast. If we don't have PCID, then switching to init_mm is
|
||||
* quite slow, so we try to defer it in the hopes that we can
|
||||
* avoid it entirely. The latter approach runs the risk of
|
||||
* receiving otherwise unnecessary IPIs.
|
||||
*
|
||||
* This choice is just a heuristic. The tlb code can handle this
|
||||
* function returning true or false regardless of whether we have
|
||||
* PCID.
|
||||
*/
|
||||
return !static_cpu_has(X86_FEATURE_PCID);
|
||||
}
|
||||
|
||||
struct tlb_context {
|
||||
u64 ctx_id;
|
||||
u64 tlb_gen;
|
||||
|
@ -554,4 +538,9 @@ extern void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch);
|
|||
native_flush_tlb_others(mask, info)
|
||||
#endif
|
||||
|
||||
extern void tlb_flush_remove_tables(struct mm_struct *mm);
|
||||
extern void tlb_flush_remove_tables_local(void *arg);
|
||||
|
||||
#define HAVE_TLB_FLUSH_REMOVE_TABLES
|
||||
|
||||
#endif /* _ASM_X86_TLBFLUSH_H */
|
||||
|
|
|
@ -329,9 +329,6 @@ static int __init pgd_cache_init(void)
|
|||
*/
|
||||
pgd_cache = kmem_cache_create("pgd_cache", PGD_SIZE, PGD_ALIGN,
|
||||
SLAB_PANIC, NULL);
|
||||
if (!pgd_cache)
|
||||
return -ENOMEM;
|
||||
|
||||
return 0;
|
||||
}
|
||||
core_initcall(pgd_cache_init);
|
||||
|
@ -719,28 +716,50 @@ int pmd_clear_huge(pmd_t *pmd)
|
|||
return 0;
|
||||
}
|
||||
|
||||
#ifdef CONFIG_X86_64
|
||||
/**
|
||||
* pud_free_pmd_page - Clear pud entry and free pmd page.
|
||||
* @pud: Pointer to a PUD.
|
||||
* @addr: Virtual address associated with pud.
|
||||
*
|
||||
* Context: The pud range has been unmaped and TLB purged.
|
||||
* Context: The pud range has been unmapped and TLB purged.
|
||||
* Return: 1 if clearing the entry succeeded. 0 otherwise.
|
||||
*
|
||||
* NOTE: Callers must allow a single page allocation.
|
||||
*/
|
||||
int pud_free_pmd_page(pud_t *pud)
|
||||
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
|
||||
{
|
||||
pmd_t *pmd;
|
||||
pmd_t *pmd, *pmd_sv;
|
||||
pte_t *pte;
|
||||
int i;
|
||||
|
||||
if (pud_none(*pud))
|
||||
return 1;
|
||||
|
||||
pmd = (pmd_t *)pud_page_vaddr(*pud);
|
||||
pmd_sv = (pmd_t *)__get_free_page(GFP_KERNEL);
|
||||
if (!pmd_sv)
|
||||
return 0;
|
||||
|
||||
for (i = 0; i < PTRS_PER_PMD; i++)
|
||||
if (!pmd_free_pte_page(&pmd[i]))
|
||||
return 0;
|
||||
for (i = 0; i < PTRS_PER_PMD; i++) {
|
||||
pmd_sv[i] = pmd[i];
|
||||
if (!pmd_none(pmd[i]))
|
||||
pmd_clear(&pmd[i]);
|
||||
}
|
||||
|
||||
pud_clear(pud);
|
||||
|
||||
/* INVLPG to clear all paging-structure caches */
|
||||
flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
|
||||
|
||||
for (i = 0; i < PTRS_PER_PMD; i++) {
|
||||
if (!pmd_none(pmd_sv[i])) {
|
||||
pte = (pte_t *)pmd_page_vaddr(pmd_sv[i]);
|
||||
free_page((unsigned long)pte);
|
||||
}
|
||||
}
|
||||
|
||||
free_page((unsigned long)pmd_sv);
|
||||
free_page((unsigned long)pmd);
|
||||
|
||||
return 1;
|
||||
|
@ -749,11 +768,12 @@ int pud_free_pmd_page(pud_t *pud)
|
|||
/**
|
||||
* pmd_free_pte_page - Clear pmd entry and free pte page.
|
||||
* @pmd: Pointer to a PMD.
|
||||
* @addr: Virtual address associated with pmd.
|
||||
*
|
||||
* Context: The pmd range has been unmaped and TLB purged.
|
||||
* Context: The pmd range has been unmapped and TLB purged.
|
||||
* Return: 1 if clearing the entry succeeded. 0 otherwise.
|
||||
*/
|
||||
int pmd_free_pte_page(pmd_t *pmd)
|
||||
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
|
||||
{
|
||||
pte_t *pte;
|
||||
|
||||
|
@ -762,8 +782,30 @@ int pmd_free_pte_page(pmd_t *pmd)
|
|||
|
||||
pte = (pte_t *)pmd_page_vaddr(*pmd);
|
||||
pmd_clear(pmd);
|
||||
|
||||
/* INVLPG to clear all paging-structure caches */
|
||||
flush_tlb_kernel_range(addr, addr + PAGE_SIZE-1);
|
||||
|
||||
free_page((unsigned long)pte);
|
||||
|
||||
return 1;
|
||||
}
|
||||
|
||||
#else /* !CONFIG_X86_64 */
|
||||
|
||||
int pud_free_pmd_page(pud_t *pud, unsigned long addr)
|
||||
{
|
||||
return pud_none(*pud);
|
||||
}
|
||||
|
||||
/*
|
||||
* Disable free page handling on x86-PAE. This assures that ioremap()
|
||||
* does not update sync'd pmd entries. See vmalloc_sync_one().
|
||||
*/
|
||||
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
|
||||
{
|
||||
return pmd_none(*pmd);
|
||||
}
|
||||
|
||||
#endif /* CONFIG_X86_64 */
|
||||
#endif /* CONFIG_HAVE_ARCH_HUGE_VMAP */
|
||||
|
|
|
@ -7,6 +7,7 @@
|
|||
#include <linux/export.h>
|
||||
#include <linux/cpu.h>
|
||||
#include <linux/debugfs.h>
|
||||
#include <linux/gfp.h>
|
||||
|
||||
#include <asm/tlbflush.h>
|
||||
#include <asm/mmu_context.h>
|
||||
|
@ -35,7 +36,7 @@
|
|||
* necessary invalidation by clearing out the 'ctx_id' which
|
||||
* forces a TLB flush when the context is loaded.
|
||||
*/
|
||||
void clear_asid_other(void)
|
||||
static void clear_asid_other(void)
|
||||
{
|
||||
u16 asid;
|
||||
|
||||
|
@ -185,8 +186,11 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|||
{
|
||||
struct mm_struct *real_prev = this_cpu_read(cpu_tlbstate.loaded_mm);
|
||||
u16 prev_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid);
|
||||
bool was_lazy = this_cpu_read(cpu_tlbstate.is_lazy);
|
||||
unsigned cpu = smp_processor_id();
|
||||
u64 next_tlb_gen;
|
||||
bool need_flush;
|
||||
u16 new_asid;
|
||||
|
||||
/*
|
||||
* NB: The scheduler will call us with prev == next when switching
|
||||
|
@ -240,20 +244,41 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|||
next->context.ctx_id);
|
||||
|
||||
/*
|
||||
* We don't currently support having a real mm loaded without
|
||||
* our cpu set in mm_cpumask(). We have all the bookkeeping
|
||||
* in place to figure out whether we would need to flush
|
||||
* if our cpu were cleared in mm_cpumask(), but we don't
|
||||
* currently use it.
|
||||
* Even in lazy TLB mode, the CPU should stay set in the
|
||||
* mm_cpumask. The TLB shootdown code can figure out from
|
||||
* from cpu_tlbstate.is_lazy whether or not to send an IPI.
|
||||
*/
|
||||
if (WARN_ON_ONCE(real_prev != &init_mm &&
|
||||
!cpumask_test_cpu(cpu, mm_cpumask(next))))
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
|
||||
return;
|
||||
/*
|
||||
* If the CPU is not in lazy TLB mode, we are just switching
|
||||
* from one thread in a process to another thread in the same
|
||||
* process. No TLB flush required.
|
||||
*/
|
||||
if (!was_lazy)
|
||||
return;
|
||||
|
||||
/*
|
||||
* Read the tlb_gen to check whether a flush is needed.
|
||||
* If the TLB is up to date, just use it.
|
||||
* The barrier synchronizes with the tlb_gen increment in
|
||||
* the TLB shootdown code.
|
||||
*/
|
||||
smp_mb();
|
||||
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
if (this_cpu_read(cpu_tlbstate.ctxs[prev_asid].tlb_gen) ==
|
||||
next_tlb_gen)
|
||||
return;
|
||||
|
||||
/*
|
||||
* TLB contents went out of date while we were in lazy
|
||||
* mode. Fall through to the TLB switching code below.
|
||||
*/
|
||||
new_asid = prev_asid;
|
||||
need_flush = true;
|
||||
} else {
|
||||
u16 new_asid;
|
||||
bool need_flush;
|
||||
u64 last_ctx_id = this_cpu_read(cpu_tlbstate.last_ctx_id);
|
||||
|
||||
/*
|
||||
|
@ -285,53 +310,60 @@ void switch_mm_irqs_off(struct mm_struct *prev, struct mm_struct *next,
|
|||
sync_current_stack_to_mm(next);
|
||||
}
|
||||
|
||||
/* Stop remote flushes for the previous mm */
|
||||
VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, mm_cpumask(real_prev)) &&
|
||||
real_prev != &init_mm);
|
||||
cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
|
||||
/*
|
||||
* Stop remote flushes for the previous mm.
|
||||
* Skip kernel threads; we never send init_mm TLB flushing IPIs,
|
||||
* but the bitmap manipulation can cause cache line contention.
|
||||
*/
|
||||
if (real_prev != &init_mm) {
|
||||
VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu,
|
||||
mm_cpumask(real_prev)));
|
||||
cpumask_clear_cpu(cpu, mm_cpumask(real_prev));
|
||||
}
|
||||
|
||||
/*
|
||||
* Start remote flushes and then read tlb_gen.
|
||||
*/
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
if (next != &init_mm)
|
||||
cpumask_set_cpu(cpu, mm_cpumask(next));
|
||||
next_tlb_gen = atomic64_read(&next->context.tlb_gen);
|
||||
|
||||
choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush);
|
||||
}
|
||||
|
||||
if (need_flush) {
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
||||
load_new_mm_cr3(next->pgd, new_asid, true);
|
||||
|
||||
/*
|
||||
* NB: This gets called via leave_mm() in the idle path
|
||||
* where RCU functions differently. Tracing normally
|
||||
* uses RCU, so we need to use the _rcuidle variant.
|
||||
*
|
||||
* (There is no good reason for this. The idle code should
|
||||
* be rearranged to call this before rcu_idle_enter().)
|
||||
*/
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
} else {
|
||||
/* The new ASID is already up to date. */
|
||||
load_new_mm_cr3(next->pgd, new_asid, false);
|
||||
|
||||
/* See above wrt _rcuidle. */
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
|
||||
}
|
||||
if (need_flush) {
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id);
|
||||
this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen);
|
||||
load_new_mm_cr3(next->pgd, new_asid, true);
|
||||
|
||||
/*
|
||||
* Record last user mm's context id, so we can avoid
|
||||
* flushing branch buffer with IBPB if we switch back
|
||||
* to the same user.
|
||||
* NB: This gets called via leave_mm() in the idle path
|
||||
* where RCU functions differently. Tracing normally
|
||||
* uses RCU, so we need to use the _rcuidle variant.
|
||||
*
|
||||
* (There is no good reason for this. The idle code should
|
||||
* be rearranged to call this before rcu_idle_enter().)
|
||||
*/
|
||||
if (next != &init_mm)
|
||||
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, TLB_FLUSH_ALL);
|
||||
} else {
|
||||
/* The new ASID is already up to date. */
|
||||
load_new_mm_cr3(next->pgd, new_asid, false);
|
||||
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm, next);
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
|
||||
/* See above wrt _rcuidle. */
|
||||
trace_tlb_flush_rcuidle(TLB_FLUSH_ON_TASK_SWITCH, 0);
|
||||
}
|
||||
|
||||
/*
|
||||
* Record last user mm's context id, so we can avoid
|
||||
* flushing branch buffer with IBPB if we switch back
|
||||
* to the same user.
|
||||
*/
|
||||
if (next != &init_mm)
|
||||
this_cpu_write(cpu_tlbstate.last_ctx_id, next->context.ctx_id);
|
||||
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm, next);
|
||||
this_cpu_write(cpu_tlbstate.loaded_mm_asid, new_asid);
|
||||
|
||||
load_mm_cr4(next);
|
||||
switch_ldt(real_prev, next);
|
||||
}
|
||||
|
@ -354,20 +386,7 @@ void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk)
|
|||
if (this_cpu_read(cpu_tlbstate.loaded_mm) == &init_mm)
|
||||
return;
|
||||
|
||||
if (tlb_defer_switch_to_init_mm()) {
|
||||
/*
|
||||
* There's a significant optimization that may be possible
|
||||
* here. We have accurate enough TLB flush tracking that we
|
||||
* don't need to maintain coherence of TLB per se when we're
|
||||
* lazy. We do, however, need to maintain coherence of
|
||||
* paging-structure caches. We could, in principle, leave our
|
||||
* old mm loaded and only switch to init_mm when
|
||||
* tlb_remove_page() happens.
|
||||
*/
|
||||
this_cpu_write(cpu_tlbstate.is_lazy, true);
|
||||
} else {
|
||||
switch_mm(NULL, &init_mm, NULL);
|
||||
}
|
||||
this_cpu_write(cpu_tlbstate.is_lazy, true);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -454,6 +473,9 @@ static void flush_tlb_func_common(const struct flush_tlb_info *f,
|
|||
* paging-structure cache to avoid speculatively reading
|
||||
* garbage into our TLB. Since switching to init_mm is barely
|
||||
* slower than a minimal flush, just switch to init_mm.
|
||||
*
|
||||
* This should be rare, with native_flush_tlb_others skipping
|
||||
* IPIs to lazy TLB mode CPUs.
|
||||
*/
|
||||
switch_mm_irqs_off(NULL, &init_mm, NULL);
|
||||
return;
|
||||
|
@ -560,6 +582,9 @@ static void flush_tlb_func_remote(void *info)
|
|||
void native_flush_tlb_others(const struct cpumask *cpumask,
|
||||
const struct flush_tlb_info *info)
|
||||
{
|
||||
cpumask_var_t lazymask;
|
||||
unsigned int cpu;
|
||||
|
||||
count_vm_tlb_event(NR_TLB_REMOTE_FLUSH);
|
||||
if (info->end == TLB_FLUSH_ALL)
|
||||
trace_tlb_flush(TLB_REMOTE_SEND_IPI, TLB_FLUSH_ALL);
|
||||
|
@ -583,8 +608,6 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
|
|||
* that UV should be updated so that smp_call_function_many(),
|
||||
* etc, are optimal on UV.
|
||||
*/
|
||||
unsigned int cpu;
|
||||
|
||||
cpu = smp_processor_id();
|
||||
cpumask = uv_flush_tlb_others(cpumask, info);
|
||||
if (cpumask)
|
||||
|
@ -592,8 +615,29 @@ void native_flush_tlb_others(const struct cpumask *cpumask,
|
|||
(void *)info, 1);
|
||||
return;
|
||||
}
|
||||
smp_call_function_many(cpumask, flush_tlb_func_remote,
|
||||
|
||||
/*
|
||||
* A temporary cpumask is used in order to skip sending IPIs
|
||||
* to CPUs in lazy TLB state, while keeping them in mm_cpumask(mm).
|
||||
* If the allocation fails, simply IPI every CPU in mm_cpumask.
|
||||
*/
|
||||
if (!alloc_cpumask_var(&lazymask, GFP_ATOMIC)) {
|
||||
smp_call_function_many(cpumask, flush_tlb_func_remote,
|
||||
(void *)info, 1);
|
||||
return;
|
||||
}
|
||||
|
||||
cpumask_copy(lazymask, cpumask);
|
||||
|
||||
for_each_cpu(cpu, lazymask) {
|
||||
if (per_cpu(cpu_tlbstate.is_lazy, cpu))
|
||||
cpumask_clear_cpu(cpu, lazymask);
|
||||
}
|
||||
|
||||
smp_call_function_many(lazymask, flush_tlb_func_remote,
|
||||
(void *)info, 1);
|
||||
|
||||
free_cpumask_var(lazymask);
|
||||
}
|
||||
|
||||
/*
|
||||
|
@ -646,6 +690,68 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start,
|
|||
put_cpu();
|
||||
}
|
||||
|
||||
void tlb_flush_remove_tables_local(void *arg)
|
||||
{
|
||||
struct mm_struct *mm = arg;
|
||||
|
||||
if (this_cpu_read(cpu_tlbstate.loaded_mm) == mm &&
|
||||
this_cpu_read(cpu_tlbstate.is_lazy)) {
|
||||
/*
|
||||
* We're in lazy mode. We need to at least flush our
|
||||
* paging-structure cache to avoid speculatively reading
|
||||
* garbage into our TLB. Since switching to init_mm is barely
|
||||
* slower than a minimal flush, just switch to init_mm.
|
||||
*/
|
||||
switch_mm_irqs_off(NULL, &init_mm, NULL);
|
||||
}
|
||||
}
|
||||
|
||||
static void mm_fill_lazy_tlb_cpu_mask(struct mm_struct *mm,
|
||||
struct cpumask *lazy_cpus)
|
||||
{
|
||||
int cpu;
|
||||
|
||||
for_each_cpu(cpu, mm_cpumask(mm)) {
|
||||
if (!per_cpu(cpu_tlbstate.is_lazy, cpu))
|
||||
cpumask_set_cpu(cpu, lazy_cpus);
|
||||
}
|
||||
}
|
||||
|
||||
void tlb_flush_remove_tables(struct mm_struct *mm)
|
||||
{
|
||||
int cpu = get_cpu();
|
||||
cpumask_var_t lazy_cpus;
|
||||
|
||||
if (cpumask_any_but(mm_cpumask(mm), cpu) >= nr_cpu_ids) {
|
||||
put_cpu();
|
||||
return;
|
||||
}
|
||||
|
||||
if (!zalloc_cpumask_var(&lazy_cpus, GFP_ATOMIC)) {
|
||||
/*
|
||||
* If the cpumask allocation fails, do a brute force flush
|
||||
* on all the CPUs that have this mm loaded.
|
||||
*/
|
||||
smp_call_function_many(mm_cpumask(mm),
|
||||
tlb_flush_remove_tables_local, (void *)mm, 1);
|
||||
put_cpu();
|
||||
return;
|
||||
}
|
||||
|
||||
/*
|
||||
* CPUs with !is_lazy either received a TLB flush IPI while the user
|
||||
* pages in this address range were unmapped, or have context switched
|
||||
* and reloaded %CR3 since then.
|
||||
*
|
||||
* Shootdown IPIs at page table freeing time only need to be sent to
|
||||
* CPUs that may have out of date TLB contents.
|
||||
*/
|
||||
mm_fill_lazy_tlb_cpu_mask(mm, lazy_cpus);
|
||||
smp_call_function_many(lazy_cpus,
|
||||
tlb_flush_remove_tables_local, (void *)mm, 1);
|
||||
free_cpumask_var(lazy_cpus);
|
||||
put_cpu();
|
||||
}
|
||||
|
||||
static void do_flush_tlb_all(void *info)
|
||||
{
|
||||
|
|
|
@ -82,6 +82,7 @@ struct mm_struct efi_mm = {
|
|||
.mmap_sem = __RWSEM_INITIALIZER(efi_mm.mmap_sem),
|
||||
.page_table_lock = __SPIN_LOCK_UNLOCKED(efi_mm.page_table_lock),
|
||||
.mmlist = LIST_HEAD_INIT(efi_mm.mmlist),
|
||||
.cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
|
||||
};
|
||||
|
||||
struct workqueue_struct *efi_rts_wq;
|
||||
|
|
|
@ -1019,8 +1019,8 @@ int pud_set_huge(pud_t *pud, phys_addr_t addr, pgprot_t prot);
|
|||
int pmd_set_huge(pmd_t *pmd, phys_addr_t addr, pgprot_t prot);
|
||||
int pud_clear_huge(pud_t *pud);
|
||||
int pmd_clear_huge(pmd_t *pmd);
|
||||
int pud_free_pmd_page(pud_t *pud);
|
||||
int pmd_free_pte_page(pmd_t *pmd);
|
||||
int pud_free_pmd_page(pud_t *pud, unsigned long addr);
|
||||
int pmd_free_pte_page(pmd_t *pmd, unsigned long addr);
|
||||
#else /* !CONFIG_HAVE_ARCH_HUGE_VMAP */
|
||||
static inline int p4d_set_huge(p4d_t *p4d, phys_addr_t addr, pgprot_t prot)
|
||||
{
|
||||
|
@ -1046,11 +1046,11 @@ static inline int pmd_clear_huge(pmd_t *pmd)
|
|||
{
|
||||
return 0;
|
||||
}
|
||||
static inline int pud_free_pmd_page(pud_t *pud)
|
||||
static inline int pud_free_pmd_page(pud_t *pud, unsigned long addr)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
static inline int pmd_free_pte_page(pmd_t *pmd)
|
||||
static inline int pmd_free_pte_page(pmd_t *pmd, unsigned long addr)
|
||||
{
|
||||
return 0;
|
||||
}
|
||||
|
|
|
@ -303,4 +303,14 @@ static inline void tlb_remove_check_page_size_change(struct mmu_gather *tlb,
|
|||
|
||||
#define tlb_migrate_finish(mm) do {} while (0)
|
||||
|
||||
/*
|
||||
* Used to flush the TLB when page tables are removed, when lazy
|
||||
* TLB mode may cause a CPU to retain intermediate translations
|
||||
* pointing to about-to-be-freed page table memory.
|
||||
*/
|
||||
#ifndef HAVE_TLB_FLUSH_REMOVE_TABLES
|
||||
#define tlb_flush_remove_tables(mm) do {} while (0)
|
||||
#define tlb_flush_remove_tables_local(mm) do {} while (0)
|
||||
#endif
|
||||
|
||||
#endif /* _ASM_GENERIC__TLB_H */
|
||||
|
|
|
@ -335,176 +335,183 @@ struct core_state {
|
|||
|
||||
struct kioctx_table;
|
||||
struct mm_struct {
|
||||
struct vm_area_struct *mmap; /* list of VMAs */
|
||||
struct rb_root mm_rb;
|
||||
u32 vmacache_seqnum; /* per-thread vmacache */
|
||||
struct {
|
||||
struct vm_area_struct *mmap; /* list of VMAs */
|
||||
struct rb_root mm_rb;
|
||||
u32 vmacache_seqnum; /* per-thread vmacache */
|
||||
#ifdef CONFIG_MMU
|
||||
unsigned long (*get_unmapped_area) (struct file *filp,
|
||||
unsigned long (*get_unmapped_area) (struct file *filp,
|
||||
unsigned long addr, unsigned long len,
|
||||
unsigned long pgoff, unsigned long flags);
|
||||
#endif
|
||||
unsigned long mmap_base; /* base of mmap area */
|
||||
unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
|
||||
unsigned long mmap_base; /* base of mmap area */
|
||||
unsigned long mmap_legacy_base; /* base of mmap area in bottom-up allocations */
|
||||
#ifdef CONFIG_HAVE_ARCH_COMPAT_MMAP_BASES
|
||||
/* Base adresses for compatible mmap() */
|
||||
unsigned long mmap_compat_base;
|
||||
unsigned long mmap_compat_legacy_base;
|
||||
/* Base adresses for compatible mmap() */
|
||||
unsigned long mmap_compat_base;
|
||||
unsigned long mmap_compat_legacy_base;
|
||||
#endif
|
||||
unsigned long task_size; /* size of task vm space */
|
||||
unsigned long highest_vm_end; /* highest vma end address */
|
||||
pgd_t * pgd;
|
||||
unsigned long task_size; /* size of task vm space */
|
||||
unsigned long highest_vm_end; /* highest vma end address */
|
||||
pgd_t * pgd;
|
||||
|
||||
/**
|
||||
* @mm_users: The number of users including userspace.
|
||||
*
|
||||
* Use mmget()/mmget_not_zero()/mmput() to modify. When this drops
|
||||
* to 0 (i.e. when the task exits and there are no other temporary
|
||||
* reference holders), we also release a reference on @mm_count
|
||||
* (which may then free the &struct mm_struct if @mm_count also
|
||||
* drops to 0).
|
||||
*/
|
||||
atomic_t mm_users;
|
||||
/**
|
||||
* @mm_users: The number of users including userspace.
|
||||
*
|
||||
* Use mmget()/mmget_not_zero()/mmput() to modify. When this
|
||||
* drops to 0 (i.e. when the task exits and there are no other
|
||||
* temporary reference holders), we also release a reference on
|
||||
* @mm_count (which may then free the &struct mm_struct if
|
||||
* @mm_count also drops to 0).
|
||||
*/
|
||||
atomic_t mm_users;
|
||||
|
||||
/**
|
||||
* @mm_count: The number of references to &struct mm_struct
|
||||
* (@mm_users count as 1).
|
||||
*
|
||||
* Use mmgrab()/mmdrop() to modify. When this drops to 0, the
|
||||
* &struct mm_struct is freed.
|
||||
*/
|
||||
atomic_t mm_count;
|
||||
/**
|
||||
* @mm_count: The number of references to &struct mm_struct
|
||||
* (@mm_users count as 1).
|
||||
*
|
||||
* Use mmgrab()/mmdrop() to modify. When this drops to 0, the
|
||||
* &struct mm_struct is freed.
|
||||
*/
|
||||
atomic_t mm_count;
|
||||
|
||||
#ifdef CONFIG_MMU
|
||||
atomic_long_t pgtables_bytes; /* PTE page table pages */
|
||||
atomic_long_t pgtables_bytes; /* PTE page table pages */
|
||||
#endif
|
||||
int map_count; /* number of VMAs */
|
||||
int map_count; /* number of VMAs */
|
||||
|
||||
spinlock_t page_table_lock; /* Protects page tables and some counters */
|
||||
struct rw_semaphore mmap_sem;
|
||||
spinlock_t page_table_lock; /* Protects page tables and some
|
||||
* counters
|
||||
*/
|
||||
struct rw_semaphore mmap_sem;
|
||||
|
||||
struct list_head mmlist; /* List of maybe swapped mm's. These are globally strung
|
||||
* together off init_mm.mmlist, and are protected
|
||||
* by mmlist_lock
|
||||
*/
|
||||
struct list_head mmlist; /* List of maybe swapped mm's. These
|
||||
* are globally strung together off
|
||||
* init_mm.mmlist, and are protected
|
||||
* by mmlist_lock
|
||||
*/
|
||||
|
||||
|
||||
unsigned long hiwater_rss; /* High-watermark of RSS usage */
|
||||
unsigned long hiwater_vm; /* High-water virtual memory usage */
|
||||
unsigned long hiwater_rss; /* High-watermark of RSS usage */
|
||||
unsigned long hiwater_vm; /* High-water virtual memory usage */
|
||||
|
||||
unsigned long total_vm; /* Total pages mapped */
|
||||
unsigned long locked_vm; /* Pages that have PG_mlocked set */
|
||||
unsigned long pinned_vm; /* Refcount permanently increased */
|
||||
unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
|
||||
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
|
||||
unsigned long stack_vm; /* VM_STACK */
|
||||
unsigned long def_flags;
|
||||
unsigned long total_vm; /* Total pages mapped */
|
||||
unsigned long locked_vm; /* Pages that have PG_mlocked set */
|
||||
unsigned long pinned_vm; /* Refcount permanently increased */
|
||||
unsigned long data_vm; /* VM_WRITE & ~VM_SHARED & ~VM_STACK */
|
||||
unsigned long exec_vm; /* VM_EXEC & ~VM_WRITE & ~VM_STACK */
|
||||
unsigned long stack_vm; /* VM_STACK */
|
||||
unsigned long def_flags;
|
||||
|
||||
spinlock_t arg_lock; /* protect the below fields */
|
||||
unsigned long start_code, end_code, start_data, end_data;
|
||||
unsigned long start_brk, brk, start_stack;
|
||||
unsigned long arg_start, arg_end, env_start, env_end;
|
||||
spinlock_t arg_lock; /* protect the below fields */
|
||||
unsigned long start_code, end_code, start_data, end_data;
|
||||
unsigned long start_brk, brk, start_stack;
|
||||
unsigned long arg_start, arg_end, env_start, env_end;
|
||||
|
||||
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
|
||||
unsigned long saved_auxv[AT_VECTOR_SIZE]; /* for /proc/PID/auxv */
|
||||
|
||||
/*
|
||||
* Special counters, in some configurations protected by the
|
||||
* page_table_lock, in other configurations by being atomic.
|
||||
*/
|
||||
struct mm_rss_stat rss_stat;
|
||||
/*
|
||||
* Special counters, in some configurations protected by the
|
||||
* page_table_lock, in other configurations by being atomic.
|
||||
*/
|
||||
struct mm_rss_stat rss_stat;
|
||||
|
||||
struct linux_binfmt *binfmt;
|
||||
struct linux_binfmt *binfmt;
|
||||
|
||||
cpumask_var_t cpu_vm_mask_var;
|
||||
/* Architecture-specific MM context */
|
||||
mm_context_t context;
|
||||
|
||||
/* Architecture-specific MM context */
|
||||
mm_context_t context;
|
||||
unsigned long flags; /* Must use atomic bitops to access */
|
||||
|
||||
unsigned long flags; /* Must use atomic bitops to access the bits */
|
||||
|
||||
struct core_state *core_state; /* coredumping support */
|
||||
struct core_state *core_state; /* coredumping support */
|
||||
#ifdef CONFIG_MEMBARRIER
|
||||
atomic_t membarrier_state;
|
||||
atomic_t membarrier_state;
|
||||
#endif
|
||||
#ifdef CONFIG_AIO
|
||||
spinlock_t ioctx_lock;
|
||||
struct kioctx_table __rcu *ioctx_table;
|
||||
spinlock_t ioctx_lock;
|
||||
struct kioctx_table __rcu *ioctx_table;
|
||||
#endif
|
||||
#ifdef CONFIG_MEMCG
|
||||
/*
|
||||
* "owner" points to a task that is regarded as the canonical
|
||||
* user/owner of this mm. All of the following must be true in
|
||||
* order for it to be changed:
|
||||
*
|
||||
* current == mm->owner
|
||||
* current->mm != mm
|
||||
* new_owner->mm == mm
|
||||
* new_owner->alloc_lock is held
|
||||
*/
|
||||
struct task_struct __rcu *owner;
|
||||
/*
|
||||
* "owner" points to a task that is regarded as the canonical
|
||||
* user/owner of this mm. All of the following must be true in
|
||||
* order for it to be changed:
|
||||
*
|
||||
* current == mm->owner
|
||||
* current->mm != mm
|
||||
* new_owner->mm == mm
|
||||
* new_owner->alloc_lock is held
|
||||
*/
|
||||
struct task_struct __rcu *owner;
|
||||
#endif
|
||||
struct user_namespace *user_ns;
|
||||
struct user_namespace *user_ns;
|
||||
|
||||
/* store ref to file /proc/<pid>/exe symlink points to */
|
||||
struct file __rcu *exe_file;
|
||||
/* store ref to file /proc/<pid>/exe symlink points to */
|
||||
struct file __rcu *exe_file;
|
||||
#ifdef CONFIG_MMU_NOTIFIER
|
||||
struct mmu_notifier_mm *mmu_notifier_mm;
|
||||
struct mmu_notifier_mm *mmu_notifier_mm;
|
||||
#endif
|
||||
#if defined(CONFIG_TRANSPARENT_HUGEPAGE) && !USE_SPLIT_PMD_PTLOCKS
|
||||
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
|
||||
#endif
|
||||
#ifdef CONFIG_CPUMASK_OFFSTACK
|
||||
struct cpumask cpumask_allocation;
|
||||
pgtable_t pmd_huge_pte; /* protected by page_table_lock */
|
||||
#endif
|
||||
#ifdef CONFIG_NUMA_BALANCING
|
||||
/*
|
||||
* numa_next_scan is the next time that the PTEs will be marked
|
||||
* pte_numa. NUMA hinting faults will gather statistics and migrate
|
||||
* pages to new nodes if necessary.
|
||||
*/
|
||||
unsigned long numa_next_scan;
|
||||
/*
|
||||
* numa_next_scan is the next time that the PTEs will be marked
|
||||
* pte_numa. NUMA hinting faults will gather statistics and
|
||||
* migrate pages to new nodes if necessary.
|
||||
*/
|
||||
unsigned long numa_next_scan;
|
||||
|
||||
/* Restart point for scanning and setting pte_numa */
|
||||
unsigned long numa_scan_offset;
|
||||
/* Restart point for scanning and setting pte_numa */
|
||||
unsigned long numa_scan_offset;
|
||||
|
||||
/* numa_scan_seq prevents two threads setting pte_numa */
|
||||
int numa_scan_seq;
|
||||
/* numa_scan_seq prevents two threads setting pte_numa */
|
||||
int numa_scan_seq;
|
||||
#endif
|
||||
/*
|
||||
* An operation with batched TLB flushing is going on. Anything that
|
||||
* can move process memory needs to flush the TLB when moving a
|
||||
* PROT_NONE or PROT_NUMA mapped page.
|
||||
*/
|
||||
atomic_t tlb_flush_pending;
|
||||
/*
|
||||
* An operation with batched TLB flushing is going on. Anything
|
||||
* that can move process memory needs to flush the TLB when
|
||||
* moving a PROT_NONE or PROT_NUMA mapped page.
|
||||
*/
|
||||
atomic_t tlb_flush_pending;
|
||||
#ifdef CONFIG_ARCH_WANT_BATCHED_UNMAP_TLB_FLUSH
|
||||
/* See flush_tlb_batched_pending() */
|
||||
bool tlb_flush_batched;
|
||||
/* See flush_tlb_batched_pending() */
|
||||
bool tlb_flush_batched;
|
||||
#endif
|
||||
struct uprobes_state uprobes_state;
|
||||
struct uprobes_state uprobes_state;
|
||||
#ifdef CONFIG_HUGETLB_PAGE
|
||||
atomic_long_t hugetlb_usage;
|
||||
atomic_long_t hugetlb_usage;
|
||||
#endif
|
||||
struct work_struct async_put_work;
|
||||
struct work_struct async_put_work;
|
||||
|
||||
#if IS_ENABLED(CONFIG_HMM)
|
||||
/* HMM needs to track a few things per mm */
|
||||
struct hmm *hmm;
|
||||
/* HMM needs to track a few things per mm */
|
||||
struct hmm *hmm;
|
||||
#endif
|
||||
} __randomize_layout;
|
||||
} __randomize_layout;
|
||||
|
||||
/*
|
||||
* The mm_cpumask needs to be at the end of mm_struct, because it
|
||||
* is dynamically sized based on nr_cpu_ids.
|
||||
*/
|
||||
unsigned long cpu_bitmap[];
|
||||
};
|
||||
|
||||
extern struct mm_struct init_mm;
|
||||
|
||||
/* Pointer magic because the dynamic array size confuses some compilers. */
|
||||
static inline void mm_init_cpumask(struct mm_struct *mm)
|
||||
{
|
||||
#ifdef CONFIG_CPUMASK_OFFSTACK
|
||||
mm->cpu_vm_mask_var = &mm->cpumask_allocation;
|
||||
#endif
|
||||
cpumask_clear(mm->cpu_vm_mask_var);
|
||||
unsigned long cpu_bitmap = (unsigned long)mm;
|
||||
|
||||
cpu_bitmap += offsetof(struct mm_struct, cpu_bitmap);
|
||||
cpumask_clear((struct cpumask *)cpu_bitmap);
|
||||
}
|
||||
|
||||
/* Future-safe accessor for struct mm_struct's cpu_vm_mask. */
|
||||
static inline cpumask_t *mm_cpumask(struct mm_struct *mm)
|
||||
{
|
||||
return mm->cpu_vm_mask_var;
|
||||
return (struct cpumask *)&mm->cpu_bitmap;
|
||||
}
|
||||
|
||||
struct mmu_gather;
|
||||
|
|
|
@ -2276,6 +2276,8 @@ static void sighand_ctor(void *data)
|
|||
|
||||
void __init proc_caches_init(void)
|
||||
{
|
||||
unsigned int mm_size;
|
||||
|
||||
sighand_cachep = kmem_cache_create("sighand_cache",
|
||||
sizeof(struct sighand_struct), 0,
|
||||
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_TYPESAFE_BY_RCU|
|
||||
|
@ -2292,15 +2294,16 @@ void __init proc_caches_init(void)
|
|||
sizeof(struct fs_struct), 0,
|
||||
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
|
||||
NULL);
|
||||
|
||||
/*
|
||||
* FIXME! The "sizeof(struct mm_struct)" currently includes the
|
||||
* whole struct cpumask for the OFFSTACK case. We could change
|
||||
* this to *only* allocate as much of it as required by the
|
||||
* maximum number of CPU's we can ever have. The cpumask_allocation
|
||||
* is at the end of the structure, exactly for that reason.
|
||||
* The mm_cpumask is located at the end of mm_struct, and is
|
||||
* dynamically sized based on the maximum CPU number this system
|
||||
* can have, taking hotplug into account (nr_cpu_ids).
|
||||
*/
|
||||
mm_size = sizeof(struct mm_struct) + cpumask_size();
|
||||
|
||||
mm_cachep = kmem_cache_create_usercopy("mm_struct",
|
||||
sizeof(struct mm_struct), ARCH_MIN_MMSTRUCT_ALIGN,
|
||||
mm_size, ARCH_MIN_MMSTRUCT_ALIGN,
|
||||
SLAB_HWCACHE_ALIGN|SLAB_PANIC|SLAB_ACCOUNT,
|
||||
offsetof(struct mm_struct, saved_auxv),
|
||||
sizeof_field(struct mm_struct, saved_auxv),
|
||||
|
|
|
@ -92,7 +92,7 @@ static inline int ioremap_pmd_range(pud_t *pud, unsigned long addr,
|
|||
if (ioremap_pmd_enabled() &&
|
||||
((next - addr) == PMD_SIZE) &&
|
||||
IS_ALIGNED(phys_addr + addr, PMD_SIZE) &&
|
||||
pmd_free_pte_page(pmd)) {
|
||||
pmd_free_pte_page(pmd, addr)) {
|
||||
if (pmd_set_huge(pmd, phys_addr + addr, prot))
|
||||
continue;
|
||||
}
|
||||
|
@ -119,7 +119,7 @@ static inline int ioremap_pud_range(p4d_t *p4d, unsigned long addr,
|
|||
if (ioremap_pud_enabled() &&
|
||||
((next - addr) == PUD_SIZE) &&
|
||||
IS_ALIGNED(phys_addr + addr, PUD_SIZE) &&
|
||||
pud_free_pmd_page(pud)) {
|
||||
pud_free_pmd_page(pud, addr)) {
|
||||
if (pud_set_huge(pud, phys_addr + addr, prot))
|
||||
continue;
|
||||
}
|
||||
|
|
11
mm/init-mm.c
11
mm/init-mm.c
|
@ -15,6 +15,16 @@
|
|||
#define INIT_MM_CONTEXT(name)
|
||||
#endif
|
||||
|
||||
/*
|
||||
* For dynamically allocated mm_structs, there is a dynamically sized cpumask
|
||||
* at the end of the structure, the size of which depends on the maximum CPU
|
||||
* number the system can see. That way we allocate only as much memory for
|
||||
* mm_cpumask() as needed for the hundreds, or thousands of processes that
|
||||
* a system typically runs.
|
||||
*
|
||||
* Since there is only one init_mm in the entire system, keep it simple
|
||||
* and size this cpu_bitmask to NR_CPUS.
|
||||
*/
|
||||
struct mm_struct init_mm = {
|
||||
.mm_rb = RB_ROOT,
|
||||
.pgd = swapper_pg_dir,
|
||||
|
@ -25,5 +35,6 @@ struct mm_struct init_mm = {
|
|||
.arg_lock = __SPIN_LOCK_UNLOCKED(init_mm.arg_lock),
|
||||
.mmlist = LIST_HEAD_INIT(init_mm.mmlist),
|
||||
.user_ns = &init_user_ns,
|
||||
.cpu_bitmap = { [BITS_TO_LONGS(NR_CPUS)] = 0},
|
||||
INIT_MM_CONTEXT(init_mm)
|
||||
};
|
||||
|
|
22
mm/memory.c
22
mm/memory.c
|
@ -326,16 +326,20 @@ bool __tlb_remove_page_size(struct mmu_gather *tlb, struct page *page, int page_
|
|||
|
||||
#ifdef CONFIG_HAVE_RCU_TABLE_FREE
|
||||
|
||||
/*
|
||||
* See the comment near struct mmu_table_batch.
|
||||
*/
|
||||
|
||||
static void tlb_remove_table_smp_sync(void *arg)
|
||||
{
|
||||
/* Simply deliver the interrupt */
|
||||
struct mm_struct __maybe_unused *mm = arg;
|
||||
/*
|
||||
* On most architectures this does nothing. Simply delivering the
|
||||
* interrupt is enough to prevent races with software page table
|
||||
* walking like that done in get_user_pages_fast.
|
||||
*
|
||||
* See the comment near struct mmu_table_batch.
|
||||
*/
|
||||
tlb_flush_remove_tables_local(mm);
|
||||
}
|
||||
|
||||
static void tlb_remove_table_one(void *table)
|
||||
static void tlb_remove_table_one(void *table, struct mmu_gather *tlb)
|
||||
{
|
||||
/*
|
||||
* This isn't an RCU grace period and hence the page-tables cannot be
|
||||
|
@ -344,7 +348,7 @@ static void tlb_remove_table_one(void *table)
|
|||
* It is however sufficient for software page-table walkers that rely on
|
||||
* IRQ disabling. See the comment near struct mmu_table_batch.
|
||||
*/
|
||||
smp_call_function(tlb_remove_table_smp_sync, NULL, 1);
|
||||
smp_call_function(tlb_remove_table_smp_sync, tlb->mm, 1);
|
||||
__tlb_remove_table(table);
|
||||
}
|
||||
|
||||
|
@ -365,6 +369,8 @@ void tlb_table_flush(struct mmu_gather *tlb)
|
|||
{
|
||||
struct mmu_table_batch **batch = &tlb->batch;
|
||||
|
||||
tlb_flush_remove_tables(tlb->mm);
|
||||
|
||||
if (*batch) {
|
||||
call_rcu_sched(&(*batch)->rcu, tlb_remove_table_rcu);
|
||||
*batch = NULL;
|
||||
|
@ -387,7 +393,7 @@ void tlb_remove_table(struct mmu_gather *tlb, void *table)
|
|||
if (*batch == NULL) {
|
||||
*batch = (struct mmu_table_batch *)__get_free_page(GFP_NOWAIT | __GFP_NOWARN);
|
||||
if (*batch == NULL) {
|
||||
tlb_remove_table_one(table);
|
||||
tlb_remove_table_one(table, tlb);
|
||||
return;
|
||||
}
|
||||
(*batch)->nr = 0;
|
||||
|
|
Loading…
Reference in New Issue