powerpc/64s/radix: Optimize TLB range flush barriers

Short range flushes issue a sequences of tlbie(l) instructions for
individual effective addresses. These do not all require individual
barrier sequences, only one covering all tlbie(l) instructions.

Commit f7327e0ba3 ("powerpc/mm/radix: Remove unnecessary ptesync")
made a similar optimization for tlbiel for PID flushing.

For tlbie, the ISA says:

    The tlbsync instruction provides an ordering function for the
    effects of all tlbie instructions executed by the thread executing
    the tlbsync instruction, with respect to the memory barrier
    created by a subsequent ptesync instruction executed by the same
    thread.

Time to munmap 30 pages of memory (after mmap, touch):
         local   global
vanilla  10.9us  22.3us
patched   3.4us  14.4us

Signed-off-by: Nicholas Piggin <npiggin@gmail.com>
Signed-off-by: Michael Ellerman <mpe@ellerman.id.au>
This commit is contained in:
Nicholas Piggin 2017-11-07 18:53:05 +11:00 committed by Michael Ellerman
parent a54c61f46e
commit 14001c6093
1 changed files with 32 additions and 9 deletions

View File

@ -84,7 +84,7 @@ static inline void _tlbie_pid(unsigned long pid, unsigned long ric)
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
static inline void _tlbiel_va(unsigned long va, unsigned long pid,
static inline void __tlbiel_va(unsigned long va, unsigned long pid,
unsigned long ap, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@ -95,14 +95,20 @@ static inline void _tlbiel_va(unsigned long va, unsigned long pid,
prs = 1; /* process scoped */
r = 1; /* raidx format */
asm volatile("ptesync": : :"memory");
asm volatile(PPC_TLBIEL(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
asm volatile("ptesync": : :"memory");
trace_tlbie(0, 1, rb, rs, ric, prs, r);
}
static inline void _tlbie_va(unsigned long va, unsigned long pid,
static inline void _tlbiel_va(unsigned long va, unsigned long pid,
unsigned long ap, unsigned long ric)
{
asm volatile("ptesync": : :"memory");
__tlbiel_va(va, pid, ap, ric);
asm volatile("ptesync": : :"memory");
}
static inline void __tlbie_va(unsigned long va, unsigned long pid,
unsigned long ap, unsigned long ric)
{
unsigned long rb,rs,prs,r;
@ -113,13 +119,20 @@ static inline void _tlbie_va(unsigned long va, unsigned long pid,
prs = 1; /* process scoped */
r = 1; /* raidx format */
asm volatile("ptesync": : :"memory");
asm volatile(PPC_TLBIE_5(%0, %4, %3, %2, %1)
: : "r"(rb), "i"(r), "i"(prs), "i"(ric), "r"(rs) : "memory");
asm volatile("eieio; tlbsync; ptesync": : :"memory");
trace_tlbie(0, 0, rb, rs, ric, prs, r);
}
static inline void _tlbie_va(unsigned long va, unsigned long pid,
unsigned long ap, unsigned long ric)
{
asm volatile("ptesync": : :"memory");
__tlbie_va(va, pid, ap, ric);
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
/*
* Base TLB flushing operations:
*
@ -341,13 +354,17 @@ void radix__flush_tlb_range_psize(struct mm_struct *mm, unsigned long start,
else
_tlbie_pid(pid, RIC_FLUSH_TLB);
} else {
asm volatile("ptesync": : :"memory");
for (addr = start; addr < end; addr += page_size) {
if (local)
_tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
__tlbiel_va(addr, pid, ap, RIC_FLUSH_TLB);
else
_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
__tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
}
if (local)
asm volatile("ptesync": : :"memory");
else
asm volatile("eieio; tlbsync; ptesync": : :"memory");
}
preempt_enable();
}
@ -378,6 +395,7 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
_tlbie_pid(pid, RIC_FLUSH_PWC);
/* Then iterate the pages */
asm volatile("ptesync": : :"memory");
end = addr + HPAGE_PMD_SIZE;
for (; addr < end; addr += PAGE_SIZE) {
if (local)
@ -386,6 +404,11 @@ void radix__flush_tlb_collapsed_pmd(struct mm_struct *mm, unsigned long addr)
_tlbie_va(addr, pid, ap, RIC_FLUSH_TLB);
}
if (local)
asm volatile("ptesync": : :"memory");
else
asm volatile("eieio; tlbsync; ptesync": : :"memory");
preempt_enable();
}
#endif /* CONFIG_TRANSPARENT_HUGEPAGE */