2008-10-23 13:26:29 +08:00
|
|
|
#ifndef _ASM_X86_PROCESSOR_FLAGS_H
|
|
|
|
#define _ASM_X86_PROCESSOR_FLAGS_H
|
2007-05-03 01:27:10 +08:00
|
|
|
|
2012-12-15 06:37:13 +08:00
|
|
|
#include <uapi/asm/processor-flags.h>
|
2017-07-18 05:10:08 +08:00
|
|
|
#include <linux/mem_encrypt.h>
|
2007-05-03 01:27:10 +08:00
|
|
|
|
2008-05-28 15:46:19 +08:00
|
|
|
#ifdef CONFIG_VM86
|
|
|
|
#define X86_VM_MASK X86_EFLAGS_VM
|
|
|
|
#else
|
|
|
|
#define X86_VM_MASK 0 /* No VM86 support */
|
|
|
|
#endif
|
2017-06-13 01:26:14 +08:00
|
|
|
|
|
|
|
/*
|
|
|
|
* CR3's layout varies depending on several things.
|
|
|
|
*
|
|
|
|
* If CR4.PCIDE is set (64-bit only), then CR3[11:0] is the address space ID.
|
|
|
|
* If PAE is enabled, then CR3[11:5] is part of the PDPT address
|
|
|
|
* (i.e. it's 32-byte aligned, not page-aligned) and CR3[4:0] is ignored.
|
|
|
|
* Otherwise (non-PAE, non-PCID), CR3[3] is PWT, CR3[4] is PCD, and
|
|
|
|
* CR3[2:0] and CR3[11:5] are ignored.
|
|
|
|
*
|
|
|
|
* In all cases, Linux puts zeros in the low ignored bits and in PWT and PCD.
|
|
|
|
*
|
|
|
|
* CR3[63] is always read as zero. If CR4.PCIDE is set, then CR3[63] may be
|
|
|
|
* written as 1 to prevent the write to CR3 from flushing the TLB.
|
|
|
|
*
|
|
|
|
* On systems with SME, one bit (in a variable position!) is stolen to indicate
|
|
|
|
* that the top-level paging structure is encrypted.
|
|
|
|
*
|
|
|
|
* All of the remaining bits indicate the physical address of the top-level
|
|
|
|
* paging structure.
|
|
|
|
*
|
|
|
|
* CR3_ADDR_MASK is the mask used by read_cr3_pa().
|
|
|
|
*/
|
|
|
|
#ifdef CONFIG_X86_64
|
2017-07-18 05:10:08 +08:00
|
|
|
/* Mask off the address space ID and SME encryption bits. */
|
|
|
|
#define CR3_ADDR_MASK __sme_clr(0x7FFFFFFFFFFFF000ull)
|
2017-06-13 01:26:14 +08:00
|
|
|
#define CR3_PCID_MASK 0xFFFull
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 12:41:38 +08:00
|
|
|
#define CR3_NOFLUSH (1UL << 63)
|
2017-06-13 01:26:14 +08:00
|
|
|
#else
|
|
|
|
/*
|
|
|
|
* CR3_ADDR_MASK needs at least bits 31:5 set on PAE systems, and we save
|
|
|
|
* a tiny bit of code size by setting all the bits.
|
|
|
|
*/
|
|
|
|
#define CR3_ADDR_MASK 0xFFFFFFFFull
|
|
|
|
#define CR3_PCID_MASK 0ull
|
x86/mm: Implement PCID based optimization: try to preserve old TLB entries using PCID
PCID is a "process context ID" -- it's what other architectures call
an address space ID. Every non-global TLB entry is tagged with a
PCID, only TLB entries that match the currently selected PCID are
used, and we can switch PGDs without flushing the TLB. x86's
PCID is 12 bits.
This is an unorthodox approach to using PCID. x86's PCID is far too
short to uniquely identify a process, and we can't even really
uniquely identify a running process because there are monster
systems with over 4096 CPUs. To make matters worse, past attempts
to use all 12 PCID bits have resulted in slowdowns instead of
speedups.
This patch uses PCID differently. We use a PCID to identify a
recently-used mm on a per-cpu basis. An mm has no fixed PCID
binding at all; instead, we give it a fresh PCID each time it's
loaded except in cases where we want to preserve the TLB, in which
case we reuse a recent value.
Here are some benchmark results, done on a Skylake laptop at 2.3 GHz
(turbo off, intel_pstate requesting max performance) under KVM with
the guest using idle=poll (to avoid artifacts when bouncing between
CPUs). I haven't done any real statistics here -- I just ran them
in a loop and picked the fastest results that didn't look like
outliers. Unpatched means commit a4eb8b993554, so all the
bookkeeping overhead is gone.
ping-pong between two mms on the same CPU using eventfd:
patched: 1.22µs
patched, nopcid: 1.33µs
unpatched: 1.34µs
Same ping-pong, but now touch 512 pages (all zero-page to minimize
cache misses) each iteration. dTLB misses are measured by
dtlb_load_misses.miss_causes_a_walk:
patched: 1.8µs 11M dTLB misses
patched, nopcid: 6.2µs, 207M dTLB misses
unpatched: 6.1µs, 190M dTLB misses
Signed-off-by: Andy Lutomirski <luto@kernel.org>
Reviewed-by: Nadav Amit <nadav.amit@gmail.com>
Cc: Andrew Morton <akpm@linux-foundation.org>
Cc: Arjan van de Ven <arjan@linux.intel.com>
Cc: Borislav Petkov <bp@alien8.de>
Cc: Dave Hansen <dave.hansen@intel.com>
Cc: Linus Torvalds <torvalds@linux-foundation.org>
Cc: Mel Gorman <mgorman@suse.de>
Cc: Peter Zijlstra <peterz@infradead.org>
Cc: Rik van Riel <riel@redhat.com>
Cc: Thomas Gleixner <tglx@linutronix.de>
Cc: linux-mm@kvack.org
Link: http://lkml.kernel.org/r/9ee75f17a81770feed616358e6860d98a2a5b1e7.1500957502.git.luto@kernel.org
Signed-off-by: Ingo Molnar <mingo@kernel.org>
2017-07-25 12:41:38 +08:00
|
|
|
#define CR3_NOFLUSH 0
|
2017-06-13 01:26:14 +08:00
|
|
|
#endif
|
|
|
|
|
2008-10-23 13:26:29 +08:00
|
|
|
#endif /* _ASM_X86_PROCESSOR_FLAGS_H */
|